mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
3 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
f552f2b1e8 | ||
![]() |
26b8927632 | ||
![]() |
db76d3865f |
BIN
.RELEASE_NOTES.swp
Normal file
BIN
.RELEASE_NOTES.swp
Normal file
Binary file not shown.
@@ -22,7 +22,7 @@ required.
|
|||||||
Compile Instructions
|
Compile Instructions
|
||||||
--------------------
|
--------------------
|
||||||
|
|
||||||
See INSTALL_LINUX or INSTALL_WINDOWS for compile instruuctions
|
See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
|
||||||
|
|
||||||
Requirements
|
Requirements
|
||||||
------------
|
------------
|
||||||
@@ -65,6 +65,28 @@ If not what makes it happen or not happen?
|
|||||||
Change Log
|
Change Log
|
||||||
----------
|
----------
|
||||||
|
|
||||||
|
v3.19.9
|
||||||
|
|
||||||
|
More Blake256, Blake512, Luffa & Cubehash prehash optimizations.
|
||||||
|
Relaxed some excessively strict data alignment that was negatively affecting performance.
|
||||||
|
|
||||||
|
v3.19.8
|
||||||
|
|
||||||
|
#370 "stratum+ssl", in addition to "stratum+tcps", is now recognized as a valid
|
||||||
|
url protocol specifier for requesting a secure stratum connection.
|
||||||
|
|
||||||
|
The full url, including the protocol, is now displayed in the stratum connect
|
||||||
|
log and the periodic summary log.
|
||||||
|
|
||||||
|
Small optimizations to Cubehash, AVX2 & AVX512.
|
||||||
|
|
||||||
|
Byte order and prehash optimizations for Blake256 & Blake512, AVX2 & AVX512.
|
||||||
|
|
||||||
|
v3.19.7
|
||||||
|
|
||||||
|
#369 Fixed time limited mining, --time-limit.
|
||||||
|
Fixed a potential compile error when using optimization below -O3.
|
||||||
|
|
||||||
v3.19.6
|
v3.19.6
|
||||||
|
|
||||||
#363 Fixed a stratum bug where the first job may be ignored delaying start of hashing
|
#363 Fixed a stratum bug where the first job may be ignored delaying start of hashing
|
||||||
@@ -76,7 +98,7 @@ v3.19.5
|
|||||||
Enhanced stratum-keepalive preemptively resets the stratum connection
|
Enhanced stratum-keepalive preemptively resets the stratum connection
|
||||||
before the server to avoid lost shares.
|
before the server to avoid lost shares.
|
||||||
|
|
||||||
Added build-msys2.sh scrypt for easier compiling on Windows, see Wiki for details.
|
Added build-msys2.sh shell script for easier compiling on Windows, see Wiki for details.
|
||||||
|
|
||||||
X16RT: eliminate unnecessary recalculations of the hash order.
|
X16RT: eliminate unnecessary recalculations of the hash order.
|
||||||
|
|
||||||
|
@@ -49,6 +49,20 @@ extern "C"{
|
|||||||
|
|
||||||
#define SPH_SIZE_blake512 512
|
#define SPH_SIZE_blake512 512
|
||||||
|
|
||||||
|
/////////////////////////
|
||||||
|
//
|
||||||
|
// Blake-256 1 way SSE2
|
||||||
|
|
||||||
|
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||||
|
const uint32_t T0, const uint32_t T1 );
|
||||||
|
|
||||||
|
/////////////////////////
|
||||||
|
//
|
||||||
|
// Blake-512 1 way SSE2
|
||||||
|
|
||||||
|
void blake512_transform_le( uint64_t *H, const uint64_t *buf,
|
||||||
|
const uint64_t T0, const uint64_t T1 );
|
||||||
|
|
||||||
//////////////////////////
|
//////////////////////////
|
||||||
//
|
//
|
||||||
// Blake-256 4 way SSE2
|
// Blake-256 4 way SSE2
|
||||||
@@ -98,6 +112,12 @@ typedef blake_8way_small_context blake256_8way_context;
|
|||||||
void blake256_8way_init(void *cc);
|
void blake256_8way_init(void *cc);
|
||||||
void blake256_8way_update(void *cc, const void *data, size_t len);
|
void blake256_8way_update(void *cc, const void *data, size_t len);
|
||||||
void blake256_8way_close(void *cc, void *dst);
|
void blake256_8way_close(void *cc, void *dst);
|
||||||
|
void blake256_8way_update_le(void *cc, const void *data, size_t len);
|
||||||
|
void blake256_8way_close_le(void *cc, void *dst);
|
||||||
|
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||||
|
const void *data );
|
||||||
|
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||||
|
const void *midhash, const void *data );
|
||||||
|
|
||||||
// 14 rounds, blake, decred
|
// 14 rounds, blake, decred
|
||||||
typedef blake_8way_small_context blake256r14_8way_context;
|
typedef blake_8way_small_context blake256r14_8way_context;
|
||||||
@@ -128,6 +148,12 @@ void blake512_4way_update( void *cc, const void *data, size_t len );
|
|||||||
void blake512_4way_close( void *cc, void *dst );
|
void blake512_4way_close( void *cc, void *dst );
|
||||||
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||||
const void *data, size_t len );
|
const void *data, size_t len );
|
||||||
|
void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
|
||||||
|
const void *data, size_t len );
|
||||||
|
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||||
|
const void *data );
|
||||||
|
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||||
|
const __m256i nonce, const __m256i *midstate );
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
@@ -148,6 +174,14 @@ typedef blake_16way_small_context blake256_16way_context;
|
|||||||
void blake256_16way_init(void *cc);
|
void blake256_16way_init(void *cc);
|
||||||
void blake256_16way_update(void *cc, const void *data, size_t len);
|
void blake256_16way_update(void *cc, const void *data, size_t len);
|
||||||
void blake256_16way_close(void *cc, void *dst);
|
void blake256_16way_close(void *cc, void *dst);
|
||||||
|
// Expects data in little endian order, no byte swap needed
|
||||||
|
void blake256_16way_update_le(void *cc, const void *data, size_t len);
|
||||||
|
void blake256_16way_close_le(void *cc, void *dst);
|
||||||
|
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||||
|
const void *data );
|
||||||
|
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||||
|
const void *midhash, const void *data );
|
||||||
|
|
||||||
|
|
||||||
// 14 rounds, blake, decred
|
// 14 rounds, blake, decred
|
||||||
typedef blake_16way_small_context blake256r14_16way_context;
|
typedef blake_16way_small_context blake256r14_16way_context;
|
||||||
@@ -180,7 +214,12 @@ void blake512_8way_update( void *cc, const void *data, size_t len );
|
|||||||
void blake512_8way_close( void *cc, void *dst );
|
void blake512_8way_close( void *cc, void *dst );
|
||||||
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||||
const void *data, size_t len );
|
const void *data, size_t len );
|
||||||
void blake512_8way_hash_le80( void *hash, const void *data );
|
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||||
|
const void *data, size_t len );
|
||||||
|
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||||
|
const void *data );
|
||||||
|
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||||
|
const __m512i nonce, const __m512i *midstate );
|
||||||
|
|
||||||
#endif // AVX512
|
#endif // AVX512
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
@@ -5,6 +5,7 @@
|
|||||||
* ==========================(LICENSE BEGIN)============================
|
* ==========================(LICENSE BEGIN)============================
|
||||||
*
|
*
|
||||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||||
|
* 2016-2022 JayDDee246@gmail.com
|
||||||
*
|
*
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining
|
* Permission is hereby granted, free of charge, to any person obtaining
|
||||||
* a copy of this software and associated documentation files (the
|
* a copy of this software and associated documentation files (the
|
||||||
@@ -304,6 +305,98 @@ static const sph_u32 CS[16] = {
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/////////////////////////////////////////
|
||||||
|
//
|
||||||
|
// Blake-256 1 way SIMD
|
||||||
|
|
||||||
|
#define BLAKE256_ROUND( r ) \
|
||||||
|
{ \
|
||||||
|
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||||
|
_mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
|
||||||
|
CSx( r, 5 ) ^ Mx( r, 4 ), \
|
||||||
|
CSx( r, 3 ) ^ Mx( r, 2 ), \
|
||||||
|
CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
|
||||||
|
V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 16 ); \
|
||||||
|
V2 = _mm_add_epi32( V2, V3 ); \
|
||||||
|
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
|
||||||
|
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||||
|
_mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
|
||||||
|
CSx( r, 4 ) ^ Mx( r, 5 ), \
|
||||||
|
CSx( r, 2 ) ^ Mx( r, 3 ), \
|
||||||
|
CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
|
||||||
|
V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 8 ); \
|
||||||
|
V2 = _mm_add_epi32( V2, V3 ); \
|
||||||
|
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
|
||||||
|
V3 = mm128_shufll_32( V3 ); \
|
||||||
|
V2 = mm128_swap_64( V2 ); \
|
||||||
|
V1 = mm128_shuflr_32( V1 ); \
|
||||||
|
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||||
|
_mm_set_epi32( CSx( r, F ) ^ Mx( r, E ), \
|
||||||
|
CSx( r, D ) ^ Mx( r, C ), \
|
||||||
|
CSx( r, B ) ^ Mx( r, A ), \
|
||||||
|
CSx( r, 9 ) ^ Mx( r, 8 ) ) ) ); \
|
||||||
|
V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 16 ); \
|
||||||
|
V2 = _mm_add_epi32( V2, V3 ); \
|
||||||
|
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
|
||||||
|
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||||
|
_mm_set_epi32( CSx( r, E ) ^ Mx( r, F ), \
|
||||||
|
CSx( r, C ) ^ Mx( r, D ), \
|
||||||
|
CSx( r, A ) ^ Mx( r, B ), \
|
||||||
|
CSx( r, 8 ) ^ Mx( r, 9 ) ) ) ); \
|
||||||
|
V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 8 ); \
|
||||||
|
V2 = _mm_add_epi32( V2, V3 ); \
|
||||||
|
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
|
||||||
|
V3 = mm128_shuflr_32( V3 ); \
|
||||||
|
V2 = mm128_swap_64( V2 ); \
|
||||||
|
V1 = mm128_shufll_32( V1 ); \
|
||||||
|
}
|
||||||
|
|
||||||
|
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||||
|
const uint32_t T0, const uint32_t T1 )
|
||||||
|
{
|
||||||
|
__m128i V0, V1, V2, V3;
|
||||||
|
uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
|
||||||
|
V0 = casti_m128i( H, 0 );
|
||||||
|
V1 = casti_m128i( H, 1 );
|
||||||
|
V2 = _mm_set_epi32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
|
||||||
|
V3 = _mm_set_epi32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
|
||||||
|
T0 ^ 0x299F31D0, T0 ^ 0xA4093822 );
|
||||||
|
M0 = buf[ 0];
|
||||||
|
M1 = buf[ 1];
|
||||||
|
M2 = buf[ 2];
|
||||||
|
M3 = buf[ 3];
|
||||||
|
M4 = buf[ 4];
|
||||||
|
M5 = buf[ 5];
|
||||||
|
M6 = buf[ 6];
|
||||||
|
M7 = buf[ 7];
|
||||||
|
M8 = buf[ 8];
|
||||||
|
M9 = buf[ 9];
|
||||||
|
MA = buf[10];
|
||||||
|
MB = buf[11];
|
||||||
|
MC = buf[12];
|
||||||
|
MD = buf[13];
|
||||||
|
ME = buf[14];
|
||||||
|
MF = buf[15];
|
||||||
|
BLAKE256_ROUND( 0 );
|
||||||
|
BLAKE256_ROUND( 1 );
|
||||||
|
BLAKE256_ROUND( 2 );
|
||||||
|
BLAKE256_ROUND( 3 );
|
||||||
|
BLAKE256_ROUND( 4 );
|
||||||
|
BLAKE256_ROUND( 5 );
|
||||||
|
BLAKE256_ROUND( 6 );
|
||||||
|
BLAKE256_ROUND( 7 );
|
||||||
|
BLAKE256_ROUND( 8 );
|
||||||
|
BLAKE256_ROUND( 9 );
|
||||||
|
BLAKE256_ROUND( 0 );
|
||||||
|
BLAKE256_ROUND( 1 );
|
||||||
|
BLAKE256_ROUND( 2 );
|
||||||
|
BLAKE256_ROUND( 3 );
|
||||||
|
casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V0, V2 );
|
||||||
|
casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V1, V3 );
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////
|
||||||
|
//
|
||||||
// Blake-256 4 way
|
// Blake-256 4 way
|
||||||
|
|
||||||
#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
|
#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||||
@@ -508,14 +601,10 @@ do { \
|
|||||||
V9 = m128_const1_64( 0x85A308D385A308D3 ); \
|
V9 = m128_const1_64( 0x85A308D385A308D3 ); \
|
||||||
VA = m128_const1_64( 0x13198A2E13198A2E ); \
|
VA = m128_const1_64( 0x13198A2E13198A2E ); \
|
||||||
VB = m128_const1_64( 0x0370734403707344 ); \
|
VB = m128_const1_64( 0x0370734403707344 ); \
|
||||||
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \
|
VC = _mm_set1_epi32( T0 ^ 0xA4093822 ); \
|
||||||
m128_const1_64( 0xA4093822A4093822 ) ); \
|
VD = _mm_set1_epi32( T0 ^ 0x299F31D0 ); \
|
||||||
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \
|
VE = _mm_set1_epi32( T1 ^ 0x082EFA98 ); \
|
||||||
m128_const1_64( 0x299F31D0299F31D0 ) ); \
|
VF = _mm_set1_epi32( T1 ^ 0xEC4E6C89 ); \
|
||||||
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), \
|
|
||||||
m128_const1_64( 0x082EFA98082EFA98 ) ); \
|
|
||||||
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), \
|
|
||||||
m128_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
|
|
||||||
BLAKE256_4WAY_BLOCK_BSWAP32; \
|
BLAKE256_4WAY_BLOCK_BSWAP32; \
|
||||||
ROUND_S_4WAY(0); \
|
ROUND_S_4WAY(0); \
|
||||||
ROUND_S_4WAY(1); \
|
ROUND_S_4WAY(1); \
|
||||||
@@ -548,6 +637,8 @@ do { \
|
|||||||
|
|
||||||
#if defined (__AVX2__)
|
#if defined (__AVX2__)
|
||||||
|
|
||||||
|
/////////////////////////////////
|
||||||
|
//
|
||||||
// Blake-256 8 way
|
// Blake-256 8 way
|
||||||
|
|
||||||
#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
|
#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||||
@@ -626,14 +717,10 @@ do { \
|
|||||||
V9 = m256_const1_64( 0x85A308D385A308D3 ); \
|
V9 = m256_const1_64( 0x85A308D385A308D3 ); \
|
||||||
VA = m256_const1_64( 0x13198A2E13198A2E ); \
|
VA = m256_const1_64( 0x13198A2E13198A2E ); \
|
||||||
VB = m256_const1_64( 0x0370734403707344 ); \
|
VB = m256_const1_64( 0x0370734403707344 ); \
|
||||||
VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
|
VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \
|
||||||
m256_const1_64( 0xA4093822A4093822 ) ); \
|
VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \
|
||||||
VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
|
VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \
|
||||||
m256_const1_64( 0x299F31D0299F31D0 ) ); \
|
VF = _mm256_set1_epi32( T1 ^ 0xEC4E6C89 ); \
|
||||||
VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
|
|
||||||
m256_const1_64( 0x082EFA98082EFA98 ) ); \
|
|
||||||
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
|
|
||||||
m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
|
|
||||||
shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||||
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
|
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
|
||||||
@@ -679,13 +766,247 @@ do { \
|
|||||||
H7 = mm256_xor3( VF, V7, H7 ); \
|
H7 = mm256_xor3( VF, V7, H7 ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
#define COMPRESS32_8WAY_LE( rounds ) \
|
||||||
|
do { \
|
||||||
|
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||||
|
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||||
|
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||||
|
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||||
|
V0 = H0; \
|
||||||
|
V1 = H1; \
|
||||||
|
V2 = H2; \
|
||||||
|
V3 = H3; \
|
||||||
|
V4 = H4; \
|
||||||
|
V5 = H5; \
|
||||||
|
V6 = H6; \
|
||||||
|
V7 = H7; \
|
||||||
|
V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
|
||||||
|
V9 = m256_const1_64( 0x85A308D385A308D3 ); \
|
||||||
|
VA = m256_const1_64( 0x13198A2E13198A2E ); \
|
||||||
|
VB = m256_const1_64( 0x0370734403707344 ); \
|
||||||
|
VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \
|
||||||
|
VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \
|
||||||
|
VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \
|
||||||
|
VF = _mm256_set1_epi32( T1 ^ 0xEC4E6C89 ); \
|
||||||
|
M0 = buf[ 0]; \
|
||||||
|
M1 = buf[ 1]; \
|
||||||
|
M2 = buf[ 2]; \
|
||||||
|
M3 = buf[ 3]; \
|
||||||
|
M4 = buf[ 4]; \
|
||||||
|
M5 = buf[ 5]; \
|
||||||
|
M6 = buf[ 6]; \
|
||||||
|
M7 = buf[ 7]; \
|
||||||
|
M8 = buf[ 8]; \
|
||||||
|
M9 = buf[ 9]; \
|
||||||
|
MA = buf[10]; \
|
||||||
|
MB = buf[11]; \
|
||||||
|
MC = buf[12]; \
|
||||||
|
MD = buf[13]; \
|
||||||
|
ME = buf[14]; \
|
||||||
|
MF = buf[15]; \
|
||||||
|
ROUND_S_8WAY(0); \
|
||||||
|
ROUND_S_8WAY(1); \
|
||||||
|
ROUND_S_8WAY(2); \
|
||||||
|
ROUND_S_8WAY(3); \
|
||||||
|
ROUND_S_8WAY(4); \
|
||||||
|
ROUND_S_8WAY(5); \
|
||||||
|
ROUND_S_8WAY(6); \
|
||||||
|
ROUND_S_8WAY(7); \
|
||||||
|
if (rounds == 14) \
|
||||||
|
{ \
|
||||||
|
ROUND_S_8WAY(8); \
|
||||||
|
ROUND_S_8WAY(9); \
|
||||||
|
ROUND_S_8WAY(0); \
|
||||||
|
ROUND_S_8WAY(1); \
|
||||||
|
ROUND_S_8WAY(2); \
|
||||||
|
ROUND_S_8WAY(3); \
|
||||||
|
} \
|
||||||
|
H0 = mm256_xor3( V8, V0, H0 ); \
|
||||||
|
H1 = mm256_xor3( V9, V1, H1 ); \
|
||||||
|
H2 = mm256_xor3( VA, V2, H2 ); \
|
||||||
|
H3 = mm256_xor3( VB, V3, H3 ); \
|
||||||
|
H4 = mm256_xor3( VC, V4, H4 ); \
|
||||||
|
H5 = mm256_xor3( VD, V5, H5 ); \
|
||||||
|
H6 = mm256_xor3( VE, V6, H6 ); \
|
||||||
|
H7 = mm256_xor3( VF, V7, H7 ); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||||
|
const void *data )
|
||||||
|
{
|
||||||
|
const __m256i *M = (const __m256i*)data;
|
||||||
|
__m256i *V = (__m256i*)midstate;
|
||||||
|
const __m256i *H = (const __m256i*)midhash;
|
||||||
|
|
||||||
|
V[ 0] = H[0];
|
||||||
|
V[ 1] = H[1];
|
||||||
|
V[ 2] = H[2];
|
||||||
|
V[ 3] = H[3];
|
||||||
|
V[ 4] = H[4];
|
||||||
|
V[ 5] = H[5];
|
||||||
|
V[ 6] = H[6];
|
||||||
|
V[ 7] = H[7];
|
||||||
|
V[ 8] = m256_const1_32( CS0 );
|
||||||
|
V[ 9] = m256_const1_32( CS1 );
|
||||||
|
V[10] = m256_const1_32( CS2 );
|
||||||
|
V[11] = m256_const1_32( CS3 );
|
||||||
|
V[12] = m256_const1_32( CS4 ^ 0x280 );
|
||||||
|
V[13] = m256_const1_32( CS5 ^ 0x280 );
|
||||||
|
V[14] = m256_const1_32( CS6 );
|
||||||
|
V[15] = m256_const1_32( CS7 );
|
||||||
|
|
||||||
|
// G0
|
||||||
|
GS_8WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
|
||||||
|
|
||||||
|
// G1
|
||||||
|
V[ 1] = _mm256_add_epi32( _mm256_add_epi32( V[ 1], V[ 5] ),
|
||||||
|
_mm256_xor_si256( _mm256_set1_epi32( CS3 ), M[ 2] ) );
|
||||||
|
V[13] = mm256_ror_32( _mm256_xor_si256( V[13], V[ 1] ), 16 );
|
||||||
|
V[ 9] = _mm256_add_epi32( V[ 9], V[13] );
|
||||||
|
V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 );
|
||||||
|
V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] );
|
||||||
|
|
||||||
|
// G2,G3
|
||||||
|
GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
|
||||||
|
GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
|
||||||
|
|
||||||
|
// G4
|
||||||
|
V[ 0] = _mm256_add_epi32( V[ 0],
|
||||||
|
_mm256_xor_si256( _mm256_set1_epi32( CS9 ), M[ 8] ) );
|
||||||
|
|
||||||
|
// G6
|
||||||
|
V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 7] ),
|
||||||
|
_mm256_xor_si256( _mm256_set1_epi32( CSD ), M[12] ) );
|
||||||
|
|
||||||
|
// G7
|
||||||
|
V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ),
|
||||||
|
_mm256_xor_si256( _mm256_set1_epi32( CSF ), M[14] ) );
|
||||||
|
V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 3] ), 16 );
|
||||||
|
V[ 3] = _mm256_add_epi32( V[ 3],
|
||||||
|
_mm256_xor_si256( _mm256_set1_epi32( CSE ), M[15] ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||||
|
const void *midhash, const void *data )
|
||||||
|
{
|
||||||
|
__m256i *H = (__m256i*)final_hash;
|
||||||
|
const __m256i *h = (const __m256i*)midhash;
|
||||||
|
const __m256i *v= (const __m256i*)midstate;
|
||||||
|
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
|
||||||
|
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||||
|
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||||
|
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||||
|
|
||||||
|
V0 = v[ 0];
|
||||||
|
V1 = v[ 1];
|
||||||
|
V2 = v[ 2];
|
||||||
|
V3 = v[ 3];
|
||||||
|
V4 = v[ 4];
|
||||||
|
V5 = v[ 5];
|
||||||
|
V6 = v[ 6];
|
||||||
|
V7 = v[ 7];
|
||||||
|
V8 = v[ 8];
|
||||||
|
V9 = v[ 9];
|
||||||
|
VA = v[10];
|
||||||
|
VB = v[11];
|
||||||
|
VC = v[12];
|
||||||
|
VD = v[13];
|
||||||
|
VE = v[14];
|
||||||
|
VF = v[15];
|
||||||
|
|
||||||
|
M0 = casti_m256i( data, 0 );
|
||||||
|
M1 = casti_m256i( data, 1 );
|
||||||
|
M2 = casti_m256i( data, 2 );
|
||||||
|
M3 = casti_m256i( data, 3 );
|
||||||
|
M4 = casti_m256i( data, 4 );
|
||||||
|
M5 = casti_m256i( data, 5 );
|
||||||
|
M6 = casti_m256i( data, 6 );
|
||||||
|
M7 = casti_m256i( data, 7 );
|
||||||
|
M8 = casti_m256i( data, 8 );
|
||||||
|
M9 = casti_m256i( data, 9 );
|
||||||
|
MA = casti_m256i( data, 10 );
|
||||||
|
MB = casti_m256i( data, 11 );
|
||||||
|
MC = casti_m256i( data, 12 );
|
||||||
|
MD = casti_m256i( data, 13 );
|
||||||
|
ME = casti_m256i( data, 14 );
|
||||||
|
MF = casti_m256i( data, 15 );
|
||||||
|
|
||||||
|
// Finish round 0
|
||||||
|
// G1
|
||||||
|
V1 = _mm256_add_epi32( V1,
|
||||||
|
_mm256_xor_si256( _mm256_set1_epi32( CS2 ), M3 ) );
|
||||||
|
VD = mm256_ror_32( _mm256_xor_si256( VD, V1 ), 8 );
|
||||||
|
V9 = _mm256_add_epi32( V9, VD );
|
||||||
|
V5 = mm256_ror_32( _mm256_xor_si256( V5, V9 ), 7 );
|
||||||
|
|
||||||
|
// G4
|
||||||
|
V0 = _mm256_add_epi32( V0, V5 );
|
||||||
|
VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 16 );
|
||||||
|
VA = _mm256_add_epi32( VA, VF );
|
||||||
|
V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 );
|
||||||
|
V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5,
|
||||||
|
_mm256_xor_si256( _mm256_set1_epi32( CS8 ), M9 ) ) );
|
||||||
|
VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 8 );
|
||||||
|
VA = _mm256_add_epi32( VA, VF );
|
||||||
|
V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 );
|
||||||
|
|
||||||
|
// G5
|
||||||
|
GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
|
||||||
|
|
||||||
|
// G6
|
||||||
|
VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 16 );
|
||||||
|
V8 = _mm256_add_epi32( V8, VD );
|
||||||
|
V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 );
|
||||||
|
V2 = _mm256_add_epi32( _mm256_add_epi32( V2, V7 ),
|
||||||
|
_mm256_xor_si256( _mm256_set1_epi32( CSC ), MD ) );
|
||||||
|
VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 8 );
|
||||||
|
V8 = _mm256_add_epi32( V8, VD );
|
||||||
|
V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 );
|
||||||
|
|
||||||
|
// G7
|
||||||
|
V9 = _mm256_add_epi32( V9, VE );
|
||||||
|
V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 12 );
|
||||||
|
V3 = _mm256_add_epi32( V3, V4 );
|
||||||
|
VE = mm256_ror_32( _mm256_xor_si256( VE, V3 ), 8 );
|
||||||
|
V9 = _mm256_add_epi32( V9, VE );
|
||||||
|
V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 );
|
||||||
|
|
||||||
|
// Remaining rounds
|
||||||
|
ROUND_S_8WAY( 1 );
|
||||||
|
ROUND_S_8WAY( 2 );
|
||||||
|
ROUND_S_8WAY( 3 );
|
||||||
|
ROUND_S_8WAY( 4 );
|
||||||
|
ROUND_S_8WAY( 5 );
|
||||||
|
ROUND_S_8WAY( 6 );
|
||||||
|
ROUND_S_8WAY( 7 );
|
||||||
|
ROUND_S_8WAY( 8 );
|
||||||
|
ROUND_S_8WAY( 9 );
|
||||||
|
ROUND_S_8WAY( 0 );
|
||||||
|
ROUND_S_8WAY( 1 );
|
||||||
|
ROUND_S_8WAY( 2 );
|
||||||
|
ROUND_S_8WAY( 3 );
|
||||||
|
|
||||||
|
const __m256i shuf_bswap32 =
|
||||||
|
m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213,
|
||||||
|
0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||||
|
|
||||||
|
H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
||||||
|
H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
||||||
|
H[2] = _mm256_shuffle_epi8( mm256_xor3( VA, V2, h[2] ), shuf_bswap32 );
|
||||||
|
H[3] = _mm256_shuffle_epi8( mm256_xor3( VB, V3, h[3] ), shuf_bswap32 );
|
||||||
|
H[4] = _mm256_shuffle_epi8( mm256_xor3( VC, V4, h[4] ), shuf_bswap32 );
|
||||||
|
H[5] = _mm256_shuffle_epi8( mm256_xor3( VD, V5, h[5] ), shuf_bswap32 );
|
||||||
|
H[6] = _mm256_shuffle_epi8( mm256_xor3( VE, V6, h[6] ), shuf_bswap32 );
|
||||||
|
H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 );
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
// Blaske-256 16 way AVX512
|
///////////////////////////////////////
|
||||||
|
//
|
||||||
|
// Blake-256 16 way AVX512
|
||||||
|
|
||||||
#define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
|
#define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||||
do { \
|
do { \
|
||||||
@@ -763,14 +1084,10 @@ do { \
|
|||||||
V9 = m512_const1_64( 0x85A308D385A308D3 ); \
|
V9 = m512_const1_64( 0x85A308D385A308D3 ); \
|
||||||
VA = m512_const1_64( 0x13198A2E13198A2E ); \
|
VA = m512_const1_64( 0x13198A2E13198A2E ); \
|
||||||
VB = m512_const1_64( 0x0370734403707344 ); \
|
VB = m512_const1_64( 0x0370734403707344 ); \
|
||||||
VC = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
|
VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \
|
||||||
m512_const1_64( 0xA4093822A4093822 ) ); \
|
VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \
|
||||||
VD = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
|
VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \
|
||||||
m512_const1_64( 0x299F31D0299F31D0 ) ); \
|
VF = _mm512_set1_epi32( T1 ^ 0xEC4E6C89 ); \
|
||||||
VE = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
|
|
||||||
m512_const1_64( 0x082EFA98082EFA98 ) ); \
|
|
||||||
VF = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
|
|
||||||
m512_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
|
|
||||||
shuf_bswap32 = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
|
shuf_bswap32 = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
|
||||||
0x2c2d2e2f28292a2b, 0x2425262720212223, \
|
0x2c2d2e2f28292a2b, 0x2425262720212223, \
|
||||||
0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||||
@@ -818,6 +1135,264 @@ do { \
|
|||||||
H7 = mm512_xor3( VF, V7, H7 ); \
|
H7 = mm512_xor3( VF, V7, H7 ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
#define COMPRESS32_16WAY_LE( rounds ) \
|
||||||
|
do { \
|
||||||
|
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||||
|
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||||
|
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||||
|
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||||
|
V0 = H0; \
|
||||||
|
V1 = H1; \
|
||||||
|
V2 = H2; \
|
||||||
|
V3 = H3; \
|
||||||
|
V4 = H4; \
|
||||||
|
V5 = H5; \
|
||||||
|
V6 = H6; \
|
||||||
|
V7 = H7; \
|
||||||
|
V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
|
||||||
|
V9 = m512_const1_64( 0x85A308D385A308D3 ); \
|
||||||
|
VA = m512_const1_64( 0x13198A2E13198A2E ); \
|
||||||
|
VB = m512_const1_64( 0x0370734403707344 ); \
|
||||||
|
VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \
|
||||||
|
VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \
|
||||||
|
VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \
|
||||||
|
VF = _mm512_set1_epi32( T1 ^ 0xEC4E6C89 ); \
|
||||||
|
M0 = buf[ 0]; \
|
||||||
|
M1 = buf[ 1]; \
|
||||||
|
M2 = buf[ 2]; \
|
||||||
|
M3 = buf[ 3]; \
|
||||||
|
M4 = buf[ 4]; \
|
||||||
|
M5 = buf[ 5]; \
|
||||||
|
M6 = buf[ 6]; \
|
||||||
|
M7 = buf[ 7]; \
|
||||||
|
M8 = buf[ 8]; \
|
||||||
|
M9 = buf[ 9]; \
|
||||||
|
MA = buf[10]; \
|
||||||
|
MB = buf[11]; \
|
||||||
|
MC = buf[12]; \
|
||||||
|
MD = buf[13]; \
|
||||||
|
ME = buf[14]; \
|
||||||
|
MF = buf[15]; \
|
||||||
|
ROUND_S_16WAY(0); \
|
||||||
|
ROUND_S_16WAY(1); \
|
||||||
|
ROUND_S_16WAY(2); \
|
||||||
|
ROUND_S_16WAY(3); \
|
||||||
|
ROUND_S_16WAY(4); \
|
||||||
|
ROUND_S_16WAY(5); \
|
||||||
|
ROUND_S_16WAY(6); \
|
||||||
|
ROUND_S_16WAY(7); \
|
||||||
|
if (rounds == 14) \
|
||||||
|
{ \
|
||||||
|
ROUND_S_16WAY(8); \
|
||||||
|
ROUND_S_16WAY(9); \
|
||||||
|
ROUND_S_16WAY(0); \
|
||||||
|
ROUND_S_16WAY(1); \
|
||||||
|
ROUND_S_16WAY(2); \
|
||||||
|
ROUND_S_16WAY(3); \
|
||||||
|
} \
|
||||||
|
H0 = mm512_xor3( V8, V0, H0 ); \
|
||||||
|
H1 = mm512_xor3( V9, V1, H1 ); \
|
||||||
|
H2 = mm512_xor3( VA, V2, H2 ); \
|
||||||
|
H3 = mm512_xor3( VB, V3, H3 ); \
|
||||||
|
H4 = mm512_xor3( VC, V4, H4 ); \
|
||||||
|
H5 = mm512_xor3( VD, V5, H5 ); \
|
||||||
|
H6 = mm512_xor3( VE, V6, H6 ); \
|
||||||
|
H7 = mm512_xor3( VF, V7, H7 ); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
// Blake-256 prehash of the second block is split onto 2 parts. The first part
|
||||||
|
// is constant for every nonce and only needs to be run once per job. The
|
||||||
|
// second part is run for each nonce using the precalculated midstate and the
|
||||||
|
// hash from the first block.
|
||||||
|
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||||
|
const void *data )
|
||||||
|
{
|
||||||
|
const __m512i *M = (const __m512i*)data;
|
||||||
|
__m512i *V = (__m512i*)midstate;
|
||||||
|
const __m512i *H = (const __m512i*)midhash;
|
||||||
|
|
||||||
|
V[ 0] = H[0];
|
||||||
|
V[ 1] = H[1];
|
||||||
|
V[ 2] = H[2];
|
||||||
|
V[ 3] = H[3];
|
||||||
|
V[ 4] = H[4];
|
||||||
|
V[ 5] = H[5];
|
||||||
|
V[ 6] = H[6];
|
||||||
|
V[ 7] = H[7];
|
||||||
|
V[ 8] = m512_const1_32( CS0 );
|
||||||
|
V[ 9] = m512_const1_32( CS1 );
|
||||||
|
V[10] = m512_const1_32( CS2 );
|
||||||
|
V[11] = m512_const1_32( CS3 );
|
||||||
|
V[12] = m512_const1_32( CS4 ^ 0x280 );
|
||||||
|
V[13] = m512_const1_32( CS5 ^ 0x280 );
|
||||||
|
V[14] = m512_const1_32( CS6 );
|
||||||
|
V[15] = m512_const1_32( CS7 );
|
||||||
|
|
||||||
|
// G0
|
||||||
|
GS_16WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
|
||||||
|
|
||||||
|
// G1, nonce is in M[3]
|
||||||
|
// GS_16WAY( M[ 2], M[ 3], CS2, CS3, V1, V5, V9, VD );
|
||||||
|
V[ 1] = _mm512_add_epi32( _mm512_add_epi32( V[ 1], V[ 5] ),
|
||||||
|
_mm512_xor_si512( _mm512_set1_epi32( CS3 ), M[ 2] ) );
|
||||||
|
V[13] = mm512_ror_32( _mm512_xor_si512( V[13], V[ 1] ), 16 );
|
||||||
|
V[ 9] = _mm512_add_epi32( V[ 9], V[13] );
|
||||||
|
V[ 5] = mm512_ror_32( _mm512_xor_si512( V[ 5], V[ 9] ), 12 );
|
||||||
|
V[ 1] = _mm512_add_epi32( V[ 1], V[ 5] );
|
||||||
|
|
||||||
|
// G2,G3
|
||||||
|
GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
|
||||||
|
GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
|
||||||
|
|
||||||
|
// G4
|
||||||
|
// GS_16WAY( M[ 8], M[ 9], CS8, CS9, V0, V5, VA, VF );
|
||||||
|
V[ 0] = _mm512_add_epi32( V[ 0],
|
||||||
|
_mm512_xor_si512( _mm512_set1_epi32( CS9 ), M[ 8] ) );
|
||||||
|
|
||||||
|
// G5
|
||||||
|
// GS_16WAY( M[10], M[11], CSA, CSB, V1, V6, VB, VC );
|
||||||
|
|
||||||
|
// G6
|
||||||
|
// GS_16WAY( M[12], M[13], CSC, CSD, V2, V7, V8, VD );
|
||||||
|
V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 7] ),
|
||||||
|
_mm512_xor_si512( _mm512_set1_epi32( CSD ), M[12] ) );
|
||||||
|
// G7
|
||||||
|
// GS_16WAY( M[14], M[15], CSE, CSF, V3, V4, V9, VE );
|
||||||
|
V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 4] ),
|
||||||
|
_mm512_xor_si512( _mm512_set1_epi32( CSF ), M[14] ) );
|
||||||
|
V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 3] ), 16 );
|
||||||
|
V[ 3] = _mm512_add_epi32( V[ 3],
|
||||||
|
_mm512_xor_si512( _mm512_set1_epi32( CSE ), M[15] ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||||
|
const void *midhash, const void *data )
|
||||||
|
{
|
||||||
|
__m512i *H = (__m512i*)final_hash;
|
||||||
|
const __m512i *h = (const __m512i*)midhash;
|
||||||
|
const __m512i *v= (const __m512i*)midstate;
|
||||||
|
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
|
||||||
|
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||||
|
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||||
|
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||||
|
|
||||||
|
V0 = v[ 0];
|
||||||
|
V1 = v[ 1];
|
||||||
|
V2 = v[ 2];
|
||||||
|
V3 = v[ 3];
|
||||||
|
V4 = v[ 4];
|
||||||
|
V5 = v[ 5];
|
||||||
|
V6 = v[ 6];
|
||||||
|
V7 = v[ 7];
|
||||||
|
V8 = v[ 8];
|
||||||
|
V9 = v[ 9];
|
||||||
|
VA = v[10];
|
||||||
|
VB = v[11];
|
||||||
|
VC = v[12];
|
||||||
|
VD = v[13];
|
||||||
|
VE = v[14];
|
||||||
|
VF = v[15];
|
||||||
|
|
||||||
|
M0 = casti_m512i( data, 0 );
|
||||||
|
M1 = casti_m512i( data, 1 );
|
||||||
|
M2 = casti_m512i( data, 2 );
|
||||||
|
M3 = casti_m512i( data, 3 );
|
||||||
|
M4 = casti_m512i( data, 4 );
|
||||||
|
M5 = casti_m512i( data, 5 );
|
||||||
|
M6 = casti_m512i( data, 6 );
|
||||||
|
M7 = casti_m512i( data, 7 );
|
||||||
|
M8 = casti_m512i( data, 8 );
|
||||||
|
M9 = casti_m512i( data, 9 );
|
||||||
|
MA = casti_m512i( data, 10 );
|
||||||
|
MB = casti_m512i( data, 11 );
|
||||||
|
MC = casti_m512i( data, 12 );
|
||||||
|
MD = casti_m512i( data, 13 );
|
||||||
|
ME = casti_m512i( data, 14 );
|
||||||
|
MF = casti_m512i( data, 15 );
|
||||||
|
|
||||||
|
// Finish round 0 with the nonce (M3) now available
|
||||||
|
// G0
|
||||||
|
// GS_16WAY( M0, M1, CS0, CS1, V0, V4, V8, VC );
|
||||||
|
|
||||||
|
// G1
|
||||||
|
// GS_16WAY( M2, M3, CS2, CS3, V1, V5, V9, VD );
|
||||||
|
V1 = _mm512_add_epi32( V1,
|
||||||
|
_mm512_xor_si512( _mm512_set1_epi32( CS2 ), M3 ) );
|
||||||
|
VD = mm512_ror_32( _mm512_xor_si512( VD, V1 ), 8 );
|
||||||
|
V9 = _mm512_add_epi32( V9, VD );
|
||||||
|
V5 = mm512_ror_32( _mm512_xor_si512( V5, V9 ), 7 );
|
||||||
|
|
||||||
|
// G2,G3
|
||||||
|
// GS_16WAY( M4, M5, CS4, CS5, V2, V6, VA, VE );
|
||||||
|
// GS_16WAY( M6, M7, CS6, CS7, V3, V7, VB, VF );
|
||||||
|
|
||||||
|
// G4
|
||||||
|
// GS_16WAY( M8, M9, CS8, CS9, V0, V5, VA, VF );
|
||||||
|
V0 = _mm512_add_epi32( V0, V5 );
|
||||||
|
VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 16 );
|
||||||
|
VA = _mm512_add_epi32( VA, VF );
|
||||||
|
V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 12 );
|
||||||
|
V0 = _mm512_add_epi32( V0, _mm512_add_epi32( V5,
|
||||||
|
_mm512_xor_si512( _mm512_set1_epi32( CS8 ), M9 ) ) );
|
||||||
|
VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 8 );
|
||||||
|
VA = _mm512_add_epi32( VA, VF );
|
||||||
|
V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 7 );
|
||||||
|
|
||||||
|
// G5
|
||||||
|
GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
|
||||||
|
|
||||||
|
// G6
|
||||||
|
// GS_16WAY( MC, MD, CSC, CSD, V2, V7, V8, VD );
|
||||||
|
VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 16 );
|
||||||
|
V8 = _mm512_add_epi32( V8, VD );
|
||||||
|
V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 12 );
|
||||||
|
V2 = _mm512_add_epi32( _mm512_add_epi32( V2, V7 ),
|
||||||
|
_mm512_xor_si512( _mm512_set1_epi32( CSC ), MD ) );
|
||||||
|
VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 8 );
|
||||||
|
V8 = _mm512_add_epi32( V8, VD );
|
||||||
|
V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 7 );
|
||||||
|
|
||||||
|
// G7
|
||||||
|
// GS_16WAY( ME, MF, CSE, CSF, V3, V4, V9, VE );
|
||||||
|
V9 = _mm512_add_epi32( V9, VE );
|
||||||
|
V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 12 );
|
||||||
|
V3 = _mm512_add_epi32( V3, V4 );
|
||||||
|
VE = mm512_ror_32( _mm512_xor_si512( VE, V3 ), 8 );
|
||||||
|
V9 = _mm512_add_epi32( V9, VE );
|
||||||
|
V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 7 );
|
||||||
|
|
||||||
|
// Remaining rounds
|
||||||
|
ROUND_S_16WAY( 1 );
|
||||||
|
ROUND_S_16WAY( 2 );
|
||||||
|
ROUND_S_16WAY( 3 );
|
||||||
|
ROUND_S_16WAY( 4 );
|
||||||
|
ROUND_S_16WAY( 5 );
|
||||||
|
ROUND_S_16WAY( 6 );
|
||||||
|
ROUND_S_16WAY( 7 );
|
||||||
|
ROUND_S_16WAY( 8 );
|
||||||
|
ROUND_S_16WAY( 9 );
|
||||||
|
ROUND_S_16WAY( 0 );
|
||||||
|
ROUND_S_16WAY( 1 );
|
||||||
|
ROUND_S_16WAY( 2 );
|
||||||
|
ROUND_S_16WAY( 3 );
|
||||||
|
|
||||||
|
// Byte swap final hash
|
||||||
|
const __m512i shuf_bswap32 =
|
||||||
|
m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233,
|
||||||
|
0x2c2d2e2f28292a2b, 0x2425262720212223,
|
||||||
|
0x1c1d1e1f18191a1b, 0x1415161710111213,
|
||||||
|
0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||||
|
|
||||||
|
H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
||||||
|
H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
||||||
|
H[2] = _mm512_shuffle_epi8( mm512_xor3( VA, V2, h[2] ), shuf_bswap32 );
|
||||||
|
H[3] = _mm512_shuffle_epi8( mm512_xor3( VB, V3, h[3] ), shuf_bswap32 );
|
||||||
|
H[4] = _mm512_shuffle_epi8( mm512_xor3( VC, V4, h[4] ), shuf_bswap32 );
|
||||||
|
H[5] = _mm512_shuffle_epi8( mm512_xor3( VD, V5, h[5] ), shuf_bswap32 );
|
||||||
|
H[6] = _mm512_shuffle_epi8( mm512_xor3( VE, V6, h[6] ), shuf_bswap32 );
|
||||||
|
H[7] = _mm512_shuffle_epi8( mm512_xor3( VF, V7, h[7] ), shuf_bswap32 );
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Blake-256 4 way
|
// Blake-256 4 way
|
||||||
@@ -913,8 +1488,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
|
|||||||
memset_zero_128( buf + vptr + 1, 13 - vptr );
|
memset_zero_128( buf + vptr + 1, 13 - vptr );
|
||||||
buf[ 13 ] = _mm_or_si128( buf[ 13 ],
|
buf[ 13 ] = _mm_or_si128( buf[ 13 ],
|
||||||
m128_const1_64( 0x0100000001000000ULL ) );
|
m128_const1_64( 0x0100000001000000ULL ) );
|
||||||
buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
|
buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) );
|
||||||
buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
|
buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) );
|
||||||
blake32_4way( ctx, buf + vptr, 64 - ptr );
|
blake32_4way( ctx, buf + vptr, 64 - ptr );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -926,8 +1501,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
|
|||||||
memset_zero_128( buf, 56>>2 );
|
memset_zero_128( buf, 56>>2 );
|
||||||
buf[ 13 ] = _mm_or_si128( buf[ 13 ],
|
buf[ 13 ] = _mm_or_si128( buf[ 13 ],
|
||||||
m128_const1_64( 0x0100000001000000ULL ) );
|
m128_const1_64( 0x0100000001000000ULL ) );
|
||||||
buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
|
buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) );
|
||||||
buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
|
buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) );
|
||||||
blake32_4way( ctx, buf, 64 );
|
blake32_4way( ctx, buf, 64 );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1033,22 +1608,117 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
|||||||
if ( out_size_w32 == 8 )
|
if ( out_size_w32 == 8 )
|
||||||
buf[52>>2] = _mm256_or_si256( buf[52>>2],
|
buf[52>>2] = _mm256_or_si256( buf[52>>2],
|
||||||
m256_const1_64( 0x0100000001000000ULL ) );
|
m256_const1_64( 0x0100000001000000ULL ) );
|
||||||
*(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
|
*(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) );
|
||||||
*(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
|
*(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) );
|
||||||
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
|
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
||||||
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
|
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
|
||||||
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||||
memset_zero_256( buf, 56>>2 );
|
memset_zero_256( buf, 56>>2 );
|
||||||
if ( out_size_w32 == 8 )
|
if ( out_size_w32 == 8 )
|
||||||
buf[52>>2] = m256_const1_64( 0x0100000001000000ULL );
|
buf[52>>2] = m256_const1_64( 0x0100000001000000ULL );
|
||||||
*(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
|
*(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) );
|
||||||
*(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
|
*(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) );
|
||||||
blake32_8way( sc, buf, 64 );
|
blake32_8way( sc, buf, 64 );
|
||||||
|
}
|
||||||
|
mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
|
||||||
|
{
|
||||||
|
__m256i *vdata = (__m256i*)data;
|
||||||
|
__m256i *buf;
|
||||||
|
size_t ptr;
|
||||||
|
const int buf_size = 64; // number of elements, sizeof/4
|
||||||
|
DECL_STATE32_8WAY
|
||||||
|
buf = sc->buf;
|
||||||
|
ptr = sc->ptr;
|
||||||
|
if ( len < buf_size - ptr )
|
||||||
|
{
|
||||||
|
memcpy_256( buf + (ptr>>2), vdata, len>>2 );
|
||||||
|
ptr += len;
|
||||||
|
sc->ptr = ptr;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
READ_STATE32_8WAY(sc);
|
||||||
|
while ( len > 0 )
|
||||||
|
{
|
||||||
|
size_t clen;
|
||||||
|
|
||||||
|
clen = buf_size - ptr;
|
||||||
|
if (clen > len)
|
||||||
|
clen = len;
|
||||||
|
memcpy_256( buf + (ptr>>2), vdata, clen>>2 );
|
||||||
|
ptr += clen;
|
||||||
|
vdata += (clen>>2);
|
||||||
|
len -= clen;
|
||||||
|
if ( ptr == buf_size )
|
||||||
|
{
|
||||||
|
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
|
||||||
|
T1 = SPH_T32(T1 + 1);
|
||||||
|
COMPRESS32_8WAY_LE( sc->rounds );
|
||||||
|
ptr = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
WRITE_STATE32_8WAY(sc);
|
||||||
|
sc->ptr = ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||||
|
void *dst, size_t out_size_w32 )
|
||||||
|
{
|
||||||
|
__m256i buf[16];
|
||||||
|
size_t ptr;
|
||||||
|
unsigned bit_len;
|
||||||
|
sph_u32 th, tl;
|
||||||
|
|
||||||
|
ptr = sc->ptr;
|
||||||
|
bit_len = ((unsigned)ptr << 3);
|
||||||
|
buf[ptr>>2] = m256_const1_32( 0x80000000 );
|
||||||
|
tl = sc->T0 + bit_len;
|
||||||
|
th = sc->T1;
|
||||||
|
|
||||||
|
if ( ptr == 0 )
|
||||||
|
{
|
||||||
|
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||||
|
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||||
|
}
|
||||||
|
else if ( sc->T0 == 0 )
|
||||||
|
{
|
||||||
|
sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
|
||||||
|
sc->T1 = SPH_T32(sc->T1 - 1);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
sc->T0 -= 512 - bit_len;
|
||||||
|
|
||||||
|
if ( ptr <= 52 )
|
||||||
|
{
|
||||||
|
memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
|
||||||
|
if ( out_size_w32 == 8 )
|
||||||
|
buf[52>>2] = _mm256_or_si256( buf[52>>2], m256_one_32 );
|
||||||
|
*(buf+(56>>2)) = _mm256_set1_epi32( th );
|
||||||
|
*(buf+(60>>2)) = _mm256_set1_epi32( tl );
|
||||||
|
blake32_8way_le( sc, buf + (ptr>>2), 64 - ptr );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
||||||
|
blake32_8way_le( sc, buf + (ptr>>2), 64 - ptr );
|
||||||
|
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||||
|
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||||
|
memset_zero_256( buf, 56>>2 );
|
||||||
|
if ( out_size_w32 == 8 )
|
||||||
|
buf[52>>2] = m256_one_32;
|
||||||
|
*(buf+(56>>2)) = _mm256_set1_epi32( th );
|
||||||
|
*(buf+(60>>2)) = _mm256_set1_epi32( tl );
|
||||||
|
blake32_8way_le( sc, buf, 64 );
|
||||||
}
|
}
|
||||||
mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
|
mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
|
||||||
}
|
}
|
||||||
@@ -1117,7 +1787,6 @@ blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
|
|||||||
WRITE_STATE32_16WAY(sc);
|
WRITE_STATE32_16WAY(sc);
|
||||||
sc->ptr = ptr;
|
sc->ptr = ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||||
void *dst, size_t out_size_w32 )
|
void *dst, size_t out_size_w32 )
|
||||||
@@ -1152,22 +1821,116 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
|||||||
if ( out_size_w32 == 8 )
|
if ( out_size_w32 == 8 )
|
||||||
buf[52>>2] = _mm512_or_si512( buf[52>>2],
|
buf[52>>2] = _mm512_or_si512( buf[52>>2],
|
||||||
m512_const1_64( 0x0100000001000000ULL ) );
|
m512_const1_64( 0x0100000001000000ULL ) );
|
||||||
buf[+56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
|
buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) );
|
||||||
buf[+60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
|
buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) );
|
||||||
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
|
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
||||||
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
|
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
|
||||||
|
sc->T0 = 0xFFFFFE00UL;
|
||||||
|
sc->T1 = 0xFFFFFFFFUL;
|
||||||
|
memset_zero_512( buf, 56>>2 );
|
||||||
|
if ( out_size_w32 == 8 )
|
||||||
|
buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
|
||||||
|
buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) );
|
||||||
|
buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) );
|
||||||
|
blake32_16way( sc, buf, 64 );
|
||||||
|
}
|
||||||
|
mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
|
||||||
|
{
|
||||||
|
__m512i *vdata = (__m512i*)data;
|
||||||
|
__m512i *buf;
|
||||||
|
size_t ptr;
|
||||||
|
const int buf_size = 64; // number of elements, sizeof/4
|
||||||
|
DECL_STATE32_16WAY
|
||||||
|
buf = sc->buf;
|
||||||
|
ptr = sc->ptr;
|
||||||
|
|
||||||
|
// only if calling update with 80
|
||||||
|
if ( len < buf_size - ptr )
|
||||||
|
{
|
||||||
|
memcpy_512( buf + (ptr>>2), vdata, len>>2 );
|
||||||
|
ptr += len;
|
||||||
|
sc->ptr = ptr;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
READ_STATE32_16WAY(sc);
|
||||||
|
while ( len > 0 )
|
||||||
|
{
|
||||||
|
size_t clen;
|
||||||
|
|
||||||
|
clen = buf_size - ptr;
|
||||||
|
if (clen > len)
|
||||||
|
clen = len;
|
||||||
|
memcpy_512( buf + (ptr>>2), vdata, clen>>2 );
|
||||||
|
ptr += clen;
|
||||||
|
vdata += (clen>>2);
|
||||||
|
len -= clen;
|
||||||
|
if ( ptr == buf_size )
|
||||||
|
{
|
||||||
|
if ( ( T0 = T0 + 512 ) < 512 )
|
||||||
|
T1 = T1 + 1;
|
||||||
|
COMPRESS32_16WAY_LE( sc->rounds );
|
||||||
|
ptr = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
WRITE_STATE32_16WAY(sc);
|
||||||
|
sc->ptr = ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||||
|
void *dst, size_t out_size_w32 )
|
||||||
|
{
|
||||||
|
__m512i buf[16];
|
||||||
|
size_t ptr;
|
||||||
|
unsigned bit_len;
|
||||||
|
sph_u32 th, tl;
|
||||||
|
|
||||||
|
ptr = sc->ptr;
|
||||||
|
bit_len = ((unsigned)ptr << 3);
|
||||||
|
buf[ptr>>2] = m512_const1_32( 0x80000000 );
|
||||||
|
tl = sc->T0 + bit_len;
|
||||||
|
th = sc->T1;
|
||||||
|
|
||||||
|
if ( ptr == 0 )
|
||||||
|
{
|
||||||
sc->T0 = 0xFFFFFE00UL;
|
sc->T0 = 0xFFFFFE00UL;
|
||||||
sc->T1 = 0xFFFFFFFFUL;
|
sc->T1 = 0xFFFFFFFFUL;
|
||||||
memset_zero_512( buf, 56>>2 );
|
}
|
||||||
if ( out_size_w32 == 8 )
|
else if ( sc->T0 == 0 )
|
||||||
buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
|
{
|
||||||
buf[56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
|
sc->T0 = 0xFFFFFE00UL + bit_len;
|
||||||
buf[60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
|
sc->T1 = sc->T1 - 1;
|
||||||
blake32_16way( sc, buf, 64 );
|
}
|
||||||
|
else
|
||||||
|
sc->T0 -= 512 - bit_len;
|
||||||
|
|
||||||
|
if ( ptr <= 52 )
|
||||||
|
{
|
||||||
|
memset_zero_512( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
|
||||||
|
buf[52>>2] = _mm512_or_si512( buf[52>>2], m512_one_32 );
|
||||||
|
buf[56>>2] = _mm512_set1_epi32( th );
|
||||||
|
buf[60>>2] = _mm512_set1_epi32( tl );
|
||||||
|
blake32_16way_le( sc, buf + (ptr>>2), 64 - ptr );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
||||||
|
blake32_16way_le( sc, buf + (ptr>>2), 64 - ptr );
|
||||||
|
sc->T0 = 0xFFFFFE00UL;
|
||||||
|
sc->T1 = 0xFFFFFFFFUL;
|
||||||
|
memset_zero_512( buf, 56>>2 );
|
||||||
|
buf[52>>2] = m512_one_32;
|
||||||
|
buf[56>>2] = _mm512_set1_epi32( th );
|
||||||
|
buf[60>>2] = _mm512_set1_epi32( tl );
|
||||||
|
blake32_16way_le( sc, buf, 64 );
|
||||||
}
|
}
|
||||||
mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
|
mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
|
||||||
}
|
}
|
||||||
@@ -1190,6 +1953,18 @@ blake256_16way_close(void *cc, void *dst)
|
|||||||
blake32_16way_close(cc, 0, 0, dst, 8);
|
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
blake256_16way_update_le(void *cc, const void *data, size_t len)
|
||||||
|
{
|
||||||
|
blake32_16way_le(cc, data, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
blake256_16way_close_le(void *cc, void *dst)
|
||||||
|
{
|
||||||
|
blake32_16way_close_le(cc, 0, 0, dst, 8);
|
||||||
|
}
|
||||||
|
|
||||||
void blake256r14_16way_init(void *cc)
|
void blake256r14_16way_init(void *cc)
|
||||||
{
|
{
|
||||||
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
|
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
|
||||||
@@ -1271,6 +2046,18 @@ blake256_8way_close(void *cc, void *dst)
|
|||||||
blake32_8way_close(cc, 0, 0, dst, 8);
|
blake32_8way_close(cc, 0, 0, dst, 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
blake256_8way_update_le(void *cc, const void *data, size_t len)
|
||||||
|
{
|
||||||
|
blake32_8way_le(cc, data, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
blake256_8way_close_le(void *cc, void *dst)
|
||||||
|
{
|
||||||
|
blake32_8way_close_le(cc, 0, 0, dst, 8);
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 14 rounds Blake, Decred
|
// 14 rounds Blake, Decred
|
||||||
|
@@ -361,14 +361,10 @@ static const sph_u64 CB[16] = {
|
|||||||
V9 = m512_const1_64( CB1 ); \
|
V9 = m512_const1_64( CB1 ); \
|
||||||
VA = m512_const1_64( CB2 ); \
|
VA = m512_const1_64( CB2 ); \
|
||||||
VB = m512_const1_64( CB3 ); \
|
VB = m512_const1_64( CB3 ); \
|
||||||
VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
|
VC = _mm512_set1_epi64( T0 ^ CB4 ); \
|
||||||
m512_const1_64( CB4 ) ); \
|
VD = _mm512_set1_epi64( T0 ^ CB5 ); \
|
||||||
VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
|
VE = _mm512_set1_epi64( T1 ^ CB6 ); \
|
||||||
m512_const1_64( CB5 ) ); \
|
VF = _mm512_set1_epi64( T1 ^ CB7 ); \
|
||||||
VE = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
|
|
||||||
m512_const1_64( CB6 ) ); \
|
|
||||||
VF = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
|
|
||||||
m512_const1_64( CB7 ) ); \
|
|
||||||
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
|
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
|
||||||
0x28292a2b2c2d2e2f, 0x2021222324252627, \
|
0x28292a2b2c2d2e2f, 0x2021222324252627, \
|
||||||
0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||||
@@ -435,14 +431,10 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
|||||||
V9 = m512_const1_64( CB1 );
|
V9 = m512_const1_64( CB1 );
|
||||||
VA = m512_const1_64( CB2 );
|
VA = m512_const1_64( CB2 );
|
||||||
VB = m512_const1_64( CB3 );
|
VB = m512_const1_64( CB3 );
|
||||||
VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
|
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
|
||||||
m512_const1_64( CB4 ) );
|
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
|
||||||
VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
|
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
|
||||||
m512_const1_64( CB5 ) );
|
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
|
||||||
VE = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
|
|
||||||
m512_const1_64( CB6 ) );
|
|
||||||
VF = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
|
|
||||||
m512_const1_64( CB7 ) );
|
|
||||||
|
|
||||||
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637,
|
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637,
|
||||||
0x28292a2b2c2d2e2f, 0x2021222324252627,
|
0x28292a2b2c2d2e2f, 0x2021222324252627,
|
||||||
@@ -493,6 +485,308 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
|||||||
sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
|
sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// won't be used after prehash implemented
|
||||||
|
void blake512_8way_compress_le( blake_8way_big_context *sc )
|
||||||
|
{
|
||||||
|
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||||
|
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||||
|
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
|
||||||
|
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||||
|
|
||||||
|
V0 = sc->H[0];
|
||||||
|
V1 = sc->H[1];
|
||||||
|
V2 = sc->H[2];
|
||||||
|
V3 = sc->H[3];
|
||||||
|
V4 = sc->H[4];
|
||||||
|
V5 = sc->H[5];
|
||||||
|
V6 = sc->H[6];
|
||||||
|
V7 = sc->H[7];
|
||||||
|
V8 = m512_const1_64( CB0 );
|
||||||
|
V9 = m512_const1_64( CB1 );
|
||||||
|
VA = m512_const1_64( CB2 );
|
||||||
|
VB = m512_const1_64( CB3 );
|
||||||
|
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
|
||||||
|
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
|
||||||
|
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
|
||||||
|
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
|
||||||
|
|
||||||
|
M0 = sc->buf[ 0];
|
||||||
|
M1 = sc->buf[ 1];
|
||||||
|
M2 = sc->buf[ 2];
|
||||||
|
M3 = sc->buf[ 3];
|
||||||
|
M4 = sc->buf[ 4];
|
||||||
|
M5 = sc->buf[ 5];
|
||||||
|
M6 = sc->buf[ 6];
|
||||||
|
M7 = sc->buf[ 7];
|
||||||
|
M8 = sc->buf[ 8];
|
||||||
|
M9 = sc->buf[ 9];
|
||||||
|
MA = sc->buf[10];
|
||||||
|
MB = sc->buf[11];
|
||||||
|
MC = sc->buf[12];
|
||||||
|
MD = sc->buf[13];
|
||||||
|
ME = sc->buf[14];
|
||||||
|
MF = sc->buf[15];
|
||||||
|
|
||||||
|
ROUND_B_8WAY(0);
|
||||||
|
ROUND_B_8WAY(1);
|
||||||
|
ROUND_B_8WAY(2);
|
||||||
|
ROUND_B_8WAY(3);
|
||||||
|
ROUND_B_8WAY(4);
|
||||||
|
ROUND_B_8WAY(5);
|
||||||
|
ROUND_B_8WAY(6);
|
||||||
|
ROUND_B_8WAY(7);
|
||||||
|
ROUND_B_8WAY(8);
|
||||||
|
ROUND_B_8WAY(9);
|
||||||
|
ROUND_B_8WAY(0);
|
||||||
|
ROUND_B_8WAY(1);
|
||||||
|
ROUND_B_8WAY(2);
|
||||||
|
ROUND_B_8WAY(3);
|
||||||
|
ROUND_B_8WAY(4);
|
||||||
|
ROUND_B_8WAY(5);
|
||||||
|
|
||||||
|
sc->H[0] = mm512_xor3( V8, V0, sc->H[0] );
|
||||||
|
sc->H[1] = mm512_xor3( V9, V1, sc->H[1] );
|
||||||
|
sc->H[2] = mm512_xor3( VA, V2, sc->H[2] );
|
||||||
|
sc->H[3] = mm512_xor3( VB, V3, sc->H[3] );
|
||||||
|
sc->H[4] = mm512_xor3( VC, V4, sc->H[4] );
|
||||||
|
sc->H[5] = mm512_xor3( VD, V5, sc->H[5] );
|
||||||
|
sc->H[6] = mm512_xor3( VE, V6, sc->H[6] );
|
||||||
|
sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
|
||||||
|
}
|
||||||
|
|
||||||
|
// with final_le forms a full hash in 2 parts from little endian data.
|
||||||
|
// all variables hard coded for 80 bytes/lane.
|
||||||
|
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||||
|
const void *data )
|
||||||
|
{
|
||||||
|
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
|
||||||
|
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||||
|
|
||||||
|
// initial hash
|
||||||
|
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||||
|
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||||
|
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||||
|
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||||
|
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
|
||||||
|
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||||
|
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||||
|
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||||
|
|
||||||
|
// fill buffer
|
||||||
|
memcpy_512( sc->buf, (__m512i*)data, 80>>3 );
|
||||||
|
sc->buf[10] = m512_const1_64( 0x8000000000000000ULL );
|
||||||
|
sc->buf[11] =
|
||||||
|
sc->buf[12] = m512_zero;
|
||||||
|
sc->buf[13] = m512_one_64;
|
||||||
|
sc->buf[14] = m512_zero;
|
||||||
|
sc->buf[15] = m512_const1_64( 80*8 );
|
||||||
|
|
||||||
|
// build working variables
|
||||||
|
V0 = sc->H[0];
|
||||||
|
V1 = sc->H[1];
|
||||||
|
V2 = sc->H[2];
|
||||||
|
V3 = sc->H[3];
|
||||||
|
V4 = sc->H[4];
|
||||||
|
V5 = sc->H[5];
|
||||||
|
V6 = sc->H[6];
|
||||||
|
V7 = sc->H[7];
|
||||||
|
V8 = m512_const1_64( CB0 );
|
||||||
|
V9 = m512_const1_64( CB1 );
|
||||||
|
VA = m512_const1_64( CB2 );
|
||||||
|
VB = m512_const1_64( CB3 );
|
||||||
|
VC = _mm512_set1_epi64( CB4 ^ 0x280ULL );
|
||||||
|
VD = _mm512_set1_epi64( CB5 ^ 0x280ULL );
|
||||||
|
VE = _mm512_set1_epi64( CB6 );
|
||||||
|
VF = _mm512_set1_epi64( CB7 );
|
||||||
|
|
||||||
|
// round 0
|
||||||
|
GB_8WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
|
||||||
|
GB_8WAY( sc->buf[ 2], sc->buf[ 3], CB2, CB3, V1, V5, V9, VD );
|
||||||
|
GB_8WAY( sc->buf[ 4], sc->buf[ 5], CB4, CB5, V2, V6, VA, VE );
|
||||||
|
GB_8WAY( sc->buf[ 6], sc->buf[ 7], CB6, CB7, V3, V7, VB, VF );
|
||||||
|
|
||||||
|
// Do half of G4, skip the nonce
|
||||||
|
// GB_8WAY( sc->buf[ 8], sc->buf[ 9], CBx(0, 8), CBx(0, 9), V0, V5, VA, VF );
|
||||||
|
|
||||||
|
V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512(
|
||||||
|
_mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
|
||||||
|
VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 32 );
|
||||||
|
VA = _mm512_add_epi64( VA, VF );
|
||||||
|
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 );
|
||||||
|
V0 = _mm512_add_epi64( V0, V5 );
|
||||||
|
|
||||||
|
GB_8WAY( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
|
||||||
|
GB_8WAY( sc->buf[12], sc->buf[13], CBC, CBD, V2, V7, V8, VD );
|
||||||
|
GB_8WAY( sc->buf[14], sc->buf[15], CBE, CBF, V3, V4, V9, VE );
|
||||||
|
|
||||||
|
// round 1
|
||||||
|
// G1
|
||||||
|
// GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD);
|
||||||
|
V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( CB8 ),
|
||||||
|
sc->buf[ 4] ) );
|
||||||
|
|
||||||
|
// G2
|
||||||
|
// GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
|
||||||
|
V2 = _mm512_add_epi64( V2, V6 );
|
||||||
|
|
||||||
|
// G3
|
||||||
|
// GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF);
|
||||||
|
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
|
||||||
|
_mm512_set1_epi64( CB6 ), sc->buf[13] ), V7 ) );
|
||||||
|
|
||||||
|
// save midstate for second part
|
||||||
|
midstate[ 0] = V0;
|
||||||
|
midstate[ 1] = V1;
|
||||||
|
midstate[ 2] = V2;
|
||||||
|
midstate[ 3] = V3;
|
||||||
|
midstate[ 4] = V4;
|
||||||
|
midstate[ 5] = V5;
|
||||||
|
midstate[ 6] = V6;
|
||||||
|
midstate[ 7] = V7;
|
||||||
|
midstate[ 8] = V8;
|
||||||
|
midstate[ 9] = V9;
|
||||||
|
midstate[10] = VA;
|
||||||
|
midstate[11] = VB;
|
||||||
|
midstate[12] = VC;
|
||||||
|
midstate[13] = VD;
|
||||||
|
midstate[14] = VE;
|
||||||
|
midstate[15] = VF;
|
||||||
|
}
|
||||||
|
|
||||||
|
// pick up where we left off, need the nonce now.
|
||||||
|
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||||
|
const __m512i nonce, const __m512i *midstate )
|
||||||
|
{
|
||||||
|
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||||
|
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||||
|
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
|
||||||
|
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||||
|
__m512i h[8] __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
// Load data with new nonce
|
||||||
|
M0 = sc->buf[ 0];
|
||||||
|
M1 = sc->buf[ 1];
|
||||||
|
M2 = sc->buf[ 2];
|
||||||
|
M3 = sc->buf[ 3];
|
||||||
|
M4 = sc->buf[ 4];
|
||||||
|
M5 = sc->buf[ 5];
|
||||||
|
M6 = sc->buf[ 6];
|
||||||
|
M7 = sc->buf[ 7];
|
||||||
|
M8 = sc->buf[ 8];
|
||||||
|
M9 = nonce;
|
||||||
|
MA = sc->buf[10];
|
||||||
|
MB = sc->buf[11];
|
||||||
|
MC = sc->buf[12];
|
||||||
|
MD = sc->buf[13];
|
||||||
|
ME = sc->buf[14];
|
||||||
|
MF = sc->buf[15];
|
||||||
|
|
||||||
|
V0 = midstate[ 0];
|
||||||
|
V1 = midstate[ 1];
|
||||||
|
V2 = midstate[ 2];
|
||||||
|
V3 = midstate[ 3];
|
||||||
|
V4 = midstate[ 4];
|
||||||
|
V5 = midstate[ 5];
|
||||||
|
V6 = midstate[ 6];
|
||||||
|
V7 = midstate[ 7];
|
||||||
|
V8 = midstate[ 8];
|
||||||
|
V9 = midstate[ 9];
|
||||||
|
VA = midstate[10];
|
||||||
|
VB = midstate[11];
|
||||||
|
VC = midstate[12];
|
||||||
|
VD = midstate[13];
|
||||||
|
VE = midstate[14];
|
||||||
|
VF = midstate[15];
|
||||||
|
|
||||||
|
// finish round 0 with the nonce now available
|
||||||
|
V0 = _mm512_add_epi64( V0, _mm512_xor_si512(
|
||||||
|
_mm512_set1_epi64( CB8 ), M9 ) );
|
||||||
|
VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 16 );
|
||||||
|
VA = _mm512_add_epi64( VA, VF );
|
||||||
|
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 11 );
|
||||||
|
|
||||||
|
// Round 1
|
||||||
|
// G0
|
||||||
|
GB_8WAY(Mx(1, 0), Mx(1, 1), CBx(1, 0), CBx(1, 1), V0, V4, V8, VC);
|
||||||
|
|
||||||
|
// G1
|
||||||
|
// GB_8WAY(Mx(1, 2), Mx(1, 3), CBx(1, 2), CBx(1, 3), V1, V5, V9, VD);
|
||||||
|
// V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 );
|
||||||
|
|
||||||
|
V1 = _mm512_add_epi64( V1, V5 );
|
||||||
|
VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 32 );
|
||||||
|
V9 = _mm512_add_epi64( V9, VD );
|
||||||
|
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 );
|
||||||
|
V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512(
|
||||||
|
_mm512_set1_epi64( CBx(1,2) ), Mx(1,3) ), V5 ) );
|
||||||
|
VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 16 );
|
||||||
|
V9 = _mm512_add_epi64( V9, VD );
|
||||||
|
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 11 );
|
||||||
|
|
||||||
|
// G2
|
||||||
|
// GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
|
||||||
|
// V2 = _mm512_add_epi64( V2, V6 );
|
||||||
|
V2 = _mm512_add_epi64( V2, _mm512_xor_si512(
|
||||||
|
_mm512_set1_epi64( CBF ), M9 ) );
|
||||||
|
VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 32 );
|
||||||
|
VA = _mm512_add_epi64( VA, VE );
|
||||||
|
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 );
|
||||||
|
V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512(
|
||||||
|
_mm512_set1_epi64( CB9 ), MF ), V6 ) );
|
||||||
|
VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 16 );
|
||||||
|
VA = _mm512_add_epi64( VA, VE );
|
||||||
|
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 11 );
|
||||||
|
|
||||||
|
// G3
|
||||||
|
// GB_8WAY(Mx(1, 6), Mx(1, 7), CBx(1, 6), CBx(1, 7), V3, V7, VB, VF);
|
||||||
|
// V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
|
||||||
|
// _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) );
|
||||||
|
|
||||||
|
VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 32 );
|
||||||
|
VB = _mm512_add_epi64( VB, VF );
|
||||||
|
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 );
|
||||||
|
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
|
||||||
|
_mm512_set1_epi64( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
|
||||||
|
VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 16 );
|
||||||
|
VB = _mm512_add_epi64( VB, VF );
|
||||||
|
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 11 );
|
||||||
|
|
||||||
|
// G4, G5, G6, G7
|
||||||
|
GB_8WAY(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
|
||||||
|
GB_8WAY(Mx(1, A), Mx(1, B), CBx(1, A), CBx(1, B), V1, V6, VB, VC);
|
||||||
|
GB_8WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
|
||||||
|
GB_8WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);
|
||||||
|
|
||||||
|
|
||||||
|
// remaining rounds
|
||||||
|
ROUND_B_8WAY(2);
|
||||||
|
ROUND_B_8WAY(3);
|
||||||
|
ROUND_B_8WAY(4);
|
||||||
|
ROUND_B_8WAY(5);
|
||||||
|
ROUND_B_8WAY(6);
|
||||||
|
ROUND_B_8WAY(7);
|
||||||
|
ROUND_B_8WAY(8);
|
||||||
|
ROUND_B_8WAY(9);
|
||||||
|
ROUND_B_8WAY(0);
|
||||||
|
ROUND_B_8WAY(1);
|
||||||
|
ROUND_B_8WAY(2);
|
||||||
|
ROUND_B_8WAY(3);
|
||||||
|
ROUND_B_8WAY(4);
|
||||||
|
ROUND_B_8WAY(5);
|
||||||
|
|
||||||
|
h[0] = mm512_xor3( V8, V0, sc->H[0] );
|
||||||
|
h[1] = mm512_xor3( V9, V1, sc->H[1] );
|
||||||
|
h[2] = mm512_xor3( VA, V2, sc->H[2] );
|
||||||
|
h[3] = mm512_xor3( VB, V3, sc->H[3] );
|
||||||
|
h[4] = mm512_xor3( VC, V4, sc->H[4] );
|
||||||
|
h[5] = mm512_xor3( VD, V5, sc->H[5] );
|
||||||
|
h[6] = mm512_xor3( VE, V6, sc->H[6] );
|
||||||
|
h[7] = mm512_xor3( VF, V7, sc->H[7] );
|
||||||
|
|
||||||
|
// bswap final hash
|
||||||
|
mm512_block_bswap_64( (__m512i*)hash, h );
|
||||||
|
}
|
||||||
|
|
||||||
void blake512_8way_init( blake_8way_big_context *sc )
|
void blake512_8way_init( blake_8way_big_context *sc )
|
||||||
{
|
{
|
||||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||||
@@ -678,6 +972,73 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
|||||||
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||||
|
const void *data, size_t len )
|
||||||
|
{
|
||||||
|
|
||||||
|
// init
|
||||||
|
|
||||||
|
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||||
|
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||||
|
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||||
|
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||||
|
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
|
||||||
|
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||||
|
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||||
|
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||||
|
|
||||||
|
sc->T0 = sc->T1 = 0;
|
||||||
|
sc->ptr = 0;
|
||||||
|
|
||||||
|
// update
|
||||||
|
|
||||||
|
memcpy_512( sc->buf, (__m512i*)data, len>>3 );
|
||||||
|
sc->ptr = len;
|
||||||
|
if ( len == 128 )
|
||||||
|
{
|
||||||
|
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||||
|
sc->T1 = sc->T1 + 1;
|
||||||
|
blake512_8way_compress_le( sc );
|
||||||
|
sc->ptr = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// close
|
||||||
|
|
||||||
|
size_t ptr64 = sc->ptr >> 3;
|
||||||
|
unsigned bit_len;
|
||||||
|
uint64_t th, tl;
|
||||||
|
|
||||||
|
bit_len = sc->ptr << 3;
|
||||||
|
sc->buf[ptr64] = m512_const1_64( 0x8000000000000000ULL );
|
||||||
|
tl = sc->T0 + bit_len;
|
||||||
|
th = sc->T1;
|
||||||
|
|
||||||
|
if ( ptr64 == 0 )
|
||||||
|
{
|
||||||
|
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||||
|
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||||
|
}
|
||||||
|
else if ( sc->T0 == 0 )
|
||||||
|
{
|
||||||
|
sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
|
||||||
|
sc->T1 = sc->T1 - 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
sc->T0 -= 1024 - bit_len;
|
||||||
|
|
||||||
|
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
|
||||||
|
sc->buf[13] = m512_one_64;
|
||||||
|
sc->buf[14] = m512_const1_64( th );
|
||||||
|
sc->buf[15] = m512_const1_64( tl );
|
||||||
|
|
||||||
|
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||||
|
sc->T1 = sc->T1 + 1;
|
||||||
|
|
||||||
|
blake512_8way_compress_le( sc );
|
||||||
|
|
||||||
|
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake512_8way_update(void *cc, const void *data, size_t len)
|
blake512_8way_update(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
@@ -741,14 +1102,10 @@ blake512_8way_close(void *cc, void *dst)
|
|||||||
V9 = m256_const1_64( CB1 ); \
|
V9 = m256_const1_64( CB1 ); \
|
||||||
VA = m256_const1_64( CB2 ); \
|
VA = m256_const1_64( CB2 ); \
|
||||||
VB = m256_const1_64( CB3 ); \
|
VB = m256_const1_64( CB3 ); \
|
||||||
VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
|
VC = _mm256_set1_epi64x( T0 ^ CB4 ); \
|
||||||
m256_const1_64( CB4 ) ); \
|
VD = _mm256_set1_epi64x( T0 ^ CB5 ); \
|
||||||
VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
|
VE = _mm256_set1_epi64x( T1 ^ CB6 ); \
|
||||||
m256_const1_64( CB5 ) ); \
|
VF = _mm256_set1_epi64x( T1 ^ CB7 ); \
|
||||||
VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
|
|
||||||
m256_const1_64( CB6 ) ); \
|
|
||||||
VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
|
|
||||||
m256_const1_64( CB7 ) ); \
|
|
||||||
shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||||
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||||
@@ -869,6 +1226,221 @@ void blake512_4way_compress( blake_4way_big_context *sc )
|
|||||||
sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
|
sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||||
|
const void *data )
|
||||||
|
{
|
||||||
|
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
|
||||||
|
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||||
|
|
||||||
|
// initial hash
|
||||||
|
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||||
|
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||||
|
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||||
|
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
|
||||||
|
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
|
||||||
|
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||||
|
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||||
|
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||||
|
|
||||||
|
// fill buffer
|
||||||
|
memcpy_256( sc->buf, (__m256i*)data, 80>>3 );
|
||||||
|
sc->buf[10] = m256_const1_64( 0x8000000000000000ULL );
|
||||||
|
sc->buf[11] = m256_zero;
|
||||||
|
sc->buf[12] = m256_zero;
|
||||||
|
sc->buf[13] = m256_one_64;
|
||||||
|
sc->buf[14] = m256_zero;
|
||||||
|
sc->buf[15] = m256_const1_64( 80*8 );
|
||||||
|
|
||||||
|
// build working variables
|
||||||
|
V0 = sc->H[0];
|
||||||
|
V1 = sc->H[1];
|
||||||
|
V2 = sc->H[2];
|
||||||
|
V3 = sc->H[3];
|
||||||
|
V4 = sc->H[4];
|
||||||
|
V5 = sc->H[5];
|
||||||
|
V6 = sc->H[6];
|
||||||
|
V7 = sc->H[7];
|
||||||
|
V8 = m256_const1_64( CB0 );
|
||||||
|
V9 = m256_const1_64( CB1 );
|
||||||
|
VA = m256_const1_64( CB2 );
|
||||||
|
VB = m256_const1_64( CB3 );
|
||||||
|
VC = _mm256_set1_epi64x( CB4 ^ 0x280ULL );
|
||||||
|
VD = _mm256_set1_epi64x( CB5 ^ 0x280ULL );
|
||||||
|
VE = _mm256_set1_epi64x( CB6 );
|
||||||
|
VF = _mm256_set1_epi64x( CB7 );
|
||||||
|
|
||||||
|
// round 0
|
||||||
|
GB_4WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
|
||||||
|
GB_4WAY( sc->buf[ 2], sc->buf[ 3], CB2, CB3, V1, V5, V9, VD );
|
||||||
|
GB_4WAY( sc->buf[ 4], sc->buf[ 5], CB4, CB5, V2, V6, VA, VE );
|
||||||
|
GB_4WAY( sc->buf[ 6], sc->buf[ 7], CB6, CB7, V3, V7, VB, VF );
|
||||||
|
|
||||||
|
// G4 skip nonce
|
||||||
|
V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256(
|
||||||
|
_mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 );
|
||||||
|
VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 32 );
|
||||||
|
VA = _mm256_add_epi64( VA, VF );
|
||||||
|
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 );
|
||||||
|
V0 = _mm256_add_epi64( V0, V5 );
|
||||||
|
|
||||||
|
GB_4WAY( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
|
||||||
|
GB_4WAY( sc->buf[12], sc->buf[13], CBC, CBD, V2, V7, V8, VD );
|
||||||
|
GB_4WAY( sc->buf[14], sc->buf[15], CBE, CBF, V3, V4, V9, VE );
|
||||||
|
|
||||||
|
// round 1
|
||||||
|
// G1
|
||||||
|
V1 = _mm256_add_epi64( V1, _mm256_xor_si256( _mm256_set1_epi64x( CB8 ),
|
||||||
|
sc->buf[ 4] ) );
|
||||||
|
|
||||||
|
// G2
|
||||||
|
V2 = _mm256_add_epi64( V2, V6 );
|
||||||
|
|
||||||
|
// G3
|
||||||
|
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
|
||||||
|
_mm256_set1_epi64x( CB6 ), sc->buf[13] ), V7 ) );
|
||||||
|
|
||||||
|
// save midstate for second part
|
||||||
|
midstate[ 0] = V0;
|
||||||
|
midstate[ 1] = V1;
|
||||||
|
midstate[ 2] = V2;
|
||||||
|
midstate[ 3] = V3;
|
||||||
|
midstate[ 4] = V4;
|
||||||
|
midstate[ 5] = V5;
|
||||||
|
midstate[ 6] = V6;
|
||||||
|
midstate[ 7] = V7;
|
||||||
|
midstate[ 8] = V8;
|
||||||
|
midstate[ 9] = V9;
|
||||||
|
midstate[10] = VA;
|
||||||
|
midstate[11] = VB;
|
||||||
|
midstate[12] = VC;
|
||||||
|
midstate[13] = VD;
|
||||||
|
midstate[14] = VE;
|
||||||
|
midstate[15] = VF;
|
||||||
|
}
|
||||||
|
|
||||||
|
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||||
|
const __m256i nonce, const __m256i *midstate )
|
||||||
|
{
|
||||||
|
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||||
|
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||||
|
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
|
||||||
|
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||||
|
__m256i h[8] __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
// Load data with new nonce
|
||||||
|
M0 = sc->buf[ 0];
|
||||||
|
M1 = sc->buf[ 1];
|
||||||
|
M2 = sc->buf[ 2];
|
||||||
|
M3 = sc->buf[ 3];
|
||||||
|
M4 = sc->buf[ 4];
|
||||||
|
M5 = sc->buf[ 5];
|
||||||
|
M6 = sc->buf[ 6];
|
||||||
|
M7 = sc->buf[ 7];
|
||||||
|
M8 = sc->buf[ 8];
|
||||||
|
M9 = nonce;
|
||||||
|
MA = sc->buf[10];
|
||||||
|
MB = sc->buf[11];
|
||||||
|
MC = sc->buf[12];
|
||||||
|
MD = sc->buf[13];
|
||||||
|
ME = sc->buf[14];
|
||||||
|
MF = sc->buf[15];
|
||||||
|
|
||||||
|
V0 = midstate[ 0];
|
||||||
|
V1 = midstate[ 1];
|
||||||
|
V2 = midstate[ 2];
|
||||||
|
V3 = midstate[ 3];
|
||||||
|
V4 = midstate[ 4];
|
||||||
|
V5 = midstate[ 5];
|
||||||
|
V6 = midstate[ 6];
|
||||||
|
V7 = midstate[ 7];
|
||||||
|
V8 = midstate[ 8];
|
||||||
|
V9 = midstate[ 9];
|
||||||
|
VA = midstate[10];
|
||||||
|
VB = midstate[11];
|
||||||
|
VC = midstate[12];
|
||||||
|
VD = midstate[13];
|
||||||
|
VE = midstate[14];
|
||||||
|
VF = midstate[15];
|
||||||
|
|
||||||
|
// finish round 0, with the nonce now available
|
||||||
|
V0 = _mm256_add_epi64( V0, _mm256_xor_si256(
|
||||||
|
_mm256_set1_epi64x( CB8 ), M9 ) );
|
||||||
|
VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 16 );
|
||||||
|
VA = _mm256_add_epi64( VA, VF );
|
||||||
|
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 );
|
||||||
|
|
||||||
|
// Round 1
|
||||||
|
// G0
|
||||||
|
GB_4WAY(Mx(1, 0), Mx(1, 1), CBx(1, 0), CBx(1, 1), V0, V4, V8, VC);
|
||||||
|
|
||||||
|
// G1
|
||||||
|
V1 = _mm256_add_epi64( V1, V5 );
|
||||||
|
VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 32 );
|
||||||
|
V9 = _mm256_add_epi64( V9, VD );
|
||||||
|
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 );
|
||||||
|
V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256(
|
||||||
|
_mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) );
|
||||||
|
VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 16 );
|
||||||
|
V9 = _mm256_add_epi64( V9, VD );
|
||||||
|
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 );
|
||||||
|
|
||||||
|
// G2
|
||||||
|
V2 = _mm256_add_epi64( V2, _mm256_xor_si256(
|
||||||
|
_mm256_set1_epi64x( CBF ), M9 ) );
|
||||||
|
VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 32 );
|
||||||
|
VA = _mm256_add_epi64( VA, VE );
|
||||||
|
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 );
|
||||||
|
V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256(
|
||||||
|
_mm256_set1_epi64x( CB9 ), MF ), V6 ) );
|
||||||
|
VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 16 );
|
||||||
|
VA = _mm256_add_epi64( VA, VE );
|
||||||
|
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 );
|
||||||
|
|
||||||
|
// G3
|
||||||
|
VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 32 );
|
||||||
|
VB = _mm256_add_epi64( VB, VF );
|
||||||
|
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 );
|
||||||
|
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
|
||||||
|
_mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
|
||||||
|
VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 16 );
|
||||||
|
VB = _mm256_add_epi64( VB, VF );
|
||||||
|
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 );
|
||||||
|
|
||||||
|
// G4, G5, G6, G7
|
||||||
|
GB_4WAY(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
|
||||||
|
GB_4WAY(Mx(1, A), Mx(1, B), CBx(1, A), CBx(1, B), V1, V6, VB, VC);
|
||||||
|
GB_4WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
|
||||||
|
GB_4WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);
|
||||||
|
|
||||||
|
ROUND_B_4WAY(2);
|
||||||
|
ROUND_B_4WAY(3);
|
||||||
|
ROUND_B_4WAY(4);
|
||||||
|
ROUND_B_4WAY(5);
|
||||||
|
ROUND_B_4WAY(6);
|
||||||
|
ROUND_B_4WAY(7);
|
||||||
|
ROUND_B_4WAY(8);
|
||||||
|
ROUND_B_4WAY(9);
|
||||||
|
ROUND_B_4WAY(0);
|
||||||
|
ROUND_B_4WAY(1);
|
||||||
|
ROUND_B_4WAY(2);
|
||||||
|
ROUND_B_4WAY(3);
|
||||||
|
ROUND_B_4WAY(4);
|
||||||
|
ROUND_B_4WAY(5);
|
||||||
|
|
||||||
|
h[0] = mm256_xor3( V8, V0, sc->H[0] );
|
||||||
|
h[1] = mm256_xor3( V9, V1, sc->H[1] );
|
||||||
|
h[2] = mm256_xor3( VA, V2, sc->H[2] );
|
||||||
|
h[3] = mm256_xor3( VB, V3, sc->H[3] );
|
||||||
|
h[4] = mm256_xor3( VC, V4, sc->H[4] );
|
||||||
|
h[5] = mm256_xor3( VD, V5, sc->H[5] );
|
||||||
|
h[6] = mm256_xor3( VE, V6, sc->H[6] );
|
||||||
|
h[7] = mm256_xor3( VF, V7, sc->H[7] );
|
||||||
|
|
||||||
|
// bswap final hash
|
||||||
|
mm256_block_bswap_64( (__m256i*)hash, h );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void blake512_4way_init( blake_4way_big_context *sc )
|
void blake512_4way_init( blake_4way_big_context *sc )
|
||||||
{
|
{
|
||||||
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
|
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||||
|
@@ -630,6 +630,69 @@ static const sph_u64 CB[16] = {
|
|||||||
H7 ^= S3 ^ V7 ^ VF; \
|
H7 ^= S3 ^ V7 ^ VF; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
#define COMPRESS32_LE do { \
|
||||||
|
sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||||
|
sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||||
|
sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||||
|
sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||||
|
V0 = H0; \
|
||||||
|
V1 = H1; \
|
||||||
|
V2 = H2; \
|
||||||
|
V3 = H3; \
|
||||||
|
V4 = H4; \
|
||||||
|
V5 = H5; \
|
||||||
|
V6 = H6; \
|
||||||
|
V7 = H7; \
|
||||||
|
V8 = S0 ^ CS0; \
|
||||||
|
V9 = S1 ^ CS1; \
|
||||||
|
VA = S2 ^ CS2; \
|
||||||
|
VB = S3 ^ CS3; \
|
||||||
|
VC = T0 ^ CS4; \
|
||||||
|
VD = T0 ^ CS5; \
|
||||||
|
VE = T1 ^ CS6; \
|
||||||
|
VF = T1 ^ CS7; \
|
||||||
|
M0 = *((uint32_t*)(buf + 0)); \
|
||||||
|
M1 = *((uint32_t*)(buf + 4)); \
|
||||||
|
M2 = *((uint32_t*)(buf + 8)); \
|
||||||
|
M3 = *((uint32_t*)(buf + 12)); \
|
||||||
|
M4 = *((uint32_t*)(buf + 16)); \
|
||||||
|
M5 = *((uint32_t*)(buf + 20)); \
|
||||||
|
M6 = *((uint32_t*)(buf + 24)); \
|
||||||
|
M7 = *((uint32_t*)(buf + 28)); \
|
||||||
|
M8 = *((uint32_t*)(buf + 32)); \
|
||||||
|
M9 = *((uint32_t*)(buf + 36)); \
|
||||||
|
MA = *((uint32_t*)(buf + 40)); \
|
||||||
|
MB = *((uint32_t*)(buf + 44)); \
|
||||||
|
MC = *((uint32_t*)(buf + 48)); \
|
||||||
|
MD = *((uint32_t*)(buf + 52)); \
|
||||||
|
ME = *((uint32_t*)(buf + 56)); \
|
||||||
|
MF = *((uint32_t*)(buf + 60)); \
|
||||||
|
ROUND_S(0); \
|
||||||
|
ROUND_S(1); \
|
||||||
|
ROUND_S(2); \
|
||||||
|
ROUND_S(3); \
|
||||||
|
ROUND_S(4); \
|
||||||
|
ROUND_S(5); \
|
||||||
|
ROUND_S(6); \
|
||||||
|
ROUND_S(7); \
|
||||||
|
if (BLAKE32_ROUNDS == 14) { \
|
||||||
|
ROUND_S(8); \
|
||||||
|
ROUND_S(9); \
|
||||||
|
ROUND_S(0); \
|
||||||
|
ROUND_S(1); \
|
||||||
|
ROUND_S(2); \
|
||||||
|
ROUND_S(3); \
|
||||||
|
} \
|
||||||
|
H0 ^= S0 ^ V0 ^ V8; \
|
||||||
|
H1 ^= S1 ^ V1 ^ V9; \
|
||||||
|
H2 ^= S2 ^ V2 ^ VA; \
|
||||||
|
H3 ^= S3 ^ V3 ^ VB; \
|
||||||
|
H4 ^= S0 ^ V4 ^ VC; \
|
||||||
|
H5 ^= S1 ^ V5 ^ VD; \
|
||||||
|
H6 ^= S2 ^ V6 ^ VE; \
|
||||||
|
H7 ^= S3 ^ V7 ^ VF; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if SPH_64
|
#if SPH_64
|
||||||
@@ -843,6 +906,45 @@ blake32(sph_blake_small_context *sc, const void *data, size_t len)
|
|||||||
sc->ptr = ptr;
|
sc->ptr = ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
blake32_le(sph_blake_small_context *sc, const void *data, size_t len)
|
||||||
|
{
|
||||||
|
unsigned char *buf;
|
||||||
|
size_t ptr;
|
||||||
|
DECL_STATE32
|
||||||
|
|
||||||
|
buf = sc->buf;
|
||||||
|
ptr = sc->ptr;
|
||||||
|
|
||||||
|
if (len < (sizeof sc->buf) - ptr) {
|
||||||
|
memcpy(buf + ptr, data, len);
|
||||||
|
ptr += len;
|
||||||
|
sc->ptr = ptr;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
READ_STATE32(sc);
|
||||||
|
while (len > 0) {
|
||||||
|
size_t clen;
|
||||||
|
|
||||||
|
clen = (sizeof sc->buf) - ptr;
|
||||||
|
if (clen > len)
|
||||||
|
clen = len;
|
||||||
|
memcpy(buf + ptr, data, clen);
|
||||||
|
ptr += clen;
|
||||||
|
data = (const unsigned char *)data + clen;
|
||||||
|
len -= clen;
|
||||||
|
if (ptr == sizeof sc->buf) {
|
||||||
|
if ((T0 = SPH_T32(T0 + 512)) < 512)
|
||||||
|
T1 = SPH_T32(T1 + 1);
|
||||||
|
COMPRESS32_LE;
|
||||||
|
ptr = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
WRITE_STATE32(sc);
|
||||||
|
sc->ptr = ptr;
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
blake32_close(sph_blake_small_context *sc,
|
blake32_close(sph_blake_small_context *sc,
|
||||||
unsigned ub, unsigned n, void *dst, size_t out_size_w32)
|
unsigned ub, unsigned n, void *dst, size_t out_size_w32)
|
||||||
@@ -1050,6 +1152,12 @@ sph_blake256(void *cc, const void *data, size_t len)
|
|||||||
blake32(cc, data, len);
|
blake32(cc, data, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
sph_blake256_update_le(void *cc, const void *data, size_t len)
|
||||||
|
{
|
||||||
|
blake32_le(cc, data, len);
|
||||||
|
}
|
||||||
|
|
||||||
/* see sph_blake.h */
|
/* see sph_blake.h */
|
||||||
void
|
void
|
||||||
sph_blake256_close(void *cc, void *dst)
|
sph_blake256_close(void *cc, void *dst)
|
||||||
|
@@ -198,6 +198,7 @@ void sph_blake256_init(void *cc);
|
|||||||
* @param len the input data length (in bytes)
|
* @param len the input data length (in bytes)
|
||||||
*/
|
*/
|
||||||
void sph_blake256(void *cc, const void *data, size_t len);
|
void sph_blake256(void *cc, const void *data, size_t len);
|
||||||
|
void sph_blake256_update_le(void *cc, const void *data, size_t len);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Terminate the current BLAKE-256 computation and output the result into
|
* Terminate the current BLAKE-256 computation and output the result into
|
||||||
|
@@ -594,9 +594,6 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
|||||||
#define rb6(x) mm256_rol_64( x, 43 )
|
#define rb6(x) mm256_rol_64( x, 43 )
|
||||||
#define rb7(x) mm256_rol_64( x, 53 )
|
#define rb7(x) mm256_rol_64( x, 53 )
|
||||||
|
|
||||||
#define rol_off_64( M, j ) \
|
|
||||||
mm256_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
|
|
||||||
|
|
||||||
#define add_elt_b( mj0, mj3, mj10, h, K ) \
|
#define add_elt_b( mj0, mj3, mj10, h, K ) \
|
||||||
_mm256_xor_si256( h, _mm256_add_epi64( K, \
|
_mm256_xor_si256( h, _mm256_add_epi64( K, \
|
||||||
_mm256_sub_epi64( _mm256_add_epi64( mj0, mj3 ), mj10 ) ) )
|
_mm256_sub_epi64( _mm256_add_epi64( mj0, mj3 ), mj10 ) ) )
|
||||||
@@ -732,8 +729,23 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
|||||||
qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] );
|
qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] );
|
||||||
|
|
||||||
__m256i mj[16];
|
__m256i mj[16];
|
||||||
for ( i = 0; i < 16; i++ )
|
|
||||||
mj[i] = rol_off_64( M, i );
|
mj[ 0] = mm256_rol_64( M[ 0], 1 );
|
||||||
|
mj[ 1] = mm256_rol_64( M[ 1], 2 );
|
||||||
|
mj[ 2] = mm256_rol_64( M[ 2], 3 );
|
||||||
|
mj[ 3] = mm256_rol_64( M[ 3], 4 );
|
||||||
|
mj[ 4] = mm256_rol_64( M[ 4], 5 );
|
||||||
|
mj[ 5] = mm256_rol_64( M[ 5], 6 );
|
||||||
|
mj[ 6] = mm256_rol_64( M[ 6], 7 );
|
||||||
|
mj[ 7] = mm256_rol_64( M[ 7], 8 );
|
||||||
|
mj[ 8] = mm256_rol_64( M[ 8], 9 );
|
||||||
|
mj[ 9] = mm256_rol_64( M[ 9], 10 );
|
||||||
|
mj[10] = mm256_rol_64( M[10], 11 );
|
||||||
|
mj[11] = mm256_rol_64( M[11], 12 );
|
||||||
|
mj[12] = mm256_rol_64( M[12], 13 );
|
||||||
|
mj[13] = mm256_rol_64( M[13], 14 );
|
||||||
|
mj[14] = mm256_rol_64( M[14], 15 );
|
||||||
|
mj[15] = mm256_rol_64( M[15], 16 );
|
||||||
|
|
||||||
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7],
|
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7],
|
||||||
(const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) );
|
(const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) );
|
||||||
@@ -1034,9 +1046,6 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|||||||
#define r8b6(x) mm512_rol_64( x, 43 )
|
#define r8b6(x) mm512_rol_64( x, 43 )
|
||||||
#define r8b7(x) mm512_rol_64( x, 53 )
|
#define r8b7(x) mm512_rol_64( x, 53 )
|
||||||
|
|
||||||
#define rol8w_off_64( M, j ) \
|
|
||||||
mm512_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
|
|
||||||
|
|
||||||
#define add_elt_b8( mj0, mj3, mj10, h, K ) \
|
#define add_elt_b8( mj0, mj3, mj10, h, K ) \
|
||||||
_mm512_xor_si512( h, _mm512_add_epi64( K, \
|
_mm512_xor_si512( h, _mm512_add_epi64( K, \
|
||||||
_mm512_sub_epi64( _mm512_add_epi64( mj0, mj3 ), mj10 ) ) )
|
_mm512_sub_epi64( _mm512_add_epi64( mj0, mj3 ), mj10 ) ) )
|
||||||
@@ -1171,41 +1180,73 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
|
|||||||
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
|
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
|
||||||
|
|
||||||
__m512i mj[16];
|
__m512i mj[16];
|
||||||
for ( i = 0; i < 16; i++ )
|
uint64_t K = 16 * 0x0555555555555555ULL;
|
||||||
mj[i] = rol8w_off_64( M, i );
|
|
||||||
|
mj[ 0] = mm512_rol_64( M[ 0], 1 );
|
||||||
|
mj[ 1] = mm512_rol_64( M[ 1], 2 );
|
||||||
|
mj[ 2] = mm512_rol_64( M[ 2], 3 );
|
||||||
|
mj[ 3] = mm512_rol_64( M[ 3], 4 );
|
||||||
|
mj[ 4] = mm512_rol_64( M[ 4], 5 );
|
||||||
|
mj[ 5] = mm512_rol_64( M[ 5], 6 );
|
||||||
|
mj[ 6] = mm512_rol_64( M[ 6], 7 );
|
||||||
|
mj[ 7] = mm512_rol_64( M[ 7], 8 );
|
||||||
|
mj[ 8] = mm512_rol_64( M[ 8], 9 );
|
||||||
|
mj[ 9] = mm512_rol_64( M[ 9], 10 );
|
||||||
|
mj[10] = mm512_rol_64( M[10], 11 );
|
||||||
|
mj[11] = mm512_rol_64( M[11], 12 );
|
||||||
|
mj[12] = mm512_rol_64( M[12], 13 );
|
||||||
|
mj[13] = mm512_rol_64( M[13], 14 );
|
||||||
|
mj[14] = mm512_rol_64( M[14], 15 );
|
||||||
|
mj[15] = mm512_rol_64( M[15], 16 );
|
||||||
|
|
||||||
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7],
|
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7],
|
||||||
(const __m512i)_mm512_set1_epi64( 16 * 0x0555555555555555ULL ) );
|
(const __m512i)_mm512_set1_epi64( K ) );
|
||||||
|
K += 0x0555555555555555ULL;
|
||||||
qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8],
|
qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8],
|
||||||
(const __m512i)_mm512_set1_epi64( 17 * 0x0555555555555555ULL ) );
|
(const __m512i)_mm512_set1_epi64( K ) );
|
||||||
|
K += 0x0555555555555555ULL;
|
||||||
qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9],
|
qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9],
|
||||||
(const __m512i)_mm512_set1_epi64( 18 * 0x0555555555555555ULL ) );
|
(const __m512i)_mm512_set1_epi64( K ) );
|
||||||
|
K += 0x0555555555555555ULL;
|
||||||
qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10],
|
qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10],
|
||||||
(const __m512i)_mm512_set1_epi64( 19 * 0x0555555555555555ULL ) );
|
(const __m512i)_mm512_set1_epi64( K ) );
|
||||||
|
K += 0x0555555555555555ULL;
|
||||||
qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11],
|
qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11],
|
||||||
(const __m512i)_mm512_set1_epi64( 20 * 0x0555555555555555ULL ) );
|
(const __m512i)_mm512_set1_epi64( K ) );
|
||||||
|
K += 0x0555555555555555ULL;
|
||||||
qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12],
|
qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12],
|
||||||
(const __m512i)_mm512_set1_epi64( 21 * 0x0555555555555555ULL ) );
|
(const __m512i)_mm512_set1_epi64( K ) );
|
||||||
|
K += 0x0555555555555555ULL;
|
||||||
qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13],
|
qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13],
|
||||||
(const __m512i)_mm512_set1_epi64( 22 * 0x0555555555555555ULL ) );
|
(const __m512i)_mm512_set1_epi64( K ) );
|
||||||
|
K += 0x0555555555555555ULL;
|
||||||
qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14],
|
qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14],
|
||||||
(const __m512i)_mm512_set1_epi64( 23 * 0x0555555555555555ULL ) );
|
(const __m512i)_mm512_set1_epi64( K ) );
|
||||||
|
K += 0x0555555555555555ULL;
|
||||||
qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15],
|
qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15],
|
||||||
(const __m512i)_mm512_set1_epi64( 24 * 0x0555555555555555ULL ) );
|
(const __m512i)_mm512_set1_epi64( K ) );
|
||||||
|
K += 0x0555555555555555ULL;
|
||||||
qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0],
|
qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0],
|
||||||
(const __m512i)_mm512_set1_epi64( 25 * 0x0555555555555555ULL ) );
|
(const __m512i)_mm512_set1_epi64( K ) );
|
||||||
|
K += 0x0555555555555555ULL;
|
||||||
qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1],
|
qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1],
|
||||||
(const __m512i)_mm512_set1_epi64( 26 * 0x0555555555555555ULL ) );
|
(const __m512i)_mm512_set1_epi64( K ) );
|
||||||
|
K += 0x0555555555555555ULL;
|
||||||
qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2],
|
qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2],
|
||||||
(const __m512i)_mm512_set1_epi64( 27 * 0x0555555555555555ULL ) );
|
(const __m512i)_mm512_set1_epi64( K ) );
|
||||||
|
K += 0x0555555555555555ULL;
|
||||||
qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3],
|
qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3],
|
||||||
(const __m512i)_mm512_set1_epi64( 28 * 0x0555555555555555ULL ) );
|
(const __m512i)_mm512_set1_epi64( K ) );
|
||||||
|
K += 0x0555555555555555ULL;
|
||||||
qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4],
|
qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4],
|
||||||
(const __m512i)_mm512_set1_epi64( 29 * 0x0555555555555555ULL ) );
|
(const __m512i)_mm512_set1_epi64( K ) );
|
||||||
|
K += 0x0555555555555555ULL;
|
||||||
qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5],
|
qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5],
|
||||||
(const __m512i)_mm512_set1_epi64( 30 * 0x0555555555555555ULL ) );
|
(const __m512i)_mm512_set1_epi64( K ) );
|
||||||
|
K += 0x0555555555555555ULL;
|
||||||
qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6],
|
qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6],
|
||||||
(const __m512i)_mm512_set1_epi64( 31 * 0x0555555555555555ULL ) );
|
(const __m512i)_mm512_set1_epi64( K ) );
|
||||||
|
|
||||||
|
|
||||||
qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
|
qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
|
||||||
qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
|
qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
|
||||||
|
@@ -54,14 +54,12 @@ static void transform_4way( cube_4way_context *sp )
|
|||||||
x5 = _mm512_add_epi32( x1, x5 );
|
x5 = _mm512_add_epi32( x1, x5 );
|
||||||
x6 = _mm512_add_epi32( x2, x6 );
|
x6 = _mm512_add_epi32( x2, x6 );
|
||||||
x7 = _mm512_add_epi32( x3, x7 );
|
x7 = _mm512_add_epi32( x3, x7 );
|
||||||
y0 = x0;
|
y0 = mm512_rol_32( x2, 7 );
|
||||||
y1 = x1;
|
y1 = mm512_rol_32( x3, 7 );
|
||||||
x0 = mm512_rol_32( x2, 7 );
|
x2 = mm512_rol_32( x0, 7 );
|
||||||
x1 = mm512_rol_32( x3, 7 );
|
x3 = mm512_rol_32( x1, 7 );
|
||||||
x2 = mm512_rol_32( y0, 7 );
|
x0 = _mm512_xor_si512( y0, x4 );
|
||||||
x3 = mm512_rol_32( y1, 7 );
|
x1 = _mm512_xor_si512( y1, x5 );
|
||||||
x0 = _mm512_xor_si512( x0, x4 );
|
|
||||||
x1 = _mm512_xor_si512( x1, x5 );
|
|
||||||
x2 = _mm512_xor_si512( x2, x6 );
|
x2 = _mm512_xor_si512( x2, x6 );
|
||||||
x3 = _mm512_xor_si512( x3, x7 );
|
x3 = _mm512_xor_si512( x3, x7 );
|
||||||
x4 = mm512_swap128_64( x4 );
|
x4 = mm512_swap128_64( x4 );
|
||||||
@@ -72,15 +70,13 @@ static void transform_4way( cube_4way_context *sp )
|
|||||||
x5 = _mm512_add_epi32( x1, x5 );
|
x5 = _mm512_add_epi32( x1, x5 );
|
||||||
x6 = _mm512_add_epi32( x2, x6 );
|
x6 = _mm512_add_epi32( x2, x6 );
|
||||||
x7 = _mm512_add_epi32( x3, x7 );
|
x7 = _mm512_add_epi32( x3, x7 );
|
||||||
y0 = x0;
|
y0 = mm512_rol_32( x1, 11 );
|
||||||
y1 = x2;
|
x1 = mm512_rol_32( x0, 11 );
|
||||||
x0 = mm512_rol_32( x1, 11 );
|
y1 = mm512_rol_32( x3, 11 );
|
||||||
x1 = mm512_rol_32( y0, 11 );
|
x3 = mm512_rol_32( x2, 11 );
|
||||||
x2 = mm512_rol_32( x3, 11 );
|
x0 = _mm512_xor_si512( y0, x4 );
|
||||||
x3 = mm512_rol_32( y1, 11 );
|
|
||||||
x0 = _mm512_xor_si512( x0, x4 );
|
|
||||||
x1 = _mm512_xor_si512( x1, x5 );
|
x1 = _mm512_xor_si512( x1, x5 );
|
||||||
x2 = _mm512_xor_si512( x2, x6 );
|
x2 = _mm512_xor_si512( y1, x6 );
|
||||||
x3 = _mm512_xor_si512( x3, x7 );
|
x3 = _mm512_xor_si512( x3, x7 );
|
||||||
x4 = mm512_swap64_32( x4 );
|
x4 = mm512_swap64_32( x4 );
|
||||||
x5 = mm512_swap64_32( x5 );
|
x5 = mm512_swap64_32( x5 );
|
||||||
@@ -131,83 +127,67 @@ static void transform_4way_2buf( cube_4way_2buf_context *sp )
|
|||||||
{
|
{
|
||||||
x4 = _mm512_add_epi32( x0, x4 );
|
x4 = _mm512_add_epi32( x0, x4 );
|
||||||
y4 = _mm512_add_epi32( y0, y4 );
|
y4 = _mm512_add_epi32( y0, y4 );
|
||||||
tx0 = x0;
|
|
||||||
ty0 = y0;
|
|
||||||
x5 = _mm512_add_epi32( x1, x5 );
|
x5 = _mm512_add_epi32( x1, x5 );
|
||||||
y5 = _mm512_add_epi32( y1, y5 );
|
y5 = _mm512_add_epi32( y1, y5 );
|
||||||
tx1 = x1;
|
tx0 = mm512_rol_32( x2, 7 );
|
||||||
ty1 = y1;
|
ty0 = mm512_rol_32( y2, 7 );
|
||||||
x0 = mm512_rol_32( x2, 7 );
|
tx1 = mm512_rol_32( x3, 7 );
|
||||||
y0 = mm512_rol_32( y2, 7 );
|
ty1 = mm512_rol_32( y3, 7 );
|
||||||
x6 = _mm512_add_epi32( x2, x6 );
|
x6 = _mm512_add_epi32( x2, x6 );
|
||||||
y6 = _mm512_add_epi32( y2, y6 );
|
y6 = _mm512_add_epi32( y2, y6 );
|
||||||
x1 = mm512_rol_32( x3, 7 );
|
|
||||||
y1 = mm512_rol_32( y3, 7 );
|
|
||||||
x7 = _mm512_add_epi32( x3, x7 );
|
x7 = _mm512_add_epi32( x3, x7 );
|
||||||
y7 = _mm512_add_epi32( y3, y7 );
|
y7 = _mm512_add_epi32( y3, y7 );
|
||||||
|
x2 = mm512_rol_32( x0, 7 );
|
||||||
|
y2 = mm512_rol_32( y0, 7 );
|
||||||
x2 = mm512_rol_32( tx0, 7 );
|
x3 = mm512_rol_32( x1, 7 );
|
||||||
y2 = mm512_rol_32( ty0, 7 );
|
y3 = mm512_rol_32( y1, 7 );
|
||||||
x0 = _mm512_xor_si512( x0, x4 );
|
x0 = _mm512_xor_si512( tx0, x4 );
|
||||||
y0 = _mm512_xor_si512( y0, y4 );
|
y0 = _mm512_xor_si512( ty0, y4 );
|
||||||
|
x1 = _mm512_xor_si512( tx1, x5 );
|
||||||
|
y1 = _mm512_xor_si512( ty1, y5 );
|
||||||
x4 = mm512_swap128_64( x4 );
|
x4 = mm512_swap128_64( x4 );
|
||||||
x3 = mm512_rol_32( tx1, 7 );
|
|
||||||
y3 = mm512_rol_32( ty1, 7 );
|
|
||||||
y4 = mm512_swap128_64( y4 );
|
y4 = mm512_swap128_64( y4 );
|
||||||
|
|
||||||
x1 = _mm512_xor_si512( x1, x5 );
|
|
||||||
y1 = _mm512_xor_si512( y1, y5 );
|
|
||||||
x5 = mm512_swap128_64( x5 );
|
x5 = mm512_swap128_64( x5 );
|
||||||
|
y5 = mm512_swap128_64( y5 );
|
||||||
x2 = _mm512_xor_si512( x2, x6 );
|
x2 = _mm512_xor_si512( x2, x6 );
|
||||||
y2 = _mm512_xor_si512( y2, y6 );
|
y2 = _mm512_xor_si512( y2, y6 );
|
||||||
y5 = mm512_swap128_64( y5 );
|
|
||||||
x3 = _mm512_xor_si512( x3, x7 );
|
x3 = _mm512_xor_si512( x3, x7 );
|
||||||
y3 = _mm512_xor_si512( y3, y7 );
|
y3 = _mm512_xor_si512( y3, y7 );
|
||||||
|
|
||||||
x6 = mm512_swap128_64( x6 );
|
x6 = mm512_swap128_64( x6 );
|
||||||
|
y6 = mm512_swap128_64( y6 );
|
||||||
|
x7 = mm512_swap128_64( x7 );
|
||||||
|
y7 = mm512_swap128_64( y7 );
|
||||||
x4 = _mm512_add_epi32( x0, x4 );
|
x4 = _mm512_add_epi32( x0, x4 );
|
||||||
y4 = _mm512_add_epi32( y0, y4 );
|
y4 = _mm512_add_epi32( y0, y4 );
|
||||||
y6 = mm512_swap128_64( y6 );
|
|
||||||
x5 = _mm512_add_epi32( x1, x5 );
|
x5 = _mm512_add_epi32( x1, x5 );
|
||||||
y5 = _mm512_add_epi32( y1, y5 );
|
y5 = _mm512_add_epi32( y1, y5 );
|
||||||
x7 = mm512_swap128_64( x7 );
|
tx0 = mm512_rol_32( x1, 11 );
|
||||||
|
ty0 = mm512_rol_32( y1, 11 );
|
||||||
|
tx1 = mm512_rol_32( x3, 11 );
|
||||||
|
ty1 = mm512_rol_32( y3, 11 );
|
||||||
x6 = _mm512_add_epi32( x2, x6 );
|
x6 = _mm512_add_epi32( x2, x6 );
|
||||||
y6 = _mm512_add_epi32( y2, y6 );
|
y6 = _mm512_add_epi32( y2, y6 );
|
||||||
tx0 = x0;
|
|
||||||
ty0 = y0;
|
|
||||||
y7 = mm512_swap128_64( y7 );
|
|
||||||
tx1 = x2;
|
|
||||||
ty1 = y2;
|
|
||||||
x0 = mm512_rol_32( x1, 11 );
|
|
||||||
y0 = mm512_rol_32( y1, 11 );
|
|
||||||
|
|
||||||
x7 = _mm512_add_epi32( x3, x7 );
|
x7 = _mm512_add_epi32( x3, x7 );
|
||||||
y7 = _mm512_add_epi32( y3, y7 );
|
y7 = _mm512_add_epi32( y3, y7 );
|
||||||
|
x1 = mm512_rol_32( x0, 11 );
|
||||||
x1 = mm512_rol_32( tx0, 11 );
|
y1 = mm512_rol_32( y0, 11 );
|
||||||
y1 = mm512_rol_32( ty0, 11 );
|
x3 = mm512_rol_32( x2, 11 );
|
||||||
x0 = _mm512_xor_si512( x0, x4 );
|
y3 = mm512_rol_32( y2, 11 );
|
||||||
x4 = mm512_swap64_32( x4 );
|
x0 = _mm512_xor_si512( tx0, x4 );
|
||||||
y0 = _mm512_xor_si512( y0, y4 );
|
y0 = _mm512_xor_si512( ty0, y4 );
|
||||||
x2 = mm512_rol_32( x3, 11 );
|
|
||||||
y4 = mm512_swap64_32( y4 );
|
|
||||||
y2 = mm512_rol_32( y3, 11 );
|
|
||||||
x1 = _mm512_xor_si512( x1, x5 );
|
x1 = _mm512_xor_si512( x1, x5 );
|
||||||
x5 = mm512_swap64_32( x5 );
|
|
||||||
y1 = _mm512_xor_si512( y1, y5 );
|
y1 = _mm512_xor_si512( y1, y5 );
|
||||||
x3 = mm512_rol_32( tx1, 11 );
|
x4 = mm512_swap64_32( x4 );
|
||||||
|
y4 = mm512_swap64_32( y4 );
|
||||||
|
x5 = mm512_swap64_32( x5 );
|
||||||
y5 = mm512_swap64_32( y5 );
|
y5 = mm512_swap64_32( y5 );
|
||||||
y3 = mm512_rol_32( ty1, 11 );
|
x2 = _mm512_xor_si512( tx1, x6 );
|
||||||
|
y2 = _mm512_xor_si512( ty1, y6 );
|
||||||
x2 = _mm512_xor_si512( x2, x6 );
|
|
||||||
x6 = mm512_swap64_32( x6 );
|
|
||||||
y2 = _mm512_xor_si512( y2, y6 );
|
|
||||||
y6 = mm512_swap64_32( y6 );
|
|
||||||
x3 = _mm512_xor_si512( x3, x7 );
|
x3 = _mm512_xor_si512( x3, x7 );
|
||||||
x7 = mm512_swap64_32( x7 );
|
|
||||||
y3 = _mm512_xor_si512( y3, y7 );
|
y3 = _mm512_xor_si512( y3, y7 );
|
||||||
|
x6 = mm512_swap64_32( x6 );
|
||||||
|
y6 = mm512_swap64_32( y6 );
|
||||||
|
x7 = mm512_swap64_32( x7 );
|
||||||
y7 = mm512_swap64_32( y7 );
|
y7 = mm512_swap64_32( y7 );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -241,14 +221,6 @@ int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
|
|||||||
sp->rounds = rounds;
|
sp->rounds = rounds;
|
||||||
sp->pos = 0;
|
sp->pos = 0;
|
||||||
|
|
||||||
h[ 0] = m512_const1_128( iv[0] );
|
|
||||||
h[ 1] = m512_const1_128( iv[1] );
|
|
||||||
h[ 2] = m512_const1_128( iv[2] );
|
|
||||||
h[ 3] = m512_const1_128( iv[3] );
|
|
||||||
h[ 4] = m512_const1_128( iv[4] );
|
|
||||||
h[ 5] = m512_const1_128( iv[5] );
|
|
||||||
h[ 6] = m512_const1_128( iv[6] );
|
|
||||||
h[ 7] = m512_const1_128( iv[7] );
|
|
||||||
h[ 0] = m512_const1_128( iv[0] );
|
h[ 0] = m512_const1_128( iv[0] );
|
||||||
h[ 1] = m512_const1_128( iv[1] );
|
h[ 1] = m512_const1_128( iv[1] );
|
||||||
h[ 2] = m512_const1_128( iv[2] );
|
h[ 2] = m512_const1_128( iv[2] );
|
||||||
@@ -489,33 +461,29 @@ static void transform_2way( cube_2way_context *sp )
|
|||||||
x5 = _mm256_add_epi32( x1, x5 );
|
x5 = _mm256_add_epi32( x1, x5 );
|
||||||
x6 = _mm256_add_epi32( x2, x6 );
|
x6 = _mm256_add_epi32( x2, x6 );
|
||||||
x7 = _mm256_add_epi32( x3, x7 );
|
x7 = _mm256_add_epi32( x3, x7 );
|
||||||
y0 = x0;
|
ROL2( y0, y1, x2, x3, 7 );
|
||||||
y1 = x1;
|
ROL2( x2, x3, x0, x1, 7 );
|
||||||
ROL2( x0, x1, x2, x3, 7 );
|
x0 = _mm256_xor_si256( y0, x4 );
|
||||||
ROL2( x2, x3, y0, y1, 7 );
|
x1 = _mm256_xor_si256( y1, x5 );
|
||||||
x0 = _mm256_xor_si256( x0, x4 );
|
x2 = _mm256_xor_si256( x2, x6 );
|
||||||
|
x3 = _mm256_xor_si256( x3, x7 );
|
||||||
x4 = mm256_swap128_64( x4 );
|
x4 = mm256_swap128_64( x4 );
|
||||||
x1 = _mm256_xor_si256( x1, x5 );
|
|
||||||
x2 = _mm256_xor_si256( x2, x6 );
|
|
||||||
x5 = mm256_swap128_64( x5 );
|
x5 = mm256_swap128_64( x5 );
|
||||||
x3 = _mm256_xor_si256( x3, x7 );
|
|
||||||
x4 = _mm256_add_epi32( x0, x4 );
|
|
||||||
x6 = mm256_swap128_64( x6 );
|
x6 = mm256_swap128_64( x6 );
|
||||||
y0 = x0;
|
|
||||||
x5 = _mm256_add_epi32( x1, x5 );
|
|
||||||
x7 = mm256_swap128_64( x7 );
|
x7 = mm256_swap128_64( x7 );
|
||||||
|
x4 = _mm256_add_epi32( x0, x4 );
|
||||||
|
x5 = _mm256_add_epi32( x1, x5 );
|
||||||
x6 = _mm256_add_epi32( x2, x6 );
|
x6 = _mm256_add_epi32( x2, x6 );
|
||||||
y1 = x2;
|
|
||||||
ROL2( x0, x1, x1, y0, 11 );
|
|
||||||
x7 = _mm256_add_epi32( x3, x7 );
|
x7 = _mm256_add_epi32( x3, x7 );
|
||||||
ROL2( x2, x3, x3, y1, 11 );
|
ROL2( y0, x1, x1, x0, 11 );
|
||||||
x0 = _mm256_xor_si256( x0, x4 );
|
ROL2( y1, x3, x3, x2, 11 );
|
||||||
x4 = mm256_swap64_32( x4 );
|
x0 = _mm256_xor_si256( y0, x4 );
|
||||||
x1 = _mm256_xor_si256( x1, x5 );
|
x1 = _mm256_xor_si256( x1, x5 );
|
||||||
x5 = mm256_swap64_32( x5 );
|
x2 = _mm256_xor_si256( y1, x6 );
|
||||||
x2 = _mm256_xor_si256( x2, x6 );
|
|
||||||
x6 = mm256_swap64_32( x6 );
|
|
||||||
x3 = _mm256_xor_si256( x3, x7 );
|
x3 = _mm256_xor_si256( x3, x7 );
|
||||||
|
x4 = mm256_swap64_32( x4 );
|
||||||
|
x5 = mm256_swap64_32( x5 );
|
||||||
|
x6 = mm256_swap64_32( x6 );
|
||||||
x7 = mm256_swap64_32( x7 );
|
x7 = mm256_swap64_32( x7 );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -540,14 +508,6 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
|
|||||||
sp->rounds = rounds;
|
sp->rounds = rounds;
|
||||||
sp->pos = 0;
|
sp->pos = 0;
|
||||||
|
|
||||||
h[ 0] = m256_const1_128( iv[0] );
|
|
||||||
h[ 1] = m256_const1_128( iv[1] );
|
|
||||||
h[ 2] = m256_const1_128( iv[2] );
|
|
||||||
h[ 3] = m256_const1_128( iv[3] );
|
|
||||||
h[ 4] = m256_const1_128( iv[4] );
|
|
||||||
h[ 5] = m256_const1_128( iv[5] );
|
|
||||||
h[ 6] = m256_const1_128( iv[6] );
|
|
||||||
h[ 7] = m256_const1_128( iv[7] );
|
|
||||||
h[ 0] = m256_const1_128( iv[0] );
|
h[ 0] = m256_const1_128( iv[0] );
|
||||||
h[ 1] = m256_const1_128( iv[1] );
|
h[ 1] = m256_const1_128( iv[1] );
|
||||||
h[ 2] = m256_const1_128( iv[2] );
|
h[ 2] = m256_const1_128( iv[2] );
|
||||||
@@ -560,7 +520,6 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
|
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
|
||||||
{
|
{
|
||||||
const int len = size >> 4;
|
const int len = size >> 4;
|
||||||
|
@@ -15,11 +15,11 @@
|
|||||||
|
|
||||||
struct _cubehashParam
|
struct _cubehashParam
|
||||||
{
|
{
|
||||||
|
__m128i _ALIGN(64) x[8]; // aligned for __m512i
|
||||||
int hashlen; // __m128i
|
int hashlen; // __m128i
|
||||||
int rounds;
|
int rounds;
|
||||||
int blocksize; // __m128i
|
int blocksize; // __m128i
|
||||||
int pos; // number of __m128i read into x from current block
|
int pos; // number of __m128i read into x from current block
|
||||||
__m128i _ALIGN(64) x[8]; // aligned for __m256i
|
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct _cubehashParam cubehashParam;
|
typedef struct _cubehashParam cubehashParam;
|
||||||
|
@@ -13,8 +13,7 @@
|
|||||||
|
|
||||||
#if defined (ALLIUM_16WAY)
|
#if defined (ALLIUM_16WAY)
|
||||||
|
|
||||||
typedef struct {
|
typedef union {
|
||||||
blake256_16way_context blake;
|
|
||||||
keccak256_8way_context keccak;
|
keccak256_8way_context keccak;
|
||||||
cube_4way_2buf_context cube;
|
cube_4way_2buf_context cube;
|
||||||
skein256_8way_context skein;
|
skein256_8way_context skein;
|
||||||
@@ -25,41 +24,31 @@ typedef struct {
|
|||||||
#endif
|
#endif
|
||||||
} allium_16way_ctx_holder;
|
} allium_16way_ctx_holder;
|
||||||
|
|
||||||
static __thread allium_16way_ctx_holder allium_16way_ctx;
|
static void allium_16way_hash( void *state, const void *midstate_vars,
|
||||||
|
const void *midhash, const void *block )
|
||||||
bool init_allium_16way_ctx()
|
|
||||||
{
|
|
||||||
keccak256_8way_init( &allium_16way_ctx.keccak );
|
|
||||||
skein256_8way_init( &allium_16way_ctx.skein );
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void allium_16way_hash( void *state, const void *input )
|
|
||||||
{
|
{
|
||||||
uint32_t vhash[16*8] __attribute__ ((aligned (128)));
|
uint32_t vhash[16*8] __attribute__ ((aligned (128)));
|
||||||
uint32_t vhashA[16*8] __attribute__ ((aligned (64)));
|
uint32_t vhashA[16*8] __attribute__ ((aligned (64)));
|
||||||
uint32_t vhashB[16*8] __attribute__ ((aligned (64)));
|
uint32_t vhashB[16*8] __attribute__ ((aligned (64)));
|
||||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash4[8] __attribute__ ((aligned (64)));
|
uint32_t hash4[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash5[8] __attribute__ ((aligned (64)));
|
uint32_t hash5[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash6[8] __attribute__ ((aligned (64)));
|
uint32_t hash6[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash7[8] __attribute__ ((aligned (64)));
|
uint32_t hash7[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash8[8] __attribute__ ((aligned (64)));
|
uint32_t hash8[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash9[8] __attribute__ ((aligned (64)));
|
uint32_t hash9[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash10[8] __attribute__ ((aligned (64)));
|
uint32_t hash10[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash11[8] __attribute__ ((aligned (64)));
|
uint32_t hash11[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash12[8] __attribute__ ((aligned (64)));
|
uint32_t hash12[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash13[8] __attribute__ ((aligned (64)));
|
uint32_t hash13[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash14[8] __attribute__ ((aligned (64)));
|
uint32_t hash14[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash15[8] __attribute__ ((aligned (64)));
|
uint32_t hash15[8] __attribute__ ((aligned (32)));
|
||||||
allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
memcpy( &ctx, &allium_16way_ctx, sizeof(allium_16way_ctx) );
|
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
|
||||||
blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
|
|
||||||
blake256_16way_close( &ctx.blake, vhash );
|
|
||||||
|
|
||||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
|
hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
|
||||||
@@ -69,6 +58,7 @@ void allium_16way_hash( void *state, const void *input )
|
|||||||
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
||||||
hash15, 256 );
|
hash15, 256 );
|
||||||
|
|
||||||
|
keccak256_8way_init( &ctx.keccak );
|
||||||
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
|
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
|
||||||
keccak256_8way_close( &ctx.keccak, vhashA);
|
keccak256_8way_close( &ctx.keccak, vhashA);
|
||||||
keccak256_8way_init( &ctx.keccak );
|
keccak256_8way_init( &ctx.keccak );
|
||||||
@@ -151,6 +141,7 @@ void allium_16way_hash( void *state, const void *input )
|
|||||||
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
||||||
hash15, 256 );
|
hash15, 256 );
|
||||||
|
|
||||||
|
skein256_8way_init( &ctx.skein );
|
||||||
skein256_8way_update( &ctx.skein, vhashA, 32 );
|
skein256_8way_update( &ctx.skein, vhashA, 32 );
|
||||||
skein256_8way_close( &ctx.skein, vhashA );
|
skein256_8way_close( &ctx.skein, vhashA );
|
||||||
skein256_8way_init( &ctx.skein );
|
skein256_8way_init( &ctx.skein );
|
||||||
@@ -198,6 +189,7 @@ void allium_16way_hash( void *state, const void *input )
|
|||||||
groestl256_full( &ctx.groestl, state+416, hash13, 256 );
|
groestl256_full( &ctx.groestl, state+416, hash13, 256 );
|
||||||
groestl256_full( &ctx.groestl, state+448, hash14, 256 );
|
groestl256_full( &ctx.groestl, state+448, hash14, 256 );
|
||||||
groestl256_full( &ctx.groestl, state+480, hash15, 256 );
|
groestl256_full( &ctx.groestl, state+480, hash15, 256 );
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -205,35 +197,72 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
|||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
uint32_t hash[8*16] __attribute__ ((aligned (128)));
|
uint32_t hash[8*16] __attribute__ ((aligned (128)));
|
||||||
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
|
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
|
||||||
|
__m512i block0_hash[8] __attribute__ ((aligned (64)));
|
||||||
|
__m512i block_buf[16] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||||
|
{
|
||||||
|
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||||
|
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||||
|
};
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
uint32_t n = first_nonce;
|
uint32_t n = first_nonce;
|
||||||
const uint32_t last_nonce = max_nonce - 16;
|
const uint32_t last_nonce = max_nonce - 16;
|
||||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
|
||||||
const int thr_id = mythr->id;
|
const int thr_id = mythr->id;
|
||||||
const bool bench = opt_benchmark;
|
const bool bench = opt_benchmark;
|
||||||
|
const __m512i sixteen = m512_const1_32( 16 );
|
||||||
|
|
||||||
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
|
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||||
|
|
||||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
// Prehash first block.
|
||||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
blake256_transform_le( phash, pdata, 512, 0 );
|
||||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
|
||||||
|
|
||||||
blake256_16way_init( &allium_16way_ctx.blake );
|
// Interleave hash for second block prehash.
|
||||||
blake256_16way_update( &allium_16way_ctx.blake, vdata, 64 );
|
block0_hash[0] = _mm512_set1_epi32( phash[0] );
|
||||||
|
block0_hash[1] = _mm512_set1_epi32( phash[1] );
|
||||||
|
block0_hash[2] = _mm512_set1_epi32( phash[2] );
|
||||||
|
block0_hash[3] = _mm512_set1_epi32( phash[3] );
|
||||||
|
block0_hash[4] = _mm512_set1_epi32( phash[4] );
|
||||||
|
block0_hash[5] = _mm512_set1_epi32( phash[5] );
|
||||||
|
block0_hash[6] = _mm512_set1_epi32( phash[6] );
|
||||||
|
block0_hash[7] = _mm512_set1_epi32( phash[7] );
|
||||||
|
|
||||||
|
// Build vectored second block, interleave last 16 bytes of data using
|
||||||
|
// unique nonces, add padding.
|
||||||
|
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
|
||||||
|
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
|
||||||
|
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
|
||||||
|
block_buf[ 3] =
|
||||||
|
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||||
|
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
|
||||||
|
block_buf[ 4] = m512_const1_32( 0x80000000 );
|
||||||
|
block_buf[ 5] =
|
||||||
|
block_buf[ 6] =
|
||||||
|
block_buf[ 7] =
|
||||||
|
block_buf[ 8] =
|
||||||
|
block_buf[ 9] =
|
||||||
|
block_buf[10] =
|
||||||
|
block_buf[11] =
|
||||||
|
block_buf[12] = m512_zero;
|
||||||
|
block_buf[13] = m512_one_32;
|
||||||
|
block_buf[14] = m512_zero;
|
||||||
|
block_buf[15] = m512_const1_32( 80*8 );
|
||||||
|
|
||||||
|
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||||
|
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
allium_16way_hash( hash, vdata );
|
allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||||
|
|
||||||
for ( int lane = 0; lane < 16; lane++ )
|
for ( int lane = 0; lane < 16; lane++ )
|
||||||
if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
|
if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
|
||||||
{
|
{
|
||||||
pdata[19] = bswap_32( n + lane );
|
pdata[19] = n + lane;
|
||||||
submit_solution( work, hash+(lane<<3), mythr );
|
submit_solution( work, hash+(lane<<3), mythr );
|
||||||
}
|
}
|
||||||
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
|
block_buf[ 3] = _mm512_add_epi32( block_buf[ 3], sixteen );
|
||||||
n += 16;
|
n += 16;
|
||||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
|
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
|
||||||
pdata[19] = n;
|
pdata[19] = n;
|
||||||
@@ -243,8 +272,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
#elif defined (ALLIUM_8WAY)
|
#elif defined (ALLIUM_8WAY)
|
||||||
|
|
||||||
typedef struct {
|
typedef union {
|
||||||
blake256_8way_context blake;
|
|
||||||
keccak256_4way_context keccak;
|
keccak256_4way_context keccak;
|
||||||
cube_2way_context cube;
|
cube_2way_context cube;
|
||||||
skein256_4way_context skein;
|
skein256_4way_context skein;
|
||||||
@@ -255,19 +283,11 @@ typedef struct {
|
|||||||
#endif
|
#endif
|
||||||
} allium_8way_ctx_holder;
|
} allium_8way_ctx_holder;
|
||||||
|
|
||||||
static __thread allium_8way_ctx_holder allium_8way_ctx;
|
static void allium_8way_hash( void *hash, const void *midstate_vars,
|
||||||
|
const void *midhash, const void *block )
|
||||||
bool init_allium_8way_ctx()
|
|
||||||
{
|
|
||||||
keccak256_4way_init( &allium_8way_ctx.keccak );
|
|
||||||
skein256_4way_init( &allium_8way_ctx.skein );
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void allium_8way_hash( void *hash, const void *input )
|
|
||||||
{
|
{
|
||||||
uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
|
uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
|
||||||
uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
|
uint64_t vhashB[4*8] __attribute__ ((aligned (32)));
|
||||||
uint64_t *hash0 = (uint64_t*)hash;
|
uint64_t *hash0 = (uint64_t*)hash;
|
||||||
uint64_t *hash1 = (uint64_t*)hash+ 4;
|
uint64_t *hash1 = (uint64_t*)hash+ 4;
|
||||||
uint64_t *hash2 = (uint64_t*)hash+ 8;
|
uint64_t *hash2 = (uint64_t*)hash+ 8;
|
||||||
@@ -278,15 +298,14 @@ void allium_8way_hash( void *hash, const void *input )
|
|||||||
uint64_t *hash7 = (uint64_t*)hash+28;
|
uint64_t *hash7 = (uint64_t*)hash+28;
|
||||||
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
|
blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block );
|
||||||
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
|
|
||||||
blake256_8way_close( &ctx.blake, vhashA );
|
|
||||||
|
|
||||||
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
vhashA, 256 );
|
vhashA, 256 );
|
||||||
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
|
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||||
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
|
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||||
|
|
||||||
|
keccak256_4way_init( &ctx.keccak );
|
||||||
keccak256_4way_update( &ctx.keccak, vhashA, 32 );
|
keccak256_4way_update( &ctx.keccak, vhashA, 32 );
|
||||||
keccak256_4way_close( &ctx.keccak, vhashA );
|
keccak256_4way_close( &ctx.keccak, vhashA );
|
||||||
keccak256_4way_init( &ctx.keccak );
|
keccak256_4way_init( &ctx.keccak );
|
||||||
@@ -305,7 +324,6 @@ void allium_8way_hash( void *hash, const void *input )
|
|||||||
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
|
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
|
||||||
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
|
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
|
||||||
|
|
||||||
|
|
||||||
intrlv_2x128( vhashA, hash0, hash1, 256 );
|
intrlv_2x128( vhashA, hash0, hash1, 256 );
|
||||||
intrlv_2x128( vhashB, hash2, hash3, 256 );
|
intrlv_2x128( vhashB, hash2, hash3, 256 );
|
||||||
cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||||
@@ -332,6 +350,7 @@ void allium_8way_hash( void *hash, const void *input )
|
|||||||
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
|
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||||
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
|
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||||
|
|
||||||
|
skein256_4way_init( &ctx.skein );
|
||||||
skein256_4way_update( &ctx.skein, vhashA, 32 );
|
skein256_4way_update( &ctx.skein, vhashA, 32 );
|
||||||
skein256_4way_close( &ctx.skein, vhashA );
|
skein256_4way_close( &ctx.skein, vhashA );
|
||||||
skein256_4way_init( &ctx.skein );
|
skein256_4way_init( &ctx.skein );
|
||||||
@@ -340,8 +359,8 @@ void allium_8way_hash( void *hash, const void *input )
|
|||||||
|
|
||||||
#if defined(__VAES__)
|
#if defined(__VAES__)
|
||||||
|
|
||||||
uint64_t vhashC[4*2] __attribute__ ((aligned (64)));
|
uint64_t vhashC[4*2] __attribute__ ((aligned (32)));
|
||||||
uint64_t vhashD[4*2] __attribute__ ((aligned (64)));
|
uint64_t vhashD[4*2] __attribute__ ((aligned (32)));
|
||||||
|
|
||||||
rintrlv_4x64_2x128( vhashC, vhashD, vhashA, 256 );
|
rintrlv_4x64_2x128( vhashC, vhashD, vhashA, 256 );
|
||||||
groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 );
|
groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 );
|
||||||
@@ -376,36 +395,72 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
|||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
uint64_t hash[4*8] __attribute__ ((aligned (64)));
|
uint64_t hash[4*8] __attribute__ ((aligned (64)));
|
||||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
|
||||||
|
__m256i block0_hash[8] __attribute__ ((aligned (64)));
|
||||||
|
__m256i block_buf[16] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||||
|
{
|
||||||
|
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||||
|
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||||
|
};
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint64_t *ptarget = (uint64_t*)work->target;
|
uint64_t *ptarget = (uint64_t*)work->target;
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
const uint32_t last_nonce = max_nonce - 8;
|
const uint32_t last_nonce = max_nonce - 8;
|
||||||
uint32_t n = first_nonce;
|
uint32_t n = first_nonce;
|
||||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
|
||||||
const int thr_id = mythr->id;
|
const int thr_id = mythr->id;
|
||||||
const bool bench = opt_benchmark;
|
const bool bench = opt_benchmark;
|
||||||
|
const __m256i eight = m256_const1_32( 8 );
|
||||||
|
|
||||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
// Prehash first block
|
||||||
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
blake256_transform_le( phash, pdata, 512, 0 );
|
||||||
|
|
||||||
blake256_8way_init( &allium_8way_ctx.blake );
|
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||||
blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
|
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||||
|
block0_hash[2] = _mm256_set1_epi32( phash[2] );
|
||||||
|
block0_hash[3] = _mm256_set1_epi32( phash[3] );
|
||||||
|
block0_hash[4] = _mm256_set1_epi32( phash[4] );
|
||||||
|
block0_hash[5] = _mm256_set1_epi32( phash[5] );
|
||||||
|
block0_hash[6] = _mm256_set1_epi32( phash[6] );
|
||||||
|
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||||
|
|
||||||
|
// Build vectored second block, interleave last 16 bytes of data using
|
||||||
|
// unique nonces and add padding.
|
||||||
|
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
|
||||||
|
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
|
||||||
|
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
|
||||||
|
block_buf[ 3] =
|
||||||
|
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
|
||||||
|
block_buf[ 4] = m256_const1_32( 0x80000000 );
|
||||||
|
block_buf[ 5] =
|
||||||
|
block_buf[ 6] =
|
||||||
|
block_buf[ 7] =
|
||||||
|
block_buf[ 8] =
|
||||||
|
block_buf[ 9] =
|
||||||
|
block_buf[10] =
|
||||||
|
block_buf[11] =
|
||||||
|
block_buf[12] = m256_zero;
|
||||||
|
block_buf[13] = m256_one_32;
|
||||||
|
block_buf[14] = m256_zero;
|
||||||
|
block_buf[15] = m256_const1_32( 80*8 );
|
||||||
|
|
||||||
|
// Partialy prehash second block without touching nonces
|
||||||
|
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
allium_8way_hash( hash, vdata );
|
allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||||
|
|
||||||
for ( int lane = 0; lane < 8; lane++ )
|
for ( int lane = 0; lane < 8; lane++ )
|
||||||
{
|
{
|
||||||
const uint64_t *lane_hash = hash + (lane<<2);
|
const uint64_t *lane_hash = hash + (lane<<2);
|
||||||
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||||
{
|
{
|
||||||
pdata[19] = bswap_32( n + lane );
|
pdata[19] = n + lane;
|
||||||
submit_solution( work, lane_hash, mythr );
|
submit_solution( work, lane_hash, mythr );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
n += 8;
|
n += 8;
|
||||||
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
|
block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
|
||||||
} while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
|
} while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
|
||||||
pdata[19] = n;
|
pdata[19] = n;
|
||||||
*hashes_done = n - first_nonce;
|
*hashes_done = n - first_nonce;
|
||||||
|
@@ -132,11 +132,11 @@ bool register_lyra2z_algo( algo_gate_t* gate )
|
|||||||
#if defined(LYRA2Z_16WAY)
|
#if defined(LYRA2Z_16WAY)
|
||||||
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
|
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
|
||||||
gate->scanhash = (void*)&scanhash_lyra2z_16way;
|
gate->scanhash = (void*)&scanhash_lyra2z_16way;
|
||||||
gate->hash = (void*)&lyra2z_16way_hash;
|
// gate->hash = (void*)&lyra2z_16way_hash;
|
||||||
#elif defined(LYRA2Z_8WAY)
|
#elif defined(LYRA2Z_8WAY)
|
||||||
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
|
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
|
||||||
gate->scanhash = (void*)&scanhash_lyra2z_8way;
|
gate->scanhash = (void*)&scanhash_lyra2z_8way;
|
||||||
gate->hash = (void*)&lyra2z_8way_hash;
|
// gate->hash = (void*)&lyra2z_8way_hash;
|
||||||
#elif defined(LYRA2Z_4WAY)
|
#elif defined(LYRA2Z_4WAY)
|
||||||
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
|
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
|
||||||
gate->scanhash = (void*)&scanhash_lyra2z_4way;
|
gate->scanhash = (void*)&scanhash_lyra2z_4way;
|
||||||
@@ -175,13 +175,9 @@ bool register_lyra2h_algo( algo_gate_t* gate )
|
|||||||
bool register_allium_algo( algo_gate_t* gate )
|
bool register_allium_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
#if defined (ALLIUM_16WAY)
|
#if defined (ALLIUM_16WAY)
|
||||||
gate->miner_thread_init = (void*)&init_allium_16way_ctx;
|
|
||||||
gate->scanhash = (void*)&scanhash_allium_16way;
|
gate->scanhash = (void*)&scanhash_allium_16way;
|
||||||
gate->hash = (void*)&allium_16way_hash;
|
|
||||||
#elif defined (ALLIUM_8WAY)
|
#elif defined (ALLIUM_8WAY)
|
||||||
gate->miner_thread_init = (void*)&init_allium_8way_ctx;
|
|
||||||
gate->scanhash = (void*)&scanhash_allium_8way;
|
gate->scanhash = (void*)&scanhash_allium_8way;
|
||||||
gate->hash = (void*)&allium_8way_hash;
|
|
||||||
#else
|
#else
|
||||||
gate->miner_thread_init = (void*)&init_allium_ctx;
|
gate->miner_thread_init = (void*)&init_allium_ctx;
|
||||||
gate->scanhash = (void*)&scanhash_allium;
|
gate->scanhash = (void*)&scanhash_allium;
|
||||||
|
@@ -99,14 +99,14 @@ bool init_lyra2rev2_ctx();
|
|||||||
|
|
||||||
#if defined(LYRA2Z_16WAY)
|
#if defined(LYRA2Z_16WAY)
|
||||||
|
|
||||||
void lyra2z_16way_hash( void *state, const void *input );
|
//void lyra2z_16way_hash( void *state, const void *input );
|
||||||
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
bool lyra2z_16way_thread_init();
|
bool lyra2z_16way_thread_init();
|
||||||
|
|
||||||
#elif defined(LYRA2Z_8WAY)
|
#elif defined(LYRA2Z_8WAY)
|
||||||
|
|
||||||
void lyra2z_8way_hash( void *state, const void *input );
|
//void lyra2z_8way_hash( void *state, const void *input );
|
||||||
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
bool lyra2z_8way_thread_init();
|
bool lyra2z_8way_thread_init();
|
||||||
@@ -163,17 +163,13 @@ bool register_allium_algo( algo_gate_t* gate );
|
|||||||
|
|
||||||
#if defined(ALLIUM_16WAY)
|
#if defined(ALLIUM_16WAY)
|
||||||
|
|
||||||
void allium_16way_hash( void *state, const void *input );
|
|
||||||
int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
bool init_allium_16way_ctx();
|
|
||||||
|
|
||||||
#elif defined(ALLIUM_8WAY)
|
#elif defined(ALLIUM_8WAY)
|
||||||
|
|
||||||
void allium_8way_hash( void *state, const void *input );
|
|
||||||
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
bool init_allium_8way_ctx();
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
@@ -14,38 +14,28 @@ bool lyra2z_16way_thread_init()
|
|||||||
return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
|
return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
static __thread blake256_16way_context l2z_16way_blake_mid;
|
static void lyra2z_16way_hash( void *state, const void *midstate_vars,
|
||||||
|
const void *midhash, const void *block )
|
||||||
void lyra2z_16way_midstate( const void* input )
|
|
||||||
{
|
|
||||||
blake256_16way_init( &l2z_16way_blake_mid );
|
|
||||||
blake256_16way_update( &l2z_16way_blake_mid, input, 64 );
|
|
||||||
}
|
|
||||||
|
|
||||||
void lyra2z_16way_hash( void *state, const void *input )
|
|
||||||
{
|
{
|
||||||
uint32_t vhash[8*16] __attribute__ ((aligned (128)));
|
uint32_t vhash[8*16] __attribute__ ((aligned (128)));
|
||||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash4[8] __attribute__ ((aligned (64)));
|
uint32_t hash4[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash5[8] __attribute__ ((aligned (64)));
|
uint32_t hash5[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash6[8] __attribute__ ((aligned (64)));
|
uint32_t hash6[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash7[8] __attribute__ ((aligned (64)));
|
uint32_t hash7[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash8[8] __attribute__ ((aligned (64)));
|
uint32_t hash8[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash9[8] __attribute__ ((aligned (64)));
|
uint32_t hash9[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash10[8] __attribute__ ((aligned (64)));
|
uint32_t hash10[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash11[8] __attribute__ ((aligned (64)));
|
uint32_t hash11[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash12[8] __attribute__ ((aligned (64)));
|
uint32_t hash12[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash13[8] __attribute__ ((aligned (64)));
|
uint32_t hash13[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash14[8] __attribute__ ((aligned (64)));
|
uint32_t hash14[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash15[8] __attribute__ ((aligned (64)));
|
uint32_t hash15[8] __attribute__ ((aligned (32)));
|
||||||
blake256_16way_context ctx_blake __attribute__ ((aligned (64)));
|
|
||||||
|
|
||||||
memcpy( &ctx_blake, &l2z_16way_blake_mid, sizeof l2z_16way_blake_mid );
|
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
|
||||||
blake256_16way_update( &ctx_blake, input + (64*16), 16 );
|
|
||||||
blake256_16way_close( &ctx_blake, vhash );
|
|
||||||
|
|
||||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
|
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
|
||||||
@@ -97,40 +87,74 @@ void lyra2z_16way_hash( void *state, const void *input )
|
|||||||
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
uint64_t hash[4*16] __attribute__ ((aligned (128)));
|
uint32_t hash[8*16] __attribute__ ((aligned (128)));
|
||||||
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
|
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
|
||||||
|
__m512i block0_hash[8] __attribute__ ((aligned (64)));
|
||||||
|
__m512i block_buf[16] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t phash[8] __attribute__ ((aligned (64))) =
|
||||||
|
{
|
||||||
|
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||||
|
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||||
|
};
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
uint32_t n = first_nonce;
|
uint32_t n = first_nonce;
|
||||||
const uint32_t last_nonce = max_nonce - 16;
|
const uint32_t last_nonce = max_nonce - 16;
|
||||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
|
||||||
const int thr_id = mythr->id;
|
const int thr_id = mythr->id;
|
||||||
const bool bench = opt_benchmark;
|
const bool bench = opt_benchmark;
|
||||||
|
const __m512i sixteen = m512_const1_32( 16 );
|
||||||
|
|
||||||
if ( bench ) ptarget[7] = 0x0000ff;
|
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||||
|
|
||||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
// Prehash first block
|
||||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
blake256_transform_le( phash, pdata, 512, 0 );
|
||||||
|
|
||||||
|
block0_hash[0] = _mm512_set1_epi32( phash[0] );
|
||||||
|
block0_hash[1] = _mm512_set1_epi32( phash[1] );
|
||||||
|
block0_hash[2] = _mm512_set1_epi32( phash[2] );
|
||||||
|
block0_hash[3] = _mm512_set1_epi32( phash[3] );
|
||||||
|
block0_hash[4] = _mm512_set1_epi32( phash[4] );
|
||||||
|
block0_hash[5] = _mm512_set1_epi32( phash[5] );
|
||||||
|
block0_hash[6] = _mm512_set1_epi32( phash[6] );
|
||||||
|
block0_hash[7] = _mm512_set1_epi32( phash[7] );
|
||||||
|
|
||||||
|
// Build vectored second block, interleave last 16 bytes of data using
|
||||||
|
// unique nonces and add padding.
|
||||||
|
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
|
||||||
|
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
|
||||||
|
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
|
||||||
|
block_buf[ 3] =
|
||||||
|
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||||
lyra2z_16way_midstate( vdata );
|
block_buf[ 4] = m512_const1_32( 0x80000000 );
|
||||||
|
block_buf[ 5] =
|
||||||
|
block_buf[ 6] =
|
||||||
|
block_buf[ 7] =
|
||||||
|
block_buf[ 8] =
|
||||||
|
block_buf[ 9] =
|
||||||
|
block_buf[10] =
|
||||||
|
block_buf[11] =
|
||||||
|
block_buf[12] = m512_zero;
|
||||||
|
block_buf[13] = m512_one_32;
|
||||||
|
block_buf[14] = m512_zero;
|
||||||
|
block_buf[15] = m512_const1_32( 80*8 );
|
||||||
|
|
||||||
|
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||||
|
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
lyra2z_16way_hash( hash, vdata );
|
lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||||
|
|
||||||
for ( int lane = 0; lane < 16; lane++ )
|
|
||||||
{
|
|
||||||
const uint64_t *lane_hash = hash + (lane<<2);
|
|
||||||
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
|
||||||
{
|
|
||||||
pdata[19] = bswap_32( n + lane );
|
|
||||||
submit_solution( work, lane_hash, mythr );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
|
|
||||||
n += 16;
|
|
||||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
|
||||||
|
|
||||||
|
for ( int lane = 0; lane < 16; lane++ )
|
||||||
|
if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
|
||||||
|
{
|
||||||
|
pdata[19] = n + lane;
|
||||||
|
submit_solution( work, hash+(lane<<3), mythr );
|
||||||
|
}
|
||||||
|
block_buf[ 3] = _mm512_add_epi32( block_buf[ 3], sixteen );
|
||||||
|
n += 16;
|
||||||
|
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
|
||||||
pdata[19] = n;
|
pdata[19] = n;
|
||||||
*hashes_done = n - first_nonce;
|
*hashes_done = n - first_nonce;
|
||||||
return 0;
|
return 0;
|
||||||
@@ -145,30 +169,20 @@ bool lyra2z_8way_thread_init()
|
|||||||
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
|
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
static __thread blake256_8way_context l2z_8way_blake_mid;
|
static void lyra2z_8way_hash( void *state, const void *midstate_vars,
|
||||||
|
const void *midhash, const void *block )
|
||||||
void lyra2z_8way_midstate( const void* input )
|
|
||||||
{
|
|
||||||
blake256_8way_init( &l2z_8way_blake_mid );
|
|
||||||
blake256_8way_update( &l2z_8way_blake_mid, input, 64 );
|
|
||||||
}
|
|
||||||
|
|
||||||
void lyra2z_8way_hash( void *state, const void *input )
|
|
||||||
{
|
{
|
||||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash4[8] __attribute__ ((aligned (64)));
|
uint32_t hash4[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash5[8] __attribute__ ((aligned (64)));
|
uint32_t hash5[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash6[8] __attribute__ ((aligned (64)));
|
uint32_t hash6[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash7[8] __attribute__ ((aligned (64)));
|
uint32_t hash7[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||||
blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
|
|
||||||
|
|
||||||
memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
|
blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block );
|
||||||
blake256_8way_update( &ctx_blake, input + (64*8), 16 );
|
|
||||||
blake256_8way_close( &ctx_blake, vhash );
|
|
||||||
|
|
||||||
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
||||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||||
@@ -182,7 +196,6 @@ void lyra2z_8way_hash( void *state, const void *input )
|
|||||||
LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
|
LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
|
||||||
LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
|
LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
|
||||||
|
|
||||||
|
|
||||||
memcpy( state, hash0, 32 );
|
memcpy( state, hash0, 32 );
|
||||||
memcpy( state+ 32, hash1, 32 );
|
memcpy( state+ 32, hash1, 32 );
|
||||||
memcpy( state+ 64, hash2, 32 );
|
memcpy( state+ 64, hash2, 32 );
|
||||||
@@ -197,43 +210,78 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
|||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
uint64_t hash[4*8] __attribute__ ((aligned (64)));
|
uint64_t hash[4*8] __attribute__ ((aligned (64)));
|
||||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
|
||||||
|
__m256i block0_hash[8] __attribute__ ((aligned (64)));
|
||||||
|
__m256i block_buf[16] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||||
|
{
|
||||||
|
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||||
|
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||||
|
};
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint64_t *ptarget = (uint64_t*)work->target;
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
const uint32_t last_nonce = max_nonce - 8;
|
const uint32_t last_nonce = max_nonce - 8;
|
||||||
uint32_t n = first_nonce;
|
uint32_t n = first_nonce;
|
||||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
|
||||||
const int thr_id = mythr->id;
|
const int thr_id = mythr->id;
|
||||||
const bool bench = opt_benchmark;
|
const bool bench = opt_benchmark;
|
||||||
|
const __m256i eight = m256_const1_32( 8 );
|
||||||
|
|
||||||
if ( bench ) ptarget[7] = 0x0000ff;
|
// Prehash first block
|
||||||
|
blake256_transform_le( phash, pdata, 512, 0 );
|
||||||
|
|
||||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||||
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||||
lyra2z_8way_midstate( vdata );
|
block0_hash[2] = _mm256_set1_epi32( phash[2] );
|
||||||
|
block0_hash[3] = _mm256_set1_epi32( phash[3] );
|
||||||
|
block0_hash[4] = _mm256_set1_epi32( phash[4] );
|
||||||
|
block0_hash[5] = _mm256_set1_epi32( phash[5] );
|
||||||
|
block0_hash[6] = _mm256_set1_epi32( phash[6] );
|
||||||
|
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||||
|
|
||||||
|
// Build vectored second block, interleave last 16 bytes of data using
|
||||||
|
// unique nonces and add padding.
|
||||||
|
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
|
||||||
|
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
|
||||||
|
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
|
||||||
|
block_buf[ 3] =
|
||||||
|
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||||
|
block_buf[ 4] = m256_const1_32( 0x80000000 );
|
||||||
|
block_buf[ 5] =
|
||||||
|
block_buf[ 6] =
|
||||||
|
block_buf[ 7] =
|
||||||
|
block_buf[ 8] =
|
||||||
|
block_buf[ 9] =
|
||||||
|
block_buf[10] =
|
||||||
|
block_buf[11] =
|
||||||
|
block_buf[12] = m256_zero;
|
||||||
|
block_buf[13] = m256_one_32;
|
||||||
|
block_buf[14] = m256_zero;
|
||||||
|
block_buf[15] = m256_const1_32( 80*8 );
|
||||||
|
|
||||||
|
// Partialy prehash second block without touching nonces
|
||||||
|
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
lyra2z_8way_hash( hash, vdata );
|
lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||||
|
|
||||||
for ( int lane = 0; lane < 8; lane++ )
|
for ( int lane = 0; lane < 8; lane++ )
|
||||||
{
|
{
|
||||||
const uint64_t *lane_hash = hash + (lane<<2);
|
const uint64_t *lane_hash = hash + (lane<<2);
|
||||||
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||||
{
|
{
|
||||||
pdata[19] = bswap_32( n + lane );
|
pdata[19] = n + lane;
|
||||||
submit_solution( work, lane_hash, mythr );
|
submit_solution( work, lane_hash, mythr );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
|
n += 8;
|
||||||
n += 8;
|
block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
|
||||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
|
} while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
|
||||||
pdata[19] = n;
|
pdata[19] = n;
|
||||||
*hashes_done = n - first_nonce;
|
*hashes_done = n - first_nonce;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#elif defined(LYRA2Z_4WAY)
|
#elif defined(LYRA2Z_4WAY)
|
||||||
|
|
||||||
|
|
||||||
|
@@ -261,7 +261,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
|||||||
// overlap it's unified.
|
// overlap it's unified.
|
||||||
// As a result normal is Nrows-2 / Nrows.
|
// As a result normal is Nrows-2 / Nrows.
|
||||||
// for 4 rows: 1 unified, 2 overlap, 1 normal.
|
// for 4 rows: 1 unified, 2 overlap, 1 normal.
|
||||||
// for 8 rows: 1 unified, 2 overlap, 56 normal.
|
// for 8 rows: 1 unified, 2 overlap, 5 normal.
|
||||||
|
|
||||||
static inline void reducedDuplexRow_2way_normal( uint64_t *State,
|
static inline void reducedDuplexRow_2way_normal( uint64_t *State,
|
||||||
uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
|
uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
|
||||||
@@ -283,6 +283,15 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
|
|||||||
for ( i = 0; i < nCols; i++ )
|
for ( i = 0; i < nCols; i++ )
|
||||||
{
|
{
|
||||||
//Absorbing "M[prev] [+] M[row*]"
|
//Absorbing "M[prev] [+] M[row*]"
|
||||||
|
io0 = _mm512_load_si512( inout0 );
|
||||||
|
io1 = _mm512_load_si512( inout0 +1 );
|
||||||
|
io2 = _mm512_load_si512( inout0 +2 );
|
||||||
|
|
||||||
|
io0 = _mm512_mask_load_epi64( io0, 0xf0, inout1 );
|
||||||
|
io1 = _mm512_mask_load_epi64( io1, 0xf0, inout1 +1 );
|
||||||
|
io2 = _mm512_mask_load_epi64( io2, 0xf0, inout1 +2 );
|
||||||
|
|
||||||
|
/*
|
||||||
io0 = _mm512_mask_blend_epi64( 0xf0,
|
io0 = _mm512_mask_blend_epi64( 0xf0,
|
||||||
_mm512_load_si512( (__m512i*)inout0 ),
|
_mm512_load_si512( (__m512i*)inout0 ),
|
||||||
_mm512_load_si512( (__m512i*)inout1 ) );
|
_mm512_load_si512( (__m512i*)inout1 ) );
|
||||||
@@ -292,6 +301,7 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
|
|||||||
io2 = _mm512_mask_blend_epi64( 0xf0,
|
io2 = _mm512_mask_blend_epi64( 0xf0,
|
||||||
_mm512_load_si512( (__m512i*)inout0 +2 ),
|
_mm512_load_si512( (__m512i*)inout0 +2 ),
|
||||||
_mm512_load_si512( (__m512i*)inout1 +2 ) );
|
_mm512_load_si512( (__m512i*)inout1 +2 ) );
|
||||||
|
*/
|
||||||
|
|
||||||
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0 ) );
|
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0 ) );
|
||||||
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1 ) );
|
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1 ) );
|
||||||
@@ -359,6 +369,15 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
|
|||||||
for ( i = 0; i < nCols; i++ )
|
for ( i = 0; i < nCols; i++ )
|
||||||
{
|
{
|
||||||
//Absorbing "M[prev] [+] M[row*]"
|
//Absorbing "M[prev] [+] M[row*]"
|
||||||
|
io0.v512 = _mm512_load_si512( inout0 );
|
||||||
|
io1.v512 = _mm512_load_si512( inout0 +1 );
|
||||||
|
io2.v512 = _mm512_load_si512( inout0 +2 );
|
||||||
|
|
||||||
|
io0.v512 = _mm512_mask_load_epi64( io0.v512, 0xf0, inout1 );
|
||||||
|
io1.v512 = _mm512_mask_load_epi64( io1.v512, 0xf0, inout1 +1 );
|
||||||
|
io2.v512 = _mm512_mask_load_epi64( io2.v512, 0xf0, inout1 +2 );
|
||||||
|
|
||||||
|
/*
|
||||||
io0.v512 = _mm512_mask_blend_epi64( 0xf0,
|
io0.v512 = _mm512_mask_blend_epi64( 0xf0,
|
||||||
_mm512_load_si512( (__m512i*)inout0 ),
|
_mm512_load_si512( (__m512i*)inout0 ),
|
||||||
_mm512_load_si512( (__m512i*)inout1 ) );
|
_mm512_load_si512( (__m512i*)inout1 ) );
|
||||||
@@ -368,27 +387,12 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
|
|||||||
io2.v512 = _mm512_mask_blend_epi64( 0xf0,
|
io2.v512 = _mm512_mask_blend_epi64( 0xf0,
|
||||||
_mm512_load_si512( (__m512i*)inout0 +2 ),
|
_mm512_load_si512( (__m512i*)inout0 +2 ),
|
||||||
_mm512_load_si512( (__m512i*)inout1 +2 ) );
|
_mm512_load_si512( (__m512i*)inout1 +2 ) );
|
||||||
|
*/
|
||||||
|
|
||||||
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0.v512 ) );
|
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0.v512 ) );
|
||||||
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1.v512 ) );
|
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1.v512 ) );
|
||||||
state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io2.v512 ) );
|
state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io2.v512 ) );
|
||||||
|
|
||||||
/*
|
|
||||||
io.v512[0] = _mm512_mask_blend_epi64( 0xf0,
|
|
||||||
_mm512_load_si512( (__m512i*)inout0 ),
|
|
||||||
_mm512_load_si512( (__m512i*)inout1 ) );
|
|
||||||
io.v512[1] = _mm512_mask_blend_epi64( 0xf0,
|
|
||||||
_mm512_load_si512( (__m512i*)inout0 +1 ),
|
|
||||||
_mm512_load_si512( (__m512i*)inout1 +1 ) );
|
|
||||||
io.v512[2] = _mm512_mask_blend_epi64( 0xf0,
|
|
||||||
_mm512_load_si512( (__m512i*)inout0 +2 ),
|
|
||||||
_mm512_load_si512( (__m512i*)inout1 +2 ) );
|
|
||||||
|
|
||||||
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io.v512[0] ) );
|
|
||||||
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io.v512[1] ) );
|
|
||||||
state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io.v512[2] ) );
|
|
||||||
*/
|
|
||||||
|
|
||||||
//Applies the reduced-round transformation f to the sponge's state
|
//Applies the reduced-round transformation f to the sponge's state
|
||||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||||
|
|
||||||
@@ -415,22 +419,6 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
|
|||||||
io2.v512 = _mm512_mask_blend_epi64( 0xf0, io2.v512, out[2] );
|
io2.v512 = _mm512_mask_blend_epi64( 0xf0, io2.v512, out[2] );
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
if ( rowOut == rowInOut0 )
|
|
||||||
{
|
|
||||||
io.v512[0] = _mm512_mask_blend_epi64( 0x0f, io.v512[0], out[0] );
|
|
||||||
io.v512[1] = _mm512_mask_blend_epi64( 0x0f, io.v512[1], out[1] );
|
|
||||||
io.v512[2] = _mm512_mask_blend_epi64( 0x0f, io.v512[2], out[2] );
|
|
||||||
|
|
||||||
}
|
|
||||||
if ( rowOut == rowInOut1 )
|
|
||||||
{
|
|
||||||
io.v512[0] = _mm512_mask_blend_epi64( 0xf0, io.v512[0], out[0] );
|
|
||||||
io.v512[1] = _mm512_mask_blend_epi64( 0xf0, io.v512[1], out[1] );
|
|
||||||
io.v512[2] = _mm512_mask_blend_epi64( 0xf0, io.v512[2], out[2] );
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
|
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
|
||||||
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
||||||
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
||||||
@@ -444,12 +432,23 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
|
|||||||
_mm512_mask_blend_epi64( 0x11, t2, t1 ) );
|
_mm512_mask_blend_epi64( 0x11, t2, t1 ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
casti_m256i( inout0, 0 ) = _mm512_castsi512_si256( io0.v512 );
|
||||||
|
casti_m256i( inout0, 2 ) = _mm512_castsi512_si256( io1.v512 );
|
||||||
|
casti_m256i( inout0, 4 ) = _mm512_castsi512_si256( io2.v512 );
|
||||||
|
_mm512_mask_store_epi64( inout1, 0xf0, io0.v512 );
|
||||||
|
_mm512_mask_store_epi64( inout1 +1, 0xf0, io1.v512 );
|
||||||
|
_mm512_mask_store_epi64( inout1 +2, 0xf0, io2.v512 );
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
casti_m256i( inout0, 0 ) = io0.v256lo;
|
casti_m256i( inout0, 0 ) = io0.v256lo;
|
||||||
casti_m256i( inout1, 1 ) = io0.v256hi;
|
casti_m256i( inout1, 1 ) = io0.v256hi;
|
||||||
casti_m256i( inout0, 2 ) = io1.v256lo;
|
casti_m256i( inout0, 2 ) = io1.v256lo;
|
||||||
casti_m256i( inout1, 3 ) = io1.v256hi;
|
casti_m256i( inout1, 3 ) = io1.v256hi;
|
||||||
casti_m256i( inout0, 4 ) = io2.v256lo;
|
casti_m256i( inout0, 4 ) = io2.v256lo;
|
||||||
casti_m256i( inout1, 5 ) = io2.v256hi;
|
casti_m256i( inout1, 5 ) = io2.v256hi;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
_mm512_mask_store_epi64( inout0, 0x0f, io.v512[0] );
|
_mm512_mask_store_epi64( inout0, 0x0f, io.v512[0] );
|
||||||
_mm512_mask_store_epi64( inout1, 0xf0, io.v512[0] );
|
_mm512_mask_store_epi64( inout1, 0xf0, io.v512[0] );
|
||||||
|
@@ -64,14 +64,14 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
|||||||
uint32_t vhashA[16<<3] __attribute__ ((aligned (64)));
|
uint32_t vhashA[16<<3] __attribute__ ((aligned (64)));
|
||||||
uint32_t vhashB[16<<3] __attribute__ ((aligned (64)));
|
uint32_t vhashB[16<<3] __attribute__ ((aligned (64)));
|
||||||
uint32_t vhashC[16<<3] __attribute__ ((aligned (64)));
|
uint32_t vhashC[16<<3] __attribute__ ((aligned (64)));
|
||||||
uint32_t hash0 [16] __attribute__ ((aligned (64)));
|
uint32_t hash0 [16] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash1 [16] __attribute__ ((aligned (64)));
|
uint32_t hash1 [16] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash2 [16] __attribute__ ((aligned (64)));
|
uint32_t hash2 [16] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash3 [16] __attribute__ ((aligned (64)));
|
uint32_t hash3 [16] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash4 [16] __attribute__ ((aligned (64)));
|
uint32_t hash4 [16] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash5 [16] __attribute__ ((aligned (64)));
|
uint32_t hash5 [16] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash6 [16] __attribute__ ((aligned (64)));
|
uint32_t hash6 [16] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash7 [16] __attribute__ ((aligned (64)));
|
uint32_t hash7 [16] __attribute__ ((aligned (32)));
|
||||||
hmq1725_8way_context_overlay ctx __attribute__ ((aligned (64)));
|
hmq1725_8way_context_overlay ctx __attribute__ ((aligned (64)));
|
||||||
__mmask8 vh_mask;
|
__mmask8 vh_mask;
|
||||||
const __m512i vmask = m512_const1_64( 24 );
|
const __m512i vmask = m512_const1_64( 24 );
|
||||||
@@ -639,13 +639,13 @@ typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
|
|||||||
|
|
||||||
extern void hmq1725_4way_hash(void *state, const void *input)
|
extern void hmq1725_4way_hash(void *state, const void *input)
|
||||||
{
|
{
|
||||||
uint32_t hash0 [16] __attribute__ ((aligned (64)));
|
|
||||||
uint32_t hash1 [16] __attribute__ ((aligned (64)));
|
|
||||||
uint32_t hash2 [16] __attribute__ ((aligned (64)));
|
|
||||||
uint32_t hash3 [16] __attribute__ ((aligned (64)));
|
|
||||||
uint32_t vhash [16<<2] __attribute__ ((aligned (64)));
|
uint32_t vhash [16<<2] __attribute__ ((aligned (64)));
|
||||||
uint32_t vhashA[16<<2] __attribute__ ((aligned (64)));
|
uint32_t vhashA[16<<2] __attribute__ ((aligned (64)));
|
||||||
uint32_t vhashB[16<<2] __attribute__ ((aligned (64)));
|
uint32_t vhashB[16<<2] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash0 [16] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash1 [16] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash2 [16] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash3 [16] __attribute__ ((aligned (32)));
|
||||||
hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
|
hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
|
||||||
__m256i vh_mask;
|
__m256i vh_mask;
|
||||||
int h_mask;
|
int h_mask;
|
||||||
|
@@ -16,7 +16,8 @@
|
|||||||
|
|
||||||
#if defined (X16R_8WAY)
|
#if defined (X16R_8WAY)
|
||||||
|
|
||||||
// Perform midstate prehash of hash functions with block size <= 72 bytes.
|
// Perform midstate prehash of hash functions with block size <= 72 bytes,
|
||||||
|
// 76 bytes for hash functions that operate on 32 bit data.
|
||||||
|
|
||||||
void x16r_8way_prehash( void *vdata, void *pdata )
|
void x16r_8way_prehash( void *vdata, void *pdata )
|
||||||
{
|
{
|
||||||
@@ -44,18 +45,36 @@ void x16r_8way_prehash( void *vdata, void *pdata )
|
|||||||
skein512_8way_update( &x16r_ctx.skein, vdata, 64 );
|
skein512_8way_update( &x16r_ctx.skein, vdata, 64 );
|
||||||
break;
|
break;
|
||||||
case LUFFA:
|
case LUFFA:
|
||||||
|
{
|
||||||
|
hashState_luffa ctx_luffa;
|
||||||
mm128_bswap32_80( edata, pdata );
|
mm128_bswap32_80( edata, pdata );
|
||||||
intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
|
intrlv_8x64( vdata, edata, edata, edata, edata,
|
||||||
luffa_4way_init( &x16r_ctx.luffa, 512 );
|
edata, edata, edata, edata, 640 );
|
||||||
luffa_4way_update( &x16r_ctx.luffa, vdata2, 64 );
|
init_luffa( &ctx_luffa, 512 );
|
||||||
rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
|
update_luffa( &ctx_luffa, (const BitSequence*)edata, 64 );
|
||||||
|
intrlv_4x128( x16r_ctx.luffa.buffer, ctx_luffa.buffer,
|
||||||
|
ctx_luffa.buffer, ctx_luffa.buffer, ctx_luffa.buffer, 512 );
|
||||||
|
intrlv_4x128( x16r_ctx.luffa.chainv, ctx_luffa.chainv,
|
||||||
|
ctx_luffa.chainv, ctx_luffa.chainv, ctx_luffa.chainv, 1280 );
|
||||||
|
x16r_ctx.luffa.hashbitlen = ctx_luffa.hashbitlen;
|
||||||
|
x16r_ctx.luffa.rembytes = ctx_luffa.rembytes;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case CUBEHASH:
|
case CUBEHASH:
|
||||||
|
{
|
||||||
|
cubehashParam ctx_cube;
|
||||||
mm128_bswap32_80( edata, pdata );
|
mm128_bswap32_80( edata, pdata );
|
||||||
intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
|
intrlv_8x64( vdata, edata, edata, edata, edata,
|
||||||
cube_4way_init( &x16r_ctx.cube, 512, 16, 32 );
|
edata, edata, edata, edata, 640 );
|
||||||
cube_4way_update( &x16r_ctx.cube, vdata2, 64 );
|
cubehashInit( &ctx_cube, 512, 16, 32 );
|
||||||
rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
|
cubehashUpdate( &ctx_cube, (const byte*)edata, 64 );
|
||||||
|
x16r_ctx.cube.hashlen = ctx_cube.hashlen;
|
||||||
|
x16r_ctx.cube.rounds = ctx_cube.rounds;
|
||||||
|
x16r_ctx.cube.blocksize = ctx_cube.blocksize;
|
||||||
|
x16r_ctx.cube.pos = ctx_cube.pos;
|
||||||
|
intrlv_4x128( x16r_ctx.cube.h, ctx_cube.x, ctx_cube.x, ctx_cube.x,
|
||||||
|
ctx_cube.x, 1024 );
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case HAMSI:
|
case HAMSI:
|
||||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||||
@@ -94,14 +113,14 @@ void x16r_8way_prehash( void *vdata, void *pdata )
|
|||||||
int x16r_8way_hash_generic( void* output, const void* input, int thrid )
|
int x16r_8way_hash_generic( void* output, const void* input, int thrid )
|
||||||
{
|
{
|
||||||
uint32_t vhash[20*8] __attribute__ ((aligned (128)));
|
uint32_t vhash[20*8] __attribute__ ((aligned (128)));
|
||||||
uint32_t hash0[20] __attribute__ ((aligned (64)));
|
uint32_t hash0[20] __attribute__ ((aligned (16)));
|
||||||
uint32_t hash1[20] __attribute__ ((aligned (64)));
|
uint32_t hash1[20] __attribute__ ((aligned (16)));
|
||||||
uint32_t hash2[20] __attribute__ ((aligned (64)));
|
uint32_t hash2[20] __attribute__ ((aligned (16)));
|
||||||
uint32_t hash3[20] __attribute__ ((aligned (64)));
|
uint32_t hash3[20] __attribute__ ((aligned (16)));
|
||||||
uint32_t hash4[20] __attribute__ ((aligned (64)));
|
uint32_t hash4[20] __attribute__ ((aligned (16)));
|
||||||
uint32_t hash5[20] __attribute__ ((aligned (64)));
|
uint32_t hash5[20] __attribute__ ((aligned (16)));
|
||||||
uint32_t hash6[20] __attribute__ ((aligned (64)));
|
uint32_t hash6[20] __attribute__ ((aligned (16)));
|
||||||
uint32_t hash7[20] __attribute__ ((aligned (64)));
|
uint32_t hash7[20] __attribute__ ((aligned (16)));
|
||||||
x16r_8way_context_overlay ctx;
|
x16r_8way_context_overlay ctx;
|
||||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||||
void *in0 = (void*) hash0;
|
void *in0 = (void*) hash0;
|
||||||
@@ -476,7 +495,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
|
|||||||
{
|
{
|
||||||
uint32_t hash[16*8] __attribute__ ((aligned (128)));
|
uint32_t hash[16*8] __attribute__ ((aligned (128)));
|
||||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||||
uint32_t bedata1[2] __attribute__((aligned(64)));
|
uint32_t bedata1[2];
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
@@ -500,7 +519,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
|
|||||||
s_ntime = ntime;
|
s_ntime = ntime;
|
||||||
|
|
||||||
if ( opt_debug && !thr_id )
|
if ( opt_debug && !thr_id )
|
||||||
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
|
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
|
||||||
}
|
}
|
||||||
|
|
||||||
x16r_8way_prehash( vdata, pdata );
|
x16r_8way_prehash( vdata, pdata );
|
||||||
@@ -552,18 +571,33 @@ void x16r_4way_prehash( void *vdata, void *pdata )
|
|||||||
skein512_4way_prehash64( &x16r_ctx.skein, vdata );
|
skein512_4way_prehash64( &x16r_ctx.skein, vdata );
|
||||||
break;
|
break;
|
||||||
case LUFFA:
|
case LUFFA:
|
||||||
|
{
|
||||||
|
hashState_luffa ctx_luffa;
|
||||||
mm128_bswap32_80( edata, pdata );
|
mm128_bswap32_80( edata, pdata );
|
||||||
intrlv_2x128( vdata2, edata, edata, 640 );
|
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
|
||||||
luffa_2way_init( &x16r_ctx.luffa, 512 );
|
init_luffa( &ctx_luffa, 512 );
|
||||||
luffa_2way_update( &x16r_ctx.luffa, vdata2, 64 );
|
update_luffa( &ctx_luffa, (const BitSequence*)edata, 64 );
|
||||||
rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
|
intrlv_2x128( x16r_ctx.luffa.buffer, ctx_luffa.buffer,
|
||||||
break;
|
ctx_luffa.buffer, 512 );
|
||||||
|
intrlv_2x128( x16r_ctx.luffa.chainv, ctx_luffa.chainv,
|
||||||
|
ctx_luffa.chainv, 1280 );
|
||||||
|
x16r_ctx.luffa.hashbitlen = ctx_luffa.hashbitlen;
|
||||||
|
x16r_ctx.luffa.rembytes = ctx_luffa.rembytes;
|
||||||
|
}
|
||||||
|
break;
|
||||||
case CUBEHASH:
|
case CUBEHASH:
|
||||||
|
{
|
||||||
|
cubehashParam ctx_cube;
|
||||||
mm128_bswap32_80( edata, pdata );
|
mm128_bswap32_80( edata, pdata );
|
||||||
intrlv_2x128( vdata2, edata, edata, 640 );
|
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
|
||||||
cube_2way_init( &x16r_ctx.cube, 512, 16, 32 );
|
cubehashInit( &ctx_cube, 512, 16, 32 );
|
||||||
cube_2way_update( &x16r_ctx.cube, vdata2, 64 );
|
cubehashUpdate( &ctx_cube, (const byte*)edata, 64 );
|
||||||
rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
|
x16r_ctx.cube.hashlen = ctx_cube.hashlen;
|
||||||
|
x16r_ctx.cube.rounds = ctx_cube.rounds;
|
||||||
|
x16r_ctx.cube.blocksize = ctx_cube.blocksize;
|
||||||
|
x16r_ctx.cube.pos = ctx_cube.pos;
|
||||||
|
intrlv_2x128( x16r_ctx.cube.h, ctx_cube.x, ctx_cube.x, 1024 );
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case HAMSI:
|
case HAMSI:
|
||||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||||
@@ -596,10 +630,10 @@ void x16r_4way_prehash( void *vdata, void *pdata )
|
|||||||
int x16r_4way_hash_generic( void* output, const void* input, int thrid )
|
int x16r_4way_hash_generic( void* output, const void* input, int thrid )
|
||||||
{
|
{
|
||||||
uint32_t vhash[20*4] __attribute__ ((aligned (128)));
|
uint32_t vhash[20*4] __attribute__ ((aligned (128)));
|
||||||
uint32_t hash0[20] __attribute__ ((aligned (64)));
|
uint32_t hash0[20] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash1[20] __attribute__ ((aligned (64)));
|
uint32_t hash1[20] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash2[20] __attribute__ ((aligned (64)));
|
uint32_t hash2[20] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash3[20] __attribute__ ((aligned (64)));
|
uint32_t hash3[20] __attribute__ ((aligned (32)));
|
||||||
x16r_4way_context_overlay ctx;
|
x16r_4way_context_overlay ctx;
|
||||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||||
void *in0 = (void*) hash0;
|
void *in0 = (void*) hash0;
|
||||||
@@ -890,7 +924,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
|
|||||||
{
|
{
|
||||||
uint32_t hash[16*4] __attribute__ ((aligned (64)));
|
uint32_t hash[16*4] __attribute__ ((aligned (64)));
|
||||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||||
uint32_t bedata1[2] __attribute__((aligned(64)));
|
uint32_t bedata1[2];
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
@@ -913,7 +947,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
|
|||||||
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
|
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
|
||||||
s_ntime = ntime;
|
s_ntime = ntime;
|
||||||
if ( opt_debug && !thr_id )
|
if ( opt_debug && !thr_id )
|
||||||
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
|
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
|
||||||
}
|
}
|
||||||
|
|
||||||
x16r_4way_prehash( vdata, pdata );
|
x16r_4way_prehash( vdata, pdata );
|
||||||
|
@@ -30,8 +30,8 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
|
|||||||
x16rt_getTimeHash( masked_ntime, &timeHash );
|
x16rt_getTimeHash( masked_ntime, &timeHash );
|
||||||
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
|
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
|
||||||
s_ntime = masked_ntime;
|
s_ntime = masked_ntime;
|
||||||
if ( opt_debug && !thr_id )
|
if ( !thr_id )
|
||||||
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
|
applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
|
||||||
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
|
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -84,8 +84,8 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
|
|||||||
x16rt_getTimeHash( masked_ntime, &timeHash );
|
x16rt_getTimeHash( masked_ntime, &timeHash );
|
||||||
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
|
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
|
||||||
s_ntime = masked_ntime;
|
s_ntime = masked_ntime;
|
||||||
if ( opt_debug && !thr_id )
|
if ( !thr_id )
|
||||||
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
|
applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
|
||||||
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
|
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -45,14 +45,14 @@ static __thread x16rv2_8way_context_overlay x16rv2_ctx;
|
|||||||
int x16rv2_8way_hash( void* output, const void* input, int thrid )
|
int x16rv2_8way_hash( void* output, const void* input, int thrid )
|
||||||
{
|
{
|
||||||
uint32_t vhash[24*8] __attribute__ ((aligned (128)));
|
uint32_t vhash[24*8] __attribute__ ((aligned (128)));
|
||||||
uint32_t hash0[24] __attribute__ ((aligned (64)));
|
uint32_t hash0[24] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash1[24] __attribute__ ((aligned (64)));
|
uint32_t hash1[24] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash2[24] __attribute__ ((aligned (64)));
|
uint32_t hash2[24] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash3[24] __attribute__ ((aligned (64)));
|
uint32_t hash3[24] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash4[24] __attribute__ ((aligned (64)));
|
uint32_t hash4[24] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash5[24] __attribute__ ((aligned (64)));
|
uint32_t hash5[24] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash6[24] __attribute__ ((aligned (64)));
|
uint32_t hash6[24] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash7[24] __attribute__ ((aligned (64)));
|
uint32_t hash7[24] __attribute__ ((aligned (32)));
|
||||||
x16rv2_8way_context_overlay ctx;
|
x16rv2_8way_context_overlay ctx;
|
||||||
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
|
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
|
||||||
void *in0 = (void*) hash0;
|
void *in0 = (void*) hash0;
|
||||||
@@ -706,11 +706,11 @@ inline void padtiger512( uint32_t* hash )
|
|||||||
|
|
||||||
int x16rv2_4way_hash( void* output, const void* input, int thrid )
|
int x16rv2_4way_hash( void* output, const void* input, int thrid )
|
||||||
{
|
{
|
||||||
uint32_t hash0[20] __attribute__ ((aligned (64)));
|
|
||||||
uint32_t hash1[20] __attribute__ ((aligned (64)));
|
|
||||||
uint32_t hash2[20] __attribute__ ((aligned (64)));
|
|
||||||
uint32_t hash3[20] __attribute__ ((aligned (64)));
|
|
||||||
uint32_t vhash[20*4] __attribute__ ((aligned (64)));
|
uint32_t vhash[20*4] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash0[20] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash1[20] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash2[20] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash3[20] __attribute__ ((aligned (32)));
|
||||||
x16rv2_4way_context_overlay ctx;
|
x16rv2_4way_context_overlay ctx;
|
||||||
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
|
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
|
||||||
void *in0 = (void*) hash0;
|
void *in0 = (void*) hash0;
|
||||||
@@ -1054,8 +1054,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
|
|||||||
uint32_t hash[4*16] __attribute__ ((aligned (64)));
|
uint32_t hash[4*16] __attribute__ ((aligned (64)));
|
||||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||||
uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
|
uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
|
||||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
uint32_t edata[20];
|
||||||
uint32_t bedata1[2] __attribute__((aligned(64)));
|
uint32_t bedata1[2];
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
@@ -1068,7 +1068,6 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
if ( bench ) ptarget[7] = 0x0fff;
|
if ( bench ) ptarget[7] = 0x0fff;
|
||||||
|
|
||||||
|
|
||||||
bedata1[0] = bswap_32( pdata[1] );
|
bedata1[0] = bswap_32( pdata[1] );
|
||||||
bedata1[1] = bswap_32( pdata[2] );
|
bedata1[1] = bswap_32( pdata[2] );
|
||||||
|
|
||||||
|
@@ -63,14 +63,14 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
|
|||||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||||
uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
|
uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
|
||||||
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
|
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
uint64_t hash0[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
uint64_t hash1[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
uint64_t hash2[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
uint64_t hash3[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
uint64_t hash4[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
uint64_t hash5[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
uint64_t hash6[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
uint64_t hash7[8] __attribute__ ((aligned (32)));
|
||||||
sonoa_8way_context_overlay ctx;
|
sonoa_8way_context_overlay ctx;
|
||||||
|
|
||||||
// 1
|
// 1
|
||||||
@@ -1150,13 +1150,13 @@ typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;
|
|||||||
|
|
||||||
int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||||
{
|
{
|
||||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
|
||||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
|
||||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
|
||||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
|
||||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||||
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
|
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
|
||||||
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
|
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash0[8] __attribute__ ((aligned (32)));
|
||||||
|
uint64_t hash1[8] __attribute__ ((aligned (32)));
|
||||||
|
uint64_t hash2[8] __attribute__ ((aligned (32)));
|
||||||
|
uint64_t hash3[8] __attribute__ ((aligned (32)));
|
||||||
sonoa_4way_context_overlay ctx;
|
sonoa_4way_context_overlay ctx;
|
||||||
|
|
||||||
// 1
|
// 1
|
||||||
|
@@ -58,23 +58,27 @@ union _x17_8way_context_overlay
|
|||||||
} __attribute__ ((aligned (64)));
|
} __attribute__ ((aligned (64)));
|
||||||
typedef union _x17_8way_context_overlay x17_8way_context_overlay;
|
typedef union _x17_8way_context_overlay x17_8way_context_overlay;
|
||||||
|
|
||||||
|
static __thread __m512i x17_8way_midstate[16] __attribute__((aligned(64)));
|
||||||
|
static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
|
||||||
|
|
||||||
int x17_8way_hash( void *state, const void *input, int thr_id )
|
int x17_8way_hash( void *state, const void *input, int thr_id )
|
||||||
{
|
{
|
||||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||||
uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
|
uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
|
||||||
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
|
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
uint64_t hash0[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
uint64_t hash1[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
uint64_t hash2[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
uint64_t hash3[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
uint64_t hash4[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
uint64_t hash5[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
uint64_t hash6[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
uint64_t hash7[8] __attribute__ ((aligned (32)));
|
||||||
x17_8way_context_overlay ctx;
|
x17_8way_context_overlay ctx;
|
||||||
|
|
||||||
blake512_8way_full( &ctx.blake, vhash, input, 80 );
|
blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
|
||||||
|
x17_8way_midstate );
|
||||||
|
|
||||||
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
|
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
|
||||||
|
|
||||||
#if defined(__VAES__)
|
#if defined(__VAES__)
|
||||||
@@ -122,9 +126,6 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
|
|||||||
|
|
||||||
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 );
|
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 );
|
||||||
|
|
||||||
// cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
|
|
||||||
// cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
|
|
||||||
|
|
||||||
#if defined(__VAES__)
|
#if defined(__VAES__)
|
||||||
|
|
||||||
shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
|
shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
|
||||||
@@ -237,6 +238,61 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
uint32_t hash32[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||||
|
__m128i edata[5] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t *hash32_d7 = &(hash32[7*8]);
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
const uint32_t *ptarget = work->target;
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const uint32_t last_nonce = max_nonce - 8;
|
||||||
|
__m512i *noncev = (__m512i*)vdata + 9;
|
||||||
|
uint32_t n = first_nonce;
|
||||||
|
const int thr_id = mythr->id;
|
||||||
|
const uint32_t targ32_d7 = ptarget[7];
|
||||||
|
const __m512i eight = m512_const1_64( 8 );
|
||||||
|
const bool bench = opt_benchmark;
|
||||||
|
|
||||||
|
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
|
||||||
|
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
|
||||||
|
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
|
||||||
|
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
|
||||||
|
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
|
||||||
|
|
||||||
|
mm512_intrlv80_8x64( vdata, edata );
|
||||||
|
|
||||||
|
*noncev = mm512_intrlv_blend_32( *noncev,
|
||||||
|
_mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
|
||||||
|
0, n+3, 0, n+2, 0, n+1, 0, n ) );
|
||||||
|
blake512_8way_prehash_le( &blake512_8way_ctx, x17_8way_midstate, vdata );
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
if ( likely( x17_8way_hash( hash32, vdata, thr_id ) ) )
|
||||||
|
for ( int lane = 0; lane < 8; lane++ )
|
||||||
|
if ( unlikely( ( hash32_d7[ lane ] <= targ32_d7 ) && !bench ) )
|
||||||
|
{
|
||||||
|
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||||
|
if ( likely( valid_hash( lane_hash, ptarget ) ) )
|
||||||
|
{
|
||||||
|
pdata[19] = n + lane;
|
||||||
|
submit_solution( work, lane_hash, mythr );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*noncev = _mm512_add_epi32( *noncev, eight );
|
||||||
|
n += 8;
|
||||||
|
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
|
||||||
|
pdata[19] = n;
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#elif defined(X17_4WAY)
|
#elif defined(X17_4WAY)
|
||||||
|
|
||||||
union _x17_4way_context_overlay
|
union _x17_4way_context_overlay
|
||||||
@@ -271,10 +327,10 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
|
|||||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||||
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
|
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
|
||||||
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
|
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
uint64_t hash0[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
uint64_t hash1[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
uint64_t hash2[8] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
uint64_t hash3[8] __attribute__ ((aligned (32)));
|
||||||
x17_4way_context_overlay ctx;
|
x17_4way_context_overlay ctx;
|
||||||
|
|
||||||
blake512_4way_full( &ctx.blake, vhash, input, 80 );
|
blake512_4way_full( &ctx.blake, vhash, input, 80 );
|
||||||
|
@@ -3,7 +3,7 @@
|
|||||||
bool register_x17_algo( algo_gate_t* gate )
|
bool register_x17_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
#if defined (X17_8WAY)
|
#if defined (X17_8WAY)
|
||||||
gate->scanhash = (void*)&scanhash_8way_64in_32out;
|
gate->scanhash = (void*)&scanhash_x17_8way;
|
||||||
gate->hash = (void*)&x17_8way_hash;
|
gate->hash = (void*)&x17_8way_hash;
|
||||||
#elif defined (X17_4WAY)
|
#elif defined (X17_4WAY)
|
||||||
gate->scanhash = (void*)&scanhash_4way_64in_32out;
|
gate->scanhash = (void*)&scanhash_4way_64in_32out;
|
||||||
|
@@ -14,10 +14,15 @@ bool register_x17_algo( algo_gate_t* gate );
|
|||||||
|
|
||||||
#if defined(X17_8WAY)
|
#if defined(X17_8WAY)
|
||||||
|
|
||||||
|
int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
int x17_8way_hash( void *state, const void *input, int thr_id );
|
int x17_8way_hash( void *state, const void *input, int thr_id );
|
||||||
|
|
||||||
#elif defined(X17_4WAY)
|
#elif defined(X17_4WAY)
|
||||||
|
|
||||||
|
int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
int x17_4way_hash( void *state, const void *input, int thr_id );
|
int x17_4way_hash( void *state, const void *input, int thr_id );
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -62,14 +62,14 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
|
|||||||
uint64_t vhash[16<<3] __attribute__ ((aligned (128)));
|
uint64_t vhash[16<<3] __attribute__ ((aligned (128)));
|
||||||
uint64_t vhashA[16<<3] __attribute__ ((aligned (64)));
|
uint64_t vhashA[16<<3] __attribute__ ((aligned (64)));
|
||||||
uint64_t vhashB[16<<3] __attribute__ ((aligned (64)));
|
uint64_t vhashB[16<<3] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash0[16] __attribute__ ((aligned (64)));
|
uint64_t hash0[16] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash1[16] __attribute__ ((aligned (64)));
|
uint64_t hash1[16] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash2[16] __attribute__ ((aligned (64)));
|
uint64_t hash2[16] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash3[16] __attribute__ ((aligned (64)));
|
uint64_t hash3[16] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash4[16] __attribute__ ((aligned (64)));
|
uint64_t hash4[16] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash5[16] __attribute__ ((aligned (64)));
|
uint64_t hash5[16] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash6[16] __attribute__ ((aligned (64)));
|
uint64_t hash6[16] __attribute__ ((aligned (32)));
|
||||||
uint64_t hash7[16] __attribute__ ((aligned (64)));
|
uint64_t hash7[16] __attribute__ ((aligned (32)));
|
||||||
const int dataLen = 128;
|
const int dataLen = 128;
|
||||||
xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));
|
xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
@@ -430,13 +430,13 @@ typedef union _xevan_4way_context_overlay xevan_4way_context_overlay;
|
|||||||
|
|
||||||
int xevan_4way_hash( void *output, const void *input, int thr_id )
|
int xevan_4way_hash( void *output, const void *input, int thr_id )
|
||||||
{
|
{
|
||||||
uint64_t hash0[16] __attribute__ ((aligned (64)));
|
|
||||||
uint64_t hash1[16] __attribute__ ((aligned (64)));
|
|
||||||
uint64_t hash2[16] __attribute__ ((aligned (64)));
|
|
||||||
uint64_t hash3[16] __attribute__ ((aligned (64)));
|
|
||||||
uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
|
uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
|
||||||
uint64_t vhashA[16<<2] __attribute__ ((aligned (64)));
|
uint64_t vhashA[16<<2] __attribute__ ((aligned (64)));
|
||||||
uint64_t vhashB[16<<2] __attribute__ ((aligned (64)));
|
uint64_t vhashB[16<<2] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash0[16] __attribute__ ((aligned (32)));
|
||||||
|
uint64_t hash1[16] __attribute__ ((aligned (32)));
|
||||||
|
uint64_t hash2[16] __attribute__ ((aligned (32)));
|
||||||
|
uint64_t hash3[16] __attribute__ ((aligned (32)));
|
||||||
const int dataLen = 128;
|
const int dataLen = 128;
|
||||||
xevan_4way_context_overlay ctx __attribute__ ((aligned (64)));
|
xevan_4way_context_overlay ctx __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
@@ -21,7 +21,6 @@
|
|||||||
#include "algo/tiger/sph_tiger.h"
|
#include "algo/tiger/sph_tiger.h"
|
||||||
#include "algo/lyra2/lyra2.h"
|
#include "algo/lyra2/lyra2.h"
|
||||||
#include "algo/gost/sph_gost.h"
|
#include "algo/gost/sph_gost.h"
|
||||||
#include "algo/swifftx/swifftx.h"
|
|
||||||
#if defined(__VAES__)
|
#if defined(__VAES__)
|
||||||
#include "algo/groestl/groestl512-hash-4way.h"
|
#include "algo/groestl/groestl512-hash-4way.h"
|
||||||
#include "algo/shavite/shavite-hash-4way.h"
|
#include "algo/shavite/shavite-hash-4way.h"
|
||||||
|
@@ -50,6 +50,7 @@ bool register_x25x_algo( algo_gate_t* gate )
|
|||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
|
gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
|
||||||
AVX512_OPT | VAES_OPT;
|
AVX512_OPT | VAES_OPT;
|
||||||
|
InitializeSWIFFTX();
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -5,6 +5,7 @@
|
|||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
#include "algo/swifftx/swifftx.h"
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
#define X22I_8WAY 1
|
#define X22I_8WAY 1
|
||||||
|
@@ -24,7 +24,6 @@
|
|||||||
#include "algo/tiger/sph_tiger.h"
|
#include "algo/tiger/sph_tiger.h"
|
||||||
#include "algo/lyra2/lyra2.h"
|
#include "algo/lyra2/lyra2.h"
|
||||||
#include "algo/gost/sph_gost.h"
|
#include "algo/gost/sph_gost.h"
|
||||||
#include "algo/swifftx/swifftx.h"
|
|
||||||
#include "algo/panama/panama-hash-4way.h"
|
#include "algo/panama/panama-hash-4way.h"
|
||||||
#include "algo/lanehash/lane.h"
|
#include "algo/lanehash/lane.h"
|
||||||
#if defined(__VAES__)
|
#if defined(__VAES__)
|
||||||
@@ -102,6 +101,9 @@ union _x25x_8way_ctx_overlay
|
|||||||
};
|
};
|
||||||
typedef union _x25x_8way_ctx_overlay x25x_8way_ctx_overlay;
|
typedef union _x25x_8way_ctx_overlay x25x_8way_ctx_overlay;
|
||||||
|
|
||||||
|
static __thread __m512i x25x_8way_midstate[16] __attribute__((aligned(64)));
|
||||||
|
static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
|
||||||
|
|
||||||
int x25x_8way_hash( void *output, const void *input, int thrid )
|
int x25x_8way_hash( void *output, const void *input, int thrid )
|
||||||
{
|
{
|
||||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||||
@@ -118,9 +120,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
|
|||||||
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
|
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
|
||||||
x25x_8way_ctx_overlay ctx __attribute__ ((aligned (64)));
|
x25x_8way_ctx_overlay ctx __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
blake512_8way_init( &ctx.blake );
|
blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
|
||||||
blake512_8way_update( &ctx.blake, input, 80 );
|
x25x_8way_midstate );
|
||||||
blake512_8way_close( &ctx.blake, vhash );
|
|
||||||
dintrlv_8x64_512( hash0[0], hash1[0], hash2[0], hash3[0],
|
dintrlv_8x64_512( hash0[0], hash1[0], hash2[0], hash3[0],
|
||||||
hash4[0], hash5[0], hash6[0], hash7[0], vhash );
|
hash4[0], hash5[0], hash6[0], hash7[0], vhash );
|
||||||
|
|
||||||
@@ -271,7 +273,6 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
|
|||||||
intrlv_8x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10],
|
intrlv_8x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10],
|
||||||
hash4[10], hash5[10], hash6[10], hash7[10] );
|
hash4[10], hash5[10], hash6[10], hash7[10] );
|
||||||
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
init_echo( &ctx.echo, 512 );
|
init_echo( &ctx.echo, 512 );
|
||||||
@@ -558,6 +559,7 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
|
|||||||
{
|
{
|
||||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||||
|
__m128i edata[5] __attribute__ ((aligned (64)));
|
||||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||||
uint32_t *hashd7 = &(hash[7*8]);
|
uint32_t *hashd7 = &(hash[7*8]);
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
@@ -569,15 +571,22 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
|
|||||||
const int thr_id = mythr->id;
|
const int thr_id = mythr->id;
|
||||||
const uint32_t targ32 = ptarget[7];
|
const uint32_t targ32 = ptarget[7];
|
||||||
const bool bench = opt_benchmark;
|
const bool bench = opt_benchmark;
|
||||||
|
const __m512i eight = m512_const1_64( 8 );
|
||||||
if ( bench ) ptarget[7] = 0x08ff;
|
if ( bench ) ptarget[7] = 0x08ff;
|
||||||
|
|
||||||
InitializeSWIFFTX();
|
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
|
||||||
|
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
|
||||||
|
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
|
||||||
|
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
|
||||||
|
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
|
||||||
|
|
||||||
|
mm512_intrlv80_8x64( vdata, edata );
|
||||||
|
|
||||||
|
*noncev = mm512_intrlv_blend_32( *noncev,
|
||||||
|
_mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
|
||||||
|
0, n+3, 0, n+2, 0, n+1, 0, n ) );
|
||||||
|
blake512_8way_prehash_le( &blake512_8way_ctx, x25x_8way_midstate, vdata );
|
||||||
|
|
||||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
|
||||||
*noncev = mm512_intrlv_blend_32(
|
|
||||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
|
||||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
if ( x25x_8way_hash( hash, vdata, thr_id ) );
|
if ( x25x_8way_hash( hash, vdata, thr_id ) );
|
||||||
@@ -588,12 +597,11 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
|
|||||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||||
if ( likely( valid_hash( lane_hash, ptarget ) ) )
|
if ( likely( valid_hash( lane_hash, ptarget ) ) )
|
||||||
{
|
{
|
||||||
pdata[19] = bswap_32( n + lane );
|
pdata[19] = n + lane;
|
||||||
submit_solution( work, lane_hash, mythr );
|
submit_solution( work, lane_hash, mythr );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*noncev = _mm512_add_epi32( *noncev,
|
*noncev = _mm512_add_epi32( *noncev, eight );
|
||||||
m512_const1_64( 0x0000000800000000 ) );
|
|
||||||
n += 8;
|
n += 8;
|
||||||
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
|
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
|
||||||
pdata[19] = n;
|
pdata[19] = n;
|
||||||
@@ -637,8 +645,12 @@ union _x25x_4way_ctx_overlay
|
|||||||
panama_4way_context panama;
|
panama_4way_context panama;
|
||||||
blake2s_4way_state blake2s;
|
blake2s_4way_state blake2s;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay;
|
typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay;
|
||||||
|
|
||||||
|
static __thread __m256i x25x_4way_midstate[16] __attribute__((aligned(64)));
|
||||||
|
static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));
|
||||||
|
|
||||||
int x25x_4way_hash( void *output, const void *input, int thrid )
|
int x25x_4way_hash( void *output, const void *input, int thrid )
|
||||||
{
|
{
|
||||||
uint64_t vhash[8*4] __attribute__ ((aligned (128)));
|
uint64_t vhash[8*4] __attribute__ ((aligned (128)));
|
||||||
@@ -651,7 +663,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
|
|||||||
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
|
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
|
||||||
x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));
|
x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
blake512_4way_full( &ctx.blake, vhash, input, 80 );
|
blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
|
||||||
|
x25x_4way_midstate );
|
||||||
|
|
||||||
dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash );
|
dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash );
|
||||||
|
|
||||||
bmw512_4way_init( &ctx.bmw );
|
bmw512_4way_init( &ctx.bmw );
|
||||||
@@ -905,6 +919,7 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
|
|||||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||||
|
__m128i edata[5] __attribute__ ((aligned (64)));
|
||||||
uint32_t *hashd7 = &(hash[ 7*4 ]);
|
uint32_t *hashd7 = &(hash[ 7*4 ]);
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
@@ -914,15 +929,23 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
|
|||||||
uint32_t n = first_nonce;
|
uint32_t n = first_nonce;
|
||||||
const int thr_id = mythr->id;
|
const int thr_id = mythr->id;
|
||||||
const uint32_t targ32 = ptarget[7];
|
const uint32_t targ32 = ptarget[7];
|
||||||
|
const __m256i four = m256_const1_64( 4 );
|
||||||
const bool bench = opt_benchmark;
|
const bool bench = opt_benchmark;
|
||||||
|
|
||||||
if ( bench ) ptarget[7] = 0x08ff;
|
if ( bench ) ptarget[7] = 0x08ff;
|
||||||
|
|
||||||
InitializeSWIFFTX();
|
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
|
||||||
|
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
|
||||||
|
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
|
||||||
|
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
|
||||||
|
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
|
||||||
|
|
||||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
mm256_intrlv80_4x64( vdata, edata );
|
||||||
*noncev = mm256_intrlv_blend_32(
|
|
||||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
*noncev = mm256_intrlv_blend_32( *noncev,
|
||||||
|
_mm256_set_epi32( 0, n+3, 0, n+2, 0, n+1, 0, n ) );
|
||||||
|
blake512_4way_prehash_le( &blake512_4way_ctx, x25x_4way_midstate, vdata );
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
if ( x25x_4way_hash( hash, vdata, thr_id ) )
|
if ( x25x_4way_hash( hash, vdata, thr_id ) )
|
||||||
@@ -932,12 +955,11 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
|
|||||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||||
if ( valid_hash( lane_hash, ptarget ) )
|
if ( valid_hash( lane_hash, ptarget ) )
|
||||||
{
|
{
|
||||||
pdata[19] = bswap_32( n + lane );
|
pdata[19] = n + lane;
|
||||||
submit_solution( work, lane_hash, mythr );
|
submit_solution( work, lane_hash, mythr );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*noncev = _mm256_add_epi32( *noncev,
|
*noncev = _mm256_add_epi32( *noncev, four );
|
||||||
m256_const1_64( 0x0000000400000000 ) );
|
|
||||||
n += 4;
|
n += 4;
|
||||||
} while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
|
} while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
|
||||||
pdata[19] = n;
|
pdata[19] = n;
|
||||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
|||||||
#! /bin/sh
|
#! /bin/sh
|
||||||
# Guess values for system-dependent variables and create Makefiles.
|
# Guess values for system-dependent variables and create Makefiles.
|
||||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.6.
|
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.9.
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
|||||||
# Identity of this package.
|
# Identity of this package.
|
||||||
PACKAGE_NAME='cpuminer-opt'
|
PACKAGE_NAME='cpuminer-opt'
|
||||||
PACKAGE_TARNAME='cpuminer-opt'
|
PACKAGE_TARNAME='cpuminer-opt'
|
||||||
PACKAGE_VERSION='3.19.6'
|
PACKAGE_VERSION='3.19.9'
|
||||||
PACKAGE_STRING='cpuminer-opt 3.19.6'
|
PACKAGE_STRING='cpuminer-opt 3.19.9'
|
||||||
PACKAGE_BUGREPORT=''
|
PACKAGE_BUGREPORT=''
|
||||||
PACKAGE_URL=''
|
PACKAGE_URL=''
|
||||||
|
|
||||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
|||||||
# Omit some internal or obsolete options to make the list less imposing.
|
# Omit some internal or obsolete options to make the list less imposing.
|
||||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||||
cat <<_ACEOF
|
cat <<_ACEOF
|
||||||
\`configure' configures cpuminer-opt 3.19.6 to adapt to many kinds of systems.
|
\`configure' configures cpuminer-opt 3.19.9 to adapt to many kinds of systems.
|
||||||
|
|
||||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||||
|
|
||||||
@@ -1404,7 +1404,7 @@ fi
|
|||||||
|
|
||||||
if test -n "$ac_init_help"; then
|
if test -n "$ac_init_help"; then
|
||||||
case $ac_init_help in
|
case $ac_init_help in
|
||||||
short | recursive ) echo "Configuration of cpuminer-opt 3.19.6:";;
|
short | recursive ) echo "Configuration of cpuminer-opt 3.19.9:";;
|
||||||
esac
|
esac
|
||||||
cat <<\_ACEOF
|
cat <<\_ACEOF
|
||||||
|
|
||||||
@@ -1509,7 +1509,7 @@ fi
|
|||||||
test -n "$ac_init_help" && exit $ac_status
|
test -n "$ac_init_help" && exit $ac_status
|
||||||
if $ac_init_version; then
|
if $ac_init_version; then
|
||||||
cat <<\_ACEOF
|
cat <<\_ACEOF
|
||||||
cpuminer-opt configure 3.19.6
|
cpuminer-opt configure 3.19.9
|
||||||
generated by GNU Autoconf 2.69
|
generated by GNU Autoconf 2.69
|
||||||
|
|
||||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
|||||||
This file contains any messages produced by compilers while
|
This file contains any messages produced by compilers while
|
||||||
running configure, to aid debugging if configure makes a mistake.
|
running configure, to aid debugging if configure makes a mistake.
|
||||||
|
|
||||||
It was created by cpuminer-opt $as_me 3.19.6, which was
|
It was created by cpuminer-opt $as_me 3.19.9, which was
|
||||||
generated by GNU Autoconf 2.69. Invocation command line was
|
generated by GNU Autoconf 2.69. Invocation command line was
|
||||||
|
|
||||||
$ $0 $@
|
$ $0 $@
|
||||||
@@ -2993,7 +2993,7 @@ fi
|
|||||||
|
|
||||||
# Define the identity of the package.
|
# Define the identity of the package.
|
||||||
PACKAGE='cpuminer-opt'
|
PACKAGE='cpuminer-opt'
|
||||||
VERSION='3.19.6'
|
VERSION='3.19.9'
|
||||||
|
|
||||||
|
|
||||||
cat >>confdefs.h <<_ACEOF
|
cat >>confdefs.h <<_ACEOF
|
||||||
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
|||||||
# report actual input values of CONFIG_FILES etc. instead of their
|
# report actual input values of CONFIG_FILES etc. instead of their
|
||||||
# values after options handling.
|
# values after options handling.
|
||||||
ac_log="
|
ac_log="
|
||||||
This file was extended by cpuminer-opt $as_me 3.19.6, which was
|
This file was extended by cpuminer-opt $as_me 3.19.9, which was
|
||||||
generated by GNU Autoconf 2.69. Invocation command line was
|
generated by GNU Autoconf 2.69. Invocation command line was
|
||||||
|
|
||||||
CONFIG_FILES = $CONFIG_FILES
|
CONFIG_FILES = $CONFIG_FILES
|
||||||
@@ -6756,7 +6756,7 @@ _ACEOF
|
|||||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||||
ac_cs_version="\\
|
ac_cs_version="\\
|
||||||
cpuminer-opt config.status 3.19.6
|
cpuminer-opt config.status 3.19.9
|
||||||
configured by $0, generated by GNU Autoconf 2.69,
|
configured by $0, generated by GNU Autoconf 2.69,
|
||||||
with options \\"\$ac_cs_config\\"
|
with options \\"\$ac_cs_config\\"
|
||||||
|
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
AC_INIT([cpuminer-opt], [3.19.6])
|
AC_INIT([cpuminer-opt], [3.19.9])
|
||||||
|
|
||||||
AC_PREREQ([2.59c])
|
AC_PREREQ([2.59c])
|
||||||
AC_CANONICAL_SYSTEM
|
AC_CANONICAL_SYSTEM
|
||||||
|
65
cpu-miner.c
65
cpu-miner.c
@@ -105,8 +105,9 @@ bool opt_randomize = false;
|
|||||||
static int opt_retries = -1;
|
static int opt_retries = -1;
|
||||||
static int opt_fail_pause = 10;
|
static int opt_fail_pause = 10;
|
||||||
static int opt_time_limit = 0;
|
static int opt_time_limit = 0;
|
||||||
|
static unsigned int time_limit_stop = 0;
|
||||||
int opt_timeout = 300;
|
int opt_timeout = 300;
|
||||||
static int opt_scantime = 5;
|
static int opt_scantime = 0;
|
||||||
const int min_scantime = 1;
|
const int min_scantime = 1;
|
||||||
//static const bool opt_time = true;
|
//static const bool opt_time = true;
|
||||||
enum algos opt_algo = ALGO_NULL;
|
enum algos opt_algo = ALGO_NULL;
|
||||||
@@ -341,6 +342,7 @@ void get_currentalgo(char* buf, int sz)
|
|||||||
|
|
||||||
void proper_exit(int reason)
|
void proper_exit(int reason)
|
||||||
{
|
{
|
||||||
|
if (opt_debug) applog(LOG_INFO,"Program exit");
|
||||||
#ifdef WIN32
|
#ifdef WIN32
|
||||||
if (opt_background) {
|
if (opt_background) {
|
||||||
HWND hcon = GetConsoleWindow();
|
HWND hcon = GetConsoleWindow();
|
||||||
@@ -1097,7 +1099,7 @@ void report_summary_log( bool force )
|
|||||||
sprintf_et( et_str, et.tv_sec );
|
sprintf_et( et_str, et.tv_sec );
|
||||||
sprintf_et( upt_str, uptime.tv_sec );
|
sprintf_et( upt_str, uptime.tv_sec );
|
||||||
|
|
||||||
applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url );
|
applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], rpc_url );
|
||||||
applog2( LOG_NOTICE, "Periodic Report %s %s", et_str, upt_str );
|
applog2( LOG_NOTICE, "Periodic Report %s %s", et_str, upt_str );
|
||||||
applog2( LOG_INFO, "Share rate %.2f/min %.2f/min",
|
applog2( LOG_INFO, "Share rate %.2f/min %.2f/min",
|
||||||
submit_rate, safe_div( (double)submitted_share_count*60.,
|
submit_rate, safe_div( (double)submitted_share_count*60.,
|
||||||
@@ -2201,8 +2203,6 @@ static void *miner_thread( void *userdata )
|
|||||||
// : 0;
|
// : 0;
|
||||||
uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20;
|
uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20;
|
||||||
|
|
||||||
time_t firstwork_time = 0;
|
|
||||||
int i;
|
|
||||||
memset( &work, 0, sizeof(work) );
|
memset( &work, 0, sizeof(work) );
|
||||||
|
|
||||||
/* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
|
/* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
|
||||||
@@ -2291,12 +2291,11 @@ static void *miner_thread( void *userdata )
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else // GBT or getwork
|
else if ( !opt_benchmark ) // GBT or getwork
|
||||||
{
|
{
|
||||||
pthread_rwlock_wrlock( &g_work_lock );
|
pthread_rwlock_wrlock( &g_work_lock );
|
||||||
|
|
||||||
if ( ( ( time(NULL) - g_work_time )
|
if ( ( ( time(NULL) - g_work_time ) >= opt_scantime )
|
||||||
>= ( have_longpoll ? LP_SCANTIME : opt_scantime ) )
|
|
||||||
|| ( *nonceptr >= end_nonce ) )
|
|| ( *nonceptr >= end_nonce ) )
|
||||||
{
|
{
|
||||||
if ( unlikely( !get_work( mythr, &g_work ) ) )
|
if ( unlikely( !get_work( mythr, &g_work ) ) )
|
||||||
@@ -2325,25 +2324,14 @@ static void *miner_thread( void *userdata )
|
|||||||
if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) )
|
if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
// LP_SCANTIME overrides opt_scantime option, is this right?
|
// opt_scantime expressed in hashes
|
||||||
|
max64 = opt_scantime * thr_hashrates[thr_id];
|
||||||
// adjust max_nonce to meet target scan time. Stratum and longpoll
|
|
||||||
// can go longer because they can rely on restart_threads to signal
|
|
||||||
// an early abort. get_work on the other hand can't rely on
|
|
||||||
// restart_threads so need a much shorter scantime
|
|
||||||
if ( have_stratum )
|
|
||||||
max64 = 60 * thr_hashrates[thr_id];
|
|
||||||
else if ( have_longpoll )
|
|
||||||
max64 = LP_SCANTIME * thr_hashrates[thr_id];
|
|
||||||
else // getwork inline
|
|
||||||
max64 = opt_scantime * thr_hashrates[thr_id];
|
|
||||||
|
|
||||||
// time limit
|
// time limit
|
||||||
if ( unlikely( opt_time_limit && firstwork_time ) )
|
if ( unlikely( opt_time_limit ) )
|
||||||
{
|
{
|
||||||
int passed = (int)( time(NULL) - firstwork_time );
|
unsigned int now = (unsigned int)time(NULL);
|
||||||
int remain = (int)( opt_time_limit - passed );
|
if ( now >= time_limit_stop )
|
||||||
if ( remain < 0 )
|
|
||||||
{
|
{
|
||||||
if ( thr_id != 0 )
|
if ( thr_id != 0 )
|
||||||
{
|
{
|
||||||
@@ -2355,14 +2343,16 @@ static void *miner_thread( void *userdata )
|
|||||||
char rate[32];
|
char rate[32];
|
||||||
format_hashrate( global_hashrate, rate );
|
format_hashrate( global_hashrate, rate );
|
||||||
applog( LOG_NOTICE, "Benchmark: %s", rate );
|
applog( LOG_NOTICE, "Benchmark: %s", rate );
|
||||||
fprintf(stderr, "%llu\n", (unsigned long long)global_hashrate);
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
applog( LOG_NOTICE,
|
applog( LOG_NOTICE, "Mining timeout of %ds reached, exiting...",
|
||||||
"Mining timeout of %ds reached, exiting...", opt_time_limit);
|
opt_time_limit);
|
||||||
proper_exit(0);
|
|
||||||
|
proper_exit(0);
|
||||||
}
|
}
|
||||||
if ( remain < max64 ) max64 = remain;
|
// else
|
||||||
|
if ( time_limit_stop - now < opt_scantime )
|
||||||
|
max64 = ( time_limit_stop - now ) * thr_hashrates[thr_id] ;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Select nonce range based on max64, the estimated number of hashes
|
// Select nonce range based on max64, the estimated number of hashes
|
||||||
@@ -2378,8 +2368,6 @@ static void *miner_thread( void *userdata )
|
|||||||
max_nonce = work_nonce + (uint32_t)max64;
|
max_nonce = work_nonce + (uint32_t)max64;
|
||||||
|
|
||||||
// init time
|
// init time
|
||||||
if ( firstwork_time == 0 )
|
|
||||||
firstwork_time = time(NULL);
|
|
||||||
hashes_done = 0;
|
hashes_done = 0;
|
||||||
gettimeofday( (struct timeval *) &tv_start, NULL );
|
gettimeofday( (struct timeval *) &tv_start, NULL );
|
||||||
|
|
||||||
@@ -2452,7 +2440,7 @@ static void *miner_thread( void *userdata )
|
|||||||
{
|
{
|
||||||
double hashrate = 0.;
|
double hashrate = 0.;
|
||||||
pthread_mutex_lock( &stats_lock );
|
pthread_mutex_lock( &stats_lock );
|
||||||
for ( i = 0; i < opt_n_threads; i++ )
|
for ( int i = 0; i < opt_n_threads; i++ )
|
||||||
hashrate += thr_hashrates[i];
|
hashrate += thr_hashrates[i];
|
||||||
global_hashrate = hashrate;
|
global_hashrate = hashrate;
|
||||||
pthread_mutex_unlock( &stats_lock );
|
pthread_mutex_unlock( &stats_lock );
|
||||||
@@ -2766,7 +2754,7 @@ static void *stratum_thread(void *userdata )
|
|||||||
stratum.url = (char*) tq_pop(mythr->q, NULL);
|
stratum.url = (char*) tq_pop(mythr->q, NULL);
|
||||||
if (!stratum.url)
|
if (!stratum.url)
|
||||||
goto out;
|
goto out;
|
||||||
applog( LOG_BLUE, "Stratum connect %s", short_url );
|
applog( LOG_BLUE, "Stratum connect %s", stratum.url );
|
||||||
|
|
||||||
while (1)
|
while (1)
|
||||||
{
|
{
|
||||||
@@ -3347,6 +3335,7 @@ void parse_arg(int key, char *arg )
|
|||||||
if ( strncasecmp( arg, "http://", 7 )
|
if ( strncasecmp( arg, "http://", 7 )
|
||||||
&& strncasecmp( arg, "https://", 8 )
|
&& strncasecmp( arg, "https://", 8 )
|
||||||
&& strncasecmp( arg, "stratum+tcp://", 14 )
|
&& strncasecmp( arg, "stratum+tcp://", 14 )
|
||||||
|
&& strncasecmp( arg, "stratum+ssl://", 14 )
|
||||||
&& strncasecmp( arg, "stratum+tcps://", 15 ) )
|
&& strncasecmp( arg, "stratum+tcps://", 15 ) )
|
||||||
{
|
{
|
||||||
fprintf(stderr, "unknown protocol -- '%s'\n", arg);
|
fprintf(stderr, "unknown protocol -- '%s'\n", arg);
|
||||||
@@ -3704,6 +3693,17 @@ int main(int argc, char *argv[])
|
|||||||
show_usage_and_exit(1);
|
show_usage_and_exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ( !opt_scantime )
|
||||||
|
{
|
||||||
|
if ( have_stratum ) opt_scantime = 30;
|
||||||
|
else if ( have_longpoll ) opt_scantime = LP_SCANTIME;
|
||||||
|
else opt_scantime = 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( opt_time_limit )
|
||||||
|
time_limit_stop = (unsigned int)time(NULL) + opt_time_limit;
|
||||||
|
|
||||||
|
|
||||||
// need to register to get algo optimizations for cpu capabilities
|
// need to register to get algo optimizations for cpu capabilities
|
||||||
// but that causes registration logs before cpu capabilities is output.
|
// but that causes registration logs before cpu capabilities is output.
|
||||||
// Would need to split register function into 2 parts. First part sets algo
|
// Would need to split register function into 2 parts. First part sets algo
|
||||||
@@ -3769,6 +3769,7 @@ int main(int argc, char *argv[])
|
|||||||
flags = CURL_GLOBAL_ALL;
|
flags = CURL_GLOBAL_ALL;
|
||||||
if ( !opt_benchmark )
|
if ( !opt_benchmark )
|
||||||
if ( strncasecmp( rpc_url, "https:", 6 )
|
if ( strncasecmp( rpc_url, "https:", 6 )
|
||||||
|
&& strncasecmp( rpc_url, "stratum+ssl://", 14 )
|
||||||
&& strncasecmp( rpc_url, "stratum+tcps://", 15 ) )
|
&& strncasecmp( rpc_url, "stratum+tcps://", 15 ) )
|
||||||
flags &= ~CURL_GLOBAL_SSL;
|
flags &= ~CURL_GLOBAL_SSL;
|
||||||
|
|
||||||
|
2
miner.h
2
miner.h
@@ -812,7 +812,7 @@ Options:\n\
|
|||||||
lyra2z330 Lyra2 330 rows\n\
|
lyra2z330 Lyra2 330 rows\n\
|
||||||
m7m Magi (XMG)\n\
|
m7m Magi (XMG)\n\
|
||||||
myr-gr Myriad-Groestl\n\
|
myr-gr Myriad-Groestl\n\
|
||||||
minotaur Ringcoin (RNG)\n\
|
minotaur\n\
|
||||||
neoscrypt NeoScrypt(128, 2, 1)\n\
|
neoscrypt NeoScrypt(128, 2, 1)\n\
|
||||||
nist5 Nist5\n\
|
nist5 Nist5\n\
|
||||||
pentablake 5 x blake512\n\
|
pentablake 5 x blake512\n\
|
||||||
|
@@ -508,6 +508,32 @@ static inline void mm128_bswap32_80( void *d, void *s )
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||||
|
{
|
||||||
|
uint32_t *s = (uint32_t*)src;
|
||||||
|
casti_m128i( d, 0 ) = _mm_set1_epi32( bswap_32( s[ 0] ) );
|
||||||
|
casti_m128i( d, 1 ) = _mm_set1_epi32( bswap_32( s[ 1] ) );
|
||||||
|
casti_m128i( d, 2 ) = _mm_set1_epi32( bswap_32( s[ 2] ) );
|
||||||
|
casti_m128i( d, 3 ) = _mm_set1_epi32( bswap_32( s[ 3] ) );
|
||||||
|
casti_m128i( d, 4 ) = _mm_set1_epi32( bswap_32( s[ 4] ) );
|
||||||
|
casti_m128i( d, 5 ) = _mm_set1_epi32( bswap_32( s[ 5] ) );
|
||||||
|
casti_m128i( d, 6 ) = _mm_set1_epi32( bswap_32( s[ 6] ) );
|
||||||
|
casti_m128i( d, 7 ) = _mm_set1_epi32( bswap_32( s[ 7] ) );
|
||||||
|
casti_m128i( d, 8 ) = _mm_set1_epi32( bswap_32( s[ 8] ) );
|
||||||
|
casti_m128i( d, 9 ) = _mm_set1_epi32( bswap_32( s[ 9] ) );
|
||||||
|
casti_m128i( d,10 ) = _mm_set1_epi32( bswap_32( s[10] ) );
|
||||||
|
casti_m128i( d,11 ) = _mm_set1_epi32( bswap_32( s[11] ) );
|
||||||
|
casti_m128i( d,12 ) = _mm_set1_epi32( bswap_32( s[12] ) );
|
||||||
|
casti_m128i( d,13 ) = _mm_set1_epi32( bswap_32( s[13] ) );
|
||||||
|
casti_m128i( d,14 ) = _mm_set1_epi32( bswap_32( s[14] ) );
|
||||||
|
casti_m128i( d,15 ) = _mm_set1_epi32( bswap_32( s[15] ) );
|
||||||
|
casti_m128i( d,16 ) = _mm_set1_epi32( bswap_32( s[16] ) );
|
||||||
|
casti_m128i( d,17 ) = _mm_set1_epi32( bswap_32( s[17] ) );
|
||||||
|
casti_m128i( d,18 ) = _mm_set1_epi32( bswap_32( s[18] ) );
|
||||||
|
casti_m128i( d,19 ) = _mm_set1_epi32( bswap_32( s[19] ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
|
static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||||
{
|
{
|
||||||
__m128i s0 = casti_m128i( src,0 );
|
__m128i s0 = casti_m128i( src,0 );
|
||||||
@@ -561,6 +587,7 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
|
|||||||
casti_m128i( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
|
casti_m128i( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
|
||||||
casti_m128i( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
|
casti_m128i( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
// 8x32
|
// 8x32
|
||||||
/*
|
/*
|
||||||
@@ -1110,6 +1137,31 @@ static inline void extr_lane_8x32( void *d, const void *s,
|
|||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
|
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||||
|
{
|
||||||
|
uint32_t *s = (uint32_t*)src;
|
||||||
|
casti_m256i( d, 0 ) = _mm256_set1_epi32( bswap_32( s[ 0] ) );
|
||||||
|
casti_m256i( d, 1 ) = _mm256_set1_epi32( bswap_32( s[ 1] ) );
|
||||||
|
casti_m256i( d, 2 ) = _mm256_set1_epi32( bswap_32( s[ 2] ) );
|
||||||
|
casti_m256i( d, 3 ) = _mm256_set1_epi32( bswap_32( s[ 3] ) );
|
||||||
|
casti_m256i( d, 4 ) = _mm256_set1_epi32( bswap_32( s[ 4] ) );
|
||||||
|
casti_m256i( d, 5 ) = _mm256_set1_epi32( bswap_32( s[ 5] ) );
|
||||||
|
casti_m256i( d, 6 ) = _mm256_set1_epi32( bswap_32( s[ 6] ) );
|
||||||
|
casti_m256i( d, 7 ) = _mm256_set1_epi32( bswap_32( s[ 7] ) );
|
||||||
|
casti_m256i( d, 8 ) = _mm256_set1_epi32( bswap_32( s[ 8] ) );
|
||||||
|
casti_m256i( d, 9 ) = _mm256_set1_epi32( bswap_32( s[ 9] ) );
|
||||||
|
casti_m256i( d,10 ) = _mm256_set1_epi32( bswap_32( s[10] ) );
|
||||||
|
casti_m256i( d,11 ) = _mm256_set1_epi32( bswap_32( s[11] ) );
|
||||||
|
casti_m256i( d,12 ) = _mm256_set1_epi32( bswap_32( s[12] ) );
|
||||||
|
casti_m256i( d,13 ) = _mm256_set1_epi32( bswap_32( s[13] ) );
|
||||||
|
casti_m256i( d,14 ) = _mm256_set1_epi32( bswap_32( s[14] ) );
|
||||||
|
casti_m256i( d,15 ) = _mm256_set1_epi32( bswap_32( s[15] ) );
|
||||||
|
casti_m256i( d,16 ) = _mm256_set1_epi32( bswap_32( s[16] ) );
|
||||||
|
casti_m256i( d,17 ) = _mm256_set1_epi32( bswap_32( s[17] ) );
|
||||||
|
casti_m256i( d,18 ) = _mm256_set1_epi32( bswap_32( s[18] ) );
|
||||||
|
casti_m256i( d,19 ) = _mm256_set1_epi32( bswap_32( s[19] ) );
|
||||||
|
}
|
||||||
|
/*
|
||||||
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||||
{
|
{
|
||||||
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||||
@@ -1170,6 +1222,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
|||||||
casti_m128i( d,38 ) =
|
casti_m128i( d,38 ) =
|
||||||
casti_m128i( d,39 ) = _mm_shuffle_epi32( s4 , 0xff );
|
casti_m128i( d,39 ) = _mm_shuffle_epi32( s4 , 0xff );
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
@@ -1718,6 +1771,31 @@ static inline void extr_lane_16x32( void *d, const void *s,
|
|||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__)
|
||||||
|
|
||||||
|
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||||
|
{
|
||||||
|
uint32_t *s = (uint32_t*)src;
|
||||||
|
casti_m512i( d, 0 ) = _mm512_set1_epi32( bswap_32( s[ 0] ) );
|
||||||
|
casti_m512i( d, 1 ) = _mm512_set1_epi32( bswap_32( s[ 1] ) );
|
||||||
|
casti_m512i( d, 2 ) = _mm512_set1_epi32( bswap_32( s[ 2] ) );
|
||||||
|
casti_m512i( d, 3 ) = _mm512_set1_epi32( bswap_32( s[ 3] ) );
|
||||||
|
casti_m512i( d, 4 ) = _mm512_set1_epi32( bswap_32( s[ 4] ) );
|
||||||
|
casti_m512i( d, 5 ) = _mm512_set1_epi32( bswap_32( s[ 5] ) );
|
||||||
|
casti_m512i( d, 6 ) = _mm512_set1_epi32( bswap_32( s[ 6] ) );
|
||||||
|
casti_m512i( d, 7 ) = _mm512_set1_epi32( bswap_32( s[ 7] ) );
|
||||||
|
casti_m512i( d, 8 ) = _mm512_set1_epi32( bswap_32( s[ 8] ) );
|
||||||
|
casti_m512i( d, 9 ) = _mm512_set1_epi32( bswap_32( s[ 9] ) );
|
||||||
|
casti_m512i( d,10 ) = _mm512_set1_epi32( bswap_32( s[10] ) );
|
||||||
|
casti_m512i( d,11 ) = _mm512_set1_epi32( bswap_32( s[11] ) );
|
||||||
|
casti_m512i( d,12 ) = _mm512_set1_epi32( bswap_32( s[12] ) );
|
||||||
|
casti_m512i( d,13 ) = _mm512_set1_epi32( bswap_32( s[13] ) );
|
||||||
|
casti_m512i( d,14 ) = _mm512_set1_epi32( bswap_32( s[14] ) );
|
||||||
|
casti_m512i( d,15 ) = _mm512_set1_epi32( bswap_32( s[15] ) );
|
||||||
|
casti_m512i( d,16 ) = _mm512_set1_epi32( bswap_32( s[16] ) );
|
||||||
|
casti_m512i( d,17 ) = _mm512_set1_epi32( bswap_32( s[17] ) );
|
||||||
|
casti_m512i( d,18 ) = _mm512_set1_epi32( bswap_32( s[18] ) );
|
||||||
|
casti_m512i( d,19 ) = _mm512_set1_epi32( bswap_32( s[19] ) );
|
||||||
|
}
|
||||||
|
/*
|
||||||
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||||
{
|
{
|
||||||
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||||
@@ -1818,6 +1896,7 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
|||||||
casti_m128i( d,78 ) =
|
casti_m128i( d,78 ) =
|
||||||
casti_m128i( d,79 ) = _mm_shuffle_epi32( s4 , 0xff );
|
casti_m128i( d,79 ) = _mm_shuffle_epi32( s4 , 0xff );
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
#endif // AVX512
|
#endif // AVX512
|
||||||
|
|
||||||
@@ -2470,6 +2549,25 @@ static inline void extr_lane_8x64( void *d, const void *s,
|
|||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__)
|
||||||
|
|
||||||
|
// broadcast to all lanes
|
||||||
|
static inline void mm512_intrlv80_8x64( void *dst, const void *src )
|
||||||
|
{
|
||||||
|
__m128i *d = (__m128i*)dst;
|
||||||
|
const __m128i *s = (const __m128i*)src;
|
||||||
|
|
||||||
|
d[ 0] = d[ 1] = d[ 2] = d[ 3] = _mm_shuffle_epi32( s[0], 0x44 );
|
||||||
|
d[ 4] = d[ 5] = d[ 6] = d[ 7] = _mm_shuffle_epi32( s[0], 0xee );
|
||||||
|
d[ 8] = d[ 9] = d[10] = d[11] = _mm_shuffle_epi32( s[1], 0x44 );
|
||||||
|
d[12] = d[13] = d[14] = d[15] = _mm_shuffle_epi32( s[1], 0xee );
|
||||||
|
d[16] = d[17] = d[18] = d[19] = _mm_shuffle_epi32( s[2], 0x44 );
|
||||||
|
d[20] = d[21] = d[22] = d[23] = _mm_shuffle_epi32( s[2], 0xee );
|
||||||
|
d[24] = d[25] = d[26] = d[27] = _mm_shuffle_epi32( s[3], 0x44 );
|
||||||
|
d[28] = d[29] = d[30] = d[31] = _mm_shuffle_epi32( s[3], 0xee );
|
||||||
|
d[32] = d[33] = d[34] = d[35] = _mm_shuffle_epi32( s[4], 0x44 );
|
||||||
|
d[36] = d[37] = d[38] = d[39] = _mm_shuffle_epi32( s[4], 0xee );
|
||||||
|
}
|
||||||
|
|
||||||
|
// byte swap and broadcast to al lanes
|
||||||
static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
||||||
{
|
{
|
||||||
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||||
@@ -2556,6 +2654,10 @@ static inline void intrlv_2x128( void *dst, const void *src0,
|
|||||||
d[10] = s0[5]; d[11] = s1[5];
|
d[10] = s0[5]; d[11] = s1[5];
|
||||||
d[12] = s0[6]; d[13] = s1[6];
|
d[12] = s0[6]; d[13] = s1[6];
|
||||||
d[14] = s0[7]; d[15] = s1[7];
|
d[14] = s0[7]; d[15] = s1[7];
|
||||||
|
if ( bit_len <= 1024 ) return;
|
||||||
|
d[16] = s0[8]; d[17] = s1[8];
|
||||||
|
d[18] = s0[9]; d[19] = s1[9];
|
||||||
|
// if ( bit_len <= 1280 ) return;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void intrlv_2x128_512( void *dst, const void *src0,
|
static inline void intrlv_2x128_512( void *dst, const void *src0,
|
||||||
@@ -2623,6 +2725,10 @@ static inline void intrlv_4x128( void *dst, const void *src0,
|
|||||||
d[20] = s0[5]; d[21] = s1[5]; d[22] = s2[5]; d[23] = s3[5];
|
d[20] = s0[5]; d[21] = s1[5]; d[22] = s2[5]; d[23] = s3[5];
|
||||||
d[24] = s0[6]; d[25] = s1[6]; d[26] = s2[6]; d[27] = s3[6];
|
d[24] = s0[6]; d[25] = s1[6]; d[26] = s2[6]; d[27] = s3[6];
|
||||||
d[28] = s0[7]; d[29] = s1[7]; d[30] = s2[7]; d[31] = s3[7];
|
d[28] = s0[7]; d[29] = s1[7]; d[30] = s2[7]; d[31] = s3[7];
|
||||||
|
if ( bit_len <= 1024 ) return;
|
||||||
|
d[32] = s0[8]; d[33] = s1[8]; d[34] = s2[8]; d[35] = s3[8];
|
||||||
|
d[36] = s0[9]; d[37] = s1[9]; d[38] = s2[9]; d[39] = s3[9];
|
||||||
|
// if ( bit_len <= 1280 ) return;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void intrlv_4x128_512( void *dst, const void *src0,
|
static inline void intrlv_4x128_512( void *dst, const void *src0,
|
||||||
|
@@ -411,7 +411,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
|||||||
#define mm128_rol_16( v, c ) \
|
#define mm128_rol_16( v, c ) \
|
||||||
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
|
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
|
||||||
|
|
||||||
// Limited 2 input shuffle
|
// Limited 2 input shuffle, combines shuffle with blend. The destination low
|
||||||
|
// half is always taken from src a, and the high half from src b.
|
||||||
#define mm128_shuffle2_64( a, b, c ) \
|
#define mm128_shuffle2_64( a, b, c ) \
|
||||||
_mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( a ), \
|
_mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( a ), \
|
||||||
_mm_castsi128_pd( b ), c ) );
|
_mm_castsi128_pd( b ), c ) );
|
||||||
|
@@ -442,8 +442,14 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
|||||||
#define mm256_shuflr64_32 mm256_swap64_32
|
#define mm256_shuflr64_32 mm256_swap64_32
|
||||||
#define mm256_shufll64_32 mm256_swap64_32
|
#define mm256_shufll64_32 mm256_swap64_32
|
||||||
|
|
||||||
//
|
// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
|
||||||
// Swap bytes in vector elements, endian bswap.
|
// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
|
||||||
|
// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
|
||||||
|
// needed for a shuffle that crosses 128 bit lanes. BSWAP doesn't therefore the
|
||||||
|
// AVX2 version will work here. The bswap control vector is coded to work
|
||||||
|
// with both versions, bit 4 is ignored in AVX2.
|
||||||
|
|
||||||
|
// Reverse byte order in elements, endian bswap.
|
||||||
#define mm256_bswap_64( v ) \
|
#define mm256_bswap_64( v ) \
|
||||||
_mm256_shuffle_epi8( v, \
|
_mm256_shuffle_epi8( v, \
|
||||||
m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||||
|
@@ -15,13 +15,14 @@
|
|||||||
|
|
||||||
// AVX512 intrinsics have a few changes from previous conventions.
|
// AVX512 intrinsics have a few changes from previous conventions.
|
||||||
//
|
//
|
||||||
// cmp instruction now returns a bitmask isnstead of a vector mask.
|
// cmp instruction now returns a bitmask instead of a vector mask.
|
||||||
// This eliminates the need for the blendv instruction.
|
// This eliminates the need for the blendv instruction.
|
||||||
//
|
//
|
||||||
// The new rotate instructions require the count to be an 8 bit
|
// The new rotate instructions require the count to be an 8 bit
|
||||||
// immediate value only. Compilation fails if a variable is used.
|
// immediate value only. Compilation fails if a variable is used.
|
||||||
// The documentation is the same as for shift and it works with
|
// The documentation is the same as for shift and it works with
|
||||||
// variables.
|
// variables. The inconsistency is likely due to compiler optimizations
|
||||||
|
// that can eliminate the variable in some instances.
|
||||||
//
|
//
|
||||||
// _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
|
// _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
|
||||||
// usually shuffles accross all lanes.
|
// usually shuffles accross all lanes.
|
||||||
@@ -317,6 +318,9 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
|||||||
// AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
|
// AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
|
||||||
// elements and can be called directly. But they only accept immediate 8
|
// elements and can be called directly. But they only accept immediate 8
|
||||||
// for control arg.
|
// for control arg.
|
||||||
|
// The workaround is a fraud, just a fluke of the compiler's optimizer.
|
||||||
|
// It fails without -O3. The compiler seems to unroll shift loops, eliminating
|
||||||
|
// the variable control, better than rotate loops.
|
||||||
//
|
//
|
||||||
// _mm512_rol_epi64, _mm512_ror_epi64, _mm512_rol_epi32, _mm512_ror_epi32
|
// _mm512_rol_epi64, _mm512_ror_epi64, _mm512_rol_epi32, _mm512_ror_epi32
|
||||||
// _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
|
// _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
|
||||||
@@ -429,21 +433,9 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
|
|||||||
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
|
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
//
|
|
||||||
// Shift with zero fill & shuffle-rotate elements in 512 bit vector.
|
|
||||||
//
|
|
||||||
|
|
||||||
// rename plan change ror to vror for Vector ROtate Right,
|
// Cross-lane shuffles implementing rotate & shift of elements within a vector.
|
||||||
// and vrol for Vector ROtate Left, not to be confused with
|
//
|
||||||
//variable rotate rorv, rolv,
|
|
||||||
// Plan changed, use shuflr & shufll instead symbolizing a shuffle-rotate
|
|
||||||
// operation. 1xNN notaion ia also removed and replaced with simpler NN.
|
|
||||||
// Swap will still have its own mnemonic and will be aliased as both
|
|
||||||
// left and right shuffles.
|
|
||||||
|
|
||||||
// Shift elements right or left in 512 bit vector, filling with zeros.
|
|
||||||
// Multiple element shifts can be combined into a single larger
|
|
||||||
// element shift.
|
|
||||||
|
|
||||||
#define mm512_shiftr_256( v ) \
|
#define mm512_shiftr_256( v ) \
|
||||||
_mm512_alignr_epi64( _mm512_setzero, v, 4 )
|
_mm512_alignr_epi64( _mm512_setzero, v, 4 )
|
||||||
@@ -529,7 +521,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
|||||||
// 128 bit lane shift is handled by bslli bsrli.
|
// 128 bit lane shift is handled by bslli bsrli.
|
||||||
|
|
||||||
// Swap hi & lo 128 bits in each 256 bit lane
|
// Swap hi & lo 128 bits in each 256 bit lane
|
||||||
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
|
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
|
||||||
#define mm512_shuflr256_128 mm512_swap256_128
|
#define mm512_shuflr256_128 mm512_swap256_128
|
||||||
#define mm512_shufll256_128 mm512_swap256_128
|
#define mm512_shufll256_128 mm512_swap256_128
|
||||||
|
|
||||||
@@ -583,7 +575,9 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
|||||||
//
|
//
|
||||||
// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
|
// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
|
||||||
|
|
||||||
// Limited 2 input, 1 output shuffle within 128 bit lanes.
|
// Limited 2 input, 1 output shuffle, combines shuffle with blend.
|
||||||
|
// Like most shuffles it's limited to 128 bit lanes and like some shuffles
|
||||||
|
// destination elements must come from a specific source.
|
||||||
#define mm512_shuffle2_64( a, b, c ) \
|
#define mm512_shuffle2_64( a, b, c ) \
|
||||||
_mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( a ), \
|
_mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( a ), \
|
||||||
_mm512_castsi512_pd( b ), c ) );
|
_mm512_castsi512_pd( b ), c ) );
|
||||||
@@ -620,11 +614,7 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
|||||||
// Drop macros? They can easilly be rebuilt using shufl2 functions
|
// Drop macros? They can easilly be rebuilt using shufl2 functions
|
||||||
|
|
||||||
// 2 input, 1 output
|
// 2 input, 1 output
|
||||||
// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
|
// Rotate concatenated { v1, v2 ) right or left and return v1.
|
||||||
// rotated v1
|
|
||||||
// visually confusing for shif2r because of arg order. First arg is always
|
|
||||||
// the target for modification, either update by reference or by function
|
|
||||||
// return.
|
|
||||||
#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 )
|
#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 )
|
||||||
#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 )
|
#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 )
|
||||||
|
|
||||||
|
22
sysinfos.c
22
sysinfos.c
@@ -502,6 +502,28 @@ static inline bool has_vaes()
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool has_vbmi()
|
||||||
|
{
|
||||||
|
#ifdef __arm__
|
||||||
|
return false;
|
||||||
|
#else
|
||||||
|
int cpu_info[4] = { 0 };
|
||||||
|
cpuid( EXTENDED_FEATURES, cpu_info );
|
||||||
|
return cpu_info[ ECX_Reg ] & AVX512VBMI_Flag;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool has_vbmi2()
|
||||||
|
{
|
||||||
|
#ifdef __arm__
|
||||||
|
return false;
|
||||||
|
#else
|
||||||
|
int cpu_info[4] = { 0 };
|
||||||
|
cpuid( EXTENDED_FEATURES, cpu_info );
|
||||||
|
return cpu_info[ ECX_Reg ] & AVX512VBMI2_Flag;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
// AMD only
|
// AMD only
|
||||||
static inline bool has_xop()
|
static inline bool has_xop()
|
||||||
{
|
{
|
||||||
|
17
util.c
17
util.c
@@ -1542,11 +1542,20 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
|
|||||||
free(sctx->url);
|
free(sctx->url);
|
||||||
sctx->url = strdup(url);
|
sctx->url = strdup(url);
|
||||||
}
|
}
|
||||||
free(sctx->curl_url);
|
|
||||||
|
free(sctx->curl_url);
|
||||||
sctx->curl_url = (char*) malloc(strlen(url));
|
sctx->curl_url = (char*) malloc(strlen(url));
|
||||||
sprintf( sctx->curl_url, "http%s", strstr( url, "s://" )
|
|
||||||
? strstr( url, "s://" )
|
// replace the stratum protocol prefix with http, https for ssl
|
||||||
: strstr (url, "://" ) );
|
sprintf( sctx->curl_url, "%s%s",
|
||||||
|
( strstr( url, "s://" ) || strstr( url, "ssl://" ) )
|
||||||
|
? "https" : "http", strstr( url, "://" ) );
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// sprintf( sctx->curl_url, "http%s", strstr( url, "s://" )
|
||||||
|
// ? strstr( url, "s://" )
|
||||||
|
// : strstr (url, "://" ) );
|
||||||
|
|
||||||
if (opt_protocol)
|
if (opt_protocol)
|
||||||
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
|
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
|
||||||
|
Reference in New Issue
Block a user