mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.7.4
This commit is contained in:
@@ -158,7 +158,9 @@ cpuminer_SOURCES = \
|
||||
algo/ripemd/lbry.c \
|
||||
algo/ripemd/lbry-4way.c \
|
||||
algo/scrypt/scrypt.c \
|
||||
algo/scrypt/scrypt-core-4way.c \
|
||||
algo/scrypt/neoscrypt.c \
|
||||
algo/sha/sha256-hash.c \
|
||||
algo/sha/sph_sha2.c \
|
||||
algo/sha/sph_sha2big.c \
|
||||
algo/sha/sha256-hash-4way.c \
|
||||
@@ -167,6 +169,7 @@ cpuminer_SOURCES = \
|
||||
algo/sha/sha256-hash-2way-ni.c \
|
||||
algo/sha/hmac-sha256-hash.c \
|
||||
algo/sha/hmac-sha256-hash-4way.c \
|
||||
algo/sha/sha256d.c \
|
||||
algo/sha/sha2.c \
|
||||
algo/sha/sha256t-gate.c \
|
||||
algo/sha/sha256t-4way.c \
|
||||
|
@@ -65,6 +65,37 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.18.0
|
||||
|
||||
Complete rewrite of Scrypt code, optimized for large N factor (scryptn2):
|
||||
- AVX512 & SHA support for SHA256, AVX512 has priority,
|
||||
- up to 50% increase in hashrate,
|
||||
- memory requirements reduced 30-60% depending on CPU architecture,
|
||||
- memory usage displayed at startup,
|
||||
- scrypt, default N=1024 (LTC), will likely perform slower.
|
||||
|
||||
Improved stale share detection and handling for Scrypt with large N factor:
|
||||
- abort and discard partially computed hash when new work is detected,
|
||||
- quicker response to new job, less time wasted mining stale job.
|
||||
|
||||
Improved stale share handling for all algorithms:
|
||||
- report possible stale share when new work received with a previously
|
||||
submitted share still pending,
|
||||
- when new work is detected report the submission of an already completed,
|
||||
otherwise valid, but likely stale, share,
|
||||
- fixed incorrect block height in stale share log.
|
||||
|
||||
Small performance improvements to sha, bmw, cube & hamsi for AVX512 & AVX2.
|
||||
|
||||
When stratum disconnects miner threads go to idle until reconnected.
|
||||
|
||||
Colour changes to some logs.
|
||||
|
||||
Some low level function name changes for clarity and consistency.
|
||||
|
||||
The reference hashrate in the summary log and the benchmark total hashrate
|
||||
are now the mean hashrate for the session.
|
||||
|
||||
v3.17.1
|
||||
|
||||
Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES.
|
||||
|
@@ -1,3 +1,6 @@
|
||||
#ifndef __ALGO_GATE_API_H__
|
||||
#define __ALGO_GATE_API_H__ 1
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
@@ -319,3 +322,4 @@ void exec_hash_function( int algo, void *output, const void *pdata );
|
||||
// algo name if valid alias, NULL if invalid alias or algo.
|
||||
void get_algo_alias( char **algo_or_alias );
|
||||
|
||||
#endif
|
||||
|
@@ -328,7 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#define ror64(x, n) _mm512_ror_epi64((x), (n))
|
||||
#define ROR64(x, n) _mm512_ror_epi64((x), (n))
|
||||
|
||||
static __m512i muladd(__m512i x, __m512i y)
|
||||
{
|
||||
@@ -344,8 +344,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
D0 = _mm512_xor_si512(D0, A0); \
|
||||
D1 = _mm512_xor_si512(D1, A1); \
|
||||
\
|
||||
D0 = ror64(D0, 32); \
|
||||
D1 = ror64(D1, 32); \
|
||||
D0 = ROR64(D0, 32); \
|
||||
D1 = ROR64(D1, 32); \
|
||||
\
|
||||
C0 = muladd(C0, D0); \
|
||||
C1 = muladd(C1, D1); \
|
||||
@@ -353,8 +353,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
B0 = _mm512_xor_si512(B0, C0); \
|
||||
B1 = _mm512_xor_si512(B1, C1); \
|
||||
\
|
||||
B0 = ror64(B0, 24); \
|
||||
B1 = ror64(B1, 24); \
|
||||
B0 = ROR64(B0, 24); \
|
||||
B1 = ROR64(B1, 24); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
@@ -365,8 +365,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
D0 = _mm512_xor_si512(D0, A0); \
|
||||
D1 = _mm512_xor_si512(D1, A1); \
|
||||
\
|
||||
D0 = ror64(D0, 16); \
|
||||
D1 = ror64(D1, 16); \
|
||||
D0 = ROR64(D0, 16); \
|
||||
D1 = ROR64(D1, 16); \
|
||||
\
|
||||
C0 = muladd(C0, D0); \
|
||||
C1 = muladd(C1, D1); \
|
||||
@@ -374,8 +374,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
B0 = _mm512_xor_si512(B0, C0); \
|
||||
B1 = _mm512_xor_si512(B1, C1); \
|
||||
\
|
||||
B0 = ror64(B0, 63); \
|
||||
B1 = ror64(B1, 63); \
|
||||
B0 = ROR64(B0, 63); \
|
||||
B1 = ROR64(B1, 63); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
|
@@ -594,22 +594,15 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
||||
#define rb6(x) mm256_rol_64( x, 43 )
|
||||
#define rb7(x) mm256_rol_64( x, 53 )
|
||||
|
||||
#define rol_off_64( M, j, off ) \
|
||||
mm256_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
|
||||
( ( (j) + (off) ) & 0xF ) + 1 )
|
||||
#define rol_off_64( M, j ) \
|
||||
mm256_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
|
||||
|
||||
#define add_elt_b( M, H, j ) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \
|
||||
rol_off_64( M, j, 3 ) ), \
|
||||
rol_off_64( M, j, 10 ) ), \
|
||||
_mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
|
||||
H[ ( (j)+7 ) & 0xF ] )
|
||||
#define add_elt_b( mj0, mj3, mj10, h, K ) \
|
||||
_mm256_xor_si256( h, _mm256_add_epi64( K, \
|
||||
_mm256_sub_epi64( _mm256_add_epi64( mj0, mj3 ), mj10 ) ) )
|
||||
|
||||
|
||||
#define expand1b( qt, M, H, i ) \
|
||||
_mm256_add_epi64( mm256_add4_64( \
|
||||
#define expand1_b( qt, i ) \
|
||||
mm256_add4_64( \
|
||||
mm256_add4_64( sb1( qt[ (i)-16 ] ), sb2( qt[ (i)-15 ] ), \
|
||||
sb3( qt[ (i)-14 ] ), sb0( qt[ (i)-13 ] )), \
|
||||
mm256_add4_64( sb1( qt[ (i)-12 ] ), sb2( qt[ (i)-11 ] ), \
|
||||
@@ -617,11 +610,10 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
||||
mm256_add4_64( sb1( qt[ (i)- 8 ] ), sb2( qt[ (i)- 7 ] ), \
|
||||
sb3( qt[ (i)- 6 ] ), sb0( qt[ (i)- 5 ] )), \
|
||||
mm256_add4_64( sb1( qt[ (i)- 4 ] ), sb2( qt[ (i)- 3 ] ), \
|
||||
sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b( M, H, (i)-16 ) )
|
||||
sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) )
|
||||
|
||||
#define expand2b( qt, M, H, i) \
|
||||
_mm256_add_epi64( mm256_add4_64( \
|
||||
#define expand2_b( qt, i) \
|
||||
mm256_add4_64( \
|
||||
mm256_add4_64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ), \
|
||||
qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ), \
|
||||
mm256_add4_64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ), \
|
||||
@@ -629,159 +621,98 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
||||
mm256_add4_64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ), \
|
||||
qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ), \
|
||||
mm256_add4_64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ), \
|
||||
sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b( M, H, (i)-16 ) )
|
||||
|
||||
|
||||
sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) )
|
||||
|
||||
#define Wb0 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[13], H[13] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \
|
||||
_mm256_add_epi64( mh[13], mh[14] ) )
|
||||
|
||||
#define Wb1 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[14], H[14] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \
|
||||
_mm256_sub_epi64( mh[14], mh[15] ) )
|
||||
|
||||
#define Wb2 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
_mm256_add_epi64( _mm256_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \
|
||||
_mm256_sub_epi64( mh[12], mh[15] ) )
|
||||
|
||||
#define Wb3 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[10], H[10] ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \
|
||||
_mm256_sub_epi64( mh[10], \
|
||||
mh[13] ) )
|
||||
|
||||
#define Wb4 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
_mm256_add_epi64( _mm256_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \
|
||||
_mm256_add_epi64( mh[11], mh[14] ) )
|
||||
|
||||
#define Wb5 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \
|
||||
_mm256_sub_epi64( mh[12], mh[15] ) )
|
||||
|
||||
#define Wb6 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
|
||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[11], H[11] ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \
|
||||
_mm256_sub_epi64( mh[11], mh[13] ) )
|
||||
|
||||
#define Wb7 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \
|
||||
_mm256_add_epi64( mh[12], mh[14] ) )
|
||||
|
||||
#define Wb8 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[13], H[13] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \
|
||||
_mm256_sub_epi64( mh[13], mh[15] ) )
|
||||
|
||||
#define Wb9 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \
|
||||
_mm256_sub_epi64( mh[ 7], mh[14] ) )
|
||||
|
||||
#define Wb10 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \
|
||||
_mm256_sub_epi64( mh[ 7], mh[15] ) )
|
||||
|
||||
#define Wb11 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \
|
||||
_mm256_sub_epi64( mh[ 5], mh[ 9] ) )
|
||||
|
||||
#define Wb12 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ) )
|
||||
_mm256_sub_epi64( _mm256_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \
|
||||
_mm256_sub_epi64( mh[ 9], mh[10] ) )
|
||||
|
||||
#define Wb13 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[10], H[10] ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ) )
|
||||
_mm256_add_epi64( _mm256_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \
|
||||
_mm256_add_epi64( mh[10], mh[11] ) )
|
||||
|
||||
#define Wb14 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \
|
||||
_mm256_add_epi64( mh[11], mh[12] ) )
|
||||
|
||||
#define Wb15 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[ 4], H[4] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \
|
||||
_mm256_sub_epi64( mh[ 9], mh[13] ) )
|
||||
|
||||
|
||||
void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
{
|
||||
__m256i qt[32], xl, xh;
|
||||
__m256i mh[16];
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < 16; i++ )
|
||||
mh[i] = _mm256_xor_si256( M[i], H[i] );
|
||||
|
||||
qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] );
|
||||
qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] );
|
||||
@@ -799,22 +730,60 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] );
|
||||
qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] );
|
||||
qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] );
|
||||
qt[16] = expand1b( qt, M, H, 16 );
|
||||
qt[17] = expand1b( qt, M, H, 17 );
|
||||
qt[18] = expand2b( qt, M, H, 18 );
|
||||
qt[19] = expand2b( qt, M, H, 19 );
|
||||
qt[20] = expand2b( qt, M, H, 20 );
|
||||
qt[21] = expand2b( qt, M, H, 21 );
|
||||
qt[22] = expand2b( qt, M, H, 22 );
|
||||
qt[23] = expand2b( qt, M, H, 23 );
|
||||
qt[24] = expand2b( qt, M, H, 24 );
|
||||
qt[25] = expand2b( qt, M, H, 25 );
|
||||
qt[26] = expand2b( qt, M, H, 26 );
|
||||
qt[27] = expand2b( qt, M, H, 27 );
|
||||
qt[28] = expand2b( qt, M, H, 28 );
|
||||
qt[29] = expand2b( qt, M, H, 29 );
|
||||
qt[30] = expand2b( qt, M, H, 30 );
|
||||
qt[31] = expand2b( qt, M, H, 31 );
|
||||
|
||||
__m256i mj[16];
|
||||
for ( i = 0; i < 16; i++ )
|
||||
mj[i] = rol_off_64( M, i );
|
||||
|
||||
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7],
|
||||
(const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) );
|
||||
qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8],
|
||||
(const __m256i)_mm256_set1_epi64x( 17 * 0x0555555555555555ULL ) );
|
||||
qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9],
|
||||
(const __m256i)_mm256_set1_epi64x( 18 * 0x0555555555555555ULL ) );
|
||||
qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10],
|
||||
(const __m256i)_mm256_set1_epi64x( 19 * 0x0555555555555555ULL ) );
|
||||
qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11],
|
||||
(const __m256i)_mm256_set1_epi64x( 20 * 0x0555555555555555ULL ) );
|
||||
qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12],
|
||||
(const __m256i)_mm256_set1_epi64x( 21 * 0x0555555555555555ULL ) );
|
||||
qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13],
|
||||
(const __m256i)_mm256_set1_epi64x( 22 * 0x0555555555555555ULL ) );
|
||||
qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14],
|
||||
(const __m256i)_mm256_set1_epi64x( 23 * 0x0555555555555555ULL ) );
|
||||
qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15],
|
||||
(const __m256i)_mm256_set1_epi64x( 24 * 0x0555555555555555ULL ) );
|
||||
qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0],
|
||||
(const __m256i)_mm256_set1_epi64x( 25 * 0x0555555555555555ULL ) );
|
||||
qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1],
|
||||
(const __m256i)_mm256_set1_epi64x( 26 * 0x0555555555555555ULL ) );
|
||||
qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2],
|
||||
(const __m256i)_mm256_set1_epi64x( 27 * 0x0555555555555555ULL ) );
|
||||
qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3],
|
||||
(const __m256i)_mm256_set1_epi64x( 28 * 0x0555555555555555ULL ) );
|
||||
qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4],
|
||||
(const __m256i)_mm256_set1_epi64x( 29 * 0x0555555555555555ULL ) );
|
||||
qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5],
|
||||
(const __m256i)_mm256_set1_epi64x( 30 * 0x0555555555555555ULL ) );
|
||||
qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6],
|
||||
(const __m256i)_mm256_set1_epi64x( 31 * 0x0555555555555555ULL ) );
|
||||
|
||||
qt[16] = _mm256_add_epi64( qt[16], expand1_b( qt, 16 ) );
|
||||
qt[17] = _mm256_add_epi64( qt[17], expand1_b( qt, 17 ) );
|
||||
qt[18] = _mm256_add_epi64( qt[18], expand2_b( qt, 18 ) );
|
||||
qt[19] = _mm256_add_epi64( qt[19], expand2_b( qt, 19 ) );
|
||||
qt[20] = _mm256_add_epi64( qt[20], expand2_b( qt, 20 ) );
|
||||
qt[21] = _mm256_add_epi64( qt[21], expand2_b( qt, 21 ) );
|
||||
qt[22] = _mm256_add_epi64( qt[22], expand2_b( qt, 22 ) );
|
||||
qt[23] = _mm256_add_epi64( qt[23], expand2_b( qt, 23 ) );
|
||||
qt[24] = _mm256_add_epi64( qt[24], expand2_b( qt, 24 ) );
|
||||
qt[25] = _mm256_add_epi64( qt[25], expand2_b( qt, 25 ) );
|
||||
qt[26] = _mm256_add_epi64( qt[26], expand2_b( qt, 26 ) );
|
||||
qt[27] = _mm256_add_epi64( qt[27], expand2_b( qt, 27 ) );
|
||||
qt[28] = _mm256_add_epi64( qt[28], expand2_b( qt, 28 ) );
|
||||
qt[29] = _mm256_add_epi64( qt[29], expand2_b( qt, 29 ) );
|
||||
qt[30] = _mm256_add_epi64( qt[30], expand2_b( qt, 30 ) );
|
||||
qt[31] = _mm256_add_epi64( qt[31], expand2_b( qt, 31 ) );
|
||||
|
||||
xl = _mm256_xor_si256(
|
||||
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
@@ -823,7 +792,6 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_xor_si256( M[m], \
|
||||
@@ -1066,21 +1034,15 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
#define r8b6(x) mm512_rol_64( x, 43 )
|
||||
#define r8b7(x) mm512_rol_64( x, 53 )
|
||||
|
||||
#define rol8w_off_64( M, j, off ) \
|
||||
mm512_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
|
||||
( ( (j) + (off) ) & 0xF ) + 1 )
|
||||
#define rol8w_off_64( M, j ) \
|
||||
mm512_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
|
||||
|
||||
#define add_elt_b8( M, H, j ) \
|
||||
_mm512_xor_si512( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_add_epi64( rol8w_off_64( M, j, 0 ), \
|
||||
rol8w_off_64( M, j, 3 ) ), \
|
||||
rol8w_off_64( M, j, 10 ) ), \
|
||||
_mm512_set1_epi64( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
|
||||
H[ ( (j)+7 ) & 0xF ] )
|
||||
#define add_elt_b8( mj0, mj3, mj10, h, K ) \
|
||||
_mm512_xor_si512( h, _mm512_add_epi64( K, \
|
||||
_mm512_sub_epi64( _mm512_add_epi64( mj0, mj3 ), mj10 ) ) )
|
||||
|
||||
#define expand1b8( qt, M, H, i ) \
|
||||
_mm512_add_epi64( mm512_add4_64( \
|
||||
#define expand1_b8( qt, i ) \
|
||||
mm512_add4_64( \
|
||||
mm512_add4_64( s8b1( qt[ (i)-16 ] ), s8b2( qt[ (i)-15 ] ), \
|
||||
s8b3( qt[ (i)-14 ] ), s8b0( qt[ (i)-13 ] )), \
|
||||
mm512_add4_64( s8b1( qt[ (i)-12 ] ), s8b2( qt[ (i)-11 ] ), \
|
||||
@@ -1088,11 +1050,10 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
mm512_add4_64( s8b1( qt[ (i)- 8 ] ), s8b2( qt[ (i)- 7 ] ), \
|
||||
s8b3( qt[ (i)- 6 ] ), s8b0( qt[ (i)- 5 ] )), \
|
||||
mm512_add4_64( s8b1( qt[ (i)- 4 ] ), s8b2( qt[ (i)- 3 ] ), \
|
||||
s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b8( M, H, (i)-16 ) )
|
||||
s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) )
|
||||
|
||||
#define expand2b8( qt, M, H, i) \
|
||||
_mm512_add_epi64( mm512_add4_64( \
|
||||
#define expand2_b8( qt, i) \
|
||||
mm512_add4_64( \
|
||||
mm512_add4_64( qt[ (i)-16 ], r8b1( qt[ (i)-15 ] ), \
|
||||
qt[ (i)-14 ], r8b2( qt[ (i)-13 ] ) ), \
|
||||
mm512_add4_64( qt[ (i)-12 ], r8b3( qt[ (i)-11 ] ), \
|
||||
@@ -1100,157 +1061,97 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
mm512_add4_64( qt[ (i)- 8 ], r8b5( qt[ (i)- 7 ] ), \
|
||||
qt[ (i)- 6 ], r8b6( qt[ (i)- 5 ] ) ), \
|
||||
mm512_add4_64( qt[ (i)- 4 ], r8b7( qt[ (i)- 3 ] ), \
|
||||
s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b8( M, H, (i)-16 ) )
|
||||
s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) )
|
||||
|
||||
#define W8b0 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
|
||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||
_mm512_xor_si512( M[10], H[10] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[13], H[13] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \
|
||||
_mm512_add_epi64( mh[13], mh[14] ) )
|
||||
|
||||
#define W8b1 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 6], H[ 6] ), \
|
||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||
_mm512_xor_si512( M[11], H[11] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[14], H[14] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \
|
||||
_mm512_sub_epi64( mh[14], mh[15] ) )
|
||||
|
||||
#define W8b2 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
_mm512_add_epi64( _mm512_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \
|
||||
_mm512_sub_epi64( mh[12], mh[15] ) )
|
||||
|
||||
#define W8b3 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
|
||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[10], H[10] ), \
|
||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \
|
||||
_mm512_sub_epi64( mh[10], mh[13] ) )
|
||||
|
||||
#define W8b4 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
_mm512_add_epi64( _mm512_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \
|
||||
_mm512_add_epi64( mh[11], mh[14] ) )
|
||||
|
||||
#define W8b5 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
|
||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||
_mm512_xor_si512( M[10], H[10] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \
|
||||
_mm512_sub_epi64( mh[12], mh[15] ) )
|
||||
|
||||
#define W8b6 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 4], H[ 4] ), \
|
||||
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
|
||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[11], H[11] ), \
|
||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \
|
||||
_mm512_sub_epi64( mh[11], mh[13] ) )
|
||||
|
||||
#define W8b7 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \
|
||||
_mm512_add_epi64( mh[12], mh[14] ) )
|
||||
|
||||
#define W8b8 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
|
||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[13], H[13] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \
|
||||
_mm512_sub_epi64( mh[13], mh[15] ) )
|
||||
|
||||
#define W8b9 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \
|
||||
_mm512_sub_epi64( mh[ 7], mh[14] ) )
|
||||
|
||||
#define W8b10 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
|
||||
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
|
||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \
|
||||
_mm512_sub_epi64( mh[ 7], mh[15] ) )
|
||||
|
||||
#define W8b11 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
|
||||
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
|
||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
|
||||
_mm512_xor_si512( M[ 9], H[ 9] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \
|
||||
_mm512_sub_epi64( mh[ 5], mh[ 9] ) )
|
||||
|
||||
#define W8b12 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
|
||||
_mm512_xor_si512( M[10], H[10] ) ) )
|
||||
_mm512_sub_epi64( _mm512_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \
|
||||
_mm512_sub_epi64( mh[ 9], mh[10] ) )
|
||||
|
||||
#define W8b13 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
|
||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[10], H[10] ), \
|
||||
_mm512_xor_si512( M[11], H[11] ) ) )
|
||||
_mm512_add_epi64( _mm512_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \
|
||||
_mm512_add_epi64( mh[10], mh[11] ) )
|
||||
|
||||
#define W8b14 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
|
||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
|
||||
_mm512_xor_si512( M[12], H[12] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \
|
||||
_mm512_add_epi64( mh[11], mh[12] ) )
|
||||
|
||||
#define W8b15 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[ 4], H[4] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
|
||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \
|
||||
_mm512_sub_epi64( mh[ 9], mh[13] ) )
|
||||
|
||||
void compress_big_8way( const __m512i *M, const __m512i H[16],
|
||||
__m512i dH[16] )
|
||||
{
|
||||
__m512i qt[32], xl, xh;
|
||||
__m512i mh[16];
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < 16; i++ )
|
||||
mh[i] = _mm512_xor_si512( M[i], H[i] );
|
||||
|
||||
qt[ 0] = _mm512_add_epi64( s8b0( W8b0 ), H[ 1] );
|
||||
qt[ 1] = _mm512_add_epi64( s8b1( W8b1 ), H[ 2] );
|
||||
@@ -1268,22 +1169,60 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
|
||||
qt[13] = _mm512_add_epi64( s8b3( W8b13), H[14] );
|
||||
qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
|
||||
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
|
||||
qt[16] = expand1b8( qt, M, H, 16 );
|
||||
qt[17] = expand1b8( qt, M, H, 17 );
|
||||
qt[18] = expand2b8( qt, M, H, 18 );
|
||||
qt[19] = expand2b8( qt, M, H, 19 );
|
||||
qt[20] = expand2b8( qt, M, H, 20 );
|
||||
qt[21] = expand2b8( qt, M, H, 21 );
|
||||
qt[22] = expand2b8( qt, M, H, 22 );
|
||||
qt[23] = expand2b8( qt, M, H, 23 );
|
||||
qt[24] = expand2b8( qt, M, H, 24 );
|
||||
qt[25] = expand2b8( qt, M, H, 25 );
|
||||
qt[26] = expand2b8( qt, M, H, 26 );
|
||||
qt[27] = expand2b8( qt, M, H, 27 );
|
||||
qt[28] = expand2b8( qt, M, H, 28 );
|
||||
qt[29] = expand2b8( qt, M, H, 29 );
|
||||
qt[30] = expand2b8( qt, M, H, 30 );
|
||||
qt[31] = expand2b8( qt, M, H, 31 );
|
||||
|
||||
__m512i mj[16];
|
||||
for ( i = 0; i < 16; i++ )
|
||||
mj[i] = rol8w_off_64( M, i );
|
||||
|
||||
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7],
|
||||
(const __m512i)_mm512_set1_epi64( 16 * 0x0555555555555555ULL ) );
|
||||
qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8],
|
||||
(const __m512i)_mm512_set1_epi64( 17 * 0x0555555555555555ULL ) );
|
||||
qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9],
|
||||
(const __m512i)_mm512_set1_epi64( 18 * 0x0555555555555555ULL ) );
|
||||
qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10],
|
||||
(const __m512i)_mm512_set1_epi64( 19 * 0x0555555555555555ULL ) );
|
||||
qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11],
|
||||
(const __m512i)_mm512_set1_epi64( 20 * 0x0555555555555555ULL ) );
|
||||
qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12],
|
||||
(const __m512i)_mm512_set1_epi64( 21 * 0x0555555555555555ULL ) );
|
||||
qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13],
|
||||
(const __m512i)_mm512_set1_epi64( 22 * 0x0555555555555555ULL ) );
|
||||
qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14],
|
||||
(const __m512i)_mm512_set1_epi64( 23 * 0x0555555555555555ULL ) );
|
||||
qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15],
|
||||
(const __m512i)_mm512_set1_epi64( 24 * 0x0555555555555555ULL ) );
|
||||
qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0],
|
||||
(const __m512i)_mm512_set1_epi64( 25 * 0x0555555555555555ULL ) );
|
||||
qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1],
|
||||
(const __m512i)_mm512_set1_epi64( 26 * 0x0555555555555555ULL ) );
|
||||
qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2],
|
||||
(const __m512i)_mm512_set1_epi64( 27 * 0x0555555555555555ULL ) );
|
||||
qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3],
|
||||
(const __m512i)_mm512_set1_epi64( 28 * 0x0555555555555555ULL ) );
|
||||
qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4],
|
||||
(const __m512i)_mm512_set1_epi64( 29 * 0x0555555555555555ULL ) );
|
||||
qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5],
|
||||
(const __m512i)_mm512_set1_epi64( 30 * 0x0555555555555555ULL ) );
|
||||
qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6],
|
||||
(const __m512i)_mm512_set1_epi64( 31 * 0x0555555555555555ULL ) );
|
||||
|
||||
qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
|
||||
qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
|
||||
qt[18] = _mm512_add_epi64( qt[18], expand2_b8( qt, 18 ) );
|
||||
qt[19] = _mm512_add_epi64( qt[19], expand2_b8( qt, 19 ) );
|
||||
qt[20] = _mm512_add_epi64( qt[20], expand2_b8( qt, 20 ) );
|
||||
qt[21] = _mm512_add_epi64( qt[21], expand2_b8( qt, 21 ) );
|
||||
qt[22] = _mm512_add_epi64( qt[22], expand2_b8( qt, 22 ) );
|
||||
qt[23] = _mm512_add_epi64( qt[23], expand2_b8( qt, 23 ) );
|
||||
qt[24] = _mm512_add_epi64( qt[24], expand2_b8( qt, 24 ) );
|
||||
qt[25] = _mm512_add_epi64( qt[25], expand2_b8( qt, 25 ) );
|
||||
qt[26] = _mm512_add_epi64( qt[26], expand2_b8( qt, 26 ) );
|
||||
qt[27] = _mm512_add_epi64( qt[27], expand2_b8( qt, 27 ) );
|
||||
qt[28] = _mm512_add_epi64( qt[28], expand2_b8( qt, 28 ) );
|
||||
qt[29] = _mm512_add_epi64( qt[29], expand2_b8( qt, 29 ) );
|
||||
qt[30] = _mm512_add_epi64( qt[30], expand2_b8( qt, 30 ) );
|
||||
qt[31] = _mm512_add_epi64( qt[31], expand2_b8( qt, 31 ) );
|
||||
|
||||
xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
|
||||
mm512_xor3( qt[19], qt[20], qt[21] ),
|
||||
|
@@ -98,6 +98,138 @@ static void transform_4way( cube_4way_context *sp )
|
||||
_mm512_store_si512( (__m512i*)sp->h + 7, x7 );
|
||||
}
|
||||
|
||||
// 8 ways, 4 way parallel double buffered
|
||||
static void transform_4way_2buf( cube_4way_2buf_context *sp )
|
||||
{
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
__m512i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
__m512i y0, y1, y2, y3, y4, y5, y6, y7;
|
||||
__m512i tx0, tx1, ty0, ty1;
|
||||
|
||||
x0 = _mm512_load_si512( (__m512i*)sp->h0 );
|
||||
x1 = _mm512_load_si512( (__m512i*)sp->h0 + 1 );
|
||||
x2 = _mm512_load_si512( (__m512i*)sp->h0 + 2 );
|
||||
x3 = _mm512_load_si512( (__m512i*)sp->h0 + 3 );
|
||||
x4 = _mm512_load_si512( (__m512i*)sp->h0 + 4 );
|
||||
x5 = _mm512_load_si512( (__m512i*)sp->h0 + 5 );
|
||||
x6 = _mm512_load_si512( (__m512i*)sp->h0 + 6 );
|
||||
x7 = _mm512_load_si512( (__m512i*)sp->h0 + 7 );
|
||||
|
||||
y0 = _mm512_load_si512( (__m512i*)sp->h1 );
|
||||
y1 = _mm512_load_si512( (__m512i*)sp->h1 + 1 );
|
||||
y2 = _mm512_load_si512( (__m512i*)sp->h1 + 2 );
|
||||
y3 = _mm512_load_si512( (__m512i*)sp->h1 + 3 );
|
||||
y4 = _mm512_load_si512( (__m512i*)sp->h1 + 4 );
|
||||
y5 = _mm512_load_si512( (__m512i*)sp->h1 + 5 );
|
||||
y6 = _mm512_load_si512( (__m512i*)sp->h1 + 6 );
|
||||
y7 = _mm512_load_si512( (__m512i*)sp->h1 + 7 );
|
||||
|
||||
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x4 = _mm512_add_epi32( x0, x4 );
|
||||
y4 = _mm512_add_epi32( y0, y4 );
|
||||
tx0 = x0;
|
||||
ty0 = y0;
|
||||
x5 = _mm512_add_epi32( x1, x5 );
|
||||
y5 = _mm512_add_epi32( y1, y5 );
|
||||
tx1 = x1;
|
||||
ty1 = y1;
|
||||
x0 = mm512_rol_32( x2, 7 );
|
||||
y0 = mm512_rol_32( y2, 7 );
|
||||
x6 = _mm512_add_epi32( x2, x6 );
|
||||
y6 = _mm512_add_epi32( y2, y6 );
|
||||
x1 = mm512_rol_32( x3, 7 );
|
||||
y1 = mm512_rol_32( y3, 7 );
|
||||
x7 = _mm512_add_epi32( x3, x7 );
|
||||
y7 = _mm512_add_epi32( y3, y7 );
|
||||
|
||||
|
||||
x2 = mm512_rol_32( tx0, 7 );
|
||||
y2 = mm512_rol_32( ty0, 7 );
|
||||
x0 = _mm512_xor_si512( x0, x4 );
|
||||
y0 = _mm512_xor_si512( y0, y4 );
|
||||
x4 = mm512_swap128_64( x4 );
|
||||
x3 = mm512_rol_32( tx1, 7 );
|
||||
y3 = mm512_rol_32( ty1, 7 );
|
||||
y4 = mm512_swap128_64( y4 );
|
||||
|
||||
x1 = _mm512_xor_si512( x1, x5 );
|
||||
y1 = _mm512_xor_si512( y1, y5 );
|
||||
x5 = mm512_swap128_64( x5 );
|
||||
x2 = _mm512_xor_si512( x2, x6 );
|
||||
y2 = _mm512_xor_si512( y2, y6 );
|
||||
y5 = mm512_swap128_64( y5 );
|
||||
x3 = _mm512_xor_si512( x3, x7 );
|
||||
y3 = _mm512_xor_si512( y3, y7 );
|
||||
|
||||
x6 = mm512_swap128_64( x6 );
|
||||
x4 = _mm512_add_epi32( x0, x4 );
|
||||
y4 = _mm512_add_epi32( y0, y4 );
|
||||
y6 = mm512_swap128_64( y6 );
|
||||
x5 = _mm512_add_epi32( x1, x5 );
|
||||
y5 = _mm512_add_epi32( y1, y5 );
|
||||
x7 = mm512_swap128_64( x7 );
|
||||
x6 = _mm512_add_epi32( x2, x6 );
|
||||
y6 = _mm512_add_epi32( y2, y6 );
|
||||
tx0 = x0;
|
||||
ty0 = y0;
|
||||
y7 = mm512_swap128_64( y7 );
|
||||
tx1 = x2;
|
||||
ty1 = y2;
|
||||
x0 = mm512_rol_32( x1, 11 );
|
||||
y0 = mm512_rol_32( y1, 11 );
|
||||
|
||||
x7 = _mm512_add_epi32( x3, x7 );
|
||||
y7 = _mm512_add_epi32( y3, y7 );
|
||||
|
||||
x1 = mm512_rol_32( tx0, 11 );
|
||||
y1 = mm512_rol_32( ty0, 11 );
|
||||
x0 = _mm512_xor_si512( x0, x4 );
|
||||
x4 = mm512_swap64_32( x4 );
|
||||
y0 = _mm512_xor_si512( y0, y4 );
|
||||
x2 = mm512_rol_32( x3, 11 );
|
||||
y4 = mm512_swap64_32( y4 );
|
||||
y2 = mm512_rol_32( y3, 11 );
|
||||
x1 = _mm512_xor_si512( x1, x5 );
|
||||
x5 = mm512_swap64_32( x5 );
|
||||
y1 = _mm512_xor_si512( y1, y5 );
|
||||
x3 = mm512_rol_32( tx1, 11 );
|
||||
y5 = mm512_swap64_32( y5 );
|
||||
y3 = mm512_rol_32( ty1, 11 );
|
||||
|
||||
x2 = _mm512_xor_si512( x2, x6 );
|
||||
x6 = mm512_swap64_32( x6 );
|
||||
y2 = _mm512_xor_si512( y2, y6 );
|
||||
y6 = mm512_swap64_32( y6 );
|
||||
x3 = _mm512_xor_si512( x3, x7 );
|
||||
x7 = mm512_swap64_32( x7 );
|
||||
y3 = _mm512_xor_si512( y3, y7 );
|
||||
|
||||
y7 = mm512_swap64_32( y7 );
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)sp->h0, x0 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 1, x1 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 2, x2 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 3, x3 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 4, x4 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 5, x5 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 6, x6 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 7, x7 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)sp->h1, y0 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 1, y1 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 2, y2 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 3, y3 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 4, y4 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 5, y5 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 6, y6 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 7, y7 );
|
||||
}
|
||||
|
||||
int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
|
||||
int blockbytes )
|
||||
{
|
||||
@@ -219,6 +351,67 @@ int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_2buf_full( cube_4way_2buf_context *sp,
|
||||
void *output0, void *output1, int hashbitlen,
|
||||
const void *data0, const void *data1, size_t size )
|
||||
{
|
||||
__m512i *h0 = (__m512i*)sp->h0;
|
||||
__m512i *h1 = (__m512i*)sp->h1;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = 32/16;
|
||||
sp->rounds = 16;
|
||||
sp->pos = 0;
|
||||
|
||||
h1[0] = h0[0] = m512_const1_128( iv[0] );
|
||||
h1[1] = h0[1] = m512_const1_128( iv[1] );
|
||||
h1[2] = h0[2] = m512_const1_128( iv[2] );
|
||||
h1[3] = h0[3] = m512_const1_128( iv[3] );
|
||||
h1[4] = h0[4] = m512_const1_128( iv[4] );
|
||||
h1[5] = h0[5] = m512_const1_128( iv[5] );
|
||||
h1[6] = h0[6] = m512_const1_128( iv[6] );
|
||||
h1[7] = h0[7] = m512_const1_128( iv[7] );
|
||||
|
||||
const int len = size >> 4;
|
||||
const __m512i *in0 = (__m512i*)data0;
|
||||
const __m512i *in1 = (__m512i*)data1;
|
||||
__m512i *hash0 = (__m512i*)output0;
|
||||
__m512i *hash1 = (__m512i*)output1;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], in0[i] );
|
||||
sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], in1[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_4way_2buf( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
__m512i tmp = m512_const2_64( 0, 0x0000000000000080 );
|
||||
sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], tmp );
|
||||
sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], tmp );
|
||||
|
||||
transform_4way_2buf( sp );
|
||||
|
||||
tmp = m512_const2_64( 0x0000000100000000, 0 );
|
||||
sp->h0[7] = _mm512_xor_si512( sp->h0[7], tmp );
|
||||
sp->h1[7] = _mm512_xor_si512( sp->h1[7], tmp );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way_2buf( sp );
|
||||
|
||||
memcpy( hash0, sp->h0, sp->hashlen<<6);
|
||||
memcpy( hash1, sp->h1, sp->hashlen<<6);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size )
|
||||
@@ -259,6 +452,21 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
|
||||
// 2 way 128
|
||||
|
||||
// This isn't expected to be used with AVX512 so HW rotate intruction
|
||||
// is assumed not avaiable.
|
||||
// Use double buffering to optimize serial bit rotations. Full double
|
||||
// buffering isn't practical because it needs twice as many registers
|
||||
// with AVX2 having only half as many as AVX512.
|
||||
#define ROL2( out0, out1, in0, in1, c ) \
|
||||
{ \
|
||||
__m256i t0 = _mm256_slli_epi32( in0, c ); \
|
||||
__m256i t1 = _mm256_slli_epi32( in1, c ); \
|
||||
out0 = _mm256_srli_epi32( in0, 32-(c) ); \
|
||||
out1 = _mm256_srli_epi32( in1, 32-(c) ); \
|
||||
out0 = _mm256_or_si256( out0, t0 ); \
|
||||
out1 = _mm256_or_si256( out1, t1 ); \
|
||||
}
|
||||
|
||||
static void transform_2way( cube_2way_context *sp )
|
||||
{
|
||||
int r;
|
||||
@@ -283,35 +491,31 @@ static void transform_2way( cube_2way_context *sp )
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x1;
|
||||
x0 = mm256_rol_32( x2, 7 );
|
||||
x1 = mm256_rol_32( x3, 7 );
|
||||
x2 = mm256_rol_32( y0, 7 );
|
||||
x3 = mm256_rol_32( y1, 7 );
|
||||
ROL2( x0, x1, x2, x3, 7 );
|
||||
ROL2( x2, x3, y0, y1, 7 );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = mm256_swap128_64( x4 );
|
||||
x5 = mm256_swap128_64( x5 );
|
||||
x6 = mm256_swap128_64( x6 );
|
||||
x7 = mm256_swap128_64( x7 );
|
||||
x4 = _mm256_add_epi32( x0, x4 );
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x2;
|
||||
x0 = mm256_rol_32( x1, 11 );
|
||||
x1 = mm256_rol_32( y0, 11 );
|
||||
x2 = mm256_rol_32( x3, 11 );
|
||||
x3 = mm256_rol_32( y1, 11 );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x5 = mm256_swap128_64( x5 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = _mm256_add_epi32( x0, x4 );
|
||||
x6 = mm256_swap128_64( x6 );
|
||||
y0 = x0;
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x7 = mm256_swap128_64( x7 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
y1 = x2;
|
||||
ROL2( x0, x1, x1, y0, 11 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
ROL2( x2, x3, x3, y1, 11 );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x4 = mm256_swap64_32( x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x5 = mm256_swap64_32( x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x6 = mm256_swap64_32( x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x7 = mm256_swap64_32( x7 );
|
||||
}
|
||||
|
||||
|
@@ -17,41 +17,41 @@ struct _cube_4way_context
|
||||
int pos;
|
||||
} __attribute__ ((aligned (128)));
|
||||
|
||||
struct _cube_4way_2buf_context
|
||||
{
|
||||
__m512i h0[8];
|
||||
__m512i h1[8];
|
||||
int hashlen;
|
||||
int rounds;
|
||||
int blocksize;
|
||||
int pos;
|
||||
} __attribute__ ((aligned (128)));
|
||||
|
||||
|
||||
typedef struct _cube_4way_context cube_4way_context;
|
||||
|
||||
typedef struct _cube_4way_2buf_context cube_4way_2buf_context;
|
||||
|
||||
int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
|
||||
int blockbytes );
|
||||
int blockbytes );
|
||||
|
||||
int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
|
||||
|
||||
int cube_4way_close( cube_4way_context *sp, void *output );
|
||||
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size );
|
||||
|
||||
int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
const void *data, size_t size );
|
||||
|
||||
int cube_4x256_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
const void *data, size_t size );
|
||||
|
||||
#define cube512_4way_init( sp ) cube_4way_update( sp, 512 )
|
||||
#define cube512_4way_update cube_4way_update
|
||||
#define cube512_4way_update_close cube_4way_update
|
||||
#define cube512_4way_close cube_4way_update
|
||||
#define cube512_4way_full( sp, output, data, size ) \
|
||||
cube_4way_full( sp, output, 512, data, size )
|
||||
#define cube512_4x256_full( sp, output, data, size ) \
|
||||
cube_4x256_full( sp, output, 512, data, size )
|
||||
|
||||
#define cube256_4way_init( sp ) cube_4way_update( sp, 256 )
|
||||
#define cube256_4way_update cube_4way_update
|
||||
#define cube256_4way_update_close cube_4way_update
|
||||
#define cube256_4way_close cube_4way_update
|
||||
#define cube256_4way_full( sp, output, data, size ) \
|
||||
cube_4way_full( sp, output, 256, data, size )
|
||||
#define cube256_4x256_full( sp, output, data, size ) \
|
||||
cube_4x256_full( sp, output, 256, data, size )
|
||||
int cube_4way_2buf_full( cube_4way_2buf_context *sp,
|
||||
void *output0, void *output1, int hashbitlen,
|
||||
const void *data0, const void *data1, size_t size );
|
||||
|
||||
#endif
|
||||
|
||||
// 2x128, 2 way parallel SSE2
|
||||
// 2x128, 2 way parallel AVX2
|
||||
|
||||
struct _cube_2way_context
|
||||
{
|
||||
|
@@ -31,10 +31,14 @@ static void transform( cubehashParam *sp )
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x1 = _mm512_add_epi32( x0, x1 );
|
||||
x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 );
|
||||
x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) );
|
||||
x0 = _mm512_xor_si512( mm512_rol_32(
|
||||
mm512_swap256_128( x0 ), 11 ), x1 );
|
||||
x0 = mm512_swap_256( x0 );
|
||||
x0 = mm512_rol_32( x0, 7 );
|
||||
x0 = _mm512_xor_si512( x0, x1 );
|
||||
x1 = mm512_swap128_64( x1 );
|
||||
x1 = _mm512_add_epi32( x0, x1 );
|
||||
x0 = mm512_swap256_128( x0 );
|
||||
x0 = mm512_rol_32( x0, 11 );
|
||||
x0 = _mm512_xor_si512( x0, x1 );
|
||||
x1 = mm512_swap64_32( x1 );
|
||||
}
|
||||
|
||||
|
@@ -43,7 +43,8 @@
|
||||
#define ROUNDS (ROUNDS1024)
|
||||
//#endif
|
||||
|
||||
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
#define ROTL64(a,n) rol64( a, n )
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
||||
|
@@ -63,7 +63,8 @@ typedef crypto_uint64 u64;
|
||||
//#define ROUNDS (ROUNDS1024)
|
||||
//#endif
|
||||
|
||||
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
#define ROTL64(a,n) rol64( a, n )
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
||||
|
@@ -11,7 +11,7 @@
|
||||
#else
|
||||
#include "sph_groestl.h"
|
||||
#endif
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
typedef struct {
|
||||
#ifdef __AES__
|
||||
@@ -19,7 +19,6 @@ typedef struct {
|
||||
#else
|
||||
sph_groestl512_context groestl;
|
||||
#endif
|
||||
sph_sha256_context sha;
|
||||
} myrgr_ctx_holder;
|
||||
|
||||
myrgr_ctx_holder myrgr_ctx;
|
||||
@@ -31,7 +30,6 @@ void init_myrgr_ctx()
|
||||
#else
|
||||
sph_groestl512_init( &myrgr_ctx.groestl );
|
||||
#endif
|
||||
sph_sha256_init( &myrgr_ctx.sha );
|
||||
}
|
||||
|
||||
void myriad_hash(void *output, const void *input)
|
||||
@@ -49,8 +47,7 @@ void myriad_hash(void *output, const void *input)
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#endif
|
||||
|
||||
sph_sha256( &ctx.sha, hash, 64 );
|
||||
sph_sha256_close( &ctx.sha, hash );
|
||||
sha256_full( hash, hash, 64 );
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
@@ -632,26 +632,25 @@ do { \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define ROUND_BIG8(rc, alpha) \
|
||||
#define ROUND_BIG8( alpha ) \
|
||||
do { \
|
||||
__m512i t0, t1, t2, t3; \
|
||||
s0 = _mm512_xor_si512( s0, m512_const1_64( \
|
||||
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
|
||||
s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
|
||||
s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
|
||||
s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
|
||||
s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
|
||||
s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
|
||||
s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
|
||||
s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
|
||||
s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
|
||||
s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
|
||||
sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
|
||||
sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
|
||||
sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
|
||||
sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
|
||||
sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
|
||||
sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
|
||||
s0 = _mm512_xor_si512( s0, alpha[ 0] ); \
|
||||
s1 = _mm512_xor_si512( s1, alpha[ 1] ); \
|
||||
s2 = _mm512_xor_si512( s2, alpha[ 2] ); \
|
||||
s3 = _mm512_xor_si512( s3, alpha[ 3] ); \
|
||||
s4 = _mm512_xor_si512( s4, alpha[ 4] ); \
|
||||
s5 = _mm512_xor_si512( s5, alpha[ 5] ); \
|
||||
s6 = _mm512_xor_si512( s6, alpha[ 6] ); \
|
||||
s7 = _mm512_xor_si512( s7, alpha[ 7] ); \
|
||||
s8 = _mm512_xor_si512( s8, alpha[ 8] ); \
|
||||
s9 = _mm512_xor_si512( s9, alpha[ 9] ); \
|
||||
sA = _mm512_xor_si512( sA, alpha[10] ); \
|
||||
sB = _mm512_xor_si512( sB, alpha[11] ); \
|
||||
sC = _mm512_xor_si512( sC, alpha[12] ); \
|
||||
sD = _mm512_xor_si512( sD, alpha[13] ); \
|
||||
sE = _mm512_xor_si512( sE, alpha[14] ); \
|
||||
sF = _mm512_xor_si512( sF, alpha[15] ); \
|
||||
\
|
||||
SBOX8( s0, s4, s8, sC ); \
|
||||
SBOX8( s1, s5, s9, sD ); \
|
||||
@@ -731,28 +730,66 @@ do { \
|
||||
|
||||
#define P_BIG8 \
|
||||
do { \
|
||||
ROUND_BIG8(0, alpha_n); \
|
||||
ROUND_BIG8(1, alpha_n); \
|
||||
ROUND_BIG8(2, alpha_n); \
|
||||
ROUND_BIG8(3, alpha_n); \
|
||||
ROUND_BIG8(4, alpha_n); \
|
||||
ROUND_BIG8(5, alpha_n); \
|
||||
__m512i alpha[16]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
} while (0)
|
||||
|
||||
#define PF_BIG8 \
|
||||
do { \
|
||||
ROUND_BIG8( 0, alpha_f); \
|
||||
ROUND_BIG8( 1, alpha_f); \
|
||||
ROUND_BIG8( 2, alpha_f); \
|
||||
ROUND_BIG8( 3, alpha_f); \
|
||||
ROUND_BIG8( 4, alpha_f); \
|
||||
ROUND_BIG8( 5, alpha_f); \
|
||||
ROUND_BIG8( 6, alpha_f); \
|
||||
ROUND_BIG8( 7, alpha_f); \
|
||||
ROUND_BIG8( 8, alpha_f); \
|
||||
ROUND_BIG8( 9, alpha_f); \
|
||||
ROUND_BIG8(10, alpha_f); \
|
||||
ROUND_BIG8(11, alpha_f); \
|
||||
__m512i alpha[16]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)6 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)7 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)8 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)9 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)10 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)11 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
} while (0)
|
||||
|
||||
#define T_BIG8 \
|
||||
@@ -965,26 +1002,25 @@ do { \
|
||||
#define sF m7
|
||||
*/
|
||||
|
||||
#define ROUND_BIG(rc, alpha) \
|
||||
#define ROUND_BIG( alpha ) \
|
||||
do { \
|
||||
__m256i t0, t1, t2, t3; \
|
||||
s0 = _mm256_xor_si256( s0, m256_const1_64( \
|
||||
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
|
||||
s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
|
||||
s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
|
||||
s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
|
||||
s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
|
||||
s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
|
||||
s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
|
||||
s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
|
||||
s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
|
||||
s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
|
||||
sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
|
||||
sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
|
||||
sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
|
||||
sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
|
||||
sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
|
||||
sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
|
||||
s0 = _mm256_xor_si256( s0, alpha[ 0] ); \
|
||||
s1 = _mm256_xor_si256( s1, alpha[ 1] ); \
|
||||
s2 = _mm256_xor_si256( s2, alpha[ 2] ); \
|
||||
s3 = _mm256_xor_si256( s3, alpha[ 3] ); \
|
||||
s4 = _mm256_xor_si256( s4, alpha[ 4] ); \
|
||||
s5 = _mm256_xor_si256( s5, alpha[ 5] ); \
|
||||
s6 = _mm256_xor_si256( s6, alpha[ 6] ); \
|
||||
s7 = _mm256_xor_si256( s7, alpha[ 7] ); \
|
||||
s8 = _mm256_xor_si256( s8, alpha[ 8] ); \
|
||||
s9 = _mm256_xor_si256( s9, alpha[ 9] ); \
|
||||
sA = _mm256_xor_si256( sA, alpha[10] ); \
|
||||
sB = _mm256_xor_si256( sB, alpha[11] ); \
|
||||
sC = _mm256_xor_si256( sC, alpha[12] ); \
|
||||
sD = _mm256_xor_si256( sD, alpha[13] ); \
|
||||
sE = _mm256_xor_si256( sE, alpha[14] ); \
|
||||
sF = _mm256_xor_si256( sF, alpha[15] ); \
|
||||
\
|
||||
SBOX( s0, s4, s8, sC ); \
|
||||
SBOX( s1, s5, s9, sD ); \
|
||||
@@ -1064,28 +1100,66 @@ do { \
|
||||
|
||||
#define P_BIG \
|
||||
do { \
|
||||
ROUND_BIG(0, alpha_n); \
|
||||
ROUND_BIG(1, alpha_n); \
|
||||
ROUND_BIG(2, alpha_n); \
|
||||
ROUND_BIG(3, alpha_n); \
|
||||
ROUND_BIG(4, alpha_n); \
|
||||
ROUND_BIG(5, alpha_n); \
|
||||
__m256i alpha[16]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
} while (0)
|
||||
|
||||
#define PF_BIG \
|
||||
do { \
|
||||
ROUND_BIG( 0, alpha_f); \
|
||||
ROUND_BIG( 1, alpha_f); \
|
||||
ROUND_BIG( 2, alpha_f); \
|
||||
ROUND_BIG( 3, alpha_f); \
|
||||
ROUND_BIG( 4, alpha_f); \
|
||||
ROUND_BIG( 5, alpha_f); \
|
||||
ROUND_BIG( 6, alpha_f); \
|
||||
ROUND_BIG( 7, alpha_f); \
|
||||
ROUND_BIG( 8, alpha_f); \
|
||||
ROUND_BIG( 9, alpha_f); \
|
||||
ROUND_BIG(10, alpha_f); \
|
||||
ROUND_BIG(11, alpha_f); \
|
||||
__m256i alpha[16]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)6 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)7 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)8 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)9 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)10 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)11 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
} while (0)
|
||||
|
||||
#define T_BIG \
|
||||
|
@@ -7,6 +7,7 @@
|
||||
#include "hodl-gate.h"
|
||||
#include "hodl-wolf.h"
|
||||
#include "miner.h"
|
||||
#include "algo/sha/sha256d.h"
|
||||
|
||||
#if defined(__AES__)
|
||||
|
||||
|
@@ -1,5 +1,6 @@
|
||||
#include "keccak-gate.h"
|
||||
#include "sph_keccak.h"
|
||||
#include "algo/sha/sha256d.h"
|
||||
|
||||
int hard_coded_eb = 1;
|
||||
|
||||
|
@@ -70,13 +70,13 @@ static const uint64_t RC[] = {
|
||||
|
||||
// Targetted macros, keccak-macros.h is included for each target.
|
||||
|
||||
#define DECL64(x) __m512i x
|
||||
#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b))
|
||||
#define AND64(d, a, b) (d = _mm512_and_si512(a,b))
|
||||
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
|
||||
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
|
||||
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
|
||||
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
|
||||
#define DECL64(x) __m512i x
|
||||
#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b))
|
||||
#define AND64(d, a, b) (d = _mm512_and_si512(a,b))
|
||||
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
|
||||
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
|
||||
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
|
||||
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
|
||||
#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
|
||||
|
||||
|
||||
|
@@ -16,7 +16,7 @@
|
||||
typedef struct {
|
||||
blake256_16way_context blake;
|
||||
keccak256_8way_context keccak;
|
||||
cube_4way_context cube;
|
||||
cube_4way_2buf_context cube;
|
||||
skein256_8way_context skein;
|
||||
#if defined(__VAES__)
|
||||
groestl256_4way_context groestl;
|
||||
@@ -30,13 +30,7 @@ static __thread allium_16way_ctx_holder allium_16way_ctx;
|
||||
bool init_allium_16way_ctx()
|
||||
{
|
||||
keccak256_8way_init( &allium_16way_ctx.keccak );
|
||||
cube_4way_init( &allium_16way_ctx.cube, 256, 16, 32 );
|
||||
skein256_8way_init( &allium_16way_ctx.skein );
|
||||
#if defined(__VAES__)
|
||||
groestl256_4way_init( &allium_16way_ctx.groestl, 32 );
|
||||
#else
|
||||
init_groestl256( &allium_16way_ctx.groestl, 32 );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -111,12 +105,11 @@ void allium_16way_hash( void *state, const void *input )
|
||||
intrlv_2x256( vhash, hash14, hash15, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash14, hash15, vhash, 256 );
|
||||
|
||||
|
||||
intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||
intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||
|
||||
cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||
cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
||||
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 );
|
||||
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
|
||||
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
|
||||
@@ -124,8 +117,7 @@ void allium_16way_hash( void *state, const void *input )
|
||||
intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 );
|
||||
intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 );
|
||||
|
||||
cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||
cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
||||
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 );
|
||||
|
||||
dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 );
|
||||
dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 );
|
||||
@@ -255,7 +247,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
typedef struct {
|
||||
blake256_8way_context blake;
|
||||
keccak256_4way_context keccak;
|
||||
cubehashParam cube;
|
||||
cube_2way_context cube;
|
||||
skein256_4way_context skein;
|
||||
#if defined(__VAES__)
|
||||
groestl256_2way_context groestl;
|
||||
@@ -269,13 +261,7 @@ static __thread allium_8way_ctx_holder allium_8way_ctx;
|
||||
bool init_allium_8way_ctx()
|
||||
{
|
||||
keccak256_4way_init( &allium_8way_ctx.keccak );
|
||||
cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 );
|
||||
skein256_4way_init( &allium_8way_ctx.skein );
|
||||
#if defined(__VAES__)
|
||||
groestl256_2way_init( &allium_8way_ctx.groestl, 32 );
|
||||
#else
|
||||
init_groestl256( &allium_8way_ctx.groestl, 32 );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -320,21 +306,20 @@ void allium_8way_hash( void *hash, const void *input )
|
||||
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*)hash4, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*)hash5, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*)hash6, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*)hash7, 32 );
|
||||
|
||||
intrlv_2x128( vhashA, hash0, hash1, 256 );
|
||||
intrlv_2x128( vhashB, hash2, hash3, 256 );
|
||||
cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||
cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
||||
dintrlv_2x128( hash0, hash1, vhashA, 256 );
|
||||
dintrlv_2x128( hash2, hash3, vhashB, 256 );
|
||||
|
||||
intrlv_2x128( vhashA, hash4, hash5, 256 );
|
||||
intrlv_2x128( vhashB, hash6, hash7, 256 );
|
||||
cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||
cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
||||
dintrlv_2x128( hash4, hash5, vhashA, 256 );
|
||||
dintrlv_2x128( hash6, hash7, vhashB, 256 );
|
||||
|
||||
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
|
||||
|
@@ -66,13 +66,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
|
||||
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
G2W_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm512_ror256_64( s1); \
|
||||
s3 = mm512_shufll256_64( s3 ); \
|
||||
s1 = mm512_shuflr256_64( s1); \
|
||||
s2 = mm512_swap256_128( s2 ); \
|
||||
s3 = mm512_rol256_64( s3 ); \
|
||||
G2W_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm512_rol256_64( s1 ); \
|
||||
s2 = mm512_swap256_128( s2 ); \
|
||||
s3 = mm512_ror256_64( s3 );
|
||||
s3 = mm512_shuflr256_64( s3 ); \
|
||||
s1 = mm512_shufll256_64( s1 ); \
|
||||
s2 = mm512_swap256_128( s2 );
|
||||
|
||||
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
@@ -107,13 +107,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
|
||||
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm256_ror_1x64( s1); \
|
||||
s3 = mm256_shufll_64( s3 ); \
|
||||
s1 = mm256_shuflr_64( s1); \
|
||||
s2 = mm256_swap_128( s2 ); \
|
||||
s3 = mm256_rol_1x64( s3 ); \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm256_rol_1x64( s1 ); \
|
||||
s2 = mm256_swap_128( s2 ); \
|
||||
s3 = mm256_ror_1x64( s3 );
|
||||
s3 = mm256_shuflr_64( s3 ); \
|
||||
s1 = mm256_shufll_64( s1 ); \
|
||||
s2 = mm256_swap_128( s2 );
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
@@ -148,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm128_ror256_64( s2, s3 ); \
|
||||
mm128_vrol256_64( s6, s7 ); \
|
||||
mm128_vror256_64( s2, s3 ); \
|
||||
mm128_swap256_128( s4, s5 ); \
|
||||
mm128_rol256_64( s6, s7 ); \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm128_rol256_64( s2, s3 ); \
|
||||
mm128_swap256_128( s4, s5 ); \
|
||||
mm128_ror256_64( s6, s7 );
|
||||
mm128_vror256_64( s6, s7 ); \
|
||||
mm128_vrol256_64( s2, s3 ); \
|
||||
mm128_swap256_128( s4, s5 );
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
|
@@ -13,6 +13,7 @@
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/ripemd/sph_ripemd.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
#define EPSa DBL_EPSILON
|
||||
#define EPS1 DBL_EPSILON
|
||||
@@ -104,8 +105,8 @@ uint32_t sw2_( int nnounce )
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
sph_sha256_context sha256;
|
||||
sph_sha512_context sha512;
|
||||
sha256_context sha256;
|
||||
sph_sha512_context sha512;
|
||||
sph_keccak512_context keccak;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sph_haval256_5_context haval;
|
||||
@@ -117,7 +118,7 @@ m7m_ctx_holder m7m_ctx;
|
||||
|
||||
void init_m7m_ctx()
|
||||
{
|
||||
sph_sha256_init( &m7m_ctx );
|
||||
sha256_ctx_init( &m7m_ctx.sha256 );
|
||||
sph_sha512_init( &m7m_ctx.sha512 );
|
||||
sph_keccak512_init( &m7m_ctx.keccak );
|
||||
sph_whirlpool_init( &m7m_ctx.whirlpool );
|
||||
@@ -153,11 +154,10 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
|
||||
m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) );
|
||||
sph_sha256_context ctxf_sha256;
|
||||
|
||||
memcpy(data, pdata, 80);
|
||||
|
||||
sph_sha256( &ctx1.sha256, data, M7_MIDSTATE_LEN );
|
||||
sha256_update( &ctx1.sha256, data, M7_MIDSTATE_LEN );
|
||||
sph_sha512( &ctx1.sha512, data, M7_MIDSTATE_LEN );
|
||||
sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN );
|
||||
sph_whirlpool( &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
|
||||
@@ -189,8 +189,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
|
||||
memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );
|
||||
|
||||
sph_sha256( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
sph_sha256_close( &ctx2.sha256, bhash[0] );
|
||||
sha256_update( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
sha256_final( &ctx2.sha256, bhash[0] );
|
||||
|
||||
sph_sha512( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
sph_sha512_close( &ctx2.sha512, bhash[1] );
|
||||
@@ -225,9 +225,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
bytes = mpz_sizeinbase(product, 256);
|
||||
mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);
|
||||
|
||||
sph_sha256_init( &ctxf_sha256 );
|
||||
sph_sha256( &ctxf_sha256, bdata, bytes );
|
||||
sph_sha256_close( &ctxf_sha256, hash );
|
||||
sha256_full( hash, bdata, bytes );
|
||||
|
||||
digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
|
||||
mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
|
||||
@@ -260,10 +258,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
mpzscale=bytes;
|
||||
mpz_export(bdata, NULL, -1, 1, 0, 0, product);
|
||||
|
||||
sph_sha256_init( &ctxf_sha256 );
|
||||
sph_sha256( &ctxf_sha256, bdata, bytes );
|
||||
sph_sha256_close( &ctxf_sha256, hash );
|
||||
}
|
||||
sha256_full( hash, bdata, bytes );
|
||||
}
|
||||
|
||||
if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
|
||||
&& !opt_benchmark ) )
|
||||
|
@@ -7,24 +7,19 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sph_ripemd.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
void lbry_hash(void* output, const void* input)
|
||||
{
|
||||
sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
|
||||
sha256_context ctx_sha256 __attribute__ ((aligned (64)));
|
||||
sph_sha512_context ctx_sha512 __attribute__ ((aligned (64)));
|
||||
sph_ripemd160_context ctx_ripemd __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) hashA[16];
|
||||
uint32_t _ALIGN(64) hashB[16];
|
||||
uint32_t _ALIGN(64) hashC[16];
|
||||
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, input, 112 );
|
||||
sph_sha256_close( &ctx_sha256, hashA );
|
||||
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hashA, 32 );
|
||||
sph_sha256_close( &ctx_sha256, hashA );
|
||||
sha256_full( hashA, input, 112 );
|
||||
sha256_full( hashA, hashA, 32 );
|
||||
|
||||
sph_sha512_init( &ctx_sha512 );
|
||||
sph_sha512( &ctx_sha512, hashA, 32 );
|
||||
@@ -38,15 +33,13 @@ void lbry_hash(void* output, const void* input)
|
||||
sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 );
|
||||
sph_ripemd160_close( &ctx_ripemd, hashC );
|
||||
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hashB, 20 );
|
||||
sph_sha256( &ctx_sha256, hashC, 20 );
|
||||
sph_sha256_close( &ctx_sha256, hashA );
|
||||
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hashA, 32 );
|
||||
sph_sha256_close( &ctx_sha256, hashA );
|
||||
sha256_ctx_init( &ctx_sha256 );
|
||||
sha256_update( &ctx_sha256, hashB, 20 );
|
||||
sha256_update( &ctx_sha256, hashC, 20 );
|
||||
sha256_final( &ctx_sha256, hashA );
|
||||
|
||||
sha256_full( hashA, hashA, 32 );
|
||||
|
||||
memcpy( output, hashA, 32 );
|
||||
}
|
||||
|
||||
|
@@ -69,8 +69,12 @@ typedef unsigned int uint;
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64U
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 32U
|
||||
|
||||
#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
|
||||
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
|
||||
//#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
|
||||
//#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
|
||||
|
||||
#define ROTL32(a,b) rol32(a,b)
|
||||
#define ROTR32(a,b) ror32(a,b)
|
||||
|
||||
|
||||
#define U8TO32_BE(p) \
|
||||
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
|
||||
|
3981
algo/scrypt/scrypt-core-4way.c
Normal file
3981
algo/scrypt/scrypt-core-4way.c
Normal file
File diff suppressed because it is too large
Load Diff
70
algo/scrypt/scrypt-core-4way.h
Normal file
70
algo/scrypt/scrypt-core-4way.h
Normal file
@@ -0,0 +1,70 @@
|
||||
#ifndef SCRYPT_CORE_4WAY_H__
|
||||
#define SCRYPT_CORE_4WAY_H__
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );
|
||||
|
||||
// Serial SIMD over 4 way parallel
|
||||
void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N );
|
||||
|
||||
// 4 way parallel over serial SIMD
|
||||
void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
void scrypt_core_8way( __m256i *X, __m256i *V, uint32_t N );
|
||||
|
||||
// 2 way parallel over SIMD128
|
||||
void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N );
|
||||
|
||||
// Double buffered 2 way parallel over SIMD128
|
||||
void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N );
|
||||
|
||||
// Triplee buffered 2 way parallel over SIMD128
|
||||
void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N );
|
||||
|
||||
// Serial SIMD128 over 2 way parallel
|
||||
void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N );
|
||||
|
||||
// Double buffered simd over parallel
|
||||
void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N );
|
||||
|
||||
// Triple buffered 2 way
|
||||
void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, const uint32_t N );
|
||||
|
||||
// Quadruple buffered
|
||||
void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
// Parallel 4 way, 4x memory
|
||||
void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N );
|
||||
|
||||
// Linear SIMD 1 way, 1x memory, lowest
|
||||
void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||
|
||||
// Double buffered, 2x memory
|
||||
void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||
|
||||
// Triple buffered
|
||||
void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||
|
||||
// Quadruple buffered, 4x memory
|
||||
void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||
|
||||
#endif
|
||||
|
||||
// For reference only
|
||||
void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||
|
||||
#endif
|
||||
|
206
algo/scrypt/scrypt-core-ref.c
Normal file
206
algo/scrypt/scrypt-core-ref.c
Normal file
@@ -0,0 +1,206 @@
|
||||
#include "scrypt-core-ref.h"
|
||||
|
||||
#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
|
||||
|
||||
static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
|
||||
{
|
||||
uint32_t x0 = (B[ 0] ^= C[ 0]),
|
||||
x1 = (B[ 1] ^= C[ 1]),
|
||||
x2 = (B[ 2] ^= C[ 2]),
|
||||
x3 = (B[ 3] ^= C[ 3]);
|
||||
uint32_t x4 = (B[ 4] ^= C[ 4]),
|
||||
x5 = (B[ 5] ^= C[ 5]),
|
||||
x6 = (B[ 6] ^= C[ 6]),
|
||||
x7 = (B[ 7] ^= C[ 7]);
|
||||
uint32_t x8 = (B[ 8] ^= C[ 8]),
|
||||
x9 = (B[ 9] ^= C[ 9]),
|
||||
xa = (B[10] ^= C[10]),
|
||||
xb = (B[11] ^= C[11]);
|
||||
uint32_t xc = (B[12] ^= C[12]),
|
||||
xd = (B[13] ^= C[13]),
|
||||
xe = (B[14] ^= C[14]),
|
||||
xf = (B[15] ^= C[15]);
|
||||
|
||||
/* Operate on columns. */
|
||||
x4 ^= ROTL(x0 + xc, 7);
|
||||
x9 ^= ROTL(x5 + x1, 7);
|
||||
xe ^= ROTL(xa + x6, 7);
|
||||
x3 ^= ROTL(xf + xb, 7);
|
||||
x8 ^= ROTL(x4 + x0, 9);
|
||||
xd ^= ROTL(x9 + x5, 9);
|
||||
x2 ^= ROTL(xe + xa, 9);
|
||||
x7 ^= ROTL(x3 + xf, 9);
|
||||
xc ^= ROTL(x8 + x4, 13);
|
||||
x1 ^= ROTL(xd + x9, 13);
|
||||
x6 ^= ROTL(x2 + xe, 13);
|
||||
xb ^= ROTL(x7 + x3, 13);
|
||||
x0 ^= ROTL(xc + x8, 18);
|
||||
x5 ^= ROTL(x1 + xd, 18);
|
||||
xa ^= ROTL(x6 + x2, 18);
|
||||
xf ^= ROTL(xb + x7, 18);
|
||||
|
||||
/* Operate on rows. */
|
||||
x1 ^= ROTL(x0 + x3, 7);
|
||||
x6 ^= ROTL(x5 + x4, 7);
|
||||
xb ^= ROTL(xa + x9, 7);
|
||||
xc ^= ROTL(xf + xe, 7);
|
||||
x2 ^= ROTL(x1 + x0, 9);
|
||||
x7 ^= ROTL(x6 + x5, 9);
|
||||
x8 ^= ROTL(xb + xa, 9);
|
||||
xd ^= ROTL(xc + xf, 9);
|
||||
x3 ^= ROTL(x2 + x1, 13);
|
||||
x4 ^= ROTL(x7 + x6, 13);
|
||||
x9 ^= ROTL(x8 + xb, 13);
|
||||
xe ^= ROTL(xd + xc, 13);
|
||||
x0 ^= ROTL(x3 + x2, 18);
|
||||
x5 ^= ROTL(x4 + x7, 18);
|
||||
xa ^= ROTL(x9 + x8, 18);
|
||||
xf ^= ROTL(xe + xd, 18);
|
||||
|
||||
/* Operate on columns. */
|
||||
x4 ^= ROTL(x0 + xc, 7);
|
||||
x9 ^= ROTL(x5 + x1, 7);
|
||||
xe ^= ROTL(xa + x6, 7);
|
||||
x3 ^= ROTL(xf + xb, 7);
|
||||
x8 ^= ROTL(x4 + x0, 9);
|
||||
xd ^= ROTL(x9 + x5, 9);
|
||||
x2 ^= ROTL(xe + xa, 9);
|
||||
x7 ^= ROTL(x3 + xf, 9);
|
||||
xc ^= ROTL(x8 + x4, 13);
|
||||
x1 ^= ROTL(xd + x9, 13);
|
||||
x6 ^= ROTL(x2 + xe, 13);
|
||||
xb ^= ROTL(x7 + x3, 13);
|
||||
x0 ^= ROTL(xc + x8, 18);
|
||||
x5 ^= ROTL(x1 + xd, 18);
|
||||
xa ^= ROTL(x6 + x2, 18);
|
||||
xf ^= ROTL(xb + x7, 18);
|
||||
|
||||
/* Operate on rows. */
|
||||
x1 ^= ROTL(x0 + x3, 7);
|
||||
x6 ^= ROTL(x5 + x4, 7);
|
||||
xb ^= ROTL(xa + x9, 7);
|
||||
xc ^= ROTL(xf + xe, 7);
|
||||
x2 ^= ROTL(x1 + x0, 9);
|
||||
x7 ^= ROTL(x6 + x5, 9);
|
||||
x8 ^= ROTL(xb + xa, 9);
|
||||
xd ^= ROTL(xc + xf, 9);
|
||||
x3 ^= ROTL(x2 + x1, 13);
|
||||
x4 ^= ROTL(x7 + x6, 13);
|
||||
x9 ^= ROTL(x8 + xb, 13);
|
||||
xe ^= ROTL(xd + xc, 13);
|
||||
x0 ^= ROTL(x3 + x2, 18);
|
||||
x5 ^= ROTL(x4 + x7, 18);
|
||||
xa ^= ROTL(x9 + x8, 18);
|
||||
xf ^= ROTL(xe + xd, 18);
|
||||
|
||||
/* Operate on columns. */
|
||||
x4 ^= ROTL(x0 + xc, 7);
|
||||
x9 ^= ROTL(x5 + x1, 7);
|
||||
xe ^= ROTL(xa + x6, 7);
|
||||
x3 ^= ROTL(xf + xb, 7);
|
||||
x8 ^= ROTL(x4 + x0, 9);
|
||||
xd ^= ROTL(x9 + x5, 9);
|
||||
x2 ^= ROTL(xe + xa, 9);
|
||||
x7 ^= ROTL(x3 + xf, 9);
|
||||
xc ^= ROTL(x8 + x4, 13);
|
||||
x1 ^= ROTL(xd + x9, 13);
|
||||
x6 ^= ROTL(x2 + xe, 13);
|
||||
xb ^= ROTL(x7 + x3, 13);
|
||||
x0 ^= ROTL(xc + x8, 18);
|
||||
x5 ^= ROTL(x1 + xd, 18);
|
||||
xa ^= ROTL(x6 + x2, 18);
|
||||
xf ^= ROTL(xb + x7, 18);
|
||||
|
||||
/* Operate on rows. */
|
||||
x1 ^= ROTL(x0 + x3, 7);
|
||||
x6 ^= ROTL(x5 + x4, 7);
|
||||
xb ^= ROTL(xa + x9, 7);
|
||||
xc ^= ROTL(xf + xe, 7);
|
||||
x2 ^= ROTL(x1 + x0, 9);
|
||||
x7 ^= ROTL(x6 + x5, 9);
|
||||
x8 ^= ROTL(xb + xa, 9);
|
||||
xd ^= ROTL(xc + xf, 9);
|
||||
x3 ^= ROTL(x2 + x1, 13);
|
||||
x4 ^= ROTL(x7 + x6, 13);
|
||||
x9 ^= ROTL(x8 + xb, 13);
|
||||
xe ^= ROTL(xd + xc, 13);
|
||||
x0 ^= ROTL(x3 + x2, 18);
|
||||
x5 ^= ROTL(x4 + x7, 18);
|
||||
xa ^= ROTL(x9 + x8, 18);
|
||||
xf ^= ROTL(xe + xd, 18);
|
||||
|
||||
/* Operate on columns. */
|
||||
x4 ^= ROTL(x0 + xc, 7);
|
||||
x9 ^= ROTL(x5 + x1, 7);
|
||||
xe ^= ROTL(xa + x6, 7);
|
||||
x3 ^= ROTL(xf + xb, 7);
|
||||
x8 ^= ROTL(x4 + x0, 9);
|
||||
xd ^= ROTL(x9 + x5, 9);
|
||||
x2 ^= ROTL(xe + xa, 9);
|
||||
x7 ^= ROTL(x3 + xf, 9);
|
||||
xc ^= ROTL(x8 + x4, 13);
|
||||
x1 ^= ROTL(xd + x9, 13);
|
||||
x6 ^= ROTL(x2 + xe, 13);
|
||||
xb ^= ROTL(x7 + x3, 13);
|
||||
x0 ^= ROTL(xc + x8, 18);
|
||||
x5 ^= ROTL(x1 + xd, 18);
|
||||
xa ^= ROTL(x6 + x2, 18);
|
||||
xf ^= ROTL(xb + x7, 18);
|
||||
|
||||
/* Operate on rows. */
|
||||
x1 ^= ROTL(x0 + x3, 7);
|
||||
x6 ^= ROTL(x5 + x4, 7);
|
||||
xb ^= ROTL(xa + x9, 7);
|
||||
xc ^= ROTL(xf + xe, 7);
|
||||
x2 ^= ROTL(x1 + x0, 9);
|
||||
x7 ^= ROTL(x6 + x5, 9);
|
||||
x8 ^= ROTL(xb + xa, 9);
|
||||
xd ^= ROTL(xc + xf, 9);
|
||||
x3 ^= ROTL(x2 + x1, 13);
|
||||
x4 ^= ROTL(x7 + x6, 13);
|
||||
x9 ^= ROTL(x8 + xb, 13);
|
||||
xe ^= ROTL(xd + xc, 13);
|
||||
x0 ^= ROTL(x3 + x2, 18);
|
||||
x5 ^= ROTL(x4 + x7, 18);
|
||||
xa ^= ROTL(x9 + x8, 18);
|
||||
xf ^= ROTL(xe + xd, 18);
|
||||
|
||||
B[ 0] += x0;
|
||||
B[ 1] += x1;
|
||||
B[ 2] += x2;
|
||||
B[ 3] += x3;
|
||||
B[ 4] += x4;
|
||||
B[ 5] += x5;
|
||||
B[ 6] += x6;
|
||||
B[ 7] += x7;
|
||||
B[ 8] += x8;
|
||||
B[ 9] += x9;
|
||||
B[10] += xa;
|
||||
B[11] += xb;
|
||||
B[12] += xc;
|
||||
B[13] += xd;
|
||||
B[14] += xe;
|
||||
B[15] += xf;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param X input/ouput
|
||||
* @param V scratch buffer
|
||||
* @param N factor (def. 1024)
|
||||
*/
|
||||
void scrypt_core_ref(uint32_t *X, uint32_t *V, uint32_t N)
|
||||
{
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
memcpy(&V[i * 32], X, 128);
|
||||
xor_salsa8(&X[0], &X[16]);
|
||||
xor_salsa8(&X[16], &X[0]);
|
||||
}
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
uint32_t j = 32 * (X[16] & (N - 1));
|
||||
for (uint8_t k = 0; k < 32; k++)
|
||||
X[k] ^= V[j + k];
|
||||
xor_salsa8(&X[0], &X[16]);
|
||||
xor_salsa8(&X[16], &X[0]);
|
||||
}
|
||||
}
|
||||
|
1476
algo/scrypt/scrypt.c
1476
algo/scrypt/scrypt.c
File diff suppressed because it is too large
Load Diff
@@ -39,10 +39,10 @@
|
||||
void
|
||||
SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
|
||||
{
|
||||
sph_sha256_context ctx;
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, in, len );
|
||||
sph_sha256_close( &ctx, digest );
|
||||
sha256_context ctx;
|
||||
sha256_ctx_init( &ctx );
|
||||
sha256_update( &ctx, in, len );
|
||||
sha256_final( &ctx, digest );
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -64,7 +64,7 @@ HMAC_SHA256_Buf( const void *K, size_t Klen, const void *in, size_t len,
|
||||
void
|
||||
HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
|
||||
{
|
||||
unsigned char pad[64];
|
||||
unsigned char pad[64] __attribute__ ((aligned (64)));
|
||||
unsigned char khash[32];
|
||||
const unsigned char * K = _K;
|
||||
size_t i;
|
||||
@@ -72,29 +72,28 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
|
||||
/* If Klen > 64, the key is really SHA256(K). */
|
||||
if ( Klen > 64 )
|
||||
{
|
||||
sph_sha256_init( &ctx->ictx );
|
||||
sph_sha256( &ctx->ictx, K, Klen );
|
||||
sph_sha256_close( &ctx->ictx, khash );
|
||||
|
||||
sha256_ctx_init( &ctx->ictx );
|
||||
sha256_update( &ctx->ictx, K, Klen );
|
||||
sha256_final( &ctx->ictx, khash );
|
||||
K = khash;
|
||||
Klen = 32;
|
||||
}
|
||||
|
||||
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
|
||||
sph_sha256_init( &ctx->ictx );
|
||||
sha256_ctx_init( &ctx->ictx );
|
||||
|
||||
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x36;
|
||||
|
||||
memset( pad + Klen, 0x36, 64 - Klen );
|
||||
sph_sha256( &ctx->ictx, pad, 64 );
|
||||
sha256_update( &ctx->ictx, pad, 64 );
|
||||
|
||||
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
|
||||
sph_sha256_init( &ctx->octx );
|
||||
sha256_ctx_init( &ctx->octx );
|
||||
|
||||
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x5c;
|
||||
|
||||
memset( pad + Klen, 0x5c, 64 - Klen );
|
||||
sph_sha256( &ctx->octx, pad, 64 );
|
||||
sha256_update( &ctx->octx, pad, 64 );
|
||||
}
|
||||
|
||||
/* Add bytes to the HMAC-SHA256 operation. */
|
||||
@@ -102,18 +101,17 @@ void
|
||||
HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
|
||||
{
|
||||
/* Feed data to the inner SHA256 operation. */
|
||||
sph_sha256( &ctx->ictx, in, len );
|
||||
sha256_update( &ctx->ictx, in, len );
|
||||
}
|
||||
|
||||
/* Finish an HMAC-SHA256 operation. */
|
||||
void
|
||||
HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
|
||||
HMAC_SHA256_Final( void *digest, HMAC_SHA256_CTX *ctx )
|
||||
{
|
||||
unsigned char ihash[32];
|
||||
|
||||
sph_sha256_close( &ctx->ictx, ihash );
|
||||
sph_sha256( &ctx->octx, ihash, 32 );
|
||||
sph_sha256_close( &ctx->octx, digest );
|
||||
uint32_t ihash[8] __attribute__ ((aligned (32)));
|
||||
sha256_final( &ctx->ictx, ihash );
|
||||
sha256_update( &ctx->octx, ihash, 32 );
|
||||
sha256_final( &ctx->octx, digest );
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -126,8 +124,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
|
||||
size_t saltlen, uint64_t c, uint8_t *buf, size_t dkLen )
|
||||
{
|
||||
HMAC_SHA256_CTX PShctx, hctx;
|
||||
uint8_t _ALIGN(128) T[32];
|
||||
uint8_t _ALIGN(128) U[32];
|
||||
uint64_t _ALIGN(128) T[4];
|
||||
uint64_t _ALIGN(128) U[4];
|
||||
// uint8_t _ALIGN(128) T[32];
|
||||
// uint8_t _ALIGN(128) U[32];
|
||||
uint32_t ivec;
|
||||
size_t i, clen;
|
||||
uint64_t j;
|
||||
@@ -163,10 +163,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
|
||||
// _mm_xor_si128( ((__m128i*)T)[0], ((__m128i*)U)[0] );
|
||||
// _mm_xor_si128( ((__m128i*)T)[1], ((__m128i*)U)[1] );
|
||||
|
||||
// for ( k = 0; k < 4; k++ ) T[k] ^= U[k];
|
||||
for ( k = 0; k < 4; k++ ) T[k] ^= U[k];
|
||||
|
||||
for ( k = 0; k < 32; k++ )
|
||||
T[k] ^= U[k];
|
||||
// for ( k = 0; k < 32; k++ )
|
||||
// T[k] ^= U[k];
|
||||
}
|
||||
|
||||
/* Copy as many bytes as necessary into buf. */
|
||||
|
@@ -31,18 +31,18 @@
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include "sph_sha2.h"
|
||||
#include "sha256-hash.h"
|
||||
|
||||
typedef struct HMAC_SHA256Context
|
||||
{
|
||||
sph_sha256_context ictx;
|
||||
sph_sha256_context octx;
|
||||
sha256_context ictx;
|
||||
sha256_context octx;
|
||||
} HMAC_SHA256_CTX;
|
||||
|
||||
void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
|
||||
void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t );
|
||||
void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t );
|
||||
void HMAC_SHA256_Final( unsigned char [32], HMAC_SHA256_CTX * );
|
||||
void HMAC_SHA256_Final( void*, HMAC_SHA256_CTX * );
|
||||
void HMAC_SHA256_Buf( const void *, size_t Klen, const void *,
|
||||
size_t len, uint8_t digest[32] );
|
||||
|
||||
|
@@ -59,7 +59,9 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha256_4way_close( sha256_4way_context *sc, void *dst );
|
||||
void sha256_4way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_4way_transform( __m128i *state_out, const __m128i *data,
|
||||
void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
|
||||
#endif // SSE2
|
||||
@@ -79,8 +81,10 @@ void sha256_8way_init( sha256_8way_context *sc );
|
||||
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
|
||||
void sha256_8way_close( sha256_8way_context *sc, void *dst );
|
||||
void sha256_8way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_8way_transform( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
@@ -99,7 +103,9 @@ void sha256_16way_init( sha256_16way_context *sc );
|
||||
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
|
||||
void sha256_16way_close( sha256_16way_context *sc, void *dst );
|
||||
void sha256_16way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_16way_transform( __m512i *state_out, const __m512i *data,
|
||||
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
|
||||
const __m512i *state_in );
|
||||
|
@@ -180,6 +180,7 @@ static const uint32_t sha256d_hash1[16] = {
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000100
|
||||
};
|
||||
|
||||
// this performs the entire hash all over again, why?
|
||||
static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
|
||||
{
|
||||
uint32_t S[16];
|
||||
@@ -195,6 +196,7 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
|
||||
hash[i] = swab32(hash[i]);
|
||||
}
|
||||
|
||||
/*
|
||||
#if defined (__SHA__)
|
||||
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
@@ -241,6 +243,7 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len)
|
||||
}
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
static inline void sha256d_preextend(uint32_t *W)
|
||||
{
|
||||
@@ -653,6 +656,7 @@ int scanhash_sha256d( struct work *work,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
@@ -682,13 +686,13 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
bool register_sha256d_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256d;
|
||||
gate->hash = (void*)&sha256d;
|
||||
// gate->hash = (void*)&sha256d;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -7,9 +7,9 @@
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
#include "sha256-hash-opt.h"
|
||||
#include "sha256-hash.h"
|
||||
|
||||
void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
|
||||
void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y )
|
||||
{
|
||||
@@ -342,4 +342,348 @@ void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
|
||||
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
|
||||
}
|
||||
|
||||
void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y )
|
||||
{
|
||||
__m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
|
||||
__m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK;
|
||||
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
|
||||
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
|
||||
__m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y;
|
||||
|
||||
// Load initial values
|
||||
TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
|
||||
STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
|
||||
TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
|
||||
STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
|
||||
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||
|
||||
TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
|
||||
TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
|
||||
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
|
||||
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
|
||||
STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
|
||||
STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
|
||||
STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
|
||||
STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
|
||||
|
||||
// Save current hash
|
||||
ABEF_SAVE_X = STATE0_X;
|
||||
ABEF_SAVE_Y = STATE0_Y;
|
||||
CDGH_SAVE_X = STATE1_X;
|
||||
CDGH_SAVE_Y = STATE1_Y;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
|
||||
TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
|
||||
TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
|
||||
TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK );
|
||||
TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK );
|
||||
MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
|
||||
TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
|
||||
TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
|
||||
TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK );
|
||||
TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK );
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
|
||||
TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
|
||||
TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
|
||||
TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK );
|
||||
TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK );
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
|
||||
TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
|
||||
TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
|
||||
TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK );
|
||||
TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK );
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 16-19
|
||||
TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 20-23
|
||||
TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 24-27
|
||||
TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 28-31
|
||||
TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 32-35
|
||||
TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 36-39
|
||||
TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 40-43
|
||||
TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 44-47
|
||||
TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 48-51
|
||||
TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 52-55
|
||||
TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 56-59
|
||||
TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 60-63
|
||||
TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Add values back to state
|
||||
STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
|
||||
STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
|
||||
STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
|
||||
STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
|
||||
|
||||
TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
|
||||
TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
|
||||
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
|
||||
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
|
||||
STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
|
||||
STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
|
||||
STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
|
||||
STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &out_X[0], STATE0_X);
|
||||
_mm_store_si128((__m128i*) &out_X[4], STATE1_X);
|
||||
_mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
|
||||
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -74,17 +74,6 @@ static const uint32_t K256[64] =
|
||||
#define CHs(X, Y, Z) \
|
||||
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z )
|
||||
|
||||
/*
|
||||
#define MAJs(X, Y, Z) \
|
||||
_mm_or_si128( _mm_and_si128( X, Y ), \
|
||||
_mm_and_si128( _mm_or_si128( X, Y ), Z ) )
|
||||
*/
|
||||
/*
|
||||
#define MAJs(X, Y, Z) \
|
||||
_mm_xor_si128( Y, _mm_and_si128( _mm_xor_si128( X, Y ), \
|
||||
_mm_xor_si128( Y, Z ) ) )
|
||||
*/
|
||||
|
||||
#define MAJs(X, Y, Z) \
|
||||
_mm_xor_si128( Y, _mm_and_si128( X_xor_Y = _mm_xor_si128( X, Y ), \
|
||||
Y_xor_Z ) )
|
||||
@@ -105,38 +94,6 @@ static const uint32_t K256[64] =
|
||||
_mm_xor_si128( _mm_xor_si128( \
|
||||
mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )
|
||||
|
||||
/*
|
||||
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
__m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
|
||||
__m128i T1 = mm128_ror_32( E, 14 ); \
|
||||
__m128i T2 = mm128_ror_32( A, 9 ); \
|
||||
__m128i T3 = _mm_xor_si128( F, G ); \
|
||||
__m128i T4 = _mm_or_si128( A, B ); \
|
||||
__m128i T5 = _mm_and_si128( A, B ); \
|
||||
K = _mm_add_epi32( K, W[i] ); \
|
||||
T1 = _mm_xor_si128( T1, E ); \
|
||||
T2 = _mm_xor_si128( T2, A ); \
|
||||
T3 = _mm_and_si128( T3, E ); \
|
||||
T4 = _mm_and_si128( T4, C ); \
|
||||
K = _mm_add_epi32( H, K ); \
|
||||
T1 = mm128_ror_32( T1, 5 ); \
|
||||
T2 = mm128_ror_32( T2, 11 ); \
|
||||
T3 = _mm_xor_si128( T3, G ); \
|
||||
T4 = _mm_or_si128( T4, T5 ); \
|
||||
T1 = _mm_xor_si128( T1, E ); \
|
||||
T2 = _mm_xor_si128( T2, A ); \
|
||||
T1 = mm128_ror_32( T1, 6 ); \
|
||||
T2 = mm128_ror_32( T2, 2 ); \
|
||||
T1 = _mm_add_epi32( T1, T3 ); \
|
||||
T2 = _mm_add_epi32( T2, T4 ); \
|
||||
T1 = _mm_add_epi32( T1, K ); \
|
||||
H = _mm_add_epi32( T1, T2 ); \
|
||||
D = _mm_add_epi32( D, T1 ); \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
|
||||
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
__m128i T1, T2; \
|
||||
@@ -149,8 +106,8 @@ do { \
|
||||
H = _mm_add_epi32( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
void sha256_4way_transform( __m128i *state_out, const __m128i *data,
|
||||
// LE data, no need to byte swap
|
||||
void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in )
|
||||
{
|
||||
__m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
|
||||
@@ -232,6 +189,91 @@ void sha256_4way_transform( __m128i *state_out, const __m128i *data,
|
||||
state_out[7] = _mm_add_epi32( state_in[7], H );
|
||||
}
|
||||
|
||||
// BE data, need to byte swap
|
||||
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in )
|
||||
{
|
||||
__m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
|
||||
__m128i W[16];
|
||||
|
||||
mm128_block_bswap_32( W, data );
|
||||
mm128_block_bswap_32( W+8, data+8 );
|
||||
|
||||
A = state_in[0];
|
||||
B = state_in[1];
|
||||
C = state_in[2];
|
||||
D = state_in[3];
|
||||
E = state_in[4];
|
||||
F = state_in[5];
|
||||
G = state_in[6];
|
||||
H = state_in[7];
|
||||
Y_xor_Z = _mm_xor_si128( B, C );
|
||||
|
||||
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
|
||||
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
for ( int j = 16; j < 64; j += 16 )
|
||||
{
|
||||
W[ 0] = SHA2s_MEXP( 14, 9, 1, 0 );
|
||||
W[ 1] = SHA2s_MEXP( 15, 10, 2, 1 );
|
||||
W[ 2] = SHA2s_MEXP( 0, 11, 3, 2 );
|
||||
W[ 3] = SHA2s_MEXP( 1, 12, 4, 3 );
|
||||
W[ 4] = SHA2s_MEXP( 2, 13, 5, 4 );
|
||||
W[ 5] = SHA2s_MEXP( 3, 14, 6, 5 );
|
||||
W[ 6] = SHA2s_MEXP( 4, 15, 7, 6 );
|
||||
W[ 7] = SHA2s_MEXP( 5, 0, 8, 7 );
|
||||
W[ 8] = SHA2s_MEXP( 6, 1, 9, 8 );
|
||||
W[ 9] = SHA2s_MEXP( 7, 2, 10, 9 );
|
||||
W[10] = SHA2s_MEXP( 8, 3, 11, 10 );
|
||||
W[11] = SHA2s_MEXP( 9, 4, 12, 11 );
|
||||
W[12] = SHA2s_MEXP( 10, 5, 13, 12 );
|
||||
W[13] = SHA2s_MEXP( 11, 6, 14, 13 );
|
||||
W[14] = SHA2s_MEXP( 12, 7, 15, 14 );
|
||||
W[15] = SHA2s_MEXP( 13, 8, 0, 15 );
|
||||
|
||||
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
|
||||
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
|
||||
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
|
||||
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
|
||||
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
|
||||
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
|
||||
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
|
||||
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
|
||||
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
|
||||
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
|
||||
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
|
||||
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
|
||||
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
|
||||
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
|
||||
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
|
||||
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
|
||||
}
|
||||
|
||||
state_out[0] = _mm_add_epi32( state_in[0], A );
|
||||
state_out[1] = _mm_add_epi32( state_in[1], B );
|
||||
state_out[2] = _mm_add_epi32( state_in[2], C );
|
||||
state_out[3] = _mm_add_epi32( state_in[3], D );
|
||||
state_out[4] = _mm_add_epi32( state_in[4], E );
|
||||
state_out[5] = _mm_add_epi32( state_in[5], F );
|
||||
state_out[6] = _mm_add_epi32( state_in[6], G );
|
||||
state_out[7] = _mm_add_epi32( state_in[7], H );
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
|
||||
{
|
||||
@@ -436,61 +478,81 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
|
||||
|
||||
// SHA-256 8 way
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define CHx(X, Y, Z) \
|
||||
_mm256_ternarylogic_epi32( X, Y, Z, 0xca )
|
||||
|
||||
#define MAJx(X, Y, Z) \
|
||||
_mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
|
||||
|
||||
#define BSG2_0x(x) \
|
||||
mm256_xor3( mm256_ror_32(x, 2), mm256_ror_32(x, 13), mm256_ror_32(x, 22) )
|
||||
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 2 ), \
|
||||
mm256_ror_32( x, 13 ) ), \
|
||||
mm256_ror_32( x, 22 ) )
|
||||
|
||||
#define BSG2_1x(x) \
|
||||
mm256_xor3( mm256_ror_32(x, 6), mm256_ror_32(x, 11), mm256_ror_32(x, 25) )
|
||||
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 6 ), \
|
||||
mm256_ror_32( x, 11 ) ), \
|
||||
mm256_ror_32( x, 25 ) )
|
||||
|
||||
#define SSG2_0x(x) \
|
||||
mm256_xor3( mm256_ror_32(x, 7), mm256_ror_32(x, 18), _mm256_srli_epi32(x, 3) )
|
||||
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 7 ), \
|
||||
mm256_ror_32( x, 18 ) ), \
|
||||
_mm256_srli_epi32( x, 3 ) )
|
||||
|
||||
#define SSG2_1x(x) \
|
||||
mm256_xor3( mm256_ror_32(x, 17), mm256_ror_32(x, 19), _mm256_srli_epi32(x, 10) )
|
||||
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 17 ), \
|
||||
mm256_ror_32( x, 19 ) ), \
|
||||
_mm256_srli_epi32( x, 10 ) )
|
||||
|
||||
#define SHA2x_MEXP( a, b, c, d ) \
|
||||
mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );
|
||||
|
||||
// With AVX512VL ternary logic optimizations are available.
|
||||
// If not optimize by forwarding the result of X^Y in MAJ to the next round
|
||||
// to avoid recalculating it as Y^Z. This optimization is not applicable
|
||||
// when MAJ is optimized with ternary logic.
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
|
||||
|
||||
#define MAJx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
|
||||
|
||||
#define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \
|
||||
W[ i ] ); \
|
||||
__m256i T1 = BSG2_1x( E ); \
|
||||
__m256i T2 = BSG2_0x( A ); \
|
||||
T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
|
||||
T1 = _mm256_add_epi32( T1, H ); \
|
||||
T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
|
||||
T1 = _mm256_add_epi32( T1, T0 ); \
|
||||
D = _mm256_add_epi32( D, T1 ); \
|
||||
H = _mm256_add_epi32( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
#else // AVX2
|
||||
|
||||
#define CHx(X, Y, Z) \
|
||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
||||
|
||||
#define MAJx(X, Y, Z) \
|
||||
_mm256_xor_si256( Y, _mm256_and_si256( _mm256_xor_si256( X, Y ), \
|
||||
_mm256_xor_si256( Y, Z ) ) )
|
||||
/*
|
||||
// Use saved X_xor_Y from previous round, now called Y_xor_Z,
|
||||
// and save new X_xor_Y, for next round.
|
||||
#define MAJx(X, Y, Z) \
|
||||
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
|
||||
Y_xor_Z ) )
|
||||
*/
|
||||
|
||||
#define BSG2_0x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_32(x, 2), mm256_ror_32(x, 13) ), mm256_ror_32( x, 22) )
|
||||
|
||||
#define BSG2_1x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_32(x, 6), mm256_ror_32(x, 11) ), mm256_ror_32( x, 25) )
|
||||
|
||||
#define SSG2_0x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_32(x, 7), mm256_ror_32(x, 18) ), _mm256_srli_epi32(x, 3) )
|
||||
|
||||
#define SSG2_1x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_32(x, 17), mm256_ror_32(x, 19) ), _mm256_srli_epi32(x, 10) )
|
||||
|
||||
#endif // AVX512 else AVX2
|
||||
|
||||
#define SHA2x_MEXP( a, b, c, d ) \
|
||||
mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );
|
||||
#define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \
|
||||
W[ i ] ); \
|
||||
__m256i T1 = BSG2_1x( E ); \
|
||||
__m256i T2 = BSG2_0x( A ); \
|
||||
T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
|
||||
T1 = _mm256_add_epi32( T1, H ); \
|
||||
T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
|
||||
T1 = _mm256_add_epi32( T1, T0 ); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = _mm256_add_epi32( D, T1 ); \
|
||||
H = _mm256_add_epi32( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
#define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
__m256i T1, T2; \
|
||||
@@ -498,16 +560,23 @@ do { \
|
||||
T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \
|
||||
K, W[i] ) ); \
|
||||
T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = _mm256_add_epi32( D, T1 ); \
|
||||
H = _mm256_add_epi32( T1, T2 ); \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
void sha256_8way_transform( __m256i *state_out, const __m256i *data,
|
||||
#endif // AVX512VL else AVX2
|
||||
|
||||
// accepts LE byte ordered data, skip the byte swap
|
||||
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in )
|
||||
{
|
||||
__m256i A, B, C, D, E, F, G, H;
|
||||
#if !defined(__AVX512VL__)
|
||||
__m256i X_xor_Y, Y_xor_Z;
|
||||
#endif
|
||||
__m256i W[16];
|
||||
|
||||
memcpy_256( W, data, 16 );
|
||||
|
||||
A = state_in[0];
|
||||
@@ -519,6 +588,101 @@ void sha256_8way_transform( __m256i *state_out, const __m256i *data,
|
||||
G = state_in[6];
|
||||
H = state_in[7];
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
Y_xor_Z = _mm256_xor_si256( B, C );
|
||||
#endif
|
||||
|
||||
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
|
||||
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
for ( int j = 16; j < 64; j += 16 )
|
||||
{
|
||||
W[ 0] = SHA2x_MEXP( 14, 9, 1, 0 );
|
||||
W[ 1] = SHA2x_MEXP( 15, 10, 2, 1 );
|
||||
W[ 2] = SHA2x_MEXP( 0, 11, 3, 2 );
|
||||
W[ 3] = SHA2x_MEXP( 1, 12, 4, 3 );
|
||||
W[ 4] = SHA2x_MEXP( 2, 13, 5, 4 );
|
||||
W[ 5] = SHA2x_MEXP( 3, 14, 6, 5 );
|
||||
W[ 6] = SHA2x_MEXP( 4, 15, 7, 6 );
|
||||
W[ 7] = SHA2x_MEXP( 5, 0, 8, 7 );
|
||||
W[ 8] = SHA2x_MEXP( 6, 1, 9, 8 );
|
||||
W[ 9] = SHA2x_MEXP( 7, 2, 10, 9 );
|
||||
W[10] = SHA2x_MEXP( 8, 3, 11, 10 );
|
||||
W[11] = SHA2x_MEXP( 9, 4, 12, 11 );
|
||||
W[12] = SHA2x_MEXP( 10, 5, 13, 12 );
|
||||
W[13] = SHA2x_MEXP( 11, 6, 14, 13 );
|
||||
W[14] = SHA2x_MEXP( 12, 7, 15, 14 );
|
||||
W[15] = SHA2x_MEXP( 13, 8, 0, 15 );
|
||||
|
||||
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
|
||||
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
|
||||
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
|
||||
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
|
||||
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
|
||||
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
|
||||
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
|
||||
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
|
||||
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
|
||||
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
|
||||
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
|
||||
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
|
||||
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
|
||||
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
|
||||
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
|
||||
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
|
||||
}
|
||||
|
||||
state_out[0] = _mm256_add_epi32( state_in[0], A );
|
||||
state_out[1] = _mm256_add_epi32( state_in[1], B );
|
||||
state_out[2] = _mm256_add_epi32( state_in[2], C );
|
||||
state_out[3] = _mm256_add_epi32( state_in[3], D );
|
||||
state_out[4] = _mm256_add_epi32( state_in[4], E );
|
||||
state_out[5] = _mm256_add_epi32( state_in[5], F );
|
||||
state_out[6] = _mm256_add_epi32( state_in[6], G );
|
||||
state_out[7] = _mm256_add_epi32( state_in[7], H );
|
||||
}
|
||||
|
||||
|
||||
// Accepts BE byte ordered data, need to byte swap
|
||||
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in )
|
||||
{
|
||||
__m256i A, B, C, D, E, F, G, H;
|
||||
#if !defined(__AVX512VL__)
|
||||
__m256i X_xor_Y, Y_xor_Z;
|
||||
#endif
|
||||
__m256i W[16];
|
||||
|
||||
mm256_block_bswap_32( W , data );
|
||||
mm256_block_bswap_32( W+8, data+8 );
|
||||
|
||||
A = state_in[0];
|
||||
B = state_in[1];
|
||||
C = state_in[2];
|
||||
D = state_in[3];
|
||||
E = state_in[4];
|
||||
F = state_in[5];
|
||||
G = state_in[6];
|
||||
H = state_in[7];
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
Y_xor_Z = _mm256_xor_si256( B, C );
|
||||
#endif
|
||||
|
||||
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
@@ -587,6 +751,9 @@ static void
|
||||
sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
|
||||
{
|
||||
register __m256i A, B, C, D, E, F, G, H;
|
||||
#if !defined(__AVX512VL__)
|
||||
__m256i X_xor_Y, Y_xor_Z;
|
||||
#endif
|
||||
__m256i W[16];
|
||||
|
||||
mm256_block_bswap_32( W , in );
|
||||
@@ -615,6 +782,10 @@ sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
|
||||
H = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
}
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
Y_xor_Z = _mm256_xor_si256( B, C );
|
||||
#endif
|
||||
|
||||
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
@@ -790,27 +961,44 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
|
||||
|
||||
// SHA-256 16 way
|
||||
|
||||
#define CHx16(X, Y, Z) \
|
||||
_mm512_ternarylogic_epi32( X, Y, Z, 0xca )
|
||||
#define CHx16(X, Y, Z) _mm512_ternarylogic_epi32( X, Y, Z, 0xca )
|
||||
|
||||
#define MAJx16(X, Y, Z) \
|
||||
_mm512_ternarylogic_epi32( X, Y, Z, 0xe8 )
|
||||
#define MAJx16(X, Y, Z) _mm512_ternarylogic_epi32( X, Y, Z, 0xe8 )
|
||||
|
||||
#define BSG2_0x16(x) \
|
||||
mm512_xor3( mm512_ror_32(x, 2), mm512_ror_32(x, 13), mm512_ror_32(x, 22) )
|
||||
#define BSG2_0x16(x) mm512_xor3( _mm512_ror_epi32( x, 2 ), \
|
||||
_mm512_ror_epi32( x, 13 ), \
|
||||
_mm512_ror_epi32( x, 22 ) )
|
||||
|
||||
#define BSG2_1x16(x) \
|
||||
mm512_xor3( mm512_ror_32(x, 6), mm512_ror_32(x, 11), mm512_ror_32(x, 25) )
|
||||
#define BSG2_1x16(x) mm512_xor3( _mm512_ror_epi32( x, 6 ), \
|
||||
_mm512_ror_epi32( x, 11 ), \
|
||||
_mm512_ror_epi32( x, 25 ) )
|
||||
|
||||
#define SSG2_0x16(x) \
|
||||
mm512_xor3( mm512_ror_32(x, 7), mm512_ror_32(x, 18), _mm512_srli_epi32(x, 3) )
|
||||
#define SSG2_0x16(x) mm512_xor3( _mm512_ror_epi32( x, 7 ), \
|
||||
_mm512_ror_epi32( x, 18 ), \
|
||||
_mm512_srli_epi32( x, 3 ) )
|
||||
|
||||
#define SSG2_1x16(x) \
|
||||
mm512_xor3( mm512_ror_32(x, 17), mm512_ror_32(x, 19), _mm512_srli_epi32(x, 10) )
|
||||
#define SSG2_1x16(x) mm512_xor3( _mm512_ror_epi32( x, 17 ), \
|
||||
_mm512_ror_epi32( x, 19 ), \
|
||||
_mm512_srli_epi32( x, 10 ) )
|
||||
|
||||
#define SHA2x16_MEXP( a, b, c, d ) \
|
||||
mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] );
|
||||
|
||||
#define SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
|
||||
do { \
|
||||
__m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[ (j)+(i) ] ), \
|
||||
W[ i ] ); \
|
||||
__m512i T1 = BSG2_1x16( E ); \
|
||||
__m512i T2 = BSG2_0x16( A ); \
|
||||
T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \
|
||||
T1 = _mm512_add_epi32( T1, H ); \
|
||||
T2 = _mm512_add_epi32( T2, MAJx16( A, B, C ) ); \
|
||||
T1 = _mm512_add_epi32( T1, T0 ); \
|
||||
D = _mm512_add_epi32( D, T1 ); \
|
||||
H = _mm512_add_epi32( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
#define SHA2s_16WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
__m512i T1, T2; \
|
||||
@@ -821,14 +1009,10 @@ do { \
|
||||
D = _mm512_add_epi32( D, T1 ); \
|
||||
H = _mm512_add_epi32( T1, T2 ); \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
// Tranform one 16 lane by 64 byte message block and update state.
|
||||
// Calling function is responsible for initializing the state, setting
|
||||
// correct byte order, counting bits and padding of the final block.
|
||||
// It's faster for multiple rounds of sha256 (sha256d/t/q) by eliminating
|
||||
// redundant byte swapping.
|
||||
//
|
||||
void sha256_16way_transform( __m512i *state_out, const __m512i *data,
|
||||
// accepts LE input data
|
||||
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in )
|
||||
{
|
||||
__m512i A, B, C, D, E, F, G, H;
|
||||
@@ -909,6 +1093,89 @@ void sha256_16way_transform( __m512i *state_out, const __m512i *data,
|
||||
state_out[7] = _mm512_add_epi32( state_in[7], H );
|
||||
}
|
||||
|
||||
// Accepts BE input data, need to bswap
|
||||
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in )
|
||||
{
|
||||
__m512i A, B, C, D, E, F, G, H;
|
||||
__m512i W[16];
|
||||
|
||||
mm512_block_bswap_32( W , data );
|
||||
mm512_block_bswap_32( W+8, data+8 );
|
||||
|
||||
A = state_in[0];
|
||||
B = state_in[1];
|
||||
C = state_in[2];
|
||||
D = state_in[3];
|
||||
E = state_in[4];
|
||||
F = state_in[5];
|
||||
G = state_in[6];
|
||||
H = state_in[7];
|
||||
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
|
||||
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
for ( int j = 16; j < 64; j += 16 )
|
||||
{
|
||||
W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 );
|
||||
W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 );
|
||||
W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 );
|
||||
W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 );
|
||||
W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 );
|
||||
W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 );
|
||||
W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 );
|
||||
W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 );
|
||||
W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 );
|
||||
W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 );
|
||||
W[10] = SHA2x16_MEXP( 8, 3, 11, 10 );
|
||||
W[11] = SHA2x16_MEXP( 9, 4, 12, 11 );
|
||||
W[12] = SHA2x16_MEXP( 10, 5, 13, 12 );
|
||||
W[13] = SHA2x16_MEXP( 11, 6, 14, 13 );
|
||||
W[14] = SHA2x16_MEXP( 12, 7, 15, 14 );
|
||||
W[15] = SHA2x16_MEXP( 13, 8, 0, 15 );
|
||||
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
|
||||
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
|
||||
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
|
||||
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
|
||||
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
|
||||
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
|
||||
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
|
||||
}
|
||||
|
||||
state_out[0] = _mm512_add_epi32( state_in[0], A );
|
||||
state_out[1] = _mm512_add_epi32( state_in[1], B );
|
||||
state_out[2] = _mm512_add_epi32( state_in[2], C );
|
||||
state_out[3] = _mm512_add_epi32( state_in[3], D );
|
||||
state_out[4] = _mm512_add_epi32( state_in[4], E );
|
||||
state_out[5] = _mm512_add_epi32( state_in[5], F );
|
||||
state_out[6] = _mm512_add_epi32( state_in[6], G );
|
||||
state_out[7] = _mm512_add_epi32( state_in[7], H );
|
||||
}
|
||||
|
||||
// Aggresive prehashing
|
||||
void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
|
||||
const __m512i *state_in )
|
||||
|
@@ -7,9 +7,9 @@
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
#include "sha256-hash-opt.h"
|
||||
#include "sha256-hash.h"
|
||||
|
||||
void sha256_opt_transform( uint32_t *state_out, const void *input,
|
||||
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
|
||||
const uint32_t *state_in )
|
||||
{
|
||||
__m128i STATE0, STATE1;
|
||||
@@ -197,4 +197,192 @@ void sha256_opt_transform( uint32_t *state_out, const void *input,
|
||||
_mm_store_si128((__m128i*) &state_out[4], STATE1);
|
||||
}
|
||||
|
||||
|
||||
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
|
||||
const uint32_t *state_in )
|
||||
{
|
||||
__m128i STATE0, STATE1;
|
||||
__m128i MSG, TMP, MASK;
|
||||
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||
__m128i ABEF_SAVE, CDGH_SAVE;
|
||||
|
||||
// Load initial values
|
||||
TMP = _mm_load_si128((__m128i*) &state_in[0]);
|
||||
STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
|
||||
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||
|
||||
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
|
||||
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
|
||||
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
|
||||
|
||||
// Save current hash
|
||||
ABEF_SAVE = STATE0;
|
||||
CDGH_SAVE = STATE1;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0 = _mm_load_si128((const __m128i*) (input+0));
|
||||
TMSG0 = _mm_shuffle_epi8( TMSG0, MASK );
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
|
||||
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
|
||||
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
|
||||
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 16-19
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 20-23
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 24-27
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 28-31
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 32-35
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 36-39
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 40-43
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 44-47
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 48-51
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 52-55
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 56-59
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 60-63
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Add values back to state
|
||||
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
|
||||
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
|
||||
|
||||
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
|
||||
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
|
||||
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &state_out[0], STATE0);
|
||||
_mm_store_si128((__m128i*) &state_out[4], STATE1);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -1,18 +0,0 @@
|
||||
#ifndef SHA2_HASH_OPT_H__
|
||||
#define SHA2_HASH_OPT_H__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
void sha256_opt_transform( uint32_t *state_out, const void *input,
|
||||
const uint32_t *state_in );
|
||||
|
||||
// 2 way with interleaved instructions
|
||||
void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y );
|
||||
|
||||
#endif
|
||||
#endif
|
142
algo/sha/sha256-hash.c
Normal file
142
algo/sha/sha256-hash.c
Normal file
@@ -0,0 +1,142 @@
|
||||
#include "sha256-hash.h"
|
||||
|
||||
static const uint32_t SHA256_IV[8] =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
|
||||
/*
|
||||
static const uint8_t SHA256_PAD[64] =
|
||||
{
|
||||
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
*/
|
||||
|
||||
void sha256_ctx_init( sha256_context *ctx )
|
||||
{
|
||||
memcpy( ctx->state, SHA256_IV, sizeof SHA256_IV );
|
||||
ctx->count = 0;
|
||||
}
|
||||
|
||||
void sha256_update( sha256_context *ctx, const void *data, size_t len )
|
||||
{
|
||||
int ptr = ctx->count & 0x3f;
|
||||
const uint8_t *src = data;
|
||||
|
||||
ctx->count += (uint64_t)len;
|
||||
|
||||
if ( len < 64 - ptr )
|
||||
{
|
||||
memcpy( ctx->buf + ptr, src, len );
|
||||
return;
|
||||
}
|
||||
|
||||
memcpy( ctx->buf + ptr, src, 64 - ptr );
|
||||
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||
src += 64 - ptr;
|
||||
len -= 64 - ptr;
|
||||
|
||||
while ( len >= 64 )
|
||||
{
|
||||
sha256_transform_be( ctx->state, (uint32_t*)src, ctx->state );
|
||||
src += 64;
|
||||
len -= 64;
|
||||
}
|
||||
|
||||
memcpy( ctx->buf, src, len );
|
||||
}
|
||||
|
||||
#if 0
|
||||
void sha256_final( sha256_context *ctx, uint32_t *hash )
|
||||
{
|
||||
size_t r;
|
||||
|
||||
|
||||
/* Figure out how many bytes we have buffered. */
|
||||
r = ctx->count & 0x3f;
|
||||
// r = ( ctx->count >> 3 ) & 0x3f;
|
||||
|
||||
//printf("final: count= %d, r= %d\n", ctx->count, r );
|
||||
|
||||
/* Pad to 56 mod 64, transforming if we finish a block en route. */
|
||||
if ( r < 56 )
|
||||
{
|
||||
/* Pad to 56 mod 64. */
|
||||
memcpy( &ctx->buf[r], SHA256_PAD, 56 - r );
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Finish the current block and mix. */
|
||||
memcpy( &ctx->buf[r], SHA256_PAD, 64 - r );
|
||||
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||
|
||||
// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
|
||||
|
||||
/* The start of the final block is all zeroes. */
|
||||
memset( &ctx->buf[0], 0, 56 );
|
||||
}
|
||||
|
||||
/* Add the terminating bit-count. */
|
||||
ctx->buf[56] = bswap_64( ctx->count << 3 );
|
||||
// ctx->buf[56] = bswap_64( ctx->count );
|
||||
// be64enc( &ctx->buf[56], ctx->count );
|
||||
|
||||
/* Mix in the final block. */
|
||||
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||
|
||||
// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
|
||||
|
||||
for ( int i = 0; i < 8; i++ ) hash[i] = bswap_32( ctx->state[i] );
|
||||
|
||||
// for ( int i = 0; i < 8; i++ ) be32enc( hash + 4*i, ctx->state + i );
|
||||
|
||||
/*
|
||||
// be32enc_vect(digest, ctx->state, 4);
|
||||
// be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
|
||||
// Encode vector, two words at a time.
|
||||
do {
|
||||
be32enc(&dst[0], src[0]);
|
||||
be32enc(&dst[4], src[1]);
|
||||
src += 2;
|
||||
dst += 8;
|
||||
} while (--len);
|
||||
*/
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
void sha256_final( sha256_context *ctx, void *hash )
|
||||
{
|
||||
int ptr = ctx->count & 0x3f;
|
||||
|
||||
ctx->buf[ ptr++ ] = 0x80;
|
||||
|
||||
if ( ptr > 56 )
|
||||
{
|
||||
memset( ctx->buf + ptr, 0, 64 - ptr );
|
||||
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||
memset( ctx->buf, 0, 56 );
|
||||
}
|
||||
else
|
||||
memset( ctx->buf + ptr, 0, 56 - ptr );
|
||||
|
||||
*(uint64_t*)(&ctx->buf[56]) = bswap_64( ctx->count << 3 );
|
||||
|
||||
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
( (uint32_t*)hash )[i] = bswap_32( ctx->state[i] );
|
||||
}
|
||||
|
||||
void sha256_full( void *hash, const void *data, size_t len )
|
||||
{
|
||||
sha256_context ctx;
|
||||
sha256_ctx_init( &ctx );
|
||||
sha256_update( &ctx, data, len );
|
||||
sha256_final( &ctx, hash );
|
||||
}
|
||||
|
56
algo/sha/sha256-hash.h
Normal file
56
algo/sha/sha256-hash.h
Normal file
@@ -0,0 +1,56 @@
|
||||
#ifndef SHA256_HASH_H__
|
||||
#define SHA256_HASH_H__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
#include "cpuminer-config.h"
|
||||
#include "sph_sha2.h"
|
||||
|
||||
|
||||
// generic interface
|
||||
|
||||
typedef struct {
|
||||
unsigned char buf[64]; /* first field, for alignment */
|
||||
uint32_t state[8];
|
||||
uint64_t count;
|
||||
} sha256_context __attribute__((aligned(64)));
|
||||
|
||||
void sha256_full( void *hash, const void *data, size_t len );
|
||||
void sha256_update( sha256_context *ctx, const void *data, size_t len );
|
||||
void sha256_final( sha256_context *ctx, void *hash );
|
||||
void sha256_ctx_init( sha256_context *ctx );
|
||||
void sha256_transform_le( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in );
|
||||
void sha256_transform_be( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in );
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
|
||||
const uint32_t *state_in );
|
||||
|
||||
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
|
||||
const uint32_t *state_in );
|
||||
|
||||
// 2 way with interleaved instructions
|
||||
void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y );
|
||||
|
||||
void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y );
|
||||
|
||||
// Select target
|
||||
// with SHA...
|
||||
#define sha256_transform_le sha256_opt_transform_le
|
||||
#define sha256_transform_be sha256_opt_transform_be
|
||||
|
||||
#else
|
||||
|
||||
// without SHA...
|
||||
#define sha256_transform_le sph_sha256_transform_le
|
||||
#define sha256_transform_be sph_sha256_transform_be
|
||||
|
||||
#endif
|
||||
#endif
|
@@ -14,6 +14,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
__m512i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m512i vdata[20] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
@@ -23,7 +24,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = vdata + 19;
|
||||
__m512i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i last_byte = m512_const1_32( 0x80000000 );
|
||||
@@ -45,27 +46,30 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_16way_transform( midstate, vdata, initstate );
|
||||
// hash first 64 byte block of data
|
||||
sha256_16way_transform_le( midstate, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_512( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_512( block + 5, 10 );
|
||||
memset_zero_512( block + 5, 10 );
|
||||
block[15] = m512_const1_32( 80*8 ); // bit count
|
||||
sha256_16way_transform( hash32, block, midstate );
|
||||
sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_512( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = m512_const1_32( 32*8 ); // bit count
|
||||
sha256_16way_transform( hash32, block, initstate );
|
||||
sha256_16way_transform_le( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
@@ -85,7 +89,6 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256D_8WAY)
|
||||
@@ -128,7 +131,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_8way_transform( midstate, vdata, initstate );
|
||||
sha256_8way_transform_le( midstate, vdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -137,14 +140,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_256( block + 5, 10 );
|
||||
block[15] = m256_const1_32( 80*8 ); // bit count
|
||||
sha256_8way_transform( hash32, block, midstate );
|
||||
sha256_8way_transform_le( hash32, block, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_256( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = m256_const1_32( 32*8 ); // bit count
|
||||
sha256_8way_transform( hash32, block, initstate );
|
||||
sha256_8way_transform_le( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
@@ -209,7 +212,7 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform( midstate, vdata, initstate );
|
||||
sha256_4way_transform_le( midstate, vdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -218,14 +221,14 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_128( block + 5, 10 );
|
||||
block[15] = m128_const1_32( 80*8 ); // bit count
|
||||
sha256_4way_transform( hash32, block, midstate );
|
||||
sha256_4way_transform_le( hash32, block, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_128( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
sha256_4way_transform( hash32, block, initstate );
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
8
algo/sha/sha256d.c
Normal file
8
algo/sha/sha256d.c
Normal file
@@ -0,0 +1,8 @@
|
||||
#include "sha256d.h"
|
||||
|
||||
void sha256d( void *hash, const void *data, int len )
|
||||
{
|
||||
sha256_full( hash, data, len );
|
||||
sha256_full( hash, hash, 32 );
|
||||
}
|
||||
|
7
algo/sha/sha256d.h
Normal file
7
algo/sha/sha256d.h
Normal file
@@ -0,0 +1,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <string.h>
|
||||
#include <inttypes.h>
|
||||
#include "sha256-hash.h"
|
||||
|
||||
void sha256d( void *hash, const void *data, int len );
|
||||
|
@@ -3,14 +3,14 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
static __thread sph_sha256_context sha256q_ctx __attribute__ ((aligned (64)));
|
||||
static __thread sha256_context sha256q_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256q_midstate( const void* input )
|
||||
{
|
||||
sph_sha256_init( &sha256q_ctx );
|
||||
sph_sha256( &sha256q_ctx, input, 64 );
|
||||
sha256_ctx_init( &sha256q_ctx );
|
||||
sha256_update( &sha256q_ctx, input, 64 );
|
||||
}
|
||||
|
||||
int sha256q_hash( void* output, const void* input )
|
||||
@@ -19,24 +19,16 @@ int sha256q_hash( void* output, const void* input )
|
||||
const int midlen = 64; // bytes
|
||||
const int tail = 80 - midlen; // 16
|
||||
|
||||
sph_sha256_context ctx __attribute__ ((aligned (64)));
|
||||
sha256_context ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx );
|
||||
|
||||
sph_sha256( &ctx, input + midlen, tail );
|
||||
sph_sha256_close( &ctx, hash );
|
||||
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, hash, 32 );
|
||||
sph_sha256_close( &ctx, hash );
|
||||
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, hash, 32 );
|
||||
sph_sha256_close( &ctx, hash );
|
||||
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, hash, 32 );
|
||||
sph_sha256_close( &ctx, output );
|
||||
sha256_update( &ctx, input + midlen, tail );
|
||||
sha256_final( &ctx, hash );
|
||||
|
||||
sha256_full( hash, hash, 32 );
|
||||
sha256_full( hash, hash, 32 );
|
||||
sha256_full( output, hash, 32 );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@@ -47,7 +47,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 byte block of data
|
||||
sha256_16way_transform( midstate, vdata, initstate );
|
||||
sha256_16way_transform_le( midstate, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
|
||||
@@ -60,18 +60,17 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
memset_zero_512( block + 5, 10 );
|
||||
block[15] = m512_const1_32( 80*8 ); // bit count
|
||||
sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
|
||||
// sha256_16way_transform( hash32, block, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_512( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = m512_const1_32( 32*8 ); // bit count
|
||||
sha256_16way_transform( hash32, block, initstate );
|
||||
sha256_16way_transform_le( hash32, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy_512( block, hash32, 8 );
|
||||
sha256_16way_transform( hash32, block, initstate );
|
||||
sha256_16way_transform_le( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
@@ -137,7 +136,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_8way_transform( midstate, vdata, initstate );
|
||||
sha256_8way_transform_le( midstate, vdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -146,18 +145,18 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_256( block + 5, 10 );
|
||||
block[15] = m256_const1_32( 80*8 ); // bit count
|
||||
sha256_8way_transform( hash32, block, midstate );
|
||||
sha256_8way_transform_le( hash32, block, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_256( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = m256_const1_32( 32*8 ); // bit count
|
||||
sha256_8way_transform( hash32, block, initstate );
|
||||
sha256_8way_transform_le( hash32, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy_256( block, hash32, 8 );
|
||||
sha256_8way_transform( hash32, block, initstate );
|
||||
sha256_8way_transform_le( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
@@ -222,7 +221,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform( midstate, vdata, initstate );
|
||||
sha256_4way_transform_le( midstate, vdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -231,18 +230,18 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_128( block + 5, 10 );
|
||||
block[15] = m128_const1_32( 80*8 ); // bit count
|
||||
sha256_4way_transform( hash32, block, midstate );
|
||||
sha256_4way_transform_le( hash32, block, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_128( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
sha256_4way_transform( hash32, block, initstate );
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy_128( block, hash32, 8 );
|
||||
sha256_4way_transform( hash32, block, initstate );
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
@@ -4,120 +4,12 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
//#include "algo/sha/sph_sha2.h"
|
||||
#include "sha256-hash-opt.h"
|
||||
#include "sha256-hash.h"
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
// Only used on CPUs with SHA
|
||||
|
||||
/*
|
||||
static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256t_midstate( const void* input )
|
||||
{
|
||||
sph_sha256_init( &sha256t_ctx );
|
||||
sph_sha256( &sha256t_ctx, input, 64 );
|
||||
}
|
||||
|
||||
int sha256t_hash( void* output, const void* input )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[16];
|
||||
const int midlen = 64; // bytes
|
||||
const int tail = 80 - midlen; // 16
|
||||
|
||||
sph_sha256_context ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &sha256t_ctx, sizeof sha256t_ctx );
|
||||
|
||||
sph_sha256( &ctx, input + midlen, tail );
|
||||
sph_sha256_close( &ctx, hash );
|
||||
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, hash, 32 );
|
||||
sph_sha256_close( &ctx, hash );
|
||||
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, hash, 32 );
|
||||
sph_sha256_close( &ctx, output );
|
||||
|
||||
return 1;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t block[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash32[8] __attribute__ ((aligned (32)));
|
||||
uint32_t initstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t midstate[8] __attribute__ ((aligned (32)));
|
||||
|
||||
|
||||
|
||||
// uint32_t edata[20] __attribute__((aligned(64)));
|
||||
// uint32_t hash[8] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 1;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
__m128i shuf_bswap32 =
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
// mm128_bswap32_80( edata, pdata );
|
||||
// sha256t_midstate( edata );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = 0x6A09E667;
|
||||
initstate[1] = 0xBB67AE85;
|
||||
initstate[2] = 0x3C6EF372;
|
||||
initstate[3] = 0xA54FF53A;
|
||||
initstate[4] = 0x510E527F;
|
||||
initstate[5] = 0x9B05688C;
|
||||
initstate[6] = 0x1F83D9AB;
|
||||
initstate[7] = 0x5BE0CD19;
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_opt_transform( midstate, pdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy( block, pdata + 16, 16 );
|
||||
block[ 4] = 0x80000000;
|
||||
memset( block + 5, 0, 40 );
|
||||
block[15] = 80*8; // bit count
|
||||
sha256_opt_transform( hash32, block, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy( block, hash32, 32 );
|
||||
block[ 8] = 0x80000000;
|
||||
memset( block + 9, 0, 24 );
|
||||
block[15] = 32*8; // bit count
|
||||
sha256_opt_transform( hash32, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy( block, hash32, 32 );
|
||||
sha256_opt_transform( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
casti_m128i( hash32, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash32, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash32, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash32, 1 ), shuf_bswap32 );
|
||||
|
||||
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
|
||||
submit_solution( work, hash32, mythr );
|
||||
n++;
|
||||
pdata[19] = n;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
@@ -149,7 +41,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
initstate[7] = 0x5BE0CD19;
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_opt_transform( midstate, pdata, initstate );
|
||||
sha256_opt_transform_le( midstate, pdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -162,7 +54,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
memset( block0 + 5, 0, 40 );
|
||||
memset( block1 + 5, 0, 40 );
|
||||
block0[15] = block1[15] = 80*8; // bit count
|
||||
sha256_ni2way_transform( hash0, hash1, block0, block1, midstate, midstate );
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1, midstate, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy( block0, hash0, 32 );
|
||||
@@ -171,12 +63,12 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
memset( block0 + 9, 0, 24 );
|
||||
memset( block1 + 9, 0, 24 );
|
||||
block0[15] = block1[15] = 32*8; // bit count
|
||||
sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy( block0, hash0, 32 );
|
||||
memcpy( block1, hash1, 32 );
|
||||
sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
casti_m128i( hash0, 0 ) =
|
||||
|
@@ -95,32 +95,36 @@ static const uint64_t K512[80] =
|
||||
|
||||
// SHA-512 8 way 64 bit
|
||||
|
||||
#define CH8W(X, Y, Z) \
|
||||
_mm512_ternarylogic_epi64( X, Y, Z, 0xca )
|
||||
#define CH8W( X, Y, Z ) _mm512_ternarylogic_epi64( X, Y, Z, 0xca )
|
||||
|
||||
#define MAJ8W(X, Y, Z) \
|
||||
_mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
|
||||
#define MAJ8W( X, Y, Z ) _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
|
||||
|
||||
#define BSG8W_5_0(x) \
|
||||
mm512_xor3( mm512_ror_64(x, 28), mm512_ror_64(x, 34), mm512_ror_64(x, 39) )
|
||||
#define BSG8W_5_0( x ) mm512_xor3( _mm512_ror_epi64( x, 28 ), \
|
||||
_mm512_ror_epi64( x, 34 ), \
|
||||
_mm512_ror_epi64( x, 39 ) )
|
||||
|
||||
#define BSG8W_5_1(x) \
|
||||
mm512_xor3( mm512_ror_64(x, 14), mm512_ror_64(x, 18), mm512_ror_64(x, 41) )
|
||||
#define BSG8W_5_1( x ) mm512_xor3( _mm512_ror_epi64( x, 14 ), \
|
||||
_mm512_ror_epi64( x, 18 ), \
|
||||
_mm512_ror_epi64( x, 41 ) )
|
||||
|
||||
#define SSG8W_5_0(x) \
|
||||
mm512_xor3( mm512_ror_64(x, 1), mm512_ror_64(x, 8), _mm512_srli_epi64(x, 7) )
|
||||
#define SSG8W_5_0( x ) mm512_xor3( _mm512_ror_epi64( x, 1 ), \
|
||||
_mm512_ror_epi64( x, 8 ), \
|
||||
_mm512_srli_epi64( x, 7 ) )
|
||||
|
||||
#define SSG8W_5_1(x) \
|
||||
mm512_xor3( mm512_ror_64(x, 19), mm512_ror_64(x, 61), _mm512_srli_epi64(x, 6) )
|
||||
#define SSG8W_5_1( x ) mm512_xor3( _mm512_ror_epi64( x, 19 ), \
|
||||
_mm512_ror_epi64( x, 61 ), \
|
||||
_mm512_srli_epi64( x, 6 ) )
|
||||
|
||||
#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
||||
#define SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i ) \
|
||||
do { \
|
||||
__m512i T1, T2; \
|
||||
__m512i K = _mm512_set1_epi64( K512[ i ] ); \
|
||||
T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
|
||||
K, W[i] ) ); \
|
||||
T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
|
||||
D = _mm512_add_epi64( D, T1 ); \
|
||||
__m512i T0 = _mm512_add_epi64( _mm512_set1_epi64( K512[i] ), W[ i ] ); \
|
||||
__m512i T1 = BSG8W_5_1( E ); \
|
||||
__m512i T2 = BSG8W_5_0( A ); \
|
||||
T0 = _mm512_add_epi64( T0, CH8W( E, F, G ) ); \
|
||||
T1 = _mm512_add_epi64( T1, H ); \
|
||||
T2 = _mm512_add_epi64( T2, MAJ8W( A, B, C ) ); \
|
||||
T1 = _mm512_add_epi64( T1, T0 ); \
|
||||
D = _mm512_add_epi64( D, T1 ); \
|
||||
H = _mm512_add_epi64( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
@@ -267,16 +271,9 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||
|
||||
// SHA-512 4 way 64 bit
|
||||
|
||||
|
||||
#define CH(X, Y, Z) \
|
||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
||||
|
||||
/*
|
||||
#define MAJ(X, Y, Z) \
|
||||
_mm256_or_si256( _mm256_and_si256( X, Y ), \
|
||||
_mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
|
||||
*/
|
||||
|
||||
#define MAJ(X, Y, Z) \
|
||||
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
|
||||
Y_xor_Z ) )
|
||||
@@ -289,15 +286,6 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
|
||||
_mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
|
||||
|
||||
/*
|
||||
#define BSG5_0(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) )
|
||||
|
||||
#define BSG5_1(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) )
|
||||
*/
|
||||
/*
|
||||
#define SSG5_0(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
@@ -325,94 +313,20 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
|
||||
return _mm256_add_epi64( w0a, w1a );
|
||||
}
|
||||
|
||||
/*
|
||||
#define SSG512x2_0( w0, w1, i ) do \
|
||||
{ \
|
||||
__m256i X0a, X1a, X0b, X1b; \
|
||||
X0a = mm256_ror_64( W[i-15], 1 ); \
|
||||
X1a = mm256_ror_64( W[i-14], 1 ); \
|
||||
X0b = mm256_ror_64( W[i-15], 8 ); \
|
||||
X1b = mm256_ror_64( W[i-14], 8 ); \
|
||||
X0a = _mm256_xor_si256( X0a, X0b ); \
|
||||
X1a = _mm256_xor_si256( X1a, X1b ); \
|
||||
X0b = _mm256_srli_epi64( W[i-15], 7 ); \
|
||||
X1b = _mm256_srli_epi64( W[i-14], 7 ); \
|
||||
w0 = _mm256_xor_si256( X0a, X0b ); \
|
||||
w1 = _mm256_xor_si256( X1a, X1b ); \
|
||||
} while(0)
|
||||
|
||||
#define SSG512x2_1( w0, w1, i ) do \
|
||||
{ \
|
||||
__m256i X0a, X1a, X0b, X1b; \
|
||||
X0a = mm256_ror_64( W[i-2],19 ); \
|
||||
X1a = mm256_ror_64( W[i-1],19 ); \
|
||||
X0b = mm256_ror_64( W[i-2],61 ); \
|
||||
X1b = mm256_ror_64( W[i-1],61 ); \
|
||||
X0a = _mm256_xor_si256( X0a, X0b ); \
|
||||
X1a = _mm256_xor_si256( X1a, X1b ); \
|
||||
X0b = _mm256_srli_epi64( W[i-2], 6 ); \
|
||||
X1b = _mm256_srli_epi64( W[i-1], 6 ); \
|
||||
w0 = _mm256_xor_si256( X0a, X0b ); \
|
||||
w1 = _mm256_xor_si256( X1a, X1b ); \
|
||||
} while(0)
|
||||
*/
|
||||
/*
|
||||
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
||||
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
|
||||
do { \
|
||||
__m256i K = _mm256_set1_epi64x( K512[ i ] ); \
|
||||
__m256i T1 = mm256_ror_64( E, 23 ); \
|
||||
__m256i T2 = mm256_ror_64( A, 5 ); \
|
||||
__m256i T3 = _mm256_xor_si256( F, G ); \
|
||||
__m256i T4 = _mm256_or_si256( A, B ); \
|
||||
__m256i T5 = _mm256_and_si256( A, B ); \
|
||||
K = _mm256_add_epi64( K, W[i] ); \
|
||||
T1 = _mm256_xor_si256( T1, E ); \
|
||||
T2 = _mm256_xor_si256( T2, A ); \
|
||||
T3 = _mm256_and_si256( T3, E ); \
|
||||
T4 = _mm256_and_si256( T4, C ); \
|
||||
K = _mm256_add_epi64( H, K ); \
|
||||
T1 = mm256_ror_64( T1, 4 ); \
|
||||
T2 = mm256_ror_64( T2, 6 ); \
|
||||
T3 = _mm256_xor_si256( T3, G ); \
|
||||
T4 = _mm256_or_si256( T4, T5 ); \
|
||||
T1 = _mm256_xor_si256( T1, E ); \
|
||||
T2 = _mm256_xor_si256( T2, A ); \
|
||||
T1 = mm256_ror_64( T1, 14 ); \
|
||||
T2 = mm256_ror_64( T2, 28 ); \
|
||||
T1 = _mm256_add_epi64( T1, T3 ); \
|
||||
T2 = _mm256_add_epi64( T2, T4 ); \
|
||||
T1 = _mm256_add_epi64( T1, K ); \
|
||||
H = _mm256_add_epi64( T1, T2 ); \
|
||||
D = _mm256_add_epi64( D, T1 ); \
|
||||
} while (0)
|
||||
*/
|
||||
/*
|
||||
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
||||
do { \
|
||||
__m256i K = _mm256_add_epi64( W[i], _mm256_set1_epi64x( K512[ i ] ) ); \
|
||||
__m256i T1 = BSG5_1(E); \
|
||||
__m256i T2 = BSG5_0(A); \
|
||||
T1 = mm256_add4_64( T1, H, CH(E, F, G), K ); \
|
||||
T2 = _mm256_add_epi64( T2, MAJ(A, B, C) ); \
|
||||
D = _mm256_add_epi64( D, T1 ); \
|
||||
H = _mm256_add_epi64( T1, T2 ); \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
|
||||
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
||||
do { \
|
||||
__m256i T1, T2; \
|
||||
__m256i K = _mm256_set1_epi64x( K512[ i ] ); \
|
||||
T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
|
||||
K, W[i] ) ); \
|
||||
T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
|
||||
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[ i ] ); \
|
||||
__m256i T1 = BSG5_1( E ); \
|
||||
__m256i T2 = BSG5_0( A ); \
|
||||
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
|
||||
T1 = _mm256_add_epi64( T1, H ); \
|
||||
T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \
|
||||
T1 = _mm256_add_epi64( T1, T0 ); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = _mm256_add_epi64( D, T1 ); \
|
||||
D = _mm256_add_epi64( D, T1 ); \
|
||||
H = _mm256_add_epi64( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
static void
|
||||
sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
|
||||
{
|
||||
|
@@ -71,198 +71,6 @@ static const sph_u32 H256[8] = {
|
||||
* of the compression function.
|
||||
*/
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
static void sha2_round( const uint8_t input[], uint32_t state[8] )
|
||||
{
|
||||
__m128i STATE0, STATE1;
|
||||
__m128i MSG, TMP, MASK;
|
||||
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||
__m128i ABEF_SAVE, CDGH_SAVE;
|
||||
|
||||
// Load initial values
|
||||
TMP = _mm_load_si128((__m128i*) &state[0]);
|
||||
STATE1 = _mm_load_si128((__m128i*) &state[4]);
|
||||
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||
|
||||
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
|
||||
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
|
||||
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
|
||||
|
||||
// Save current hash
|
||||
ABEF_SAVE = STATE0;
|
||||
CDGH_SAVE = STATE1;
|
||||
|
||||
// Rounds 0-3
|
||||
MSG = _mm_load_si128((const __m128i*) (input+0));
|
||||
TMSG0 = _mm_shuffle_epi8(MSG, MASK);
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
|
||||
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
|
||||
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
|
||||
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 16-19
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 20-23
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 24-27
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 28-31
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 32-35
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 36-39
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 40-43
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 44-47
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 48-51
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 52-55
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 56-59
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 60-63
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Add values back to state
|
||||
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
|
||||
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
|
||||
|
||||
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
|
||||
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
|
||||
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &state[0], STATE0);
|
||||
_mm_store_si128((__m128i*) &state[4], STATE1);
|
||||
}
|
||||
|
||||
#else // no SHA
|
||||
|
||||
/*
|
||||
static const sph_u32 K[64] = {
|
||||
@@ -875,8 +683,24 @@ sha2_round(const unsigned char *data, sph_u32 r[8])
|
||||
#undef SHA2_IN
|
||||
}
|
||||
|
||||
#endif // SHA else
|
||||
void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in )
|
||||
{
|
||||
memcpy( state_out, state_in, 32 );
|
||||
#define SHA2_IN(x) (data[x])
|
||||
SHA2_ROUND_BODY( SHA2_IN, state_out );
|
||||
#undef SHA2_IN
|
||||
}
|
||||
|
||||
void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in )
|
||||
{
|
||||
memcpy( state_out, state_in, 32 );
|
||||
#define SHA2_IN(x) sph_dec32be_aligned( data+(x) )
|
||||
SHA2_ROUND_BODY( SHA2_IN, state_out );
|
||||
#undef SHA2_IN
|
||||
|
||||
}
|
||||
|
||||
/* see sph_sha2.h */
|
||||
void
|
||||
|
@@ -207,6 +207,13 @@ void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
|
||||
|
||||
void sph_sha256_full( void *dst, const void *data, size_t len );
|
||||
|
||||
// These shouldn't be called directly, use sha256-hash.h generic functions
|
||||
// sha256_transform_le & sha256_transform_be instead.
|
||||
void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in );
|
||||
|
||||
void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in );
|
||||
|
||||
|
||||
#if SPH_64
|
||||
|
@@ -20,8 +20,8 @@ static const uint32_t IV512[] =
|
||||
|
||||
|
||||
#define mm256_ror2x256hi_1x32( a, b ) \
|
||||
_mm256_blend_epi32( mm256_ror128_32( a ), \
|
||||
mm256_ror128_32( b ), 0x88 )
|
||||
_mm256_blend_epi32( mm256_shuflr128_32( a ), \
|
||||
mm256_shuflr128_32( b ), 0x88 )
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
@@ -78,7 +78,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
{
|
||||
// round 1, 5, 9
|
||||
|
||||
k00 = _mm256_xor_si256( k13, mm256_ror128_32(
|
||||
k00 = _mm256_xor_si256( k13, mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k00, zero ) ) );
|
||||
|
||||
if ( r == 0 )
|
||||
@@ -88,7 +88,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
|
||||
k01 = _mm256_xor_si256( k00,
|
||||
mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) );
|
||||
mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ) );
|
||||
|
||||
if ( r == 1 )
|
||||
k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
|
||||
@@ -97,25 +97,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||
k02 = _mm256_xor_si256( k01,
|
||||
mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) );
|
||||
mm256_shuflr128_32( mm256_aesenc_2x128( k02, zero ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||
k03 = _mm256_xor_si256( k02,
|
||||
mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) );
|
||||
mm256_shuflr128_32( mm256_aesenc_2x128( k03, zero ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||
|
||||
p3 = _mm256_xor_si256( p3, x );
|
||||
|
||||
k10 = _mm256_xor_si256( k03,
|
||||
mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) );
|
||||
mm256_shuflr128_32( mm256_aesenc_2x128( k10, zero ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
|
||||
k11 = _mm256_xor_si256( k10,
|
||||
mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) );
|
||||
mm256_shuflr128_32( mm256_aesenc_2x128( k11, zero ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||
k12 = _mm256_xor_si256( k11,
|
||||
mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) );
|
||||
mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||
k13 = _mm256_xor_si256( k12,
|
||||
mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) );
|
||||
mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ) );
|
||||
|
||||
if ( r == 2 )
|
||||
k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
|
||||
@@ -151,31 +151,31 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
|
||||
// round 3, 7, 11
|
||||
|
||||
k00 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k00 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k00, zero ) ), k13 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
|
||||
k01 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k01 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k01, zero ) ), k00 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||
k02 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k02 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k02, zero ) ), k01 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||
k03 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k03 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k03, zero ) ), k02 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||
|
||||
p1 = _mm256_xor_si256( p1, x );
|
||||
|
||||
k10 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k10 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k10, zero ) ), k03 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
|
||||
k11 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k11 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k11, zero ) ), k10 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||
k12 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k12 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k12, zero ) ), k11 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||
k13 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k13 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k13, zero ) ), k12 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
||||
|
||||
@@ -209,35 +209,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
|
||||
// round 13
|
||||
|
||||
k00 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k00 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k00, zero ) ), k13 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
|
||||
k01 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k01 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k01, zero ) ), k00 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||
k02 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k02 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k02, zero ) ), k01 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||
k03 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k03 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k03, zero ) ), k02 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||
|
||||
p3 = _mm256_xor_si256( p3, x );
|
||||
|
||||
k10 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k10 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k10, zero ) ), k03 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
|
||||
k11 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k11 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k11, zero ) ), k10 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||
|
||||
k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) );
|
||||
k12 = mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) );
|
||||
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
|
||||
~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
|
||||
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
|
||||
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||
k13 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k13 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k13, zero ) ), k12 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
||||
|
||||
|
@@ -12,8 +12,8 @@ static const uint32_t IV512[] =
|
||||
};
|
||||
|
||||
#define mm512_ror2x512hi_1x32( a, b ) \
|
||||
_mm512_mask_blend_epi32( 0x8888, mm512_ror128_32( a ), \
|
||||
mm512_ror128_32( b ) )
|
||||
_mm512_mask_blend_epi32( 0x8888, mm512_shuflr128_32( a ), \
|
||||
mm512_shuflr128_32( b ) )
|
||||
|
||||
static void
|
||||
c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
@@ -60,7 +60,7 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
{
|
||||
// round 1, 5, 9
|
||||
|
||||
K0 = _mm512_xor_si512( K7, mm512_ror128_32(
|
||||
K0 = _mm512_xor_si512( K7, mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K0, m512_zero ) ) );
|
||||
|
||||
if ( r == 0 )
|
||||
@@ -69,33 +69,33 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
|
||||
K1 = _mm512_xor_si512( K0,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
|
||||
mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
|
||||
|
||||
if ( r == 1 )
|
||||
K1 = _mm512_xor_si512( K1, mm512_ror128_32(
|
||||
K1 = _mm512_xor_si512( K1, mm512_shuflr128_32(
|
||||
_mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
|
||||
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||
K2 = _mm512_xor_si512( K1,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
|
||||
mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||
K3 = _mm512_xor_si512( K2,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
|
||||
mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||
|
||||
P3 = _mm512_xor_si512( P3, X );
|
||||
|
||||
K4 = _mm512_xor_si512( K3,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
|
||||
mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
|
||||
K5 = _mm512_xor_si512( K4,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
|
||||
mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||
K6 = _mm512_xor_si512( K5,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
|
||||
mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||
K7 = _mm512_xor_si512( K6,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
|
||||
mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
|
||||
|
||||
if ( r == 2 )
|
||||
K7 = _mm512_xor_si512( K7, mm512_swap128_64(
|
||||
@@ -130,31 +130,31 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
|
||||
// round 3, 7, 11
|
||||
|
||||
K0 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K0 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
|
||||
K1 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K1 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||
K2 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K2 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||
K3 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K3 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||
|
||||
P1 = _mm512_xor_si512( P1, X );
|
||||
|
||||
K4 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K4 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
|
||||
K5 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K5 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||
K6 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K6 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||
K7 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K7 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||
|
||||
@@ -187,34 +187,34 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
|
||||
// round 13
|
||||
|
||||
K0 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K0 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
|
||||
K1 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K1 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||
K2 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K2 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||
K3 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K3 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||
|
||||
P3 = _mm512_xor_si512( P3, X );
|
||||
|
||||
K4 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K4 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
|
||||
K5 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K5 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||
|
||||
K6 = mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
|
||||
K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
|
||||
K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32(
|
||||
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
|
||||
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||
K7= _mm512_xor_si512( mm512_ror128_32(
|
||||
K7= _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||
|
||||
|
@@ -74,15 +74,15 @@ static const sph_u32 IV512[] = {
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
#if defined(__AVX2__)
|
||||
// 2 way version of above
|
||||
// a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] }
|
||||
|
||||
#define mm256_ror2x256hi_1x32( a, b ) \
|
||||
_mm256_blend_epi32( mm256_ror256_1x32( a ), \
|
||||
mm256_rol256_3x32( b ), 0x88 )
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
static void
|
||||
c512( sph_shavite_big_context *sc, const void *msg )
|
||||
@@ -135,7 +135,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
for ( r = 0; r < 3; r ++ )
|
||||
{
|
||||
// round 1, 5, 9
|
||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
|
||||
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
|
||||
if ( r == 0 )
|
||||
@@ -144,7 +144,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
x = _mm_xor_si128( p0, k00 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
|
||||
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
|
||||
if ( r == 1 )
|
||||
@@ -153,31 +153,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
|
||||
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
|
||||
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
|
||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
|
||||
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
|
||||
x = _mm_xor_si128( p2, k10 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
|
||||
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
|
||||
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
|
||||
k12 = _mm_xor_si128( k12, k11 );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
|
||||
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
|
||||
if ( r == 2 )
|
||||
@@ -222,38 +222,38 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
// round 3, 7, 11
|
||||
|
||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
|
||||
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
x = _mm_xor_si128( p2, k00 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
|
||||
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
|
||||
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
|
||||
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p1 = _mm_xor_si128( p1, x );
|
||||
|
||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
|
||||
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
x = _mm_xor_si128( p0, k10 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
|
||||
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
|
||||
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
|
||||
k12 = _mm_xor_si128( k12, k11 );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
|
||||
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
@@ -295,39 +295,39 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
// round 13
|
||||
|
||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
|
||||
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
x = _mm_xor_si128( p0, k00 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
|
||||
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
|
||||
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
|
||||
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
|
||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
|
||||
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
x = _mm_xor_si128( p2, k10 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
|
||||
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
|
||||
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
|
||||
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
|
||||
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
|
||||
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include "skein-hash-4way.h"
|
||||
#include "algo/sha/sha-hash-4way.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
#if defined (SKEIN_8WAY)
|
||||
|
||||
@@ -87,7 +87,6 @@ void skeinhash_4way( void *state, const void *input )
|
||||
uint32_t hash1[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[16] __attribute__ ((aligned (64)));
|
||||
sph_sha256_context ctx_sha256;
|
||||
#else
|
||||
uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
|
||||
sha256_4way_context ctx_sha256;
|
||||
@@ -98,18 +97,12 @@ void skeinhash_4way( void *state, const void *input )
|
||||
#if defined(__SHA__)
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 );
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hash0, 64 );
|
||||
sph_sha256_close( &ctx_sha256, hash0 );
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hash1, 64 );
|
||||
sph_sha256_close( &ctx_sha256, hash1 );
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hash2, 64 );
|
||||
sph_sha256_close( &ctx_sha256, hash2 );
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hash3, 64 );
|
||||
sph_sha256_close( &ctx_sha256, hash3 );
|
||||
|
||||
sha256_full( hash0, hash0, 64 );
|
||||
sha256_full( hash1, hash1, 64 );
|
||||
sha256_full( hash2, hash2, 64 );
|
||||
sha256_full( hash3, hash3, 64 );
|
||||
|
||||
intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
|
||||
|
||||
#else
|
||||
|
@@ -5,21 +5,18 @@
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "sph_skein.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
void skeinhash(void *state, const void *input)
|
||||
{
|
||||
uint32_t hash[16] __attribute__ ((aligned (64)));
|
||||
sph_skein512_context ctx_skein;
|
||||
sph_sha256_context ctx_sha256;
|
||||
|
||||
sph_skein512_init( &ctx_skein );
|
||||
sph_skein512( &ctx_skein, input, 80 );
|
||||
sph_skein512_close( &ctx_skein, hash );
|
||||
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hash, 64 );
|
||||
sph_sha256_close( &ctx_sha256, hash );
|
||||
sha256_full( hash, hash, 64 );
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
@@ -27,8 +24,8 @@ void skeinhash(void *state, const void *input)
|
||||
int scanhash_skein( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t hash64[8] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__ ((aligned (64)));
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
@@ -36,7 +33,7 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
|
@@ -176,12 +176,6 @@ static void rotate_indexes( uint32_t *p )
|
||||
*/
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline uint32_t rotl32( uint32_t a, size_t r )
|
||||
{
|
||||
return ( a << r ) | ( a >> (32-r) );
|
||||
}
|
||||
|
||||
// Vectorized and targetted version of fnv1a
|
||||
#if defined (__AVX2__)
|
||||
|
||||
@@ -232,7 +226,7 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
|
||||
for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
|
||||
{ \
|
||||
const uint32_t *blob_off = blob + \
|
||||
( ( fnv1a( rotl32( subset[i], r ), accumulator ) % mdiv ) \
|
||||
( ( fnv1a( rol32( subset[i], r ), accumulator ) % mdiv ) \
|
||||
* ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
|
||||
UPDATE_ACCUMULATOR; \
|
||||
MULXOR; \
|
||||
|
@@ -1,5 +1,5 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#include "Verthash.h"
|
||||
#include "tiny_sha3/sha3-4way.h"
|
||||
|
||||
@@ -140,7 +140,7 @@ bool register_verthash_algo( algo_gate_t* gate )
|
||||
uint8_t vhDataFileHash[32] = { 0 };
|
||||
|
||||
applog( LOG_NOTICE, "Verifying Verthash data" );
|
||||
sph_sha256_full( vhDataFileHash, verthashInfo.data,
|
||||
sha256_full( vhDataFileHash, verthashInfo.data,
|
||||
verthashInfo.dataSize );
|
||||
if ( memcmp( vhDataFileHash, verthashDatFileHash_bytes,
|
||||
sizeof(verthashDatFileHash_bytes) ) == 0 )
|
||||
|
@@ -82,7 +82,7 @@ int scanhash_whirlpool( struct work* work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[19], n );
|
||||
whirlpool_hash(vhash, endiandata);
|
||||
|
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
|
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget) & ! opt_benchmark )
|
||||
submit_solution( work, vhash, mythr );
|
||||
} while ( n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
|
@@ -52,10 +52,10 @@ void x16r_8way_prehash( void *vdata, void *pdata )
|
||||
break;
|
||||
case CUBEHASH:
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 );
|
||||
intrlv_8x64( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
|
||||
cube_4way_init( &x16r_ctx.cube, 512, 16, 32 );
|
||||
cube_4way_update( &x16r_ctx.cube, vdata2, 64 );
|
||||
rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
|
||||
break;
|
||||
case HAMSI:
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
@@ -207,15 +207,15 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
|
||||
case LUFFA:
|
||||
if ( i == 0 )
|
||||
{
|
||||
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash,
|
||||
vhash + (16<<2), 16 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash,
|
||||
vhash + (16<<2), 16 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash,
|
||||
vhash + (16<<2), 16 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash,
|
||||
vhash + (16<<2), 16 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -230,56 +230,24 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
|
||||
case CUBEHASH:
|
||||
if ( i == 0 )
|
||||
{
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
|
||||
(const byte*)in0 + 64, 16 );
|
||||
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
|
||||
cube_4way_update_close( &ctx.cube, vhash,
|
||||
vhash + (16<<2), 16 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
|
||||
(const byte*)in1 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
|
||||
(const byte*)in2 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
|
||||
(const byte*)in3 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash4,
|
||||
(const byte*)in4 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash5,
|
||||
(const byte*)in5 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash6,
|
||||
(const byte*)in6 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash7,
|
||||
(const byte*)in7 + 64, 16 );
|
||||
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
|
||||
cube_4way_update_close( &ctx.cube, vhash,
|
||||
vhash + (16<<2), 16 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
}
|
||||
else
|
||||
{
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
|
||||
(const byte*)in0, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
|
||||
(const byte*)in1, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
|
||||
(const byte*)in2, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
|
||||
(const byte*)in3, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash4,
|
||||
(const byte*)in4, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash5,
|
||||
(const byte*)in5, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash6,
|
||||
(const byte*)in6, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash7,
|
||||
(const byte*)in7, size );
|
||||
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
|
||||
cube_4way_full( &ctx.cube, vhash, 512, vhash, size );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
|
||||
cube_4way_full( &ctx.cube, vhash, 512, vhash, size );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
}
|
||||
break;
|
||||
case SHAVITE:
|
||||
@@ -556,9 +524,10 @@ void x16r_4way_prehash( void *vdata, void *pdata )
|
||||
break;
|
||||
case CUBEHASH:
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 );
|
||||
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
|
||||
intrlv_2x128( vdata2, edata, edata, 640 );
|
||||
cube_2way_init( &x16r_ctx.cube, 512, 16, 32 );
|
||||
cube_2way_update( &x16r_ctx.cube, vdata2, 64 );
|
||||
rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
|
||||
break;
|
||||
case HAMSI:
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
@@ -680,13 +649,13 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
|
||||
case LUFFA:
|
||||
if ( i == 0 )
|
||||
{
|
||||
intrlv_2x128( vhash, hash0, hash1, 640 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
|
||||
dintrlv_2x128_512( hash0, hash1, vhash );
|
||||
intrlv_2x128( vhash, hash2, hash3, 640 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
|
||||
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||
intrlv_2x128( vhash, hash0, hash1, 640 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
|
||||
dintrlv_2x128_512( hash0, hash1, vhash );
|
||||
intrlv_2x128( vhash, hash2, hash3, 640 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
|
||||
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -701,32 +670,24 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
|
||||
case CUBEHASH:
|
||||
if ( i == 0 )
|
||||
{
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
|
||||
(const byte*)in0 + 64, 16 );
|
||||
intrlv_2x128( vhash, in0, in1, size<<3 );
|
||||
cube_2way_update_close( &ctx.cube, vhash,
|
||||
vhash + (16<<1), 16 );
|
||||
dintrlv_2x128_512( hash0, hash1, vhash );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
|
||||
(const byte*)in1 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
|
||||
(const byte*)in2 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
|
||||
(const byte*)in3 + 64, 16 );
|
||||
intrlv_2x128( vhash, in2, in3, size<<3 );
|
||||
cube_2way_update_close( &ctx.cube, vhash,
|
||||
vhash + (16<<1), 16 );
|
||||
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||
}
|
||||
else
|
||||
{
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
|
||||
(const byte*)in0, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
|
||||
(const byte*)in1, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
|
||||
(const byte*)in2, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
|
||||
(const byte*)in3, size );
|
||||
intrlv_2x128( vhash, in0, in1, size<<3 );
|
||||
cube_2way_full( &ctx.cube, vhash, 512, vhash, size );
|
||||
dintrlv_2x128_512( hash0, hash1, vhash );
|
||||
intrlv_2x128( vhash, in2, in3, size<<3 );
|
||||
cube_2way_full( &ctx.cube, vhash, 512, vhash, size );
|
||||
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||
}
|
||||
break;
|
||||
case SHAVITE:
|
||||
|
@@ -1,4 +1,5 @@
|
||||
#include "x16r-gate.h"
|
||||
#include "algo/sha/sha256d.h"
|
||||
|
||||
__thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 };
|
||||
|
||||
|
@@ -37,6 +37,7 @@
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/hamsi/hamsi-hash-4way.h"
|
||||
@@ -115,7 +116,7 @@ union _x16r_8way_context_overlay
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
luffa_4way_context luffa;
|
||||
cubehashParam cube;
|
||||
cube_4way_context cube;
|
||||
simd_4way_context simd;
|
||||
hamsi512_8way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
@@ -164,8 +165,8 @@ union _x16r_4way_context_overlay
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
luffa_2way_context luffa;
|
||||
cube_2way_context cube;
|
||||
hashState_luffa luffa1;
|
||||
cubehashParam cube;
|
||||
simd_2way_context simd;
|
||||
hamsi512_4way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
|
@@ -13,7 +13,7 @@
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/lyra2/lyra2.h"
|
||||
#if defined(__SHA__)
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#endif
|
||||
|
||||
#if defined (X21S_8WAY)
|
||||
@@ -208,9 +208,7 @@ union _x21s_4way_context_overlay
|
||||
haval256_5_4way_context haval;
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
#if defined(__SHA__)
|
||||
sph_sha256_context sha256;
|
||||
#else
|
||||
#if !defined(__SHA__)
|
||||
sha256_4way_context sha256;
|
||||
#endif
|
||||
} __attribute__ ((aligned (64)));
|
||||
@@ -275,18 +273,10 @@ int x21s_4way_hash( void* output, const void* input, int thrid )
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash0, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash1, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+32 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash2, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+64 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash3, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+96 );
|
||||
sha256_full( output, hash0, 64 );
|
||||
sha256_full( output+32, hash1, 64 );
|
||||
sha256_full( output+64, hash2, 64 );
|
||||
sha256_full( output+96, hash3, 64 );
|
||||
|
||||
#else
|
||||
|
||||
|
@@ -8,7 +8,7 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include "algo/tiger/sph_tiger.h"
|
||||
#include "algo/gost/sph_gost.h"
|
||||
@@ -23,7 +23,7 @@ union _x21s_context_overlay
|
||||
sph_haval256_5_context haval;
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
sph_sha256_context sha256;
|
||||
sha256_context sha256;
|
||||
};
|
||||
typedef union _x21s_context_overlay x21s_context_overlay;
|
||||
|
||||
@@ -50,9 +50,7 @@ int x21s_hash( void* output, const void* input, int thrid )
|
||||
sph_gost512 ( &ctx.gost, (const void*) hash, 64 );
|
||||
sph_gost512_close( &ctx.gost, (void*) hash );
|
||||
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash, 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash );
|
||||
sha256_full( hash, hash, 64 );
|
||||
|
||||
memcpy( output, hash, 32 );
|
||||
|
||||
|
@@ -37,7 +37,8 @@ union _x17_8way_context_overlay
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
luffa_4way_context luffa;
|
||||
cube_4way_context cube;
|
||||
// cube_4way_context cube;
|
||||
cube_4way_2buf_context cube;
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_context groestl;
|
||||
shavite512_4way_context shavite;
|
||||
@@ -119,8 +120,10 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
|
||||
luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
|
||||
cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
|
||||
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 );
|
||||
|
||||
// cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
|
||||
// cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
|
@@ -28,7 +28,7 @@
|
||||
#include "algo/echo/echo-hash-4way.h"
|
||||
#endif
|
||||
#if defined(__SHA__)
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#endif
|
||||
|
||||
#if defined(X22I_8WAY)
|
||||
@@ -51,9 +51,7 @@ union _x22i_8way_ctx_overlay
|
||||
haval256_5_8way_context haval;
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
#if defined(X22I_8WAY_SHA)
|
||||
sph_sha256_context sha256;
|
||||
#else
|
||||
#if !defined(X22I_8WAY_SHA)
|
||||
sha256_8way_context sha256;
|
||||
#endif
|
||||
#if defined(__VAES__)
|
||||
@@ -391,30 +389,14 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
|
||||
|
||||
#if defined(X22I_8WAY_SHA)
|
||||
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash0, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash1, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+32 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash2, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+64 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash3, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+96 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash4, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+128 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash5, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+160 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash6, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+192 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash7, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+224 );
|
||||
sha256_full( hash0, hash0, 64 );
|
||||
sha256_full( hash1, hash1, 64 );
|
||||
sha256_full( hash2, hash2, 64 );
|
||||
sha256_full( hash3, hash3, 64 );
|
||||
sha256_full( hash4, hash4, 64 );
|
||||
sha256_full( hash5, hash5, 64 );
|
||||
sha256_full( hash6, hash6, 64 );
|
||||
sha256_full( hash7, hash7, 64 );
|
||||
|
||||
#else
|
||||
|
||||
@@ -551,9 +533,7 @@ union _x22i_4way_ctx_overlay
|
||||
haval256_5_4way_context haval;
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
#if defined(X22I_4WAY_SHA)
|
||||
sph_sha256_context sha256;
|
||||
#else
|
||||
#if !defined(X22I_4WAY_SHA)
|
||||
sha256_4way_context sha256;
|
||||
#endif
|
||||
};
|
||||
@@ -757,18 +737,10 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
|
||||
|
||||
#if defined(X22I_4WAY_SHA)
|
||||
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash0, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash1, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+32 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash2, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+64 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash3, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+96 );
|
||||
sha256_full( hash0, hash0, 64 );
|
||||
sha256_full( hash1, hash1, 64 );
|
||||
sha256_full( hash2, hash2, 64 );
|
||||
sha256_full( hash3, hash3, 64 );
|
||||
|
||||
#else
|
||||
|
||||
|
@@ -24,6 +24,7 @@
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include "algo/tiger/sph_tiger.h"
|
||||
#include "algo/lyra2/lyra2.h"
|
||||
@@ -57,7 +58,6 @@ union _x22i_context_overlay
|
||||
sph_haval256_5_context haval;
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
sph_sha256_context sha256;
|
||||
};
|
||||
typedef union _x22i_context_overlay x22i_context_overlay;
|
||||
|
||||
@@ -172,9 +172,7 @@ int x22i_hash( void *output, const void *input, int thrid )
|
||||
sph_gost512 (&ctx.gost, (const void*) hash, 64);
|
||||
sph_gost512_close(&ctx.gost, (void*) hash);
|
||||
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash, 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash );
|
||||
sha256_full( hash, hash, 64 );
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
|
||||
|
@@ -33,7 +33,7 @@
|
||||
#include "algo/echo/echo-hash-4way.h"
|
||||
#endif
|
||||
#if defined(__SHA__)
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#endif
|
||||
|
||||
void x25x_shuffle( void *hash )
|
||||
@@ -84,7 +84,7 @@ union _x25x_8way_ctx_overlay
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
#if defined(X25X_8WAY_SHA)
|
||||
sph_sha256_context sha256;
|
||||
sha256_context sha256;
|
||||
#else
|
||||
sha256_8way_context sha256;
|
||||
#endif
|
||||
@@ -447,31 +447,15 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
|
||||
|
||||
#if defined(X25X_8WAY_SHA)
|
||||
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash0[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash0[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash1[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash1[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash2[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash2[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash3[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash3[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash4[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash4[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash5[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash5[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash6[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash6[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash7[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash7[21] );
|
||||
|
||||
sha256_full( hash0[21], hash0[20], 64 );
|
||||
sha256_full( hash1[21], hash1[20], 64 );
|
||||
sha256_full( hash2[21], hash2[20], 64 );
|
||||
sha256_full( hash3[21], hash3[20], 64 );
|
||||
sha256_full( hash4[21], hash4[20], 64 );
|
||||
sha256_full( hash5[21], hash5[20], 64 );
|
||||
sha256_full( hash6[21], hash6[20], 64 );
|
||||
sha256_full( hash7[21], hash7[20], 64 );
|
||||
|
||||
intrlv_8x32_512( vhash, hash0[21], hash1[21], hash2[21], hash3[21],
|
||||
hash4[21], hash5[21], hash6[21], hash7[21] );
|
||||
|
||||
@@ -646,7 +630,7 @@ union _x25x_4way_ctx_overlay
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
#if defined(X25X_4WAY_SHA)
|
||||
sph_sha256_context sha256;
|
||||
sha256_context sha256;
|
||||
#else
|
||||
sha256_4way_context sha256;
|
||||
#endif
|
||||
@@ -848,18 +832,10 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
|
||||
|
||||
#if defined(X25X_4WAY_SHA)
|
||||
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash0[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash0[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash1[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash1[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash2[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash2[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash3[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash3[21] );
|
||||
sha256_full( hash0[21], hash0[20], 64 );
|
||||
sha256_full( hash1[21], hash1[20], 64 );
|
||||
sha256_full( hash2[21], hash2[20], 64 );
|
||||
sha256_full( hash3[21], hash3[20], 64 );
|
||||
|
||||
intrlv_4x32_512( vhash, hash0[21], hash1[21], hash2[21], hash3[21] );
|
||||
|
||||
|
@@ -23,7 +23,7 @@
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include "algo/tiger/sph_tiger.h"
|
||||
#include "algo/lyra2/lyra2.h"
|
||||
@@ -60,7 +60,7 @@ union _x25x_context_overlay
|
||||
sph_haval256_5_context haval;
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
sph_sha256_context sha256;
|
||||
sha256_context sha256;
|
||||
sph_panama_context panama;
|
||||
blake2s_state blake2s;
|
||||
};
|
||||
@@ -174,9 +174,7 @@ int x25x_hash( void *output, const void *input, int thrid )
|
||||
sph_gost512 (&ctx.gost, (const void*) &hash[19], 64);
|
||||
sph_gost512_close(&ctx.gost, (void*) &hash[20]);
|
||||
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, &hash[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, &hash[21] );
|
||||
sha256_full( &hash[21], &hash[20], 64 );
|
||||
|
||||
sph_panama_init(&ctx.panama);
|
||||
sph_panama (&ctx.panama, (const void*) &hash[21], 64 );
|
||||
|
@@ -35,9 +35,11 @@
|
||||
#include "blake2b-yp.h"
|
||||
|
||||
// Cyclic right rotation.
|
||||
#ifndef ROTR64
|
||||
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
|
||||
#endif
|
||||
//#ifndef ROTR64
|
||||
//#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
|
||||
//#endif
|
||||
|
||||
#define ROTR64(x, y) ror64( x, y )
|
||||
|
||||
// Little-endian byte access.
|
||||
#define B2B_GET64(p) \
|
||||
|
@@ -52,8 +52,8 @@ int scanhash_yespower_r8g( struct work *work, uint32_t max_nonce,
|
||||
endiandata[19] = n;
|
||||
|
||||
// do sha256 prehash
|
||||
sph_sha256_init( &sha256_prehash_ctx );
|
||||
sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
|
||||
sha256_ctx_init( &sha256_prehash_ctx );
|
||||
sha256_update( &sha256_prehash_ctx, endiandata, 64 );
|
||||
|
||||
do {
|
||||
yespower_tls( (unsigned char *)endiandata, params.perslen,
|
||||
|
@@ -27,14 +27,11 @@
|
||||
* coin.
|
||||
*/
|
||||
#include "yespower.h"
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
yespower_params_t yespower_params;
|
||||
|
||||
//SHA256_CTX sha256_prehash_ctx;
|
||||
__thread sph_sha256_context sha256_prehash_ctx;
|
||||
//__thread SHA256_CTX sha256_prehash_ctx;
|
||||
__thread sha256_context sha256_prehash_ctx;
|
||||
|
||||
// YESPOWER
|
||||
|
||||
@@ -61,8 +58,8 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
|
||||
endiandata[19] = n;
|
||||
|
||||
// do sha256 prehash
|
||||
sph_sha256_init( &sha256_prehash_ctx );
|
||||
sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
|
||||
sha256_ctx_init( &sha256_prehash_ctx );
|
||||
sha256_update( &sha256_prehash_ctx, endiandata, 64 );
|
||||
|
||||
do {
|
||||
if ( yespower_hash( (char*)endiandata, (char*)vhash, 80, thr_id ) )
|
||||
@@ -101,10 +98,6 @@ int scanhash_yespower_b2b( struct work *work, uint32_t max_nonce,
|
||||
be32enc( &endiandata[k], pdata[k] );
|
||||
endiandata[19] = n;
|
||||
|
||||
// do sha256 prehash
|
||||
sph_sha256_init( &sha256_prehash_ctx );
|
||||
sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
|
||||
|
||||
do {
|
||||
if (yespower_b2b_hash( (char*) endiandata, (char*) vhash, 80, thr_id ) )
|
||||
if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )
|
||||
|
@@ -203,17 +203,17 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
|
||||
ARX(X0, X3, X2, 18) \
|
||||
/* Rearrange data */ \
|
||||
X1 = _mm_shuffle_epi32(X1, 0x93); \
|
||||
X3 = _mm_shuffle_epi32(X3, 0x39); \
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E); \
|
||||
X3 = _mm_shuffle_epi32(X3, 0x39); \
|
||||
/* Operate on "rows" */ \
|
||||
ARX(X3, X0, X1, 7) \
|
||||
ARX(X2, X3, X0, 9) \
|
||||
ARX(X1, X2, X3, 13) \
|
||||
ARX(X0, X1, X2, 18) \
|
||||
/* Rearrange data */ \
|
||||
X3 = _mm_shuffle_epi32(X3, 0x93); \
|
||||
X1 = _mm_shuffle_epi32(X1, 0x39); \
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E); \
|
||||
X3 = _mm_shuffle_epi32(X3, 0x93);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
|
||||
/**
|
||||
* Apply the Salsa20 core to the block provided in (X0 ... X3).
|
||||
@@ -1095,7 +1095,7 @@ int yespower(yespower_local_t *local,
|
||||
salsa20_blk_t *V, *XY;
|
||||
pwxform_ctx_t ctx;
|
||||
uint8_t sha256[32];
|
||||
sph_sha256_context sha256_ctx;
|
||||
sha256_context sha256_ctx;
|
||||
|
||||
/* Sanity-check parameters */
|
||||
if ( (version != YESPOWER_0_5 && version != YESPOWER_1_0)
|
||||
@@ -1138,10 +1138,9 @@ int yespower(yespower_local_t *local,
|
||||
|
||||
// copy prehash, do tail
|
||||
memcpy( &sha256_ctx, &sha256_prehash_ctx, sizeof sha256_ctx );
|
||||
|
||||
sph_sha256( &sha256_ctx, src+64, srclen-64 );
|
||||
sph_sha256_close( &sha256_ctx, sha256 );
|
||||
|
||||
sha256_update( &sha256_ctx, src+64, srclen-64 );
|
||||
sha256_final( &sha256_ctx, sha256 );
|
||||
|
||||
if ( version == YESPOWER_0_5 )
|
||||
{
|
||||
PBKDF2_SHA256( sha256, sizeof(sha256), src, srclen, 1, B, B_size );
|
||||
@@ -1186,7 +1185,9 @@ int yespower(yespower_local_t *local,
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
|
||||
smix_1_0( B, r, N, V, XY, &ctx );
|
||||
|
||||
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
|
||||
HMAC_SHA256_Buf( B + B_size - 64, 64, sha256, sizeof(sha256),
|
||||
(uint8_t *)dst );
|
||||
}
|
||||
|
@@ -34,7 +34,7 @@
|
||||
#include <stdlib.h> /* for size_t */
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@@ -78,9 +78,7 @@ typedef struct {
|
||||
|
||||
extern yespower_params_t yespower_params;
|
||||
|
||||
//SHA256_CTX sha256_prehash_ctx;
|
||||
extern __thread sph_sha256_context sha256_prehash_ctx;
|
||||
//extern __thread SHA256_CTX sha256_prehash_ctx;
|
||||
extern __thread sha256_context sha256_prehash_ctx;
|
||||
|
||||
/**
|
||||
* yespower_init_local(local):
|
||||
|
@@ -4,7 +4,7 @@
|
||||
# during develpment. However the information contained may provide compilation
|
||||
# tips to users.
|
||||
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 > /dev/null
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 > /dev/null
|
||||
|
||||
# Icelake AVX512 SHA VAES
|
||||
make distclean || echo clean
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.1.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.0.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.17.1'
|
||||
PACKAGE_STRING='cpuminer-opt 3.17.1'
|
||||
PACKAGE_VERSION='3.18.0'
|
||||
PACKAGE_STRING='cpuminer-opt 3.18.0'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.17.1 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.18.0 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1404,7 +1404,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.17.1:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.18.0:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1509,7 +1509,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.17.1
|
||||
cpuminer-opt configure 3.18.0
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.17.1, which was
|
||||
It was created by cpuminer-opt $as_me 3.18.0, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2993,7 +2993,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.17.1'
|
||||
VERSION='3.18.0'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.17.1, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.18.0, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6756,7 +6756,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.17.1
|
||||
cpuminer-opt config.status 3.18.0
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.17.1])
|
||||
AC_INIT([cpuminer-opt], [3.18.0])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
136
cpu-miner.c
136
cpu-miner.c
@@ -38,6 +38,7 @@
|
||||
#include <jansson.h>
|
||||
#include <openssl/sha.h>
|
||||
#include "sysinfos.c"
|
||||
#include "algo/sha/sha256d.h"
|
||||
|
||||
#ifdef WIN32
|
||||
#include <winsock2.h>
|
||||
@@ -94,6 +95,7 @@ bool have_gbt = true;
|
||||
bool allow_getwork = true;
|
||||
bool want_stratum = true; // pretty useless
|
||||
bool have_stratum = false;
|
||||
bool stratum_down = true;
|
||||
bool allow_mininginfo = true;
|
||||
bool use_syslog = false;
|
||||
bool use_colors = true;
|
||||
@@ -166,6 +168,8 @@ uint32_t stale_share_count = 0;
|
||||
uint32_t solved_block_count = 0;
|
||||
double *thr_hashrates;
|
||||
double global_hashrate = 0.;
|
||||
double total_hashes = 0.;
|
||||
struct timeval total_hashes_time = {0,0};
|
||||
double stratum_diff = 0.;
|
||||
double net_diff = 0.;
|
||||
double net_hashrate = 0.;
|
||||
@@ -1001,6 +1005,7 @@ struct share_stats_t
|
||||
double share_diff;
|
||||
double stratum_diff;
|
||||
double target_diff;
|
||||
uint32_t height;
|
||||
char job_id[32];
|
||||
};
|
||||
|
||||
@@ -1080,13 +1085,14 @@ void report_summary_log( bool force )
|
||||
pthread_mutex_unlock( &stats_lock );
|
||||
|
||||
timeval_subtract( &et, &now, &start_time );
|
||||
timeval_subtract( &uptime, &now, &session_start );
|
||||
timeval_subtract( &uptime, &total_hashes_time, &session_start );
|
||||
|
||||
double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6;
|
||||
double ghrate = global_hashrate;
|
||||
double ghrate = safe_div( total_hashes, (double)uptime.tv_sec, 0. );
|
||||
double target_diff = exp32 * last_targetdiff;
|
||||
double shrate = safe_div( target_diff * (double)(accepts),
|
||||
share_time, 0. );
|
||||
// global_hashrate = ghrate;
|
||||
double sess_hrate = safe_div( exp32 * norm_diff_sum,
|
||||
(double)uptime.tv_sec, 0. );
|
||||
double submit_rate = safe_div( (double)submits * 60., share_time, 0. );
|
||||
@@ -1134,29 +1140,38 @@ void report_summary_log( bool force )
|
||||
100. * safe_div( (double)accepted_share_count,
|
||||
(double)submitted_share_count, 0. ) );
|
||||
if ( stale_share_count )
|
||||
applog2( LOG_INFO, "Stale %7d %7d %5.1f%%",
|
||||
{
|
||||
int prio = stales ? LOG_MINR : LOG_INFO;
|
||||
applog2( prio, "Stale %7d %7d %5.1f%%",
|
||||
stales, stale_share_count,
|
||||
100. * safe_div( (double)stale_share_count,
|
||||
(double)submitted_share_count, 0. ) );
|
||||
}
|
||||
if ( rejected_share_count )
|
||||
applog2( LOG_INFO, "Rejected %7d %7d %5.1f%%",
|
||||
{
|
||||
int prio = rejects ? LOG_ERR : LOG_INFO;
|
||||
applog2( prio, "Rejected %7d %7d %5.1f%%",
|
||||
rejects, rejected_share_count,
|
||||
100. * safe_div( (double)rejected_share_count,
|
||||
(double)submitted_share_count, 0. ) );
|
||||
}
|
||||
if ( solved_block_count )
|
||||
applog2( LOG_INFO,"Blocks Solved %7d %7d",
|
||||
{
|
||||
int prio = solved ? LOG_PINK : LOG_INFO;
|
||||
applog2( prio, "Blocks Solved %7d %7d",
|
||||
solved, solved_block_count );
|
||||
}
|
||||
applog2( LOG_INFO, "Hi/Lo Share Diff %.5g / %.5g",
|
||||
highest_share, lowest_share );
|
||||
highest_share, lowest_share );
|
||||
|
||||
int mismatch = submitted_share_count
|
||||
- ( accepted_share_count + stale_share_count + rejected_share_count );
|
||||
if ( mismatch )
|
||||
{
|
||||
if ( mismatch != 1 )
|
||||
applog(LOG_WARNING,"Share count mismatch: %d, stats may be inaccurate", mismatch );
|
||||
applog2(LOG_MINR, "Count mismatch: %d, stats may be inaccurate", mismatch );
|
||||
else
|
||||
applog(LOG_INFO,"Share count mismatch, submitted share may still be pending" );
|
||||
applog2(LOG_INFO, CL_LBL "Count mismatch, submitted share may still be pending" CL_N );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1278,17 +1293,17 @@ static int share_result( int result, struct work *work,
|
||||
|
||||
if ( use_colors )
|
||||
{
|
||||
bcol = acol = scol = rcol = CL_WHT;
|
||||
bcol = acol = scol = rcol = CL_N;
|
||||
if ( likely( result ) )
|
||||
{
|
||||
acol = CL_WHT CL_GRN;
|
||||
if ( unlikely( solved ) ) bcol = CL_WHT CL_MAG;
|
||||
acol = CL_LGR;
|
||||
if ( unlikely( solved ) ) bcol = CL_LMA;
|
||||
}
|
||||
else if ( stale ) scol = CL_WHT CL_YL2;
|
||||
else rcol = CL_WHT CL_RED;
|
||||
else if ( stale ) scol = CL_YL2;
|
||||
else rcol = CL_LRD;
|
||||
}
|
||||
|
||||
applog( LOG_NOTICE, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)",
|
||||
applog( LOG_INFO, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)",
|
||||
my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
|
||||
bres, share_time, latency );
|
||||
|
||||
@@ -1296,8 +1311,7 @@ static int share_result( int result, struct work *work,
|
||||
{
|
||||
if ( have_stratum )
|
||||
applog2( LOG_INFO, "Diff %.5g, Block %d, Job %s",
|
||||
my_stats.share_diff, stratum.block_height,
|
||||
my_stats.job_id );
|
||||
my_stats.share_diff, my_stats.height, my_stats.job_id );
|
||||
else
|
||||
applog2( LOG_INFO, "Diff %.5g, Block %d",
|
||||
my_stats.share_diff, work ? work->height : last_block_height );
|
||||
@@ -1308,7 +1322,7 @@ static int share_result( int result, struct work *work,
|
||||
uint32_t str[8];
|
||||
uint32_t *targ;
|
||||
|
||||
if ( reason ) applog( LOG_WARNING, "Reject reason: %s", reason );
|
||||
if ( reason ) applog( LOG_MINR, "Reject reason: %s", reason );
|
||||
|
||||
diff_to_hash( str, my_stats.share_diff );
|
||||
applog2( LOG_INFO, "Hash: %08x%08x%08x%08x%08x%08x", str[7], str[6],
|
||||
@@ -1861,6 +1875,7 @@ static void update_submit_stats( struct work *work, const void *hash )
|
||||
share_stats[ s_put_ptr ].net_diff = net_diff;
|
||||
share_stats[ s_put_ptr ].stratum_diff = stratum_diff;
|
||||
share_stats[ s_put_ptr ].target_diff = work->targetdiff;
|
||||
share_stats[ s_put_ptr ].height = work->height;
|
||||
if ( have_stratum )
|
||||
strncpy( share_stats[ s_put_ptr ].job_id, work->job_id, 30 );
|
||||
s_put_ptr = stats_ptr_incr( s_put_ptr );
|
||||
@@ -1871,6 +1886,10 @@ static void update_submit_stats( struct work *work, const void *hash )
|
||||
bool submit_solution( struct work *work, const void *hash,
|
||||
struct thr_info *thr )
|
||||
{
|
||||
// Job went stale during hashing of a valid share.
|
||||
if ( !opt_quiet && work_restart[ thr->id ].restart )
|
||||
applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N );
|
||||
|
||||
work->sharediff = hash_to_diff( hash );
|
||||
if ( likely( submit_work( thr, work ) ) )
|
||||
{
|
||||
@@ -1887,11 +1906,11 @@ bool submit_solution( struct work *work, const void *hash,
|
||||
if ( !opt_quiet )
|
||||
{
|
||||
if ( have_stratum )
|
||||
applog( LOG_NOTICE, "%d Submitted Diff %.5g, Block %d, Job %s",
|
||||
applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Job %s",
|
||||
submitted_share_count, work->sharediff, work->height,
|
||||
work->job_id );
|
||||
else
|
||||
applog( LOG_NOTICE, "%d Submitted Diff %.5g, Block %d, Ntime %08x",
|
||||
applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Ntime %08x",
|
||||
submitted_share_count, work->sharediff, work->height,
|
||||
work->data[ algo_gate.ntime_index ] );
|
||||
}
|
||||
@@ -2048,7 +2067,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
pthread_rwlock_wrlock( &g_work_lock );
|
||||
pthread_mutex_lock( &sctx->work_lock );
|
||||
|
||||
new_job = sctx->new_job;
|
||||
new_job = sctx->new_job; // otherwise just increment extranonce2
|
||||
sctx->new_job = false;
|
||||
|
||||
free( g_work->job_id );
|
||||
@@ -2084,6 +2103,14 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
|
||||
pthread_mutex_unlock( &stats_lock );
|
||||
|
||||
if ( !opt_quiet )
|
||||
{
|
||||
int mismatch = submitted_share_count
|
||||
- ( accepted_share_count + stale_share_count + rejected_share_count );
|
||||
if ( mismatch )
|
||||
applog(LOG_INFO, CL_LBL "%d Submitted share pending, maybe stale" CL_N, submitted_share_count );
|
||||
}
|
||||
|
||||
if ( stratum_diff != sctx->job.diff )
|
||||
applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s",
|
||||
sctx->job.diff, sctx->block_height, g_work->job_id );
|
||||
@@ -2264,19 +2291,29 @@ static void *miner_thread( void *userdata )
|
||||
}
|
||||
|
||||
// wait for stratum to send first job
|
||||
if ( have_stratum ) while ( unlikely( !g_work.job_id ) ) sleep(1);
|
||||
if ( have_stratum ) while ( unlikely( stratum_down ) )
|
||||
{
|
||||
if ( opt_debug )
|
||||
applog( LOG_INFO, "Thread %d waiting for first job", thr_id );
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
// nominal startng values
|
||||
int64_t max64 = 20;
|
||||
thr_hashrates[thr_id] = 20;
|
||||
while (1)
|
||||
{
|
||||
uint64_t hashes_done;
|
||||
struct timeval tv_start, tv_end, diff;
|
||||
int64_t max64 = 1000;
|
||||
// int64_t max64 = 1000;
|
||||
int nonce_found = 0;
|
||||
|
||||
if ( likely( algo_gate.do_this_thread( thr_id ) ) )
|
||||
{
|
||||
if ( have_stratum )
|
||||
if ( have_stratum )
|
||||
{
|
||||
while ( unlikely( stratum_down ) )
|
||||
sleep( 1 );
|
||||
if ( *nonceptr >= end_nonce )
|
||||
stratum_gen_work( &stratum, &g_work );
|
||||
}
|
||||
@@ -2383,6 +2420,8 @@ static void *miner_thread( void *userdata )
|
||||
if ( diff.tv_usec || diff.tv_sec )
|
||||
{
|
||||
pthread_mutex_lock( &stats_lock );
|
||||
total_hashes += hashes_done;
|
||||
total_hashes_time = tv_end;
|
||||
thr_hashrates[thr_id] =
|
||||
hashes_done / ( diff.tv_sec + diff.tv_usec * 1e-6 );
|
||||
pthread_mutex_unlock( &stats_lock );
|
||||
@@ -2439,7 +2478,6 @@ static void *miner_thread( void *userdata )
|
||||
&& thr_id == opt_n_threads - 1 ) )
|
||||
{
|
||||
double hashrate = 0.;
|
||||
|
||||
pthread_mutex_lock( &stats_lock );
|
||||
for ( i = 0; i < opt_n_threads; i++ )
|
||||
hashrate += thr_hashrates[i];
|
||||
@@ -2448,8 +2486,12 @@ static void *miner_thread( void *userdata )
|
||||
|
||||
if ( opt_benchmark )
|
||||
{
|
||||
struct timeval uptime;
|
||||
char hr[16];
|
||||
char hr_units[2] = {0,0};
|
||||
timeval_subtract( &uptime, &total_hashes_time, &session_start );
|
||||
double hashrate = safe_div( total_hashes, uptime.tv_sec, 0. );
|
||||
|
||||
scale_hash_for_display( &hashrate, hr_units );
|
||||
sprintf( hr, "%.2f", hashrate );
|
||||
#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
|
||||
@@ -2745,6 +2787,7 @@ static void *stratum_thread(void *userdata )
|
||||
if ( unlikely( stratum_need_reset ) )
|
||||
{
|
||||
stratum_need_reset = false;
|
||||
stratum_down = true;
|
||||
stratum_disconnect( &stratum );
|
||||
if ( strcmp( stratum.url, rpc_url ) )
|
||||
{
|
||||
@@ -2755,11 +2798,13 @@ static void *stratum_thread(void *userdata )
|
||||
else
|
||||
applog(LOG_WARNING, "Stratum connection reset");
|
||||
// reset stats queue as well
|
||||
restart_threads();
|
||||
if ( s_get_ptr != s_put_ptr ) s_get_ptr = s_put_ptr = 0;
|
||||
}
|
||||
|
||||
while ( !stratum.curl )
|
||||
{
|
||||
stratum_down = true;
|
||||
pthread_rwlock_wrlock( &g_work_lock );
|
||||
g_work_time = 0;
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
@@ -2780,6 +2825,7 @@ static void *stratum_thread(void *userdata )
|
||||
}
|
||||
else
|
||||
{
|
||||
stratum_down = false;
|
||||
restart_threads();
|
||||
applog(LOG_BLUE,"Stratum connection established" );
|
||||
}
|
||||
@@ -2801,7 +2847,7 @@ static void *stratum_thread(void *userdata )
|
||||
}
|
||||
else
|
||||
{
|
||||
applog(LOG_WARNING, "Stratum connection interrupted");
|
||||
// applog(LOG_WARNING, "Stratum connection interrupted");
|
||||
// stratum_disconnect( &stratum );
|
||||
stratum_need_reset = true;
|
||||
}
|
||||
@@ -3629,6 +3675,10 @@ int main(int argc, char *argv[])
|
||||
show_usage_and_exit(1);
|
||||
}
|
||||
|
||||
// need to register to get algo optimizations for cpu capabilities
|
||||
// but that causes register logs before cpu capabilities is output.
|
||||
// Would need to split register into 2 parts. First part sets algo
|
||||
// optimizations but no logging, second part does any logging.
|
||||
if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1);
|
||||
|
||||
if ( !check_cpu_capability() ) exit(1);
|
||||
@@ -3685,12 +3735,6 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize stats times and counters
|
||||
memset( share_stats, 0, s_stats_size * sizeof (struct share_stats_t) );
|
||||
gettimeofday( &last_submit_time, NULL );
|
||||
memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
|
||||
memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
|
||||
|
||||
// if ( !check_cpu_capability() ) exit(1);
|
||||
|
||||
pthread_mutex_init( &stats_lock, NULL );
|
||||
@@ -3854,7 +3898,8 @@ int main(int argc, char *argv[])
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if ( have_stratum )
|
||||
|
||||
if ( have_stratum )
|
||||
{
|
||||
if ( opt_debug )
|
||||
applog(LOG_INFO,"Creating stratum thread");
|
||||
@@ -3900,24 +3945,35 @@ int main(int argc, char *argv[])
|
||||
opt_api_listen );
|
||||
}
|
||||
|
||||
// hold the stats lock while starting miner threads
|
||||
pthread_mutex_lock( &stats_lock );
|
||||
|
||||
/* start mining threads */
|
||||
for (i = 0; i < opt_n_threads; i++)
|
||||
for ( i = 0; i < opt_n_threads; i++ )
|
||||
{
|
||||
usleep( 5000 );
|
||||
// usleep( 5000 );
|
||||
thr = &thr_info[i];
|
||||
thr->id = i;
|
||||
thr->q = tq_new();
|
||||
if (!thr->q)
|
||||
if ( !thr->q )
|
||||
return 1;
|
||||
err = thread_create(thr, miner_thread);
|
||||
if (err) {
|
||||
applog(LOG_ERR, "Miner thread %d create failed", i);
|
||||
err = thread_create( thr, miner_thread );
|
||||
if ( err )
|
||||
{
|
||||
applog( LOG_ERR, "Miner thread %d create failed", i );
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm",
|
||||
opt_n_threads, num_cpus, algo_names[opt_algo] );
|
||||
// Initialize stats times and counters
|
||||
memset( share_stats, 0, s_stats_size * sizeof (struct share_stats_t) );
|
||||
gettimeofday( &last_submit_time, NULL );
|
||||
memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
|
||||
memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
|
||||
pthread_mutex_unlock( &stats_lock );
|
||||
|
||||
applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm",
|
||||
opt_n_threads, num_cpus, algo_names[opt_algo] );
|
||||
|
||||
/* main loop - simply wait for workio thread to exit */
|
||||
pthread_join( thr_info[work_thr_id].pth, NULL );
|
||||
|
37
miner.h
37
miner.h
@@ -70,17 +70,25 @@ void *alloca (size_t);
|
||||
|
||||
#ifdef HAVE_SYSLOG_H
|
||||
#include <syslog.h>
|
||||
#define LOG_BLUE 0x10 /* unique value */
|
||||
#define LOG_BLUE 0x10 /* unique value */
|
||||
#define LOG_MAJR 0x11 /* unique value */
|
||||
#define LOG_MINR 0x12 /* unique value */
|
||||
#define LOG_GREEN 0x13 /* unique value */
|
||||
#define LOG_PINK 0x14 /* unique value */
|
||||
#else
|
||||
enum {
|
||||
LOG_ERR,
|
||||
LOG_CRIT,
|
||||
LOG_ERR,
|
||||
LOG_WARNING,
|
||||
LOG_NOTICE,
|
||||
LOG_INFO,
|
||||
LOG_DEBUG,
|
||||
/* custom notices */
|
||||
LOG_BLUE = 0x10,
|
||||
};
|
||||
/* custom notices */
|
||||
LOG_BLUE = 0x10,
|
||||
LOG_MAJR = 0x11,
|
||||
LOG_MINR = 0x12,
|
||||
LOG_GREEN = 0x13,
|
||||
LOG_PINK = 0x14 };
|
||||
#endif
|
||||
|
||||
extern bool is_power_of_2( int n );
|
||||
@@ -216,7 +224,7 @@ json_t* json_load_url(char* cfg_url, json_error_t *err);
|
||||
|
||||
void sha256_init(uint32_t *state);
|
||||
void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
|
||||
void sha256d(unsigned char *hash, const unsigned char *data, int len);
|
||||
//void sha256d(unsigned char *hash, const unsigned char *data, int len);
|
||||
|
||||
#ifdef USE_ASM
|
||||
#if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__)
|
||||
@@ -225,7 +233,8 @@ int sha256_use_4way();
|
||||
void sha256_init_4way(uint32_t *state);
|
||||
void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
|
||||
#endif
|
||||
#if defined(__x86_64__) && defined(USE_AVX2)
|
||||
//#if defined(__x86_64__) && defined(USE_AVX2)
|
||||
#if defined(__x86_64__) && defined(__AVX2__)
|
||||
#define HAVE_SHA256_8WAY 1
|
||||
int sha256_use_8way();
|
||||
void sha256_init_8way(uint32_t *state);
|
||||
@@ -271,9 +280,9 @@ struct thr_api {
|
||||
#define CL_N "\x1B[0m"
|
||||
#define CL_RED "\x1B[31m"
|
||||
#define CL_GRN "\x1B[32m"
|
||||
#define CL_YLW "\x1B[33m"
|
||||
#define CL_YLW "\x1B[33m" // dark yellow
|
||||
#define CL_BLU "\x1B[34m"
|
||||
#define CL_MAG "\x1B[35m"
|
||||
#define CL_MAG "\x1B[35m" // purple
|
||||
#define CL_CYN "\x1B[36m"
|
||||
|
||||
#define CL_BLK "\x1B[22;30m" /* black */
|
||||
@@ -281,7 +290,7 @@ struct thr_api {
|
||||
#define CL_GR2 "\x1B[22;32m" /* green */
|
||||
#define CL_BRW "\x1B[22;33m" /* brown */
|
||||
#define CL_BL2 "\x1B[22;34m" /* blue */
|
||||
#define CL_MA2 "\x1B[22;35m" /* magenta */
|
||||
#define CL_MA2 "\x1B[22;35m" /* purple */
|
||||
#define CL_CY2 "\x1B[22;36m" /* cyan */
|
||||
#define CL_SIL "\x1B[22;37m" /* gray */
|
||||
|
||||
@@ -290,9 +299,9 @@ struct thr_api {
|
||||
#else
|
||||
#define CL_GRY "\x1B[90m" /* dark gray selectable in putty */
|
||||
#endif
|
||||
#define CL_LRD "\x1B[01;31m" /* light red */
|
||||
#define CL_LGR "\x1B[01;32m" /* light green */
|
||||
#define CL_YL2 "\x1B[01;33m" /* yellow */
|
||||
#define CL_LRD "\x1B[01;31m" /* bright red */
|
||||
#define CL_LGR "\x1B[01;32m" /* bright green */
|
||||
#define CL_YL2 "\x1B[01;33m" /* bright yellow */
|
||||
#define CL_LBL "\x1B[01;34m" /* light blue */
|
||||
#define CL_LMA "\x1B[01;35m" /* light magenta */
|
||||
#define CL_LCY "\x1B[01;36m" /* light cyan */
|
||||
@@ -481,7 +490,7 @@ void format_hashrate(double hashrate, char *output);
|
||||
void print_hash_tests(void);
|
||||
|
||||
void scale_hash_for_display ( double* hashrate, char* units );
|
||||
|
||||
void format_number_si( double* hashrate, char* si_units );
|
||||
void report_summary_log( bool force );
|
||||
|
||||
/*
|
||||
|
@@ -78,6 +78,8 @@
|
||||
// - specialized shift and rotate functions that move elements around
|
||||
// use the notation "1x32" to indicate the distance moved as units of
|
||||
// the element size.
|
||||
// Vector shuffle rotations are being renamed to "vrol" and "vror"
|
||||
// to avoid confusion with bit rotations.
|
||||
// - there is a subset of some functions for scalar data. They may have
|
||||
// no prefix nor vec-size, just one size, the size of the data.
|
||||
// - Some integer functions are also defined which use a similar notation.
|
||||
|
@@ -65,7 +65,7 @@ static inline void dintrlv_2x32( void *dst0, void *dst1,
|
||||
d0[24] = s[48]; d1[24] = s[49]; d0[25] = s[50]; d1[25] = s[51];
|
||||
d0[26] = s[52]; d1[26] = s[53]; d0[27] = s[54]; d1[27] = s[55];
|
||||
d0[28] = s[56]; d1[28] = s[57]; d0[29] = s[58]; d1[29] = s[59];
|
||||
d0[30] = s[60]; d1[30] = s[61]; d0[31] = s[61]; d1[31] = s[63];
|
||||
d0[30] = s[60]; d1[30] = s[61]; d0[31] = s[62]; d1[31] = s[63];
|
||||
}
|
||||
|
||||
static inline void extr_lane_2x32( void *dst, const void *src,
|
||||
|
@@ -35,6 +35,13 @@
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
// Used instead if casting.
|
||||
typedef union
|
||||
{
|
||||
__m128i m128;
|
||||
uint32_t u32[4];
|
||||
} __attribute__ ((aligned (16))) m128_ovly;
|
||||
|
||||
// Efficient and convenient moving between GP & low bits of XMM.
|
||||
// Use VEX when available to give access to xmm8-15 and zero extend for
|
||||
// larger vectors.
|
||||
@@ -61,7 +68,10 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
|
||||
return a;
|
||||
}
|
||||
|
||||
static inline uint64_t mm128_mov128_64( const __m128i a )
|
||||
// Inconstant naming, prefix should reflect return value:
|
||||
// u64_mov128_64
|
||||
|
||||
static inline uint64_t u64_mov128_64( const __m128i a )
|
||||
{
|
||||
uint64_t n;
|
||||
#if defined(__AVX__)
|
||||
@@ -72,7 +82,7 @@ static inline uint64_t mm128_mov128_64( const __m128i a )
|
||||
return n;
|
||||
}
|
||||
|
||||
static inline uint32_t mm128_mov128_32( const __m128i a )
|
||||
static inline uint32_t u32_mov128_32( const __m128i a )
|
||||
{
|
||||
uint32_t n;
|
||||
#if defined(__AVX__)
|
||||
@@ -166,12 +176,17 @@ static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
|
||||
|
||||
// Extract 32 bit element c from v and return as integer.
|
||||
static inline uint32_t mm128_extract_32( const __m128i v, const int c )
|
||||
{ return mm128_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }
|
||||
{ return u32_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }
|
||||
|
||||
// Clear (zero) 32 bit elements based on bits set in 4 bit mask.
|
||||
static inline __m128i mm128_mask_32( const __m128i v, const int m )
|
||||
{ return mm128_xim_32( v, v, m ); }
|
||||
|
||||
// Move element i2 of v2 to element i1 of v1. For reference and convenience,
|
||||
// it's faster to precalculate the index.
|
||||
#define mm128_shuflmov_32( v1, i1, v2, i2 ) \
|
||||
mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )
|
||||
|
||||
#endif // SSE4_1
|
||||
|
||||
//
|
||||
@@ -257,12 +272,37 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
// Diagonal blend: d = s3[3], s2[2], s1[1], s0[0] ||
|
||||
|
||||
// Blend 4 32 bit elements from 4 vectors
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
#define mm128_diagonal_32( v3, v2, v1, v0 ) \
|
||||
mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \
|
||||
_mm_blend_epi32( s1, s0, 0x1 ), 0x3 )
|
||||
|
||||
#elif defined(__SSE4_1)
|
||||
|
||||
#define mm128_diagonal_32( v3, v2, v1, v0 ) \
|
||||
mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \
|
||||
_mm_blend_epi16( s1, s0, 0x03 ), 0x0f )
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
// Bit rotations
|
||||
|
||||
// AVX512VL has implemented bit rotation for 128 bit vectors with
|
||||
// 64 and 32 bit elements.
|
||||
|
||||
// x2 rotates elements in 2 individual vectors in a double buffered
|
||||
// optimization for SSE2, does nothing for AVX512 but is there for
|
||||
// transparency.
|
||||
|
||||
// compiler doesn't like when a variable is used for the last arg of
|
||||
// _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same
|
||||
// specification but works with a variable. Therefore use rol_var where
|
||||
@@ -290,6 +330,22 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm128_ror_32 _mm_ror_epi32
|
||||
#define mm128_rol_32 _mm_rol_epi32
|
||||
|
||||
#define mm128_rorx2_64( v1, v0, c ) \
|
||||
_mm_ror_epi64( v0, c ); \
|
||||
_mm_ror_epi64( v1, c )
|
||||
|
||||
#define mm128_rolx2_64( v1, v0, c ) \
|
||||
_mm_rol_epi64( v0, c ); \
|
||||
_mm_rol_epi64( v1, c )
|
||||
|
||||
#define mm128_rorx2_32( v1, v0, c ) \
|
||||
_mm_ror_epi32( v0, c ); \
|
||||
_mm_ror_epi32( v1, c )
|
||||
|
||||
#define mm128_rolx2_32( v1, v0, c ) \
|
||||
_mm_rol_epi32( v0, c ); \
|
||||
_mm_rol_epi32( v1, c )
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define mm128_ror_64 mm128_ror_var_64
|
||||
@@ -297,6 +353,46 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm128_ror_32 mm128_ror_var_32
|
||||
#define mm128_rol_32 mm128_rol_var_32
|
||||
|
||||
#define mm128_rorx2_64( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_srli_epi64( v0, c ); \
|
||||
__m128i t1 = _mm_srli_epi64( v1, c ); \
|
||||
v0 = _mm_slli_epi64( v0, 64-(c) ); \
|
||||
v1 = _mm_slli_epi64( v1, 64-(c) ); \
|
||||
v0 = _mm_or_si256( v0, t0 ); \
|
||||
v1 = _mm_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define mm128_rolx2_64( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_slli_epi64( v0, c ); \
|
||||
__m128i t1 = _mm_slli_epi64( v1, c ); \
|
||||
v0 = _mm_srli_epi64( v0, 64-(c) ); \
|
||||
v1 = _mm_srli_epi64( v1, 64-(c) ); \
|
||||
v0 = _mm_or_si256( v0, t0 ); \
|
||||
v1 = _mm_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define mm128_rorx2_32( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_srli_epi32( v0, c ); \
|
||||
__m128i t1 = _mm_srli_epi32( v1, c ); \
|
||||
v0 = _mm_slli_epi32( v0, 32-(c) ); \
|
||||
v1 = _mm_slli_epi32( v1, 32-(c) ); \
|
||||
v0 = _mm_or_si256( v0, t0 ); \
|
||||
v1 = _mm_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define mm128_rolx2_32( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_slli_epi32( v0, c ); \
|
||||
__m128i t1 = _mm_slli_epi32( v1, c ); \
|
||||
v0 = _mm_srli_epi32( v0, 32-(c) ); \
|
||||
v1 = _mm_srli_epi32( v1, 32-(c) ); \
|
||||
v0 = _mm_or_si256( v0, t0 ); \
|
||||
v1 = _mm_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#endif // AVX512 else SSE2
|
||||
|
||||
#define mm128_ror_16( v, c ) \
|
||||
@@ -309,16 +405,22 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
// Rotate vector elements accross all lanes
|
||||
|
||||
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
|
||||
#define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
|
||||
#define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
|
||||
#define mm128_shuflr_64 mm128_swap_64
|
||||
#define mm128_shufll_64 mm128_swap_64
|
||||
|
||||
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
|
||||
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
|
||||
|
||||
|
||||
// Swap 32 bit elements in 64 bit lanes
|
||||
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
|
||||
#define mm128_shuflr64_32 mm128_swap64_32
|
||||
#define mm128_shufll64_32 mm128_swap64_32
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
// Rotate right by c bytes, no SSE2 equivalent.
|
||||
static inline __m128i mm128_ror_x8( const __m128i v, const int c )
|
||||
static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
{ return _mm_alignr_epi8( v, v, c ); }
|
||||
|
||||
//
|
||||
@@ -422,59 +524,88 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
v1 = _mm_xor_si128( v1, v2 );
|
||||
|
||||
|
||||
// Two input shuffle-rotate.
|
||||
// Concatenate v1 & v2 and rotate as one 256 bit vector.
|
||||
#if defined(__SSE4_1__)
|
||||
// Continue to use vror/vrol for now to avoid confusion with
|
||||
// shufl2r/shufl2l function macros available with AVX512.
|
||||
|
||||
#define mm128_ror256_64( v1, v2 ) \
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
// Function macro with two inputs and one output, inputs are preserved.
|
||||
// Returns modified first arg.
|
||||
// Two input functions are not available without SSSE3. Use procedure
|
||||
// belowe instead.
|
||||
|
||||
#define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
|
||||
#define mm128_shufl2l_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
|
||||
|
||||
#define mm128_shufl2r_32( v1, v2 ) _mm_alignr_epi8( v2, v1, 4 )
|
||||
#define mm128_shufl2l_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 )
|
||||
|
||||
#define mm128_shufl2r_16( v1, v2 ) _mm_alignr_epi8( v2, v1, 2 )
|
||||
#define mm128_shufl2l_16( v1, v2 ) _mm_alignr_epi8( v1, v2, 2 )
|
||||
|
||||
#define mm128_shufl2r_8( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
|
||||
#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
|
||||
|
||||
// Procedure macroswith 2 inputs and 2 outputs, inputs are destroyed.
|
||||
// Returns both modified args in place.
|
||||
|
||||
// These macros retain the vrol/vror name for now to avoid
|
||||
// confusion with the shufl2r/shuffle2l function macros above.
|
||||
// These may be renamed to something like shufl2r2 for 2 1nputs and
|
||||
// 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs.
|
||||
|
||||
#define mm128_vror256_64( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
|
||||
v1 = _mm_alignr_epi8( v2, v1, 8 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_rol256_64( v1, v2 ) \
|
||||
#define mm128_vrol256_64( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
|
||||
v2 = _mm_alignr_epi8( v2, v1, 8 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_ror256_32( v1, v2 ) \
|
||||
#define mm128_vror256_32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 4 ); \
|
||||
v1 = _mm_alignr_epi8( v2, v1, 4 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_rol256_32( v1, v2 ) \
|
||||
#define mm128_vrol256_32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 12 ); \
|
||||
v2 = _mm_alignr_epi8( v2, v1, 12 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_ror256_16( v1, v2 ) \
|
||||
#define mm128_vror256_16( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 2 ); \
|
||||
v1 = _mm_alignr_epi8( v2, v1, 2 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_rol256_16( v1, v2 ) \
|
||||
#define mm128_vrol256_16( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 14 ); \
|
||||
v2 = _mm_alignr_epi8( v2, v1, 14 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_ror256_8( v1, v2 ) \
|
||||
#define mm128_vror256_8( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 1 ); \
|
||||
v1 = _mm_alignr_epi8( v2, v1, 1 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_rol256_8( v1, v2 ) \
|
||||
#define mm128_vrol256_8( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 15 ); \
|
||||
v2 = _mm_alignr_epi8( v2, v1, 15 ); \
|
||||
@@ -483,7 +614,7 @@ do { \
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define mm128_ror256_64( v1, v2 ) \
|
||||
#define mm128_vror256_64( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
|
||||
_mm_slli_si128( v2, 8 ) ); \
|
||||
@@ -492,7 +623,7 @@ do { \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_rol256_64( v1, v2 ) \
|
||||
#define mm128_vrol256_64( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
|
||||
_mm_srli_si128( v2, 8 ) ); \
|
||||
@@ -501,7 +632,7 @@ do { \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_ror256_32( v1, v2 ) \
|
||||
#define mm128_vror256_32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
|
||||
_mm_slli_si128( v2, 12 ) ); \
|
||||
@@ -510,7 +641,7 @@ do { \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_rol256_32( v1, v2 ) \
|
||||
#define mm128_vrol256_32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
|
||||
_mm_srli_si128( v2, 12 ) ); \
|
||||
@@ -519,7 +650,7 @@ do { \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_ror256_16( v1, v2 ) \
|
||||
#define mm128_vror256_16( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
|
||||
_mm_slli_si128( v2, 14 ) ); \
|
||||
@@ -528,7 +659,7 @@ do { \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_rol256_16( v1, v2 ) \
|
||||
#define mm128_vrol256_16( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
|
||||
_mm_srli_si128( v2, 14 ) ); \
|
||||
@@ -537,7 +668,7 @@ do { \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_ror256_8( v1, v2 ) \
|
||||
#define mm128_vror256_8( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
|
||||
_mm_slli_si128( v2, 15 ) ); \
|
||||
@@ -546,7 +677,7 @@ do { \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_rol256_8( v1, v2 ) \
|
||||
#define mm128_vrol256_8( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
|
||||
_mm_srli_si128( v2, 15 ) ); \
|
||||
|
@@ -14,13 +14,28 @@
|
||||
// is limited because 256 bit vectors are less likely to be used when 512
|
||||
// is available.
|
||||
|
||||
// Used instead if casting.
|
||||
typedef union
|
||||
{
|
||||
__m256i m256;
|
||||
__m128i m128[2];
|
||||
uint64_t u64[4];
|
||||
uint32_t u32[8];
|
||||
} __attribute__ ((aligned (32))) m256_ovly;
|
||||
|
||||
|
||||
// Move integer to low element of vector, other elements are set to zero.
|
||||
#define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
|
||||
#define mm256_mov32_256( i ) _mm256_castsi128_si256( mm128_mov32_128( i ) )
|
||||
|
||||
// Move low element of vector to integer.
|
||||
#define mm256_mov256_64( v ) mm128_mov128_64( _mm256_castsi256_si128( v ) )
|
||||
#define mm256_mov256_32( v ) mm128_mov128_32( _mm256_castsi256_si128( v ) )
|
||||
#define u64_mov256_64( v ) u64_mov128_64( _mm256_castsi256_si128( v ) )
|
||||
#define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) )
|
||||
|
||||
// deprecated
|
||||
//#define mm256_mov256_64 u64_mov256_64
|
||||
//#define mm256_mov256_32 u32_mov256_32
|
||||
|
||||
|
||||
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
|
||||
#define mm256_concat_128( hi, lo ) \
|
||||
@@ -214,12 +229,41 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
|
||||
#endif
|
||||
|
||||
// Diagonal blending
|
||||
|
||||
// Blend 4 64 bit elements from 4 vectors
|
||||
#define mm256_diagonal_64( v3, v2, v1, v0 ) \
|
||||
mm256_blend_epi32( _mm256_blend_epi32( v3, v2, 0x30 ), \
|
||||
_mm256_blend_epi32( v1, v0, 0x03 ), 0x0f )
|
||||
|
||||
// Blend 8 32 bit elements from 8 vectors
|
||||
#define mm256_diagonal_32( v7, v6, v5, v4, v3, v2, v1, v0 ) \
|
||||
_mm256_blend_epi32( \
|
||||
_mm256_blend_epi32( \
|
||||
_mm256_blend_epi32( v7, v6, 0x40 ), \
|
||||
_mm256_blend_epi32( v5, v4, 0x10 ) 0x30 ), \
|
||||
_mm256_blend_epi32( \
|
||||
_mm256_blend_epi32( v3, v2, 0x04) \
|
||||
_mm256_blend_epi32( v1, v0, 0x01 ), 0x03 ), 0x0f )
|
||||
|
||||
|
||||
// Blend 4 32 bit elements from each 128 bit lane.
|
||||
#define mm256_diagonal128_32( v3, v2, v1, v0 ) \
|
||||
_mm256_blend_epi32( \
|
||||
_mm256_blend_epi32( v3, v2, 0x44) \
|
||||
_mm256_blend_epi32( v1, v0, 0x11 ) )
|
||||
|
||||
|
||||
//
|
||||
// Bit rotations.
|
||||
//
|
||||
// The only bit shift for more than 64 bits is with __int128.
|
||||
// The only bit shift for more than 64 bits is with __int128 which is slow.
|
||||
//
|
||||
// AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements
|
||||
//
|
||||
// x2 rotates elements in 2 individual vectors in a double buffered
|
||||
// optimization for SSE2, does nothing for AVX512 but is there for
|
||||
// transparency.
|
||||
|
||||
|
||||
// compiler doesn't like when a variable is used for the last arg of
|
||||
@@ -255,6 +299,22 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
#define mm256_ror_32 _mm256_ror_epi32
|
||||
#define mm256_rol_32 _mm256_rol_epi32
|
||||
|
||||
#define mm256_rorx2_64( v1, v0, c ) \
|
||||
_mm256_ror_epi64( v0, c ); \
|
||||
_mm256_ror_epi64( v1, c )
|
||||
|
||||
#define mm256_rolx2_64( v1, v0, c ) \
|
||||
_mm256_rol_epi64( v0, c ); \
|
||||
_mm256_rol_epi64( v1, c )
|
||||
|
||||
#define mm256_rorx2_32( v1, v0, c ) \
|
||||
_mm256_ror_epi32( v0, c ); \
|
||||
_mm256_ror_epi32( v1, c )
|
||||
|
||||
#define mm256_rolx2_32( v1, v0, c ) \
|
||||
_mm256_rol_epi32( v0, c ); \
|
||||
_mm256_rol_epi32( v1, c )
|
||||
|
||||
#else // AVX2
|
||||
|
||||
#define mm256_ror_64 mm256_ror_var_64
|
||||
@@ -262,6 +322,46 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
#define mm256_ror_32 mm256_ror_var_32
|
||||
#define mm256_rol_32 mm256_rol_var_32
|
||||
|
||||
#define mm256_rorx2_64( v1, v0, c ) \
|
||||
{ \
|
||||
__m256i t0 = _mm256_srli_epi64( v0, c ); \
|
||||
__m256i t1 = _mm256_srli_epi64( v1, c ); \
|
||||
v0 = _mm256_slli_epi64( v0, 64-(c) ); \
|
||||
v1 = _mm256_slli_epi64( v1, 64-(c) ); \
|
||||
v0 = _mm256_or_si256( v0, t0 ); \
|
||||
v1 = _mm256_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define mm256_rolx2_64( v1, v0, c ) \
|
||||
{ \
|
||||
__m256i t0 = _mm256_slli_epi64( v0, c ); \
|
||||
__m256i t1 = _mm256_slli_epi64( v1, c ); \
|
||||
v0 = _mm256_srli_epi64( v0, 64-(c) ); \
|
||||
v1 = _mm256_srli_epi64( v1, 64-(c) ); \
|
||||
v0 = _mm256_or_si256( v0, t0 ); \
|
||||
v1 = _mm256_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define mm256_rorx2_32( v1, v0, c ) \
|
||||
{ \
|
||||
__m256i t0 = _mm256_srli_epi32( v0, c ); \
|
||||
__m256i t1 = _mm256_srli_epi32( v1, c ); \
|
||||
v0 = _mm256_slli_epi32( v0, 32-(c) ); \
|
||||
v1 = _mm256_slli_epi32( v1, 32-(c) ); \
|
||||
v0 = _mm256_or_si256( v0, t0 ); \
|
||||
v1 = _mm256_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define mm256_rolx2_32( v1, v0, c ) \
|
||||
{ \
|
||||
__m256i t0 = _mm256_slli_epi32( v0, c ); \
|
||||
__m256i t1 = _mm256_slli_epi32( v1, c ); \
|
||||
v0 = _mm256_srli_epi32( v0, 32-(c) ); \
|
||||
v1 = _mm256_srli_epi32( v1, 32-(c) ); \
|
||||
v0 = _mm256_or_si256( v0, t0 ); \
|
||||
v1 = _mm256_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#endif // AVX512 else AVX2
|
||||
|
||||
#define mm256_ror_16( v, c ) \
|
||||
@@ -276,58 +376,45 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
//
|
||||
// Rotate elements accross all lanes.
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
static inline __m256i mm256_swap_128( const __m256i v )
|
||||
{ return _mm256_alignr_epi64( v, v, 2 ); }
|
||||
|
||||
static inline __m256i mm256_ror_1x64( const __m256i v )
|
||||
{ return _mm256_alignr_epi64( v, v, 1 ); }
|
||||
|
||||
static inline __m256i mm256_rol_1x64( const __m256i v )
|
||||
{ return _mm256_alignr_epi64( v, v, 3 ); }
|
||||
|
||||
static inline __m256i mm256_ror_1x32( const __m256i v )
|
||||
{ return _mm256_alignr_epi32( v, v, 1 ); }
|
||||
|
||||
static inline __m256i mm256_rol_1x32( const __m256i v )
|
||||
{ return _mm256_alignr_epi32( v, v, 7 ); }
|
||||
|
||||
#else // AVX2
|
||||
|
||||
// Swap 128 bit elements in 256 bit vector.
|
||||
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
|
||||
#define mm256_shuflr_128 mm256_swap_128
|
||||
#define mm256_shufll_128 mm256_swap_128
|
||||
|
||||
// Rotate 256 bit vector by one 64 bit element
|
||||
#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
|
||||
#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
|
||||
#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 )
|
||||
|
||||
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
|
||||
|
||||
// Rotate 256 bit vector by one 32 bit element.
|
||||
#define mm256_ror_1x32( v ) \
|
||||
#define mm256_shuflr_32( v ) \
|
||||
_mm256_permutevar8x32_epi32( v, \
|
||||
m256_const_64( 0x0000000000000007, 0x0000000600000005, \
|
||||
0x0000000400000003, 0x0000000200000001 )
|
||||
0x0000000400000003, 0x0000000200000001 ) )
|
||||
|
||||
#define mm256_rol_1x32( v ) \
|
||||
#define mm256_shufll_32( v ) \
|
||||
_mm256_permutevar8x32_epi32( v, \
|
||||
m256_const_64( 0x0000000600000005, 0x0000000400000003, \
|
||||
0x0000000200000001, 0x0000000000000007 )
|
||||
0x0000000200000001, 0x0000000000000007 ) )
|
||||
|
||||
|
||||
#endif // AVX512 else AVX2
|
||||
|
||||
//
|
||||
// Rotate elements within each 128 bit lane of 256 bit vector.
|
||||
|
||||
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
|
||||
#define mm256_ror128_32( v ) _mm256_shuffle_epi32( v, 0x39 )
|
||||
#define mm256_rol128_32( v ) _mm256_shuffle_epi32( v, 0x93 )
|
||||
#define mm256_shuflr128_64 mm256_swap128_64
|
||||
#define mm256_shufll128_64 mm256_swap128_64
|
||||
|
||||
static inline __m256i mm256_ror128_x8( const __m256i v, const int c )
|
||||
#define mm256_shuflr128_32( v ) _mm256_shuffle_epi32( v, 0x39 )
|
||||
#define mm256_shufll128_32( v ) _mm256_shuffle_epi32( v, 0x93 )
|
||||
|
||||
static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
{ return _mm256_alignr_epi8( v, v, c ); }
|
||||
|
||||
// Swap 32 bit elements in each 64 bit lane.
|
||||
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
|
||||
#define mm256_shuflr64_32 mm256_swap64_32
|
||||
#define mm256_shufll64_32 mm256_swap64_32
|
||||
|
||||
//
|
||||
// Swap bytes in vector elements, endian bswap.
|
||||
@@ -387,19 +474,21 @@ static inline __m256i mm256_ror128_x8( const __m256i v, const int c )
|
||||
// _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
|
||||
// makes these macros unnecessary.
|
||||
|
||||
// continue using vror/vrol notation for now to avoid confusion with
|
||||
// shufl2r/shufl2l macro functions available with AVX512.
|
||||
#define mm256_swap512_256( v1, v2 ) \
|
||||
v1 = _mm256_xor_si256( v1, v2 ); \
|
||||
v2 = _mm256_xor_si256( v1, v2 ); \
|
||||
v1 = _mm256_xor_si256( v1, v2 );
|
||||
|
||||
#define mm256_ror512_128( v1, v2 ) \
|
||||
#define mm256_vror512_128( v1, v2 ) \
|
||||
do { \
|
||||
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
|
||||
v1 = _mm256_permute2x128( v2, v1, 0x21 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm256_rol512_128( v1, v2 ) \
|
||||
#define mm256_vrol512_128( v1, v2 ) \
|
||||
do { \
|
||||
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
|
||||
v2 = _mm256_permute2x128( v2, v1, 0x21 ); \
|
||||
|
@@ -74,13 +74,22 @@
|
||||
// __AVX512VBMI__ __AVX512VAES__
|
||||
//
|
||||
|
||||
// Used instead if casting.
|
||||
typedef union
|
||||
{
|
||||
__m512i m512;
|
||||
__m128i m128[4];
|
||||
uint32_t u32[16];
|
||||
uint64_t u64[8];
|
||||
} __attribute__ ((aligned (64))) m512_ovly;
|
||||
|
||||
// Move integer to/from element 0 of vector.
|
||||
|
||||
#define mm512_mov64_512( n ) _mm512_castsi128_si512( mm128_mov64_128( n ) )
|
||||
#define mm512_mov32_512( n ) _mm512_castsi128_si512( mm128_mov32_128( n ) )
|
||||
|
||||
#define mm512_mov256_64( a ) mm128_mov128_64( _mm256_castsi512_si128( a ) )
|
||||
#define mm512_mov256_32( a ) mm128_mov128_32( _mm256_castsi512_si128( a ) )
|
||||
#define u64_mov512_64( a ) u64_mov128_64( _mm256_castsi512_si128( a ) )
|
||||
#define u32_mov512_32( a ) u32_mov128_32( _mm256_castsi512_si128( a ) )
|
||||
|
||||
// A simple 128 bit permute, using function instead of macro avoids
|
||||
// problems if the v arg passed as an expression.
|
||||
@@ -91,6 +100,10 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
|
||||
#define mm512_concat_256( hi, lo ) \
|
||||
_mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 )
|
||||
|
||||
#define m512_const_128( v3, v2, v1, v0 ) \
|
||||
mm512_concat_256( mm256_concat_128( v3, v2 ), \
|
||||
mm256_concat_128( v1, v0 ) )
|
||||
|
||||
// Equivalent of set, assign 64 bit integers to respective 64 bit elements.
|
||||
// Use stack memory overlay
|
||||
static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
|
||||
@@ -225,7 +238,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
|
||||
//
|
||||
// Ternary logic uses 8 bit truth table to define any 3 input logical
|
||||
// operation using any number or combinations of AND, OR XOR, NOT.
|
||||
// expression using any number or combinations of AND, OR, XOR, NOT.
|
||||
|
||||
// a ^ b ^ c
|
||||
#define mm512_xor3( a, b, c ) \
|
||||
@@ -251,11 +264,11 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
#define mm512_andxor( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0x60 )
|
||||
|
||||
// a ^ ( b & c )
|
||||
// a ^ ( b | c )
|
||||
#define mm512_xoror( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0x1e )
|
||||
|
||||
// a ^ ( ~b & c ) [ xor( a, andnot( b, c ) ]
|
||||
// a ^ ( ~b & c ) xor( a, andnot( b, c ) )
|
||||
#define mm512_xorandnot( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0xd2 )
|
||||
|
||||
@@ -265,11 +278,11 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
|
||||
// Some 2 input operations that don't have their own instruction mnemonic.
|
||||
|
||||
// ~( a | b )
|
||||
// ~( a | b ), (~a) & (~b)
|
||||
#define mm512_nor( a, b ) \
|
||||
_mm512_ternarylogic_epi64( a, b, b, 0x01 )
|
||||
|
||||
// ~( a ^ b ), same as (~a) ^ b
|
||||
// ~( a ^ b ), (~a) ^ b
|
||||
#define mm512_xnor( a, b ) \
|
||||
_mm512_ternarylogic_epi64( a, b, b, 0x81 )
|
||||
|
||||
@@ -278,6 +291,27 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
_mm512_ternarylogic_epi64( a, b, b, 0xef )
|
||||
|
||||
|
||||
// Diagonal blending
|
||||
// Blend 8 64 bit elements from 8 vectors
|
||||
#define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \
|
||||
_mm512_mask_blend_epi64( 0x0f, \
|
||||
_mm512_mask_blend_epi64( 0x30, \
|
||||
_mm512_mask_blend_epi64( 0x40, v7, v6 ), \
|
||||
_mm512_mask_blend_epi64( 0x40, v5, v4 ) ), \
|
||||
_mm512_mask_blend_epi64( 0x03, \
|
||||
_mm512_mask_blend_epi64( 0x04, v3, v2 ) \
|
||||
_mm512_mask_blend_epi64( 0x01, v1, v0 ) ) )
|
||||
|
||||
|
||||
// Blend 4 32 bit elements from each 128 bit lane.
|
||||
#define mm512_diagonal128_32( v3, v2, v1, v0 ) \
|
||||
_mm512_mask_blend_epi32( 0x3333, \
|
||||
_mm512_mask_blend_epi32( 0x4444, v3, v2 ), \
|
||||
_mm512_mask_blend_epi32( 0x1111, v1, v0 ) )
|
||||
|
||||
|
||||
|
||||
|
||||
// Bit rotations.
|
||||
|
||||
// AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
|
||||
@@ -395,59 +429,95 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
|
||||
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
|
||||
} while(0)
|
||||
|
||||
|
||||
//
|
||||
// Rotate elements in 512 bit vector.
|
||||
// Shift with zero fill & shuffle-rotate elements in 512 bit vector.
|
||||
//
|
||||
|
||||
// rename plan change ror to vror for Vector ROtate Right,
|
||||
// and vrol for Vector ROtate Left, not to be confused with
|
||||
//variable rotate rorv, rolv,
|
||||
// Plan changed, use shuflr & shufll instead symbolizing a shuffle-rotate
|
||||
// operation. 1xNN notaion ia also removed and replaced with simpler NN.
|
||||
// Swap will still have its own mnemonic and will be aliased as both
|
||||
// left and right shuffles.
|
||||
|
||||
// Shift elements right or left in 512 bit vector, filling with zeros.
|
||||
// Multiple element shifts can be combined into a single larger
|
||||
// element shift.
|
||||
|
||||
#define mm512_shiftr_256( v ) \
|
||||
_mm512_alignr_epi64( _mm512_setzero, v, 4 )
|
||||
#define mm512_shiftl_256( v ) mm512_shifr_256
|
||||
|
||||
#define mm512_shiftr_128( v ) \
|
||||
_mm512_alignr_epi64( _mm512_setzero, v, 2 )
|
||||
#define mm512_shiftl_128( v ) \
|
||||
_mm512_alignr_epi64( v, _mm512_setzero, 6 )
|
||||
|
||||
#define mm512_shiftr_64( v ) \
|
||||
_mm512_alignr_epi64( _mm512_setzero, v, 1 )
|
||||
#define mm512_shiftl_64( v ) \
|
||||
_mm512_alignr_epi64( v, _mm512_setzero, 7 )
|
||||
|
||||
#define mm512_shiftr_32( v ) \
|
||||
_mm512_alignr_epi32( _mm512_setzero, v, 1 )
|
||||
#define mm512_shiftl_32( v ) \
|
||||
_mm512_alignr_epi32( v, _mm512_setzero, 15 )
|
||||
|
||||
// Shuffle-rotate elements left or right in 512 bit vector.
|
||||
|
||||
static inline __m512i mm512_swap_256( const __m512i v )
|
||||
{ return _mm512_alignr_epi64( v, v, 4 ); }
|
||||
#define mm512_shuflr_256( v ) mm512_swap_256
|
||||
#define mm512_shufll_256( v ) mm512_swap_256
|
||||
|
||||
static inline __m512i mm512_ror_1x128( const __m512i v )
|
||||
static inline __m512i mm512_shuflr_128( const __m512i v )
|
||||
{ return _mm512_alignr_epi64( v, v, 2 ); }
|
||||
|
||||
static inline __m512i mm512_rol_1x128( const __m512i v )
|
||||
static inline __m512i mm512_shufll_128( const __m512i v )
|
||||
{ return _mm512_alignr_epi64( v, v, 6 ); }
|
||||
|
||||
static inline __m512i mm512_ror_1x64( const __m512i v )
|
||||
static inline __m512i mm512_shuflr_64( const __m512i v )
|
||||
{ return _mm512_alignr_epi64( v, v, 1 ); }
|
||||
|
||||
static inline __m512i mm512_rol_1x64( const __m512i v )
|
||||
static inline __m512i mm512_shufll_64( const __m512i v )
|
||||
{ return _mm512_alignr_epi64( v, v, 7 ); }
|
||||
|
||||
static inline __m512i mm512_ror_1x32( const __m512i v )
|
||||
static inline __m512i mm512_shuflr_32( const __m512i v )
|
||||
{ return _mm512_alignr_epi32( v, v, 1 ); }
|
||||
|
||||
static inline __m512i mm512_rol_1x32( const __m512i v )
|
||||
static inline __m512i mm512_shufll_32( const __m512i v )
|
||||
{ return _mm512_alignr_epi32( v, v, 15 ); }
|
||||
|
||||
static inline __m512i mm512_ror_x64( const __m512i v, const int n )
|
||||
// Generic
|
||||
static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
|
||||
{ return _mm512_alignr_epi64( v, v, n ); }
|
||||
|
||||
static inline __m512i mm512_ror_x32( const __m512i v, const int n )
|
||||
static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
|
||||
{ return _mm512_alignr_epi32( v, v, n ); }
|
||||
|
||||
#define mm512_ror_1x16( v ) \
|
||||
#define mm512_shuflr_16( v ) \
|
||||
_mm512_permutexvar_epi16( m512_const_64( \
|
||||
0x0000001F001E001D, 0x001C001B001A0019, \
|
||||
0X0018001700160015, 0X0014001300120011, \
|
||||
0X0010000F000E000D, 0X000C000B000A0009, \
|
||||
0X0008000700060005, 0X0004000300020001 ), v )
|
||||
|
||||
#define mm512_rol_1x16( v ) \
|
||||
#define mm512_shufll_16( v ) \
|
||||
_mm512_permutexvar_epi16( m512_const_64( \
|
||||
0x001E001D001C001B, 0x001A001900180017, \
|
||||
0X0016001500140013, 0X001200110010000F, \
|
||||
0X000E000D000C000B, 0X000A000900080007, \
|
||||
0X0006000500040003, 0X000200010000001F ), v )
|
||||
|
||||
#define mm512_ror_1x8( v ) \
|
||||
#define mm512_shuflr_8( v ) \
|
||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
||||
0x003F3E3D3C3B3A39, 0x3837363534333231, \
|
||||
0x302F2E2D2C2B2A29, 0x2827262524232221, \
|
||||
0x201F1E1D1C1B1A19. 0x1817161514131211, \
|
||||
0x100F0E0D0C0B0A09, 0x0807060504030201 ) )
|
||||
|
||||
#define mm512_rol_1x8( v ) \
|
||||
#define mm512_shufll_8( v ) \
|
||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
||||
0x3E3D3C3B3A393837, 0x363534333231302F. \
|
||||
0x2E2D2C2B2A292827, 0x262524232221201F, \
|
||||
@@ -456,51 +526,55 @@ static inline __m512i mm512_ror_x32( const __m512i v, const int n )
|
||||
|
||||
//
|
||||
// Rotate elements within 256 bit lanes of 512 bit vector.
|
||||
// 128 bit lane shift is handled by bslli bsrli.
|
||||
|
||||
// Swap hi & lo 128 bits in each 256 bit lane
|
||||
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
|
||||
#define mm512_shuflr256_128 mm512_swap256_128
|
||||
#define mm512_shufll256_128 mm512_swap256_128
|
||||
|
||||
// Rotate 256 bit lanes by one 64 bit element
|
||||
#define mm512_ror256_64( v ) _mm512_permutex_epi64( v, 0x39 )
|
||||
#define mm512_rol256_64( v ) _mm512_permutex_epi64( v, 0x93 )
|
||||
#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 )
|
||||
|
||||
#define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 )
|
||||
|
||||
// Rotate 256 bit lanes by one 32 bit element
|
||||
#define mm512_ror256_32( v ) \
|
||||
#define mm512_shuflr256_32( v ) \
|
||||
_mm512_permutexvar_epi32( m512_const_64( \
|
||||
0x000000080000000f, 0x0000000e0000000d, \
|
||||
0x0000000c0000000b, 0x0000000a00000009, \
|
||||
0x0000000000000007, 0x0000000600000005, \
|
||||
0x0000000400000003, 0x0000000200000001 ), v )
|
||||
|
||||
#define mm512_rol256_32( v ) \
|
||||
#define mm512_shufll256_32( v ) \
|
||||
_mm512_permutexvar_epi32( m512_const_64( \
|
||||
0x0000000e0000000d, 0x0000000c0000000b, \
|
||||
0x0000000a00000009, 0x000000080000000f, \
|
||||
0x0000000600000005, 0x0000000400000003, \
|
||||
0x0000000200000001, 0x0000000000000007 ), v )
|
||||
|
||||
#define mm512_ror256_16( v ) \
|
||||
#define mm512_shuflr256_16( v ) \
|
||||
_mm512_permutexvar_epi16( m512_const_64( \
|
||||
0x00100001001e001d, 0x001c001b001a0019, \
|
||||
0x0018001700160015, 0x0014001300120011, \
|
||||
0x0000000f000e000d, 0x000c000b000a0009, \
|
||||
0x0008000700060005, 0x0004000300020001 ), v )
|
||||
|
||||
#define mm512_rol256_16( v ) \
|
||||
#define mm512_shufll256_16( v ) \
|
||||
_mm512_permutexvar_epi16( m512_const_64( \
|
||||
0x001e001d001c001b, 0x001a001900180017, \
|
||||
0x0016001500140013, 0x001200110010001f, \
|
||||
0x000e000d000c000b, 0x000a000900080007, \
|
||||
0x0006000500040003, 0x000200010000000f ), v )
|
||||
|
||||
#define mm512_ror256_8( v ) \
|
||||
#define mm512_shuflr256_8( v ) \
|
||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
||||
0x203f3e3d3c3b3a39, 0x3837363534333231, \
|
||||
0x302f2e2d2c2b2a29, 0x2827262524232221, \
|
||||
0x001f1e1d1c1b1a19, 0x1817161514131211, \
|
||||
0x100f0e0d0c0b0a09, 0x0807060504030201 ) )
|
||||
|
||||
#define mm512_rol256_8( v ) \
|
||||
#define mm512_shufll256_8( v ) \
|
||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
||||
0x3e3d3c3b3a393837, 0x363534333231302f, \
|
||||
0x2e2d2c2b2a292827, 0x262524232221203f, \
|
||||
@@ -508,82 +582,120 @@ static inline __m512i mm512_ror_x32( const __m512i v, const int n )
|
||||
0x0e0d0c0b0a090807, 0x060504030201001f ) )
|
||||
|
||||
//
|
||||
// Rotate elements within 128 bit lanes of 512 bit vector.
|
||||
|
||||
// Shuffle-roate elements within 128 bit lanes of 512 bit vector.
|
||||
|
||||
// Swap 64 bits in each 128 bit lane
|
||||
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
|
||||
#define mm512_shuflr128_64 mm512_swap128_64
|
||||
#define mm512_shufll128_64 mm512_swap128_64
|
||||
|
||||
// Rotate 128 bit lanes by one 32 bit element
|
||||
#define mm512_ror128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
|
||||
#define mm512_rol128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
|
||||
#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
|
||||
#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
|
||||
|
||||
// Rotate right 128 bit lanes by c bytes
|
||||
static inline __m512i mm512_ror128_x8( const __m512i v, const int c )
|
||||
// Rotate right 128 bit lanes by c bytes, versatile and just as fast
|
||||
static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
{ return _mm512_alignr_epi8( v, v, c ); }
|
||||
|
||||
// Swap 32 bits in each 64 bit lane.
|
||||
// Swap 32 bits in each 64 bit lane. Can be done with rotate instruction
|
||||
// but only with AVX512. Shuffle is just as fast and availble with AVX2
|
||||
// & SSE2.
|
||||
#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 )
|
||||
#define mm512_shuflr64_32 mm512_swap64_32
|
||||
#define mm512_shufll64_32 mm512_swap64_32
|
||||
|
||||
|
||||
// Need good way to distinguish 1 input shuffles, 2 input shuffle functions,
|
||||
// and 2 input 2 output shuffle macros.
|
||||
//
|
||||
// Rotate elements from 2 512 bit vectors in place, source arguments
|
||||
// shuflr is 1 input
|
||||
// shufl2r is 2 input ...
|
||||
// Drop macros? They can easilly be rebuilt using shufl2 functions
|
||||
|
||||
// add shuflr shufll functions performing rotate, returning first arg
|
||||
// They're faster than doing both, when both not needed.
|
||||
|
||||
// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
|
||||
// rotated v1
|
||||
// visually confusing for shif2r because of arg order. First arg is always
|
||||
// the target for modification, either update by reference or by function
|
||||
// return.
|
||||
#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 )
|
||||
#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 )
|
||||
|
||||
#define mm512_shufl2r_128( v1, v2 ) _mm512_alignr_epi64( v2, v1, 2 )
|
||||
#define mm512_shufl2l_128( v1, v2 ) _mm512_alignr_epi64( v1, v2, 2 )
|
||||
|
||||
#define mm512_shufl2r_64( v1, v2 ) _mm512_alignr_epi64( v2, v1, 1 )
|
||||
#define mm512_shufl2l_64( v1, v2 ) _mm512_alignr_epi64( v1, v2, 1 )
|
||||
|
||||
#define mm512_shufl2r_32( v1, v2 ) _mm512_alignr_epi32( v2, v1, 1 )
|
||||
#define mm512_shufl2l_32( v1, v2 ) _mm512_alignr_epi32( v1, v2, 1 )
|
||||
|
||||
// Rotate elements from 2 512 bit vectors in place, source arguments
|
||||
// are overwritten.
|
||||
|
||||
#define mm512_swap1024_512( v1, v2 ) \
|
||||
v1 = _mm512_xor_si512( v1, v2 ); \
|
||||
v2 = _mm512_xor_si512( v1, v2 ); \
|
||||
v1 = _mm512_xor_si512( v1, v2 );
|
||||
#define mm512_shufl2l_512 mm512_swap1024_512 \
|
||||
#define mm512_shufl2r_512 mm512_swap1024_512 \
|
||||
|
||||
#define mm512_ror1024_256( v1, v2 ) \
|
||||
// Deprecated, will be removed. Use shufl2 functions instead. Leave them as is
|
||||
// for now.
|
||||
// Rotate elements from 2 512 bit vectors in place, both source arguments
|
||||
// are updated.
|
||||
|
||||
#define mm512_vror1024_256( v1, v2 ) \
|
||||
do { \
|
||||
__m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
|
||||
v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm512_rol1024_256( v1, v2 ) \
|
||||
#define mm512_vrol1024_256( v1, v2 ) \
|
||||
do { \
|
||||
__m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
|
||||
v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm512_ror1024_128( v1, v2 ) \
|
||||
#define mm512_vror1024_128( v1, v2 ) \
|
||||
do { \
|
||||
__m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
|
||||
v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm512_rol1024_128( v1, v2 ) \
|
||||
#define mm512_vrol1024_128( v1, v2 ) \
|
||||
do { \
|
||||
__m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
|
||||
v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm512_ror1024_64( v1, v2 ) \
|
||||
#define mm512_vror1024_64( v1, v2 ) \
|
||||
do { \
|
||||
__m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
|
||||
v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm512_rol1024_64( v1, v2 ) \
|
||||
#define mm512_vrol1024_64( v1, v2 ) \
|
||||
do { \
|
||||
__m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
|
||||
v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm512_ror1024_32( v1, v2 ) \
|
||||
#define mm512_vror1024_32( v1, v2 ) \
|
||||
do { \
|
||||
__m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
|
||||
v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm512_rol1024_32( v1, v2 ) \
|
||||
#define mm512_vrol1024_32( v1, v2 ) \
|
||||
do { \
|
||||
__m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
|
||||
v2 = _mm512_alignr_epi32( v2, v1, 15 ); \
|
||||
|
@@ -68,13 +68,13 @@
|
||||
// rotation.
|
||||
|
||||
// Swap hi & lo 32 bits.
|
||||
#define mm64_swap_32( a ) _mm_shuffle_pi16( a, 0x4e )
|
||||
#define mm64_swap_32( a ) _mm_shuffle_pi16( a, 0x4e )
|
||||
|
||||
#define mm64_ror64_1x16( a ) _mm_shuffle_pi16( a, 0x39 )
|
||||
#define mm64_rol64_1x16( a ) _mm_shuffle_pi16( a, 0x93 )
|
||||
#define mm64_shulfr_16( a ) _mm_shuffle_pi16( a, 0x39 )
|
||||
#define mm64_shufll_16( a ) _mm_shuffle_pi16( a, 0x93 )
|
||||
|
||||
// Swap hi & lo 16 bits of each 32 bit element
|
||||
#define mm64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 )
|
||||
#define mm64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 )
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
@@ -86,7 +86,7 @@
|
||||
_mm_shuffle_pi8( v, (__m64)0x0607040502030001 );
|
||||
|
||||
// Rotate right by c bytes
|
||||
static inline __m64 mm64_ror_x8( __m64 v, const int c )
|
||||
static inline __m64 mm64_vror_x8( __m64 v, const int c )
|
||||
{ return _mm_alignr_pi8( v, v, c ); }
|
||||
|
||||
#else
|
||||
|
@@ -5,10 +5,19 @@
|
||||
#define bswap_64( a ) __builtin_bswap64( a )
|
||||
#define bswap_32( a ) __builtin_bswap32( a )
|
||||
|
||||
// safe division, integer or floating point
|
||||
// Safe division, integer or floating point. For floating point it's as
|
||||
// safe as 0. is precisely zero.
|
||||
// Returns safe_result if division by zero.
|
||||
#define safe_div( dividend, divisor, safe_result ) \
|
||||
( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) ) )
|
||||
|
||||
// Aliases with familiar names for built in bit rotate instructions
|
||||
#define rol64( a, n ) _lrotl( a, n )
|
||||
#define ror64( a, n ) _lrotr( a, n )
|
||||
#define rol32( a, n ) _rotl( a, n )
|
||||
#define ror32( a, n ) _rotr( a, n )
|
||||
#define rol16( a, n ) _rotwl( a, n )
|
||||
#define ror16( a, n ) _rotwr( a, n )
|
||||
|
||||
///////////////////////////////////////
|
||||
//
|
||||
@@ -29,12 +38,14 @@
|
||||
// __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 );
|
||||
// my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );
|
||||
|
||||
// obsolete test
|
||||
// Compiler check for __int128 support
|
||||
// Configure also has a test for int128.
|
||||
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
|
||||
#define GCC_INT128 1
|
||||
#endif
|
||||
|
||||
// obsolte test
|
||||
#if !defined(GCC_INT128)
|
||||
#warning "__int128 not supported, requires GCC-4.8 or newer."
|
||||
#endif
|
||||
|
@@ -218,7 +218,7 @@ static inline void cpu_getname(char *outbuf, size_t maxsz)
|
||||
for (int i = 2; i <= (ext & 0xF); i++)
|
||||
{
|
||||
cpuid(0x80000000+i, output);
|
||||
memcpy(&brand[(i-2) * 4*sizeof(int)], output, 4*sizeof(int));
|
||||
memcpy(&brand[(i-2) * 4*sizeof(int)], output, 4*sizeof(int));
|
||||
}
|
||||
snprintf(outbuf, maxsz, "%s", brand);
|
||||
}
|
||||
|
60
util.c
60
util.c
@@ -47,6 +47,7 @@
|
||||
//#include "miner.h"
|
||||
#include "elist.h"
|
||||
#include "algo-gate-api.h"
|
||||
#include "algo/sha/sha256d.h"
|
||||
|
||||
//extern pthread_mutex_t stats_lock;
|
||||
|
||||
@@ -129,17 +130,19 @@ void applog2( int prio, const char *fmt, ... )
|
||||
|
||||
// localtime_r(&now, &tm);
|
||||
|
||||
switch (prio) {
|
||||
switch ( prio )
|
||||
{
|
||||
case LOG_CRIT: color = CL_LRD; break;
|
||||
case LOG_ERR: color = CL_RED; break;
|
||||
case LOG_WARNING: color = CL_YLW; break;
|
||||
case LOG_WARNING: color = CL_YL2; break;
|
||||
case LOG_MAJR: color = CL_YL2; break;
|
||||
case LOG_NOTICE: color = CL_WHT; break;
|
||||
case LOG_INFO: color = ""; break;
|
||||
case LOG_DEBUG: color = CL_GRY; break;
|
||||
|
||||
case LOG_BLUE:
|
||||
prio = LOG_NOTICE;
|
||||
color = CL_CYN;
|
||||
break;
|
||||
case LOG_MINR: color = CL_YLW; break;
|
||||
case LOG_GREEN: color = CL_GRN; prio = LOG_INFO; break;
|
||||
case LOG_BLUE: color = CL_CYN; prio = LOG_NOTICE; break;
|
||||
case LOG_PINK: color = CL_LMA; prio = LOG_NOTICE; break;
|
||||
}
|
||||
if (!use_colors)
|
||||
color = "";
|
||||
@@ -206,17 +209,19 @@ void applog(int prio, const char *fmt, ...)
|
||||
|
||||
localtime_r(&now, &tm);
|
||||
|
||||
switch (prio) {
|
||||
case LOG_ERR: color = CL_RED; break;
|
||||
case LOG_WARNING: color = CL_YLW; break;
|
||||
switch ( prio )
|
||||
{
|
||||
case LOG_CRIT: color = CL_LRD; break;
|
||||
case LOG_ERR: color = CL_RED; break;
|
||||
case LOG_WARNING: color = CL_YL2; break;
|
||||
case LOG_MAJR: color = CL_YL2; break;
|
||||
case LOG_NOTICE: color = CL_WHT; break;
|
||||
case LOG_INFO: color = ""; break;
|
||||
case LOG_INFO: color = ""; break;
|
||||
case LOG_DEBUG: color = CL_GRY; break;
|
||||
|
||||
case LOG_BLUE:
|
||||
prio = LOG_NOTICE;
|
||||
color = CL_CYN;
|
||||
break;
|
||||
case LOG_MINR: color = CL_YLW; break;
|
||||
case LOG_GREEN: color = CL_GRN; prio = LOG_INFO; break;
|
||||
case LOG_BLUE: color = CL_CYN; prio = LOG_NOTICE; break;
|
||||
case LOG_PINK: color = CL_LMA; prio = LOG_NOTICE; break;
|
||||
}
|
||||
if (!use_colors)
|
||||
color = "";
|
||||
@@ -303,6 +308,29 @@ void format_hashrate(double hashrate, char *output)
|
||||
);
|
||||
}
|
||||
|
||||
// For use with MiB etc
|
||||
void format_number_si( double* n, char* si_units )
|
||||
{
|
||||
if ( *n < 1024*10 ) { *si_units = 0; return; }
|
||||
*n /= 1024;
|
||||
if ( *n < 1024*10 ) { *si_units = 'k'; return; }
|
||||
*n /= 1024;
|
||||
if ( *n < 1024*10 ) { *si_units = 'M'; return; }
|
||||
*n /= 1024;
|
||||
if ( *n < 1024*10 ) { *si_units = 'G'; return; }
|
||||
*n /= 1024;
|
||||
if ( *n < 1024*10 ) { *si_units = 'T'; return; }
|
||||
*n /= 1024;
|
||||
if ( *n < 1024*10 ) { *si_units = 'P'; return; }
|
||||
*n /= 1024;
|
||||
if ( *n < 1024*10 ) { *si_units = 'E'; return; }
|
||||
*n /= 1024;
|
||||
if ( *n < 1024*10 ) { *si_units = 'Z'; return; }
|
||||
*n /= 1024;
|
||||
*si_units = 'Y';
|
||||
}
|
||||
|
||||
|
||||
/* Modify the representation of integer numbers which would cause an overflow
|
||||
* so that they are treated as floating-point numbers.
|
||||
* This is a hack to overcome the limitations of some versions of Jansson. */
|
||||
|
Reference in New Issue
Block a user