This commit is contained in:
Jay D Dee
2021-09-29 17:31:16 -04:00
parent 9b905fccc8
commit 2cd1507c2e
80 changed files with 8145 additions and 2097 deletions

View File

@@ -158,7 +158,9 @@ cpuminer_SOURCES = \
algo/ripemd/lbry.c \
algo/ripemd/lbry-4way.c \
algo/scrypt/scrypt.c \
algo/scrypt/scrypt-core-4way.c \
algo/scrypt/neoscrypt.c \
algo/sha/sha256-hash.c \
algo/sha/sph_sha2.c \
algo/sha/sph_sha2big.c \
algo/sha/sha256-hash-4way.c \
@@ -167,6 +169,7 @@ cpuminer_SOURCES = \
algo/sha/sha256-hash-2way-ni.c \
algo/sha/hmac-sha256-hash.c \
algo/sha/hmac-sha256-hash-4way.c \
algo/sha/sha256d.c \
algo/sha/sha2.c \
algo/sha/sha256t-gate.c \
algo/sha/sha256t-4way.c \

View File

@@ -65,6 +65,37 @@ If not what makes it happen or not happen?
Change Log
----------
v3.18.0
Complete rewrite of Scrypt code, optimized for large N factor (scryptn2):
- AVX512 & SHA support for SHA256, AVX512 has priority,
- up to 50% increase in hashrate,
- memory requirements reduced 30-60% depending on CPU architecture,
- memory usage displayed at startup,
- scrypt, default N=1024 (LTC), will likely perform slower.
Improved stale share detection and handling for Scrypt with large N factor:
- abort and discard partially computed hash when new work is detected,
- quicker response to new job, less time wasted mining stale job.
Improved stale share handling for all algorithms:
- report possible stale share when new work received with a previously
submitted share still pending,
- when new work is detected report the submission of an already completed,
otherwise valid, but likely stale, share,
- fixed incorrect block height in stale share log.
Small performance improvements to sha, bmw, cube & hamsi for AVX512 & AVX2.
When stratum disconnects miner threads go to idle until reconnected.
Colour changes to some logs.
Some low level function name changes for clarity and consistency.
The reference hashrate in the summary log and the benchmark total hashrate
are now the mean hashrate for the session.
v3.17.1
Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES.

View File

@@ -1,3 +1,6 @@
#ifndef __ALGO_GATE_API_H__
#define __ALGO_GATE_API_H__ 1
#include <stdlib.h>
#include <stdbool.h>
#include <stdint.h>
@@ -319,3 +322,4 @@ void exec_hash_function( int algo, void *output, const void *pdata );
// algo name if valid alias, NULL if invalid alias or algo.
void get_algo_alias( char **algo_or_alias );
#endif

View File

@@ -328,7 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
#include <immintrin.h>
#define ror64(x, n) _mm512_ror_epi64((x), (n))
#define ROR64(x, n) _mm512_ror_epi64((x), (n))
static __m512i muladd(__m512i x, __m512i y)
{
@@ -344,8 +344,8 @@ static __m512i muladd(__m512i x, __m512i y)
D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \
\
D0 = ror64(D0, 32); \
D1 = ror64(D1, 32); \
D0 = ROR64(D0, 32); \
D1 = ROR64(D1, 32); \
\
C0 = muladd(C0, D0); \
C1 = muladd(C1, D1); \
@@ -353,8 +353,8 @@ static __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \
\
B0 = ror64(B0, 24); \
B1 = ror64(B1, 24); \
B0 = ROR64(B0, 24); \
B1 = ROR64(B1, 24); \
} while ((void)0, 0)
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -365,8 +365,8 @@ static __m512i muladd(__m512i x, __m512i y)
D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \
\
D0 = ror64(D0, 16); \
D1 = ror64(D1, 16); \
D0 = ROR64(D0, 16); \
D1 = ROR64(D1, 16); \
\
C0 = muladd(C0, D0); \
C1 = muladd(C1, D1); \
@@ -374,8 +374,8 @@ static __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \
\
B0 = ror64(B0, 63); \
B1 = ror64(B1, 63); \
B0 = ROR64(B0, 63); \
B1 = ROR64(B1, 63); \
} while ((void)0, 0)
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \

View File

@@ -594,22 +594,15 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
#define rb6(x) mm256_rol_64( x, 43 )
#define rb7(x) mm256_rol_64( x, 53 )
#define rol_off_64( M, j, off ) \
mm256_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
( ( (j) + (off) ) & 0xF ) + 1 )
#define rol_off_64( M, j ) \
mm256_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
#define add_elt_b( M, H, j ) \
_mm256_xor_si256( \
_mm256_add_epi64( \
_mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \
rol_off_64( M, j, 3 ) ), \
rol_off_64( M, j, 10 ) ), \
_mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
H[ ( (j)+7 ) & 0xF ] )
#define add_elt_b( mj0, mj3, mj10, h, K ) \
_mm256_xor_si256( h, _mm256_add_epi64( K, \
_mm256_sub_epi64( _mm256_add_epi64( mj0, mj3 ), mj10 ) ) )
#define expand1b( qt, M, H, i ) \
_mm256_add_epi64( mm256_add4_64( \
#define expand1_b( qt, i ) \
mm256_add4_64( \
mm256_add4_64( sb1( qt[ (i)-16 ] ), sb2( qt[ (i)-15 ] ), \
sb3( qt[ (i)-14 ] ), sb0( qt[ (i)-13 ] )), \
mm256_add4_64( sb1( qt[ (i)-12 ] ), sb2( qt[ (i)-11 ] ), \
@@ -617,11 +610,10 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
mm256_add4_64( sb1( qt[ (i)- 8 ] ), sb2( qt[ (i)- 7 ] ), \
sb3( qt[ (i)- 6 ] ), sb0( qt[ (i)- 5 ] )), \
mm256_add4_64( sb1( qt[ (i)- 4 ] ), sb2( qt[ (i)- 3 ] ), \
sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ), \
add_elt_b( M, H, (i)-16 ) )
sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) )
#define expand2b( qt, M, H, i) \
_mm256_add_epi64( mm256_add4_64( \
#define expand2_b( qt, i) \
mm256_add4_64( \
mm256_add4_64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ), \
qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ), \
mm256_add4_64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ), \
@@ -629,159 +621,98 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
mm256_add4_64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ), \
qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ), \
mm256_add4_64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ), \
sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \
add_elt_b( M, H, (i)-16 ) )
sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) )
#define Wb0 \
_mm256_add_epi64( \
_mm256_add_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
_mm256_xor_si256( M[10], H[10] ) ), \
_mm256_add_epi64( _mm256_xor_si256( M[13], H[13] ), \
_mm256_xor_si256( M[14], H[14] ) ) )
_mm256_add_epi64( _mm256_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \
_mm256_add_epi64( mh[13], mh[14] ) )
#define Wb1 \
_mm256_add_epi64( \
_mm256_add_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
_mm256_xor_si256( M[11], H[11] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[14], H[14] ), \
_mm256_xor_si256( M[15], H[15] ) ) )
_mm256_add_epi64( _mm256_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \
_mm256_sub_epi64( mh[14], mh[15] ) )
#define Wb2 \
_mm256_sub_epi64( \
_mm256_add_epi64( \
_mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
_mm256_xor_si256( M[15], H[15] ) ) )
_mm256_add_epi64( _mm256_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \
_mm256_sub_epi64( mh[12], mh[15] ) )
#define Wb3 \
_mm256_sub_epi64( \
_mm256_add_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[10], H[10] ), \
_mm256_xor_si256( M[13], H[13] ) ) )
_mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \
_mm256_sub_epi64( mh[10], \
mh[13] ) )
#define Wb4 \
_mm256_sub_epi64( \
_mm256_add_epi64( \
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
_mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
_mm256_xor_si256( M[14], H[14] ) ) )
_mm256_add_epi64( _mm256_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \
_mm256_add_epi64( mh[11], mh[14] ) )
#define Wb5 \
_mm256_sub_epi64( \
_mm256_add_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
_mm256_xor_si256( M[10], H[10] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
_mm256_xor_si256( M[15], H[15] ) ) )
_mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \
_mm256_sub_epi64( mh[12], mh[15] ) )
#define Wb6 \
_mm256_sub_epi64( \
_mm256_sub_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[11], H[11] ), \
_mm256_xor_si256( M[13], H[13] ) ) )
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \
_mm256_sub_epi64( mh[11], mh[13] ) )
#define Wb7 \
_mm256_sub_epi64( \
_mm256_sub_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
_mm256_add_epi64( _mm256_xor_si256( M[12], H[12] ), \
_mm256_xor_si256( M[14], H[14] ) ) )
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \
_mm256_add_epi64( mh[12], mh[14] ) )
#define Wb8 \
_mm256_add_epi64( \
_mm256_sub_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[13], H[13] ), \
_mm256_xor_si256( M[15], H[15] ) ) )
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \
_mm256_sub_epi64( mh[13], mh[15] ) )
#define Wb9 \
_mm256_sub_epi64( \
_mm256_add_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
_mm256_xor_si256( M[14], H[14] ) ) )
_mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \
_mm256_sub_epi64( mh[ 7], mh[14] ) )
#define Wb10 \
_mm256_sub_epi64( \
_mm256_sub_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
_mm256_xor_si256( M[15], H[15] ) ) )
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \
_mm256_sub_epi64( mh[ 7], mh[15] ) )
#define Wb11 \
_mm256_sub_epi64( \
_mm256_sub_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
_mm256_xor_si256( M[ 9], H[ 9] ) ) )
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \
_mm256_sub_epi64( mh[ 5], mh[ 9] ) )
#define Wb12 \
_mm256_sub_epi64( \
_mm256_sub_epi64( \
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
_mm256_xor_si256( M[10], H[10] ) ) )
_mm256_sub_epi64( _mm256_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \
_mm256_sub_epi64( mh[ 9], mh[10] ) )
#define Wb13 \
_mm256_add_epi64( \
_mm256_add_epi64( \
_mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
_mm256_add_epi64( _mm256_xor_si256( M[10], H[10] ), \
_mm256_xor_si256( M[11], H[11] ) ) )
_mm256_add_epi64( _mm256_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \
_mm256_add_epi64( mh[10], mh[11] ) )
#define Wb14 \
_mm256_sub_epi64( \
_mm256_add_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
_mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
_mm256_xor_si256( M[12], H[12] ) ) )
_mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \
_mm256_add_epi64( mh[11], mh[12] ) )
#define Wb15 \
_mm256_sub_epi64( \
_mm256_sub_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
_mm256_xor_si256( M[ 4], H[4] ) ), \
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
_mm256_xor_si256( M[13], H[13] ) ) )
_mm256_sub_epi64( _mm256_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \
_mm256_sub_epi64( mh[ 9], mh[13] ) )
void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
{
__m256i qt[32], xl, xh;
__m256i mh[16];
int i;
for ( i = 0; i < 16; i++ )
mh[i] = _mm256_xor_si256( M[i], H[i] );
qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] );
qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] );
@@ -799,22 +730,60 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] );
qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] );
qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] );
qt[16] = expand1b( qt, M, H, 16 );
qt[17] = expand1b( qt, M, H, 17 );
qt[18] = expand2b( qt, M, H, 18 );
qt[19] = expand2b( qt, M, H, 19 );
qt[20] = expand2b( qt, M, H, 20 );
qt[21] = expand2b( qt, M, H, 21 );
qt[22] = expand2b( qt, M, H, 22 );
qt[23] = expand2b( qt, M, H, 23 );
qt[24] = expand2b( qt, M, H, 24 );
qt[25] = expand2b( qt, M, H, 25 );
qt[26] = expand2b( qt, M, H, 26 );
qt[27] = expand2b( qt, M, H, 27 );
qt[28] = expand2b( qt, M, H, 28 );
qt[29] = expand2b( qt, M, H, 29 );
qt[30] = expand2b( qt, M, H, 30 );
qt[31] = expand2b( qt, M, H, 31 );
__m256i mj[16];
for ( i = 0; i < 16; i++ )
mj[i] = rol_off_64( M, i );
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7],
(const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) );
qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8],
(const __m256i)_mm256_set1_epi64x( 17 * 0x0555555555555555ULL ) );
qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9],
(const __m256i)_mm256_set1_epi64x( 18 * 0x0555555555555555ULL ) );
qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10],
(const __m256i)_mm256_set1_epi64x( 19 * 0x0555555555555555ULL ) );
qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11],
(const __m256i)_mm256_set1_epi64x( 20 * 0x0555555555555555ULL ) );
qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12],
(const __m256i)_mm256_set1_epi64x( 21 * 0x0555555555555555ULL ) );
qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13],
(const __m256i)_mm256_set1_epi64x( 22 * 0x0555555555555555ULL ) );
qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14],
(const __m256i)_mm256_set1_epi64x( 23 * 0x0555555555555555ULL ) );
qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15],
(const __m256i)_mm256_set1_epi64x( 24 * 0x0555555555555555ULL ) );
qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0],
(const __m256i)_mm256_set1_epi64x( 25 * 0x0555555555555555ULL ) );
qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1],
(const __m256i)_mm256_set1_epi64x( 26 * 0x0555555555555555ULL ) );
qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2],
(const __m256i)_mm256_set1_epi64x( 27 * 0x0555555555555555ULL ) );
qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3],
(const __m256i)_mm256_set1_epi64x( 28 * 0x0555555555555555ULL ) );
qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4],
(const __m256i)_mm256_set1_epi64x( 29 * 0x0555555555555555ULL ) );
qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5],
(const __m256i)_mm256_set1_epi64x( 30 * 0x0555555555555555ULL ) );
qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6],
(const __m256i)_mm256_set1_epi64x( 31 * 0x0555555555555555ULL ) );
qt[16] = _mm256_add_epi64( qt[16], expand1_b( qt, 16 ) );
qt[17] = _mm256_add_epi64( qt[17], expand1_b( qt, 17 ) );
qt[18] = _mm256_add_epi64( qt[18], expand2_b( qt, 18 ) );
qt[19] = _mm256_add_epi64( qt[19], expand2_b( qt, 19 ) );
qt[20] = _mm256_add_epi64( qt[20], expand2_b( qt, 20 ) );
qt[21] = _mm256_add_epi64( qt[21], expand2_b( qt, 21 ) );
qt[22] = _mm256_add_epi64( qt[22], expand2_b( qt, 22 ) );
qt[23] = _mm256_add_epi64( qt[23], expand2_b( qt, 23 ) );
qt[24] = _mm256_add_epi64( qt[24], expand2_b( qt, 24 ) );
qt[25] = _mm256_add_epi64( qt[25], expand2_b( qt, 25 ) );
qt[26] = _mm256_add_epi64( qt[26], expand2_b( qt, 26 ) );
qt[27] = _mm256_add_epi64( qt[27], expand2_b( qt, 27 ) );
qt[28] = _mm256_add_epi64( qt[28], expand2_b( qt, 28 ) );
qt[29] = _mm256_add_epi64( qt[29], expand2_b( qt, 29 ) );
qt[30] = _mm256_add_epi64( qt[30], expand2_b( qt, 30 ) );
qt[31] = _mm256_add_epi64( qt[31], expand2_b( qt, 31 ) );
xl = _mm256_xor_si256(
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
@@ -823,7 +792,6 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
#define DH1L( m, sl, sr, a, b, c ) \
_mm256_add_epi64( \
_mm256_xor_si256( M[m], \
@@ -1066,21 +1034,15 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#define r8b6(x) mm512_rol_64( x, 43 )
#define r8b7(x) mm512_rol_64( x, 53 )
#define rol8w_off_64( M, j, off ) \
mm512_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
( ( (j) + (off) ) & 0xF ) + 1 )
#define rol8w_off_64( M, j ) \
mm512_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
#define add_elt_b8( M, H, j ) \
_mm512_xor_si512( \
_mm512_add_epi64( \
_mm512_sub_epi64( _mm512_add_epi64( rol8w_off_64( M, j, 0 ), \
rol8w_off_64( M, j, 3 ) ), \
rol8w_off_64( M, j, 10 ) ), \
_mm512_set1_epi64( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
H[ ( (j)+7 ) & 0xF ] )
#define add_elt_b8( mj0, mj3, mj10, h, K ) \
_mm512_xor_si512( h, _mm512_add_epi64( K, \
_mm512_sub_epi64( _mm512_add_epi64( mj0, mj3 ), mj10 ) ) )
#define expand1b8( qt, M, H, i ) \
_mm512_add_epi64( mm512_add4_64( \
#define expand1_b8( qt, i ) \
mm512_add4_64( \
mm512_add4_64( s8b1( qt[ (i)-16 ] ), s8b2( qt[ (i)-15 ] ), \
s8b3( qt[ (i)-14 ] ), s8b0( qt[ (i)-13 ] )), \
mm512_add4_64( s8b1( qt[ (i)-12 ] ), s8b2( qt[ (i)-11 ] ), \
@@ -1088,11 +1050,10 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
mm512_add4_64( s8b1( qt[ (i)- 8 ] ), s8b2( qt[ (i)- 7 ] ), \
s8b3( qt[ (i)- 6 ] ), s8b0( qt[ (i)- 5 ] )), \
mm512_add4_64( s8b1( qt[ (i)- 4 ] ), s8b2( qt[ (i)- 3 ] ), \
s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ), \
add_elt_b8( M, H, (i)-16 ) )
s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) )
#define expand2b8( qt, M, H, i) \
_mm512_add_epi64( mm512_add4_64( \
#define expand2_b8( qt, i) \
mm512_add4_64( \
mm512_add4_64( qt[ (i)-16 ], r8b1( qt[ (i)-15 ] ), \
qt[ (i)-14 ], r8b2( qt[ (i)-13 ] ) ), \
mm512_add4_64( qt[ (i)-12 ], r8b3( qt[ (i)-11 ] ), \
@@ -1100,157 +1061,97 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
mm512_add4_64( qt[ (i)- 8 ], r8b5( qt[ (i)- 7 ] ), \
qt[ (i)- 6 ], r8b6( qt[ (i)- 5 ] ) ), \
mm512_add4_64( qt[ (i)- 4 ], r8b7( qt[ (i)- 3 ] ), \
s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \
add_elt_b8( M, H, (i)-16 ) )
s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) )
#define W8b0 \
_mm512_add_epi64( \
_mm512_add_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
_mm512_xor_si512( M[10], H[10] ) ), \
_mm512_add_epi64( _mm512_xor_si512( M[13], H[13] ), \
_mm512_xor_si512( M[14], H[14] ) ) )
_mm512_add_epi64( _mm512_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \
_mm512_add_epi64( mh[13], mh[14] ) )
#define W8b1 \
_mm512_add_epi64( \
_mm512_add_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 6], H[ 6] ), \
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
_mm512_xor_si512( M[11], H[11] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[14], H[14] ), \
_mm512_xor_si512( M[15], H[15] ) ) )
_mm512_add_epi64( _mm512_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \
_mm512_sub_epi64( mh[14], mh[15] ) )
#define W8b2 \
_mm512_sub_epi64( \
_mm512_add_epi64( \
_mm512_add_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
_mm512_xor_si512( M[15], H[15] ) ) )
_mm512_add_epi64( _mm512_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \
_mm512_sub_epi64( mh[12], mh[15] ) )
#define W8b3 \
_mm512_sub_epi64( \
_mm512_add_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[10], H[10] ), \
_mm512_xor_si512( M[13], H[13] ) ) )
_mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \
_mm512_sub_epi64( mh[10], mh[13] ) )
#define W8b4 \
_mm512_sub_epi64( \
_mm512_add_epi64( \
_mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
_mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
_mm512_xor_si512( M[14], H[14] ) ) )
_mm512_add_epi64( _mm512_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \
_mm512_add_epi64( mh[11], mh[14] ) )
#define W8b5 \
_mm512_sub_epi64( \
_mm512_add_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
_mm512_xor_si512( M[10], H[10] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
_mm512_xor_si512( M[15], H[15] ) ) )
_mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \
_mm512_sub_epi64( mh[12], mh[15] ) )
#define W8b6 \
_mm512_sub_epi64( \
_mm512_sub_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 4], H[ 4] ), \
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[11], H[11] ), \
_mm512_xor_si512( M[13], H[13] ) ) )
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \
_mm512_sub_epi64( mh[11], mh[13] ) )
#define W8b7 \
_mm512_sub_epi64( \
_mm512_sub_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
_mm512_add_epi64( _mm512_xor_si512( M[12], H[12] ), \
_mm512_xor_si512( M[14], H[14] ) ) )
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \
_mm512_add_epi64( mh[12], mh[14] ) )
#define W8b8 \
_mm512_add_epi64( \
_mm512_sub_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[13], H[13] ), \
_mm512_xor_si512( M[15], H[15] ) ) )
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \
_mm512_sub_epi64( mh[13], mh[15] ) )
#define W8b9 \
_mm512_sub_epi64( \
_mm512_add_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
_mm512_xor_si512( M[14], H[14] ) ) )
_mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \
_mm512_sub_epi64( mh[ 7], mh[14] ) )
#define W8b10 \
_mm512_sub_epi64( \
_mm512_sub_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
_mm512_xor_si512( M[15], H[15] ) ) )
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \
_mm512_sub_epi64( mh[ 7], mh[15] ) )
#define W8b11 \
_mm512_sub_epi64( \
_mm512_sub_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
_mm512_xor_si512( M[ 9], H[ 9] ) ) )
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \
_mm512_sub_epi64( mh[ 5], mh[ 9] ) )
#define W8b12 \
_mm512_sub_epi64( \
_mm512_sub_epi64( \
_mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
_mm512_xor_si512( M[10], H[10] ) ) )
_mm512_sub_epi64( _mm512_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \
_mm512_sub_epi64( mh[ 9], mh[10] ) )
#define W8b13 \
_mm512_add_epi64( \
_mm512_add_epi64( \
_mm512_add_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
_mm512_add_epi64( _mm512_xor_si512( M[10], H[10] ), \
_mm512_xor_si512( M[11], H[11] ) ) )
_mm512_add_epi64( _mm512_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \
_mm512_add_epi64( mh[10], mh[11] ) )
#define W8b14 \
_mm512_sub_epi64( \
_mm512_add_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
_mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
_mm512_xor_si512( M[12], H[12] ) ) )
_mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \
_mm512_add_epi64( mh[11], mh[12] ) )
#define W8b15 \
_mm512_sub_epi64( \
_mm512_sub_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
_mm512_xor_si512( M[ 4], H[4] ) ), \
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
_mm512_xor_si512( M[13], H[13] ) ) )
_mm512_sub_epi64( _mm512_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \
_mm512_sub_epi64( mh[ 9], mh[13] ) )
void compress_big_8way( const __m512i *M, const __m512i H[16],
__m512i dH[16] )
{
__m512i qt[32], xl, xh;
__m512i mh[16];
int i;
for ( i = 0; i < 16; i++ )
mh[i] = _mm512_xor_si512( M[i], H[i] );
qt[ 0] = _mm512_add_epi64( s8b0( W8b0 ), H[ 1] );
qt[ 1] = _mm512_add_epi64( s8b1( W8b1 ), H[ 2] );
@@ -1268,22 +1169,60 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
qt[13] = _mm512_add_epi64( s8b3( W8b13), H[14] );
qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
qt[16] = expand1b8( qt, M, H, 16 );
qt[17] = expand1b8( qt, M, H, 17 );
qt[18] = expand2b8( qt, M, H, 18 );
qt[19] = expand2b8( qt, M, H, 19 );
qt[20] = expand2b8( qt, M, H, 20 );
qt[21] = expand2b8( qt, M, H, 21 );
qt[22] = expand2b8( qt, M, H, 22 );
qt[23] = expand2b8( qt, M, H, 23 );
qt[24] = expand2b8( qt, M, H, 24 );
qt[25] = expand2b8( qt, M, H, 25 );
qt[26] = expand2b8( qt, M, H, 26 );
qt[27] = expand2b8( qt, M, H, 27 );
qt[28] = expand2b8( qt, M, H, 28 );
qt[29] = expand2b8( qt, M, H, 29 );
qt[30] = expand2b8( qt, M, H, 30 );
qt[31] = expand2b8( qt, M, H, 31 );
__m512i mj[16];
for ( i = 0; i < 16; i++ )
mj[i] = rol8w_off_64( M, i );
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7],
(const __m512i)_mm512_set1_epi64( 16 * 0x0555555555555555ULL ) );
qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8],
(const __m512i)_mm512_set1_epi64( 17 * 0x0555555555555555ULL ) );
qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9],
(const __m512i)_mm512_set1_epi64( 18 * 0x0555555555555555ULL ) );
qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10],
(const __m512i)_mm512_set1_epi64( 19 * 0x0555555555555555ULL ) );
qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11],
(const __m512i)_mm512_set1_epi64( 20 * 0x0555555555555555ULL ) );
qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12],
(const __m512i)_mm512_set1_epi64( 21 * 0x0555555555555555ULL ) );
qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13],
(const __m512i)_mm512_set1_epi64( 22 * 0x0555555555555555ULL ) );
qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14],
(const __m512i)_mm512_set1_epi64( 23 * 0x0555555555555555ULL ) );
qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15],
(const __m512i)_mm512_set1_epi64( 24 * 0x0555555555555555ULL ) );
qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0],
(const __m512i)_mm512_set1_epi64( 25 * 0x0555555555555555ULL ) );
qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1],
(const __m512i)_mm512_set1_epi64( 26 * 0x0555555555555555ULL ) );
qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2],
(const __m512i)_mm512_set1_epi64( 27 * 0x0555555555555555ULL ) );
qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3],
(const __m512i)_mm512_set1_epi64( 28 * 0x0555555555555555ULL ) );
qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4],
(const __m512i)_mm512_set1_epi64( 29 * 0x0555555555555555ULL ) );
qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5],
(const __m512i)_mm512_set1_epi64( 30 * 0x0555555555555555ULL ) );
qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6],
(const __m512i)_mm512_set1_epi64( 31 * 0x0555555555555555ULL ) );
qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
qt[18] = _mm512_add_epi64( qt[18], expand2_b8( qt, 18 ) );
qt[19] = _mm512_add_epi64( qt[19], expand2_b8( qt, 19 ) );
qt[20] = _mm512_add_epi64( qt[20], expand2_b8( qt, 20 ) );
qt[21] = _mm512_add_epi64( qt[21], expand2_b8( qt, 21 ) );
qt[22] = _mm512_add_epi64( qt[22], expand2_b8( qt, 22 ) );
qt[23] = _mm512_add_epi64( qt[23], expand2_b8( qt, 23 ) );
qt[24] = _mm512_add_epi64( qt[24], expand2_b8( qt, 24 ) );
qt[25] = _mm512_add_epi64( qt[25], expand2_b8( qt, 25 ) );
qt[26] = _mm512_add_epi64( qt[26], expand2_b8( qt, 26 ) );
qt[27] = _mm512_add_epi64( qt[27], expand2_b8( qt, 27 ) );
qt[28] = _mm512_add_epi64( qt[28], expand2_b8( qt, 28 ) );
qt[29] = _mm512_add_epi64( qt[29], expand2_b8( qt, 29 ) );
qt[30] = _mm512_add_epi64( qt[30], expand2_b8( qt, 30 ) );
qt[31] = _mm512_add_epi64( qt[31], expand2_b8( qt, 31 ) );
xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
mm512_xor3( qt[19], qt[20], qt[21] ),

View File

@@ -98,6 +98,138 @@ static void transform_4way( cube_4way_context *sp )
_mm512_store_si512( (__m512i*)sp->h + 7, x7 );
}
// 8 ways, 4 way parallel double buffered
static void transform_4way_2buf( cube_4way_2buf_context *sp )
{
int r;
const int rounds = sp->rounds;
__m512i x0, x1, x2, x3, x4, x5, x6, x7;
__m512i y0, y1, y2, y3, y4, y5, y6, y7;
__m512i tx0, tx1, ty0, ty1;
x0 = _mm512_load_si512( (__m512i*)sp->h0 );
x1 = _mm512_load_si512( (__m512i*)sp->h0 + 1 );
x2 = _mm512_load_si512( (__m512i*)sp->h0 + 2 );
x3 = _mm512_load_si512( (__m512i*)sp->h0 + 3 );
x4 = _mm512_load_si512( (__m512i*)sp->h0 + 4 );
x5 = _mm512_load_si512( (__m512i*)sp->h0 + 5 );
x6 = _mm512_load_si512( (__m512i*)sp->h0 + 6 );
x7 = _mm512_load_si512( (__m512i*)sp->h0 + 7 );
y0 = _mm512_load_si512( (__m512i*)sp->h1 );
y1 = _mm512_load_si512( (__m512i*)sp->h1 + 1 );
y2 = _mm512_load_si512( (__m512i*)sp->h1 + 2 );
y3 = _mm512_load_si512( (__m512i*)sp->h1 + 3 );
y4 = _mm512_load_si512( (__m512i*)sp->h1 + 4 );
y5 = _mm512_load_si512( (__m512i*)sp->h1 + 5 );
y6 = _mm512_load_si512( (__m512i*)sp->h1 + 6 );
y7 = _mm512_load_si512( (__m512i*)sp->h1 + 7 );
for ( r = 0; r < rounds; ++r )
{
x4 = _mm512_add_epi32( x0, x4 );
y4 = _mm512_add_epi32( y0, y4 );
tx0 = x0;
ty0 = y0;
x5 = _mm512_add_epi32( x1, x5 );
y5 = _mm512_add_epi32( y1, y5 );
tx1 = x1;
ty1 = y1;
x0 = mm512_rol_32( x2, 7 );
y0 = mm512_rol_32( y2, 7 );
x6 = _mm512_add_epi32( x2, x6 );
y6 = _mm512_add_epi32( y2, y6 );
x1 = mm512_rol_32( x3, 7 );
y1 = mm512_rol_32( y3, 7 );
x7 = _mm512_add_epi32( x3, x7 );
y7 = _mm512_add_epi32( y3, y7 );
x2 = mm512_rol_32( tx0, 7 );
y2 = mm512_rol_32( ty0, 7 );
x0 = _mm512_xor_si512( x0, x4 );
y0 = _mm512_xor_si512( y0, y4 );
x4 = mm512_swap128_64( x4 );
x3 = mm512_rol_32( tx1, 7 );
y3 = mm512_rol_32( ty1, 7 );
y4 = mm512_swap128_64( y4 );
x1 = _mm512_xor_si512( x1, x5 );
y1 = _mm512_xor_si512( y1, y5 );
x5 = mm512_swap128_64( x5 );
x2 = _mm512_xor_si512( x2, x6 );
y2 = _mm512_xor_si512( y2, y6 );
y5 = mm512_swap128_64( y5 );
x3 = _mm512_xor_si512( x3, x7 );
y3 = _mm512_xor_si512( y3, y7 );
x6 = mm512_swap128_64( x6 );
x4 = _mm512_add_epi32( x0, x4 );
y4 = _mm512_add_epi32( y0, y4 );
y6 = mm512_swap128_64( y6 );
x5 = _mm512_add_epi32( x1, x5 );
y5 = _mm512_add_epi32( y1, y5 );
x7 = mm512_swap128_64( x7 );
x6 = _mm512_add_epi32( x2, x6 );
y6 = _mm512_add_epi32( y2, y6 );
tx0 = x0;
ty0 = y0;
y7 = mm512_swap128_64( y7 );
tx1 = x2;
ty1 = y2;
x0 = mm512_rol_32( x1, 11 );
y0 = mm512_rol_32( y1, 11 );
x7 = _mm512_add_epi32( x3, x7 );
y7 = _mm512_add_epi32( y3, y7 );
x1 = mm512_rol_32( tx0, 11 );
y1 = mm512_rol_32( ty0, 11 );
x0 = _mm512_xor_si512( x0, x4 );
x4 = mm512_swap64_32( x4 );
y0 = _mm512_xor_si512( y0, y4 );
x2 = mm512_rol_32( x3, 11 );
y4 = mm512_swap64_32( y4 );
y2 = mm512_rol_32( y3, 11 );
x1 = _mm512_xor_si512( x1, x5 );
x5 = mm512_swap64_32( x5 );
y1 = _mm512_xor_si512( y1, y5 );
x3 = mm512_rol_32( tx1, 11 );
y5 = mm512_swap64_32( y5 );
y3 = mm512_rol_32( ty1, 11 );
x2 = _mm512_xor_si512( x2, x6 );
x6 = mm512_swap64_32( x6 );
y2 = _mm512_xor_si512( y2, y6 );
y6 = mm512_swap64_32( y6 );
x3 = _mm512_xor_si512( x3, x7 );
x7 = mm512_swap64_32( x7 );
y3 = _mm512_xor_si512( y3, y7 );
y7 = mm512_swap64_32( y7 );
}
_mm512_store_si512( (__m512i*)sp->h0, x0 );
_mm512_store_si512( (__m512i*)sp->h0 + 1, x1 );
_mm512_store_si512( (__m512i*)sp->h0 + 2, x2 );
_mm512_store_si512( (__m512i*)sp->h0 + 3, x3 );
_mm512_store_si512( (__m512i*)sp->h0 + 4, x4 );
_mm512_store_si512( (__m512i*)sp->h0 + 5, x5 );
_mm512_store_si512( (__m512i*)sp->h0 + 6, x6 );
_mm512_store_si512( (__m512i*)sp->h0 + 7, x7 );
_mm512_store_si512( (__m512i*)sp->h1, y0 );
_mm512_store_si512( (__m512i*)sp->h1 + 1, y1 );
_mm512_store_si512( (__m512i*)sp->h1 + 2, y2 );
_mm512_store_si512( (__m512i*)sp->h1 + 3, y3 );
_mm512_store_si512( (__m512i*)sp->h1 + 4, y4 );
_mm512_store_si512( (__m512i*)sp->h1 + 5, y5 );
_mm512_store_si512( (__m512i*)sp->h1 + 6, y6 );
_mm512_store_si512( (__m512i*)sp->h1 + 7, y7 );
}
int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
int blockbytes )
{
@@ -219,6 +351,67 @@ int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
return 0;
}
int cube_4way_2buf_full( cube_4way_2buf_context *sp,
void *output0, void *output1, int hashbitlen,
const void *data0, const void *data1, size_t size )
{
__m512i *h0 = (__m512i*)sp->h0;
__m512i *h1 = (__m512i*)sp->h1;
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
: (__m128i*)IV256 );
sp->hashlen = hashbitlen/128;
sp->blocksize = 32/16;
sp->rounds = 16;
sp->pos = 0;
h1[0] = h0[0] = m512_const1_128( iv[0] );
h1[1] = h0[1] = m512_const1_128( iv[1] );
h1[2] = h0[2] = m512_const1_128( iv[2] );
h1[3] = h0[3] = m512_const1_128( iv[3] );
h1[4] = h0[4] = m512_const1_128( iv[4] );
h1[5] = h0[5] = m512_const1_128( iv[5] );
h1[6] = h0[6] = m512_const1_128( iv[6] );
h1[7] = h0[7] = m512_const1_128( iv[7] );
const int len = size >> 4;
const __m512i *in0 = (__m512i*)data0;
const __m512i *in1 = (__m512i*)data1;
__m512i *hash0 = (__m512i*)output0;
__m512i *hash1 = (__m512i*)output1;
int i;
for ( i = 0; i < len; i++ )
{
sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], in0[i] );
sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], in1[i] );
sp->pos++;
if ( sp->pos == sp->blocksize )
{
transform_4way_2buf( sp );
sp->pos = 0;
}
}
// pos is zero for 64 byte data, 1 for 80 byte data.
__m512i tmp = m512_const2_64( 0, 0x0000000000000080 );
sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], tmp );
sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], tmp );
transform_4way_2buf( sp );
tmp = m512_const2_64( 0x0000000100000000, 0 );
sp->h0[7] = _mm512_xor_si512( sp->h0[7], tmp );
sp->h1[7] = _mm512_xor_si512( sp->h1[7], tmp );
for ( i = 0; i < 10; ++i )
transform_4way_2buf( sp );
memcpy( hash0, sp->h0, sp->hashlen<<6);
memcpy( hash1, sp->h1, sp->hashlen<<6);
return 0;
}
int cube_4way_update_close( cube_4way_context *sp, void *output,
const void *data, size_t size )
@@ -259,6 +452,21 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
// 2 way 128
// This isn't expected to be used with AVX512 so HW rotate intruction
// is assumed not avaiable.
// Use double buffering to optimize serial bit rotations. Full double
// buffering isn't practical because it needs twice as many registers
// with AVX2 having only half as many as AVX512.
#define ROL2( out0, out1, in0, in1, c ) \
{ \
__m256i t0 = _mm256_slli_epi32( in0, c ); \
__m256i t1 = _mm256_slli_epi32( in1, c ); \
out0 = _mm256_srli_epi32( in0, 32-(c) ); \
out1 = _mm256_srli_epi32( in1, 32-(c) ); \
out0 = _mm256_or_si256( out0, t0 ); \
out1 = _mm256_or_si256( out1, t1 ); \
}
static void transform_2way( cube_2way_context *sp )
{
int r;
@@ -283,35 +491,31 @@ static void transform_2way( cube_2way_context *sp )
x7 = _mm256_add_epi32( x3, x7 );
y0 = x0;
y1 = x1;
x0 = mm256_rol_32( x2, 7 );
x1 = mm256_rol_32( x3, 7 );
x2 = mm256_rol_32( y0, 7 );
x3 = mm256_rol_32( y1, 7 );
ROL2( x0, x1, x2, x3, 7 );
ROL2( x2, x3, y0, y1, 7 );
x0 = _mm256_xor_si256( x0, x4 );
x1 = _mm256_xor_si256( x1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
x3 = _mm256_xor_si256( x3, x7 );
x4 = mm256_swap128_64( x4 );
x5 = mm256_swap128_64( x5 );
x6 = mm256_swap128_64( x6 );
x7 = mm256_swap128_64( x7 );
x4 = _mm256_add_epi32( x0, x4 );
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
x7 = _mm256_add_epi32( x3, x7 );
y0 = x0;
y1 = x2;
x0 = mm256_rol_32( x1, 11 );
x1 = mm256_rol_32( y0, 11 );
x2 = mm256_rol_32( x3, 11 );
x3 = mm256_rol_32( y1, 11 );
x0 = _mm256_xor_si256( x0, x4 );
x1 = _mm256_xor_si256( x1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
x5 = mm256_swap128_64( x5 );
x3 = _mm256_xor_si256( x3, x7 );
x4 = _mm256_add_epi32( x0, x4 );
x6 = mm256_swap128_64( x6 );
y0 = x0;
x5 = _mm256_add_epi32( x1, x5 );
x7 = mm256_swap128_64( x7 );
x6 = _mm256_add_epi32( x2, x6 );
y1 = x2;
ROL2( x0, x1, x1, y0, 11 );
x7 = _mm256_add_epi32( x3, x7 );
ROL2( x2, x3, x3, y1, 11 );
x0 = _mm256_xor_si256( x0, x4 );
x4 = mm256_swap64_32( x4 );
x1 = _mm256_xor_si256( x1, x5 );
x5 = mm256_swap64_32( x5 );
x2 = _mm256_xor_si256( x2, x6 );
x6 = mm256_swap64_32( x6 );
x3 = _mm256_xor_si256( x3, x7 );
x7 = mm256_swap64_32( x7 );
}

View File

@@ -17,41 +17,41 @@ struct _cube_4way_context
int pos;
} __attribute__ ((aligned (128)));
struct _cube_4way_2buf_context
{
__m512i h0[8];
__m512i h1[8];
int hashlen;
int rounds;
int blocksize;
int pos;
} __attribute__ ((aligned (128)));
typedef struct _cube_4way_context cube_4way_context;
typedef struct _cube_4way_2buf_context cube_4way_2buf_context;
int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
int blockbytes );
int blockbytes );
int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
int cube_4way_close( cube_4way_context *sp, void *output );
int cube_4way_update_close( cube_4way_context *sp, void *output,
const void *data, size_t size );
int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
const void *data, size_t size );
int cube_4x256_full( cube_4way_context *sp, void *output, int hashbitlen,
const void *data, size_t size );
#define cube512_4way_init( sp ) cube_4way_update( sp, 512 )
#define cube512_4way_update cube_4way_update
#define cube512_4way_update_close cube_4way_update
#define cube512_4way_close cube_4way_update
#define cube512_4way_full( sp, output, data, size ) \
cube_4way_full( sp, output, 512, data, size )
#define cube512_4x256_full( sp, output, data, size ) \
cube_4x256_full( sp, output, 512, data, size )
#define cube256_4way_init( sp ) cube_4way_update( sp, 256 )
#define cube256_4way_update cube_4way_update
#define cube256_4way_update_close cube_4way_update
#define cube256_4way_close cube_4way_update
#define cube256_4way_full( sp, output, data, size ) \
cube_4way_full( sp, output, 256, data, size )
#define cube256_4x256_full( sp, output, data, size ) \
cube_4x256_full( sp, output, 256, data, size )
int cube_4way_2buf_full( cube_4way_2buf_context *sp,
void *output0, void *output1, int hashbitlen,
const void *data0, const void *data1, size_t size );
#endif
// 2x128, 2 way parallel SSE2
// 2x128, 2 way parallel AVX2
struct _cube_2way_context
{

View File

@@ -31,10 +31,14 @@ static void transform( cubehashParam *sp )
for ( r = 0; r < rounds; ++r )
{
x1 = _mm512_add_epi32( x0, x1 );
x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 );
x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) );
x0 = _mm512_xor_si512( mm512_rol_32(
mm512_swap256_128( x0 ), 11 ), x1 );
x0 = mm512_swap_256( x0 );
x0 = mm512_rol_32( x0, 7 );
x0 = _mm512_xor_si512( x0, x1 );
x1 = mm512_swap128_64( x1 );
x1 = _mm512_add_epi32( x0, x1 );
x0 = mm512_swap256_128( x0 );
x0 = mm512_rol_32( x0, 11 );
x0 = _mm512_xor_si512( x0, x1 );
x1 = mm512_swap64_32( x1 );
}

View File

@@ -43,7 +43,8 @@
#define ROUNDS (ROUNDS1024)
//#endif
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
#define ROTL64(a,n) rol64( a, n )
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))

View File

@@ -63,7 +63,8 @@ typedef crypto_uint64 u64;
//#define ROUNDS (ROUNDS1024)
//#endif
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
#define ROTL64(a,n) rol64( a, n )
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))

View File

@@ -11,7 +11,7 @@
#else
#include "sph_groestl.h"
#endif
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
typedef struct {
#ifdef __AES__
@@ -19,7 +19,6 @@ typedef struct {
#else
sph_groestl512_context groestl;
#endif
sph_sha256_context sha;
} myrgr_ctx_holder;
myrgr_ctx_holder myrgr_ctx;
@@ -31,7 +30,6 @@ void init_myrgr_ctx()
#else
sph_groestl512_init( &myrgr_ctx.groestl );
#endif
sph_sha256_init( &myrgr_ctx.sha );
}
void myriad_hash(void *output, const void *input)
@@ -49,8 +47,7 @@ void myriad_hash(void *output, const void *input)
sph_groestl512_close(&ctx.groestl, hash);
#endif
sph_sha256( &ctx.sha, hash, 64 );
sph_sha256_close( &ctx.sha, hash );
sha256_full( hash, hash, 64 );
memcpy(output, hash, 32);
}

View File

@@ -632,26 +632,25 @@ do { \
} while (0)
#define ROUND_BIG8(rc, alpha) \
#define ROUND_BIG8( alpha ) \
do { \
__m512i t0, t1, t2, t3; \
s0 = _mm512_xor_si512( s0, m512_const1_64( \
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
s0 = _mm512_xor_si512( s0, alpha[ 0] ); \
s1 = _mm512_xor_si512( s1, alpha[ 1] ); \
s2 = _mm512_xor_si512( s2, alpha[ 2] ); \
s3 = _mm512_xor_si512( s3, alpha[ 3] ); \
s4 = _mm512_xor_si512( s4, alpha[ 4] ); \
s5 = _mm512_xor_si512( s5, alpha[ 5] ); \
s6 = _mm512_xor_si512( s6, alpha[ 6] ); \
s7 = _mm512_xor_si512( s7, alpha[ 7] ); \
s8 = _mm512_xor_si512( s8, alpha[ 8] ); \
s9 = _mm512_xor_si512( s9, alpha[ 9] ); \
sA = _mm512_xor_si512( sA, alpha[10] ); \
sB = _mm512_xor_si512( sB, alpha[11] ); \
sC = _mm512_xor_si512( sC, alpha[12] ); \
sD = _mm512_xor_si512( sD, alpha[13] ); \
sE = _mm512_xor_si512( sE, alpha[14] ); \
sF = _mm512_xor_si512( sF, alpha[15] ); \
\
SBOX8( s0, s4, s8, sC ); \
SBOX8( s1, s5, s9, sD ); \
@@ -731,28 +730,66 @@ do { \
#define P_BIG8 \
do { \
ROUND_BIG8(0, alpha_n); \
ROUND_BIG8(1, alpha_n); \
ROUND_BIG8(2, alpha_n); \
ROUND_BIG8(3, alpha_n); \
ROUND_BIG8(4, alpha_n); \
ROUND_BIG8(5, alpha_n); \
__m512i alpha[16]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG8( alpha ); \
} while (0)
#define PF_BIG8 \
do { \
ROUND_BIG8( 0, alpha_f); \
ROUND_BIG8( 1, alpha_f); \
ROUND_BIG8( 2, alpha_f); \
ROUND_BIG8( 3, alpha_f); \
ROUND_BIG8( 4, alpha_f); \
ROUND_BIG8( 5, alpha_f); \
ROUND_BIG8( 6, alpha_f); \
ROUND_BIG8( 7, alpha_f); \
ROUND_BIG8( 8, alpha_f); \
ROUND_BIG8( 9, alpha_f); \
ROUND_BIG8(10, alpha_f); \
ROUND_BIG8(11, alpha_f); \
__m512i alpha[16]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)6 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)7 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)8 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)9 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)10 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)11 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \
} while (0)
#define T_BIG8 \
@@ -965,26 +1002,25 @@ do { \
#define sF m7
*/
#define ROUND_BIG(rc, alpha) \
#define ROUND_BIG( alpha ) \
do { \
__m256i t0, t1, t2, t3; \
s0 = _mm256_xor_si256( s0, m256_const1_64( \
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
s0 = _mm256_xor_si256( s0, alpha[ 0] ); \
s1 = _mm256_xor_si256( s1, alpha[ 1] ); \
s2 = _mm256_xor_si256( s2, alpha[ 2] ); \
s3 = _mm256_xor_si256( s3, alpha[ 3] ); \
s4 = _mm256_xor_si256( s4, alpha[ 4] ); \
s5 = _mm256_xor_si256( s5, alpha[ 5] ); \
s6 = _mm256_xor_si256( s6, alpha[ 6] ); \
s7 = _mm256_xor_si256( s7, alpha[ 7] ); \
s8 = _mm256_xor_si256( s8, alpha[ 8] ); \
s9 = _mm256_xor_si256( s9, alpha[ 9] ); \
sA = _mm256_xor_si256( sA, alpha[10] ); \
sB = _mm256_xor_si256( sB, alpha[11] ); \
sC = _mm256_xor_si256( sC, alpha[12] ); \
sD = _mm256_xor_si256( sD, alpha[13] ); \
sE = _mm256_xor_si256( sE, alpha[14] ); \
sF = _mm256_xor_si256( sF, alpha[15] ); \
\
SBOX( s0, s4, s8, sC ); \
SBOX( s1, s5, s9, sD ); \
@@ -1064,28 +1100,66 @@ do { \
#define P_BIG \
do { \
ROUND_BIG(0, alpha_n); \
ROUND_BIG(1, alpha_n); \
ROUND_BIG(2, alpha_n); \
ROUND_BIG(3, alpha_n); \
ROUND_BIG(4, alpha_n); \
ROUND_BIG(5, alpha_n); \
__m256i alpha[16]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG( alpha ); \
} while (0)
#define PF_BIG \
do { \
ROUND_BIG( 0, alpha_f); \
ROUND_BIG( 1, alpha_f); \
ROUND_BIG( 2, alpha_f); \
ROUND_BIG( 3, alpha_f); \
ROUND_BIG( 4, alpha_f); \
ROUND_BIG( 5, alpha_f); \
ROUND_BIG( 6, alpha_f); \
ROUND_BIG( 7, alpha_f); \
ROUND_BIG( 8, alpha_f); \
ROUND_BIG( 9, alpha_f); \
ROUND_BIG(10, alpha_f); \
ROUND_BIG(11, alpha_f); \
__m256i alpha[16]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)6 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)7 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)8 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)9 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)10 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)11 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \
} while (0)
#define T_BIG \

View File

@@ -7,6 +7,7 @@
#include "hodl-gate.h"
#include "hodl-wolf.h"
#include "miner.h"
#include "algo/sha/sha256d.h"
#if defined(__AES__)

View File

@@ -1,5 +1,6 @@
#include "keccak-gate.h"
#include "sph_keccak.h"
#include "algo/sha/sha256d.h"
int hard_coded_eb = 1;

View File

@@ -70,13 +70,13 @@ static const uint64_t RC[] = {
// Targetted macros, keccak-macros.h is included for each target.
#define DECL64(x) __m512i x
#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b))
#define AND64(d, a, b) (d = _mm512_and_si512(a,b))
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
#define DECL64(x) __m512i x
#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b))
#define AND64(d, a, b) (d = _mm512_and_si512(a,b))
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))

View File

@@ -16,7 +16,7 @@
typedef struct {
blake256_16way_context blake;
keccak256_8way_context keccak;
cube_4way_context cube;
cube_4way_2buf_context cube;
skein256_8way_context skein;
#if defined(__VAES__)
groestl256_4way_context groestl;
@@ -30,13 +30,7 @@ static __thread allium_16way_ctx_holder allium_16way_ctx;
bool init_allium_16way_ctx()
{
keccak256_8way_init( &allium_16way_ctx.keccak );
cube_4way_init( &allium_16way_ctx.cube, 256, 16, 32 );
skein256_8way_init( &allium_16way_ctx.skein );
#if defined(__VAES__)
groestl256_4way_init( &allium_16way_ctx.groestl, 32 );
#else
init_groestl256( &allium_16way_ctx.groestl, 32 );
#endif
return true;
}
@@ -111,12 +105,11 @@ void allium_16way_hash( void *state, const void *input )
intrlv_2x256( vhash, hash14, hash15, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash14, hash15, vhash, 256 );
intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
@@ -124,8 +117,7 @@ void allium_16way_hash( void *state, const void *input )
intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 );
intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 );
cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 );
dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 );
dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 );
@@ -255,7 +247,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
typedef struct {
blake256_8way_context blake;
keccak256_4way_context keccak;
cubehashParam cube;
cube_2way_context cube;
skein256_4way_context skein;
#if defined(__VAES__)
groestl256_2way_context groestl;
@@ -269,13 +261,7 @@ static __thread allium_8way_ctx_holder allium_8way_ctx;
bool init_allium_8way_ctx()
{
keccak256_4way_init( &allium_8way_ctx.keccak );
cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 );
skein256_4way_init( &allium_8way_ctx.skein );
#if defined(__VAES__)
groestl256_2way_init( &allium_8way_ctx.groestl, 32 );
#else
init_groestl256( &allium_8way_ctx.groestl, 32 );
#endif
return true;
}
@@ -320,21 +306,20 @@ void allium_8way_hash( void *hash, const void *input )
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*)hash4, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*)hash5, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*)hash6, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*)hash7, 32 );
intrlv_2x128( vhashA, hash0, hash1, 256 );
intrlv_2x128( vhashB, hash2, hash3, 256 );
cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
dintrlv_2x128( hash0, hash1, vhashA, 256 );
dintrlv_2x128( hash2, hash3, vhashB, 256 );
intrlv_2x128( vhashA, hash4, hash5, 256 );
intrlv_2x128( vhashB, hash6, hash7, 256 );
cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
dintrlv_2x128( hash4, hash5, vhashA, 256 );
dintrlv_2x128( hash6, hash7, vhashB, 256 );
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );

View File

@@ -66,13 +66,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
G2W_4X64( s0, s1, s2, s3 ); \
s1 = mm512_ror256_64( s1); \
s3 = mm512_shufll256_64( s3 ); \
s1 = mm512_shuflr256_64( s1); \
s2 = mm512_swap256_128( s2 ); \
s3 = mm512_rol256_64( s3 ); \
G2W_4X64( s0, s1, s2, s3 ); \
s1 = mm512_rol256_64( s1 ); \
s2 = mm512_swap256_128( s2 ); \
s3 = mm512_ror256_64( s3 );
s3 = mm512_shuflr256_64( s3 ); \
s1 = mm512_shufll256_64( s1 ); \
s2 = mm512_swap256_128( s2 );
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -107,13 +107,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_ror_1x64( s1); \
s3 = mm256_shufll_64( s3 ); \
s1 = mm256_shuflr_64( s1); \
s2 = mm256_swap_128( s2 ); \
s3 = mm256_rol_1x64( s3 ); \
G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_rol_1x64( s1 ); \
s2 = mm256_swap_128( s2 ); \
s3 = mm256_ror_1x64( s3 );
s3 = mm256_shuflr_64( s3 ); \
s1 = mm256_shufll_64( s1 ); \
s2 = mm256_swap_128( s2 );
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -148,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_ror256_64( s2, s3 ); \
mm128_vrol256_64( s6, s7 ); \
mm128_vror256_64( s2, s3 ); \
mm128_swap256_128( s4, s5 ); \
mm128_rol256_64( s6, s7 ); \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_rol256_64( s2, s3 ); \
mm128_swap256_128( s4, s5 ); \
mm128_ror256_64( s6, s7 );
mm128_vror256_64( s6, s7 ); \
mm128_vrol256_64( s2, s3 ); \
mm128_swap256_128( s4, s5 );
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \

View File

@@ -13,6 +13,7 @@
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/ripemd/sph_ripemd.h"
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
#define EPSa DBL_EPSILON
#define EPS1 DBL_EPSILON
@@ -104,8 +105,8 @@ uint32_t sw2_( int nnounce )
}
typedef struct {
sph_sha256_context sha256;
sph_sha512_context sha512;
sha256_context sha256;
sph_sha512_context sha512;
sph_keccak512_context keccak;
sph_whirlpool_context whirlpool;
sph_haval256_5_context haval;
@@ -117,7 +118,7 @@ m7m_ctx_holder m7m_ctx;
void init_m7m_ctx()
{
sph_sha256_init( &m7m_ctx );
sha256_ctx_init( &m7m_ctx.sha256 );
sph_sha512_init( &m7m_ctx.sha512 );
sph_keccak512_init( &m7m_ctx.keccak );
sph_whirlpool_init( &m7m_ctx.whirlpool );
@@ -153,11 +154,10 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64)));
memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) );
sph_sha256_context ctxf_sha256;
memcpy(data, pdata, 80);
sph_sha256( &ctx1.sha256, data, M7_MIDSTATE_LEN );
sha256_update( &ctx1.sha256, data, M7_MIDSTATE_LEN );
sph_sha512( &ctx1.sha512, data, M7_MIDSTATE_LEN );
sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN );
sph_whirlpool( &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
@@ -189,8 +189,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );
sph_sha256( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
sph_sha256_close( &ctx2.sha256, bhash[0] );
sha256_update( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
sha256_final( &ctx2.sha256, bhash[0] );
sph_sha512( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
sph_sha512_close( &ctx2.sha512, bhash[1] );
@@ -225,9 +225,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
bytes = mpz_sizeinbase(product, 256);
mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);
sph_sha256_init( &ctxf_sha256 );
sph_sha256( &ctxf_sha256, bdata, bytes );
sph_sha256_close( &ctxf_sha256, hash );
sha256_full( hash, bdata, bytes );
digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
@@ -260,10 +258,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
mpzscale=bytes;
mpz_export(bdata, NULL, -1, 1, 0, 0, product);
sph_sha256_init( &ctxf_sha256 );
sph_sha256( &ctxf_sha256, bdata, bytes );
sph_sha256_close( &ctxf_sha256, hash );
}
sha256_full( hash, bdata, bytes );
}
if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
&& !opt_benchmark ) )

View File

@@ -7,24 +7,19 @@
#include <string.h>
#include <stdio.h>
#include "sph_ripemd.h"
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
void lbry_hash(void* output, const void* input)
{
sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
sha256_context ctx_sha256 __attribute__ ((aligned (64)));
sph_sha512_context ctx_sha512 __attribute__ ((aligned (64)));
sph_ripemd160_context ctx_ripemd __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) hashA[16];
uint32_t _ALIGN(64) hashB[16];
uint32_t _ALIGN(64) hashC[16];
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, input, 112 );
sph_sha256_close( &ctx_sha256, hashA );
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hashA, 32 );
sph_sha256_close( &ctx_sha256, hashA );
sha256_full( hashA, input, 112 );
sha256_full( hashA, hashA, 32 );
sph_sha512_init( &ctx_sha512 );
sph_sha512( &ctx_sha512, hashA, 32 );
@@ -38,15 +33,13 @@ void lbry_hash(void* output, const void* input)
sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 );
sph_ripemd160_close( &ctx_ripemd, hashC );
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hashB, 20 );
sph_sha256( &ctx_sha256, hashC, 20 );
sph_sha256_close( &ctx_sha256, hashA );
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hashA, 32 );
sph_sha256_close( &ctx_sha256, hashA );
sha256_ctx_init( &ctx_sha256 );
sha256_update( &ctx_sha256, hashB, 20 );
sha256_update( &ctx_sha256, hashC, 20 );
sha256_final( &ctx_sha256, hashA );
sha256_full( hashA, hashA, 32 );
memcpy( output, hashA, 32 );
}

View File

@@ -69,8 +69,12 @@ typedef unsigned int uint;
#define SCRYPT_HASH_BLOCK_SIZE 64U
#define SCRYPT_HASH_DIGEST_SIZE 32U
#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
//#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
//#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
#define ROTL32(a,b) rol32(a,b)
#define ROTR32(a,b) ror32(a,b)
#define U8TO32_BE(p) \
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,70 @@
#ifndef SCRYPT_CORE_4WAY_H__
#define SCRYPT_CORE_4WAY_H__
#include "simd-utils.h"
#include <stdlib.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );
// Serial SIMD over 4 way parallel
void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N );
// 4 way parallel over serial SIMD
void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N );
#endif
#if defined(__AVX2__)
void scrypt_core_8way( __m256i *X, __m256i *V, uint32_t N );
// 2 way parallel over SIMD128
void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N );
// Double buffered 2 way parallel over SIMD128
void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N );
// Triplee buffered 2 way parallel over SIMD128
void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N );
// Serial SIMD128 over 2 way parallel
void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N );
// Double buffered simd over parallel
void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N );
// Triple buffered 2 way
void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, const uint32_t N );
// Quadruple buffered
void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N );
#endif
#if defined(__SSE2__)
// Parallel 4 way, 4x memory
void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N );
// Linear SIMD 1 way, 1x memory, lowest
void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N );
// Double buffered, 2x memory
void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N );
// Triple buffered
void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N );
// Quadruple buffered, 4x memory
void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N );
#endif
// For reference only
void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N );
#endif

View File

@@ -0,0 +1,206 @@
#include "scrypt-core-ref.h"
#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
{
uint32_t x0 = (B[ 0] ^= C[ 0]),
x1 = (B[ 1] ^= C[ 1]),
x2 = (B[ 2] ^= C[ 2]),
x3 = (B[ 3] ^= C[ 3]);
uint32_t x4 = (B[ 4] ^= C[ 4]),
x5 = (B[ 5] ^= C[ 5]),
x6 = (B[ 6] ^= C[ 6]),
x7 = (B[ 7] ^= C[ 7]);
uint32_t x8 = (B[ 8] ^= C[ 8]),
x9 = (B[ 9] ^= C[ 9]),
xa = (B[10] ^= C[10]),
xb = (B[11] ^= C[11]);
uint32_t xc = (B[12] ^= C[12]),
xd = (B[13] ^= C[13]),
xe = (B[14] ^= C[14]),
xf = (B[15] ^= C[15]);
/* Operate on columns. */
x4 ^= ROTL(x0 + xc, 7);
x9 ^= ROTL(x5 + x1, 7);
xe ^= ROTL(xa + x6, 7);
x3 ^= ROTL(xf + xb, 7);
x8 ^= ROTL(x4 + x0, 9);
xd ^= ROTL(x9 + x5, 9);
x2 ^= ROTL(xe + xa, 9);
x7 ^= ROTL(x3 + xf, 9);
xc ^= ROTL(x8 + x4, 13);
x1 ^= ROTL(xd + x9, 13);
x6 ^= ROTL(x2 + xe, 13);
xb ^= ROTL(x7 + x3, 13);
x0 ^= ROTL(xc + x8, 18);
x5 ^= ROTL(x1 + xd, 18);
xa ^= ROTL(x6 + x2, 18);
xf ^= ROTL(xb + x7, 18);
/* Operate on rows. */
x1 ^= ROTL(x0 + x3, 7);
x6 ^= ROTL(x5 + x4, 7);
xb ^= ROTL(xa + x9, 7);
xc ^= ROTL(xf + xe, 7);
x2 ^= ROTL(x1 + x0, 9);
x7 ^= ROTL(x6 + x5, 9);
x8 ^= ROTL(xb + xa, 9);
xd ^= ROTL(xc + xf, 9);
x3 ^= ROTL(x2 + x1, 13);
x4 ^= ROTL(x7 + x6, 13);
x9 ^= ROTL(x8 + xb, 13);
xe ^= ROTL(xd + xc, 13);
x0 ^= ROTL(x3 + x2, 18);
x5 ^= ROTL(x4 + x7, 18);
xa ^= ROTL(x9 + x8, 18);
xf ^= ROTL(xe + xd, 18);
/* Operate on columns. */
x4 ^= ROTL(x0 + xc, 7);
x9 ^= ROTL(x5 + x1, 7);
xe ^= ROTL(xa + x6, 7);
x3 ^= ROTL(xf + xb, 7);
x8 ^= ROTL(x4 + x0, 9);
xd ^= ROTL(x9 + x5, 9);
x2 ^= ROTL(xe + xa, 9);
x7 ^= ROTL(x3 + xf, 9);
xc ^= ROTL(x8 + x4, 13);
x1 ^= ROTL(xd + x9, 13);
x6 ^= ROTL(x2 + xe, 13);
xb ^= ROTL(x7 + x3, 13);
x0 ^= ROTL(xc + x8, 18);
x5 ^= ROTL(x1 + xd, 18);
xa ^= ROTL(x6 + x2, 18);
xf ^= ROTL(xb + x7, 18);
/* Operate on rows. */
x1 ^= ROTL(x0 + x3, 7);
x6 ^= ROTL(x5 + x4, 7);
xb ^= ROTL(xa + x9, 7);
xc ^= ROTL(xf + xe, 7);
x2 ^= ROTL(x1 + x0, 9);
x7 ^= ROTL(x6 + x5, 9);
x8 ^= ROTL(xb + xa, 9);
xd ^= ROTL(xc + xf, 9);
x3 ^= ROTL(x2 + x1, 13);
x4 ^= ROTL(x7 + x6, 13);
x9 ^= ROTL(x8 + xb, 13);
xe ^= ROTL(xd + xc, 13);
x0 ^= ROTL(x3 + x2, 18);
x5 ^= ROTL(x4 + x7, 18);
xa ^= ROTL(x9 + x8, 18);
xf ^= ROTL(xe + xd, 18);
/* Operate on columns. */
x4 ^= ROTL(x0 + xc, 7);
x9 ^= ROTL(x5 + x1, 7);
xe ^= ROTL(xa + x6, 7);
x3 ^= ROTL(xf + xb, 7);
x8 ^= ROTL(x4 + x0, 9);
xd ^= ROTL(x9 + x5, 9);
x2 ^= ROTL(xe + xa, 9);
x7 ^= ROTL(x3 + xf, 9);
xc ^= ROTL(x8 + x4, 13);
x1 ^= ROTL(xd + x9, 13);
x6 ^= ROTL(x2 + xe, 13);
xb ^= ROTL(x7 + x3, 13);
x0 ^= ROTL(xc + x8, 18);
x5 ^= ROTL(x1 + xd, 18);
xa ^= ROTL(x6 + x2, 18);
xf ^= ROTL(xb + x7, 18);
/* Operate on rows. */
x1 ^= ROTL(x0 + x3, 7);
x6 ^= ROTL(x5 + x4, 7);
xb ^= ROTL(xa + x9, 7);
xc ^= ROTL(xf + xe, 7);
x2 ^= ROTL(x1 + x0, 9);
x7 ^= ROTL(x6 + x5, 9);
x8 ^= ROTL(xb + xa, 9);
xd ^= ROTL(xc + xf, 9);
x3 ^= ROTL(x2 + x1, 13);
x4 ^= ROTL(x7 + x6, 13);
x9 ^= ROTL(x8 + xb, 13);
xe ^= ROTL(xd + xc, 13);
x0 ^= ROTL(x3 + x2, 18);
x5 ^= ROTL(x4 + x7, 18);
xa ^= ROTL(x9 + x8, 18);
xf ^= ROTL(xe + xd, 18);
/* Operate on columns. */
x4 ^= ROTL(x0 + xc, 7);
x9 ^= ROTL(x5 + x1, 7);
xe ^= ROTL(xa + x6, 7);
x3 ^= ROTL(xf + xb, 7);
x8 ^= ROTL(x4 + x0, 9);
xd ^= ROTL(x9 + x5, 9);
x2 ^= ROTL(xe + xa, 9);
x7 ^= ROTL(x3 + xf, 9);
xc ^= ROTL(x8 + x4, 13);
x1 ^= ROTL(xd + x9, 13);
x6 ^= ROTL(x2 + xe, 13);
xb ^= ROTL(x7 + x3, 13);
x0 ^= ROTL(xc + x8, 18);
x5 ^= ROTL(x1 + xd, 18);
xa ^= ROTL(x6 + x2, 18);
xf ^= ROTL(xb + x7, 18);
/* Operate on rows. */
x1 ^= ROTL(x0 + x3, 7);
x6 ^= ROTL(x5 + x4, 7);
xb ^= ROTL(xa + x9, 7);
xc ^= ROTL(xf + xe, 7);
x2 ^= ROTL(x1 + x0, 9);
x7 ^= ROTL(x6 + x5, 9);
x8 ^= ROTL(xb + xa, 9);
xd ^= ROTL(xc + xf, 9);
x3 ^= ROTL(x2 + x1, 13);
x4 ^= ROTL(x7 + x6, 13);
x9 ^= ROTL(x8 + xb, 13);
xe ^= ROTL(xd + xc, 13);
x0 ^= ROTL(x3 + x2, 18);
x5 ^= ROTL(x4 + x7, 18);
xa ^= ROTL(x9 + x8, 18);
xf ^= ROTL(xe + xd, 18);
B[ 0] += x0;
B[ 1] += x1;
B[ 2] += x2;
B[ 3] += x3;
B[ 4] += x4;
B[ 5] += x5;
B[ 6] += x6;
B[ 7] += x7;
B[ 8] += x8;
B[ 9] += x9;
B[10] += xa;
B[11] += xb;
B[12] += xc;
B[13] += xd;
B[14] += xe;
B[15] += xf;
}
/**
* @param X input/ouput
* @param V scratch buffer
* @param N factor (def. 1024)
*/
void scrypt_core_ref(uint32_t *X, uint32_t *V, uint32_t N)
{
for (uint32_t i = 0; i < N; i++) {
memcpy(&V[i * 32], X, 128);
xor_salsa8(&X[0], &X[16]);
xor_salsa8(&X[16], &X[0]);
}
for (uint32_t i = 0; i < N; i++) {
uint32_t j = 32 * (X[16] & (N - 1));
for (uint8_t k = 0; k < 32; k++)
X[k] ^= V[j + k];
xor_salsa8(&X[0], &X[16]);
xor_salsa8(&X[16], &X[0]);
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -39,10 +39,10 @@
void
SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
{
sph_sha256_context ctx;
sph_sha256_init( &ctx );
sph_sha256( &ctx, in, len );
sph_sha256_close( &ctx, digest );
sha256_context ctx;
sha256_ctx_init( &ctx );
sha256_update( &ctx, in, len );
sha256_final( &ctx, digest );
}
/**
@@ -64,7 +64,7 @@ HMAC_SHA256_Buf( const void *K, size_t Klen, const void *in, size_t len,
void
HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
{
unsigned char pad[64];
unsigned char pad[64] __attribute__ ((aligned (64)));
unsigned char khash[32];
const unsigned char * K = _K;
size_t i;
@@ -72,29 +72,28 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
/* If Klen > 64, the key is really SHA256(K). */
if ( Klen > 64 )
{
sph_sha256_init( &ctx->ictx );
sph_sha256( &ctx->ictx, K, Klen );
sph_sha256_close( &ctx->ictx, khash );
sha256_ctx_init( &ctx->ictx );
sha256_update( &ctx->ictx, K, Klen );
sha256_final( &ctx->ictx, khash );
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
sph_sha256_init( &ctx->ictx );
sha256_ctx_init( &ctx->ictx );
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x36;
memset( pad + Klen, 0x36, 64 - Klen );
sph_sha256( &ctx->ictx, pad, 64 );
sha256_update( &ctx->ictx, pad, 64 );
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
sph_sha256_init( &ctx->octx );
sha256_ctx_init( &ctx->octx );
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x5c;
memset( pad + Klen, 0x5c, 64 - Klen );
sph_sha256( &ctx->octx, pad, 64 );
sha256_update( &ctx->octx, pad, 64 );
}
/* Add bytes to the HMAC-SHA256 operation. */
@@ -102,18 +101,17 @@ void
HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
{
/* Feed data to the inner SHA256 operation. */
sph_sha256( &ctx->ictx, in, len );
sha256_update( &ctx->ictx, in, len );
}
/* Finish an HMAC-SHA256 operation. */
void
HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
HMAC_SHA256_Final( void *digest, HMAC_SHA256_CTX *ctx )
{
unsigned char ihash[32];
sph_sha256_close( &ctx->ictx, ihash );
sph_sha256( &ctx->octx, ihash, 32 );
sph_sha256_close( &ctx->octx, digest );
uint32_t ihash[8] __attribute__ ((aligned (32)));
sha256_final( &ctx->ictx, ihash );
sha256_update( &ctx->octx, ihash, 32 );
sha256_final( &ctx->octx, digest );
}
/**
@@ -126,8 +124,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
size_t saltlen, uint64_t c, uint8_t *buf, size_t dkLen )
{
HMAC_SHA256_CTX PShctx, hctx;
uint8_t _ALIGN(128) T[32];
uint8_t _ALIGN(128) U[32];
uint64_t _ALIGN(128) T[4];
uint64_t _ALIGN(128) U[4];
// uint8_t _ALIGN(128) T[32];
// uint8_t _ALIGN(128) U[32];
uint32_t ivec;
size_t i, clen;
uint64_t j;
@@ -163,10 +163,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
// _mm_xor_si128( ((__m128i*)T)[0], ((__m128i*)U)[0] );
// _mm_xor_si128( ((__m128i*)T)[1], ((__m128i*)U)[1] );
// for ( k = 0; k < 4; k++ ) T[k] ^= U[k];
for ( k = 0; k < 4; k++ ) T[k] ^= U[k];
for ( k = 0; k < 32; k++ )
T[k] ^= U[k];
// for ( k = 0; k < 32; k++ )
// T[k] ^= U[k];
}
/* Copy as many bytes as necessary into buf. */

View File

@@ -31,18 +31,18 @@
#include <sys/types.h>
#include <stdint.h>
#include "sph_sha2.h"
#include "sha256-hash.h"
typedef struct HMAC_SHA256Context
{
sph_sha256_context ictx;
sph_sha256_context octx;
sha256_context ictx;
sha256_context octx;
} HMAC_SHA256_CTX;
void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t );
void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t );
void HMAC_SHA256_Final( unsigned char [32], HMAC_SHA256_CTX * );
void HMAC_SHA256_Final( void*, HMAC_SHA256_CTX * );
void HMAC_SHA256_Buf( const void *, size_t Klen, const void *,
size_t len, uint8_t digest[32] );

View File

@@ -59,7 +59,9 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data,
size_t len );
void sha256_4way_close( sha256_4way_context *sc, void *dst );
void sha256_4way_full( void *dst, const void *data, size_t len );
void sha256_4way_transform( __m128i *state_out, const __m128i *data,
void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
const __m128i *state_in );
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
const __m128i *state_in );
#endif // SSE2
@@ -79,8 +81,10 @@ void sha256_8way_init( sha256_8way_context *sc );
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
void sha256_8way_close( sha256_8way_context *sc, void *dst );
void sha256_8way_full( void *dst, const void *data, size_t len );
void sha256_8way_transform( __m256i *state_out, const __m256i *data,
const __m256i *state_in );
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
const __m256i *state_in );
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
const __m256i *state_in );
#endif // AVX2
@@ -99,7 +103,9 @@ void sha256_16way_init( sha256_16way_context *sc );
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
void sha256_16way_close( sha256_16way_context *sc, void *dst );
void sha256_16way_full( void *dst, const void *data, size_t len );
void sha256_16way_transform( __m512i *state_out, const __m512i *data,
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
const __m512i *state_in );
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
const __m512i *state_in );
void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
const __m512i *state_in );

View File

@@ -180,6 +180,7 @@ static const uint32_t sha256d_hash1[16] = {
0x00000000, 0x00000000, 0x00000000, 0x00000100
};
// this performs the entire hash all over again, why?
static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
{
uint32_t S[16];
@@ -195,6 +196,7 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
hash[i] = swab32(hash[i]);
}
/*
#if defined (__SHA__)
#include "algo/sha/sph_sha2.h"
@@ -241,6 +243,7 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len)
}
#endif
*/
static inline void sha256d_preextend(uint32_t *W)
{
@@ -653,6 +656,7 @@ int scanhash_sha256d( struct work *work,
return 0;
}
/*
int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
@@ -682,13 +686,13 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
pdata[19] = n;
return 0;
}
*/
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_sha256d;
gate->hash = (void*)&sha256d;
// gate->hash = (void*)&sha256d;
return true;
};

View File

@@ -7,9 +7,9 @@
#if defined(__SHA__)
#include "sha256-hash-opt.h"
#include "sha256-hash.h"
void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y )
{
@@ -342,4 +342,348 @@ void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
}
void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y )
{
__m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
__m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK;
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
__m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y;
// Load initial values
TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
// Save current hash
ABEF_SAVE_X = STATE0_X;
ABEF_SAVE_Y = STATE0_Y;
CDGH_SAVE_X = STATE1_X;
CDGH_SAVE_Y = STATE1_Y;
// Rounds 0-3
TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK );
TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK );
MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Rounds 4-7
TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK );
TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK );
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
// Rounds 8-11
TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK );
TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK );
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
// Rounds 12-15
TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK );
TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK );
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
// Rounds 16-19
TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
// Rounds 20-23
TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
// Rounds 24-27
TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
// Rounds 28-31
TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
// Rounds 32-35
TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
// Rounds 36-39
TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
// Rounds 40-43
TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
// Rounds 44-47
TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
// Rounds 48-51
TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
// Rounds 52-55
TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Rounds 56-59
TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Rounds 60-63
TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Add values back to state
STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
// Save state
_mm_store_si128((__m128i*) &out_X[0], STATE0_X);
_mm_store_si128((__m128i*) &out_X[4], STATE1_X);
_mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
}
#endif

View File

@@ -74,17 +74,6 @@ static const uint32_t K256[64] =
#define CHs(X, Y, Z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z )
/*
#define MAJs(X, Y, Z) \
_mm_or_si128( _mm_and_si128( X, Y ), \
_mm_and_si128( _mm_or_si128( X, Y ), Z ) )
*/
/*
#define MAJs(X, Y, Z) \
_mm_xor_si128( Y, _mm_and_si128( _mm_xor_si128( X, Y ), \
_mm_xor_si128( Y, Z ) ) )
*/
#define MAJs(X, Y, Z) \
_mm_xor_si128( Y, _mm_and_si128( X_xor_Y = _mm_xor_si128( X, Y ), \
Y_xor_Z ) )
@@ -105,38 +94,6 @@ static const uint32_t K256[64] =
_mm_xor_si128( _mm_xor_si128( \
mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )
/*
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \
__m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
__m128i T1 = mm128_ror_32( E, 14 ); \
__m128i T2 = mm128_ror_32( A, 9 ); \
__m128i T3 = _mm_xor_si128( F, G ); \
__m128i T4 = _mm_or_si128( A, B ); \
__m128i T5 = _mm_and_si128( A, B ); \
K = _mm_add_epi32( K, W[i] ); \
T1 = _mm_xor_si128( T1, E ); \
T2 = _mm_xor_si128( T2, A ); \
T3 = _mm_and_si128( T3, E ); \
T4 = _mm_and_si128( T4, C ); \
K = _mm_add_epi32( H, K ); \
T1 = mm128_ror_32( T1, 5 ); \
T2 = mm128_ror_32( T2, 11 ); \
T3 = _mm_xor_si128( T3, G ); \
T4 = _mm_or_si128( T4, T5 ); \
T1 = _mm_xor_si128( T1, E ); \
T2 = _mm_xor_si128( T2, A ); \
T1 = mm128_ror_32( T1, 6 ); \
T2 = mm128_ror_32( T2, 2 ); \
T1 = _mm_add_epi32( T1, T3 ); \
T2 = _mm_add_epi32( T2, T4 ); \
T1 = _mm_add_epi32( T1, K ); \
H = _mm_add_epi32( T1, T2 ); \
D = _mm_add_epi32( D, T1 ); \
} while (0)
*/
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \
__m128i T1, T2; \
@@ -149,8 +106,8 @@ do { \
H = _mm_add_epi32( T1, T2 ); \
} while (0)
void sha256_4way_transform( __m128i *state_out, const __m128i *data,
// LE data, no need to byte swap
void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
const __m128i *state_in )
{
__m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
@@ -232,6 +189,91 @@ void sha256_4way_transform( __m128i *state_out, const __m128i *data,
state_out[7] = _mm_add_epi32( state_in[7], H );
}
// BE data, need to byte swap
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
const __m128i *state_in )
{
__m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
__m128i W[16];
mm128_block_bswap_32( W, data );
mm128_block_bswap_32( W+8, data+8 );
A = state_in[0];
B = state_in[1];
C = state_in[2];
D = state_in[3];
E = state_in[4];
F = state_in[5];
G = state_in[6];
H = state_in[7];
Y_xor_Z = _mm_xor_si128( B, C );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
for ( int j = 16; j < 64; j += 16 )
{
W[ 0] = SHA2s_MEXP( 14, 9, 1, 0 );
W[ 1] = SHA2s_MEXP( 15, 10, 2, 1 );
W[ 2] = SHA2s_MEXP( 0, 11, 3, 2 );
W[ 3] = SHA2s_MEXP( 1, 12, 4, 3 );
W[ 4] = SHA2s_MEXP( 2, 13, 5, 4 );
W[ 5] = SHA2s_MEXP( 3, 14, 6, 5 );
W[ 6] = SHA2s_MEXP( 4, 15, 7, 6 );
W[ 7] = SHA2s_MEXP( 5, 0, 8, 7 );
W[ 8] = SHA2s_MEXP( 6, 1, 9, 8 );
W[ 9] = SHA2s_MEXP( 7, 2, 10, 9 );
W[10] = SHA2s_MEXP( 8, 3, 11, 10 );
W[11] = SHA2s_MEXP( 9, 4, 12, 11 );
W[12] = SHA2s_MEXP( 10, 5, 13, 12 );
W[13] = SHA2s_MEXP( 11, 6, 14, 13 );
W[14] = SHA2s_MEXP( 12, 7, 15, 14 );
W[15] = SHA2s_MEXP( 13, 8, 0, 15 );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
}
state_out[0] = _mm_add_epi32( state_in[0], A );
state_out[1] = _mm_add_epi32( state_in[1], B );
state_out[2] = _mm_add_epi32( state_in[2], C );
state_out[3] = _mm_add_epi32( state_in[3], D );
state_out[4] = _mm_add_epi32( state_in[4], E );
state_out[5] = _mm_add_epi32( state_in[5], F );
state_out[6] = _mm_add_epi32( state_in[6], G );
state_out[7] = _mm_add_epi32( state_in[7], H );
}
static void
sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
{
@@ -436,61 +478,81 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
// SHA-256 8 way
#if defined(__AVX512VL__)
#define CHx(X, Y, Z) \
_mm256_ternarylogic_epi32( X, Y, Z, 0xca )
#define MAJx(X, Y, Z) \
_mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
#define BSG2_0x(x) \
mm256_xor3( mm256_ror_32(x, 2), mm256_ror_32(x, 13), mm256_ror_32(x, 22) )
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 2 ), \
mm256_ror_32( x, 13 ) ), \
mm256_ror_32( x, 22 ) )
#define BSG2_1x(x) \
mm256_xor3( mm256_ror_32(x, 6), mm256_ror_32(x, 11), mm256_ror_32(x, 25) )
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 6 ), \
mm256_ror_32( x, 11 ) ), \
mm256_ror_32( x, 25 ) )
#define SSG2_0x(x) \
mm256_xor3( mm256_ror_32(x, 7), mm256_ror_32(x, 18), _mm256_srli_epi32(x, 3) )
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 7 ), \
mm256_ror_32( x, 18 ) ), \
_mm256_srli_epi32( x, 3 ) )
#define SSG2_1x(x) \
mm256_xor3( mm256_ror_32(x, 17), mm256_ror_32(x, 19), _mm256_srli_epi32(x, 10) )
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 17 ), \
mm256_ror_32( x, 19 ) ), \
_mm256_srli_epi32( x, 10 ) )
#define SHA2x_MEXP( a, b, c, d ) \
mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );
// With AVX512VL ternary logic optimizations are available.
// If not optimize by forwarding the result of X^Y in MAJ to the next round
// to avoid recalculating it as Y^Z. This optimization is not applicable
// when MAJ is optimized with ternary logic.
#if defined(__AVX512VL__)
#define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
#define MAJx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
#define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
do { \
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \
W[ i ] ); \
__m256i T1 = BSG2_1x( E ); \
__m256i T2 = BSG2_0x( A ); \
T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
T1 = _mm256_add_epi32( T1, H ); \
T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
T1 = _mm256_add_epi32( T1, T0 ); \
D = _mm256_add_epi32( D, T1 ); \
H = _mm256_add_epi32( T1, T2 ); \
} while (0)
#else // AVX2
#define CHx(X, Y, Z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
#define MAJx(X, Y, Z) \
_mm256_xor_si256( Y, _mm256_and_si256( _mm256_xor_si256( X, Y ), \
_mm256_xor_si256( Y, Z ) ) )
/*
// Use saved X_xor_Y from previous round, now called Y_xor_Z,
// and save new X_xor_Y, for next round.
#define MAJx(X, Y, Z) \
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
Y_xor_Z ) )
*/
#define BSG2_0x(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_32(x, 2), mm256_ror_32(x, 13) ), mm256_ror_32( x, 22) )
#define BSG2_1x(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_32(x, 6), mm256_ror_32(x, 11) ), mm256_ror_32( x, 25) )
#define SSG2_0x(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_32(x, 7), mm256_ror_32(x, 18) ), _mm256_srli_epi32(x, 3) )
#define SSG2_1x(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_32(x, 17), mm256_ror_32(x, 19) ), _mm256_srli_epi32(x, 10) )
#endif // AVX512 else AVX2
#define SHA2x_MEXP( a, b, c, d ) \
mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );
#define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
do { \
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \
W[ i ] ); \
__m256i T1 = BSG2_1x( E ); \
__m256i T2 = BSG2_0x( A ); \
T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
T1 = _mm256_add_epi32( T1, H ); \
T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
T1 = _mm256_add_epi32( T1, T0 ); \
Y_xor_Z = X_xor_Y; \
D = _mm256_add_epi32( D, T1 ); \
H = _mm256_add_epi32( T1, T2 ); \
} while (0)
/*
#define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \
__m256i T1, T2; \
@@ -498,16 +560,23 @@ do { \
T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \
K, W[i] ) ); \
T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
Y_xor_Z = X_xor_Y; \
D = _mm256_add_epi32( D, T1 ); \
H = _mm256_add_epi32( T1, T2 ); \
} while (0)
*/
void sha256_8way_transform( __m256i *state_out, const __m256i *data,
#endif // AVX512VL else AVX2
// accepts LE byte ordered data, skip the byte swap
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
const __m256i *state_in )
{
__m256i A, B, C, D, E, F, G, H;
#if !defined(__AVX512VL__)
__m256i X_xor_Y, Y_xor_Z;
#endif
__m256i W[16];
memcpy_256( W, data, 16 );
A = state_in[0];
@@ -519,6 +588,101 @@ void sha256_8way_transform( __m256i *state_out, const __m256i *data,
G = state_in[6];
H = state_in[7];
#if !defined(__AVX512VL__)
Y_xor_Z = _mm256_xor_si256( B, C );
#endif
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
for ( int j = 16; j < 64; j += 16 )
{
W[ 0] = SHA2x_MEXP( 14, 9, 1, 0 );
W[ 1] = SHA2x_MEXP( 15, 10, 2, 1 );
W[ 2] = SHA2x_MEXP( 0, 11, 3, 2 );
W[ 3] = SHA2x_MEXP( 1, 12, 4, 3 );
W[ 4] = SHA2x_MEXP( 2, 13, 5, 4 );
W[ 5] = SHA2x_MEXP( 3, 14, 6, 5 );
W[ 6] = SHA2x_MEXP( 4, 15, 7, 6 );
W[ 7] = SHA2x_MEXP( 5, 0, 8, 7 );
W[ 8] = SHA2x_MEXP( 6, 1, 9, 8 );
W[ 9] = SHA2x_MEXP( 7, 2, 10, 9 );
W[10] = SHA2x_MEXP( 8, 3, 11, 10 );
W[11] = SHA2x_MEXP( 9, 4, 12, 11 );
W[12] = SHA2x_MEXP( 10, 5, 13, 12 );
W[13] = SHA2x_MEXP( 11, 6, 14, 13 );
W[14] = SHA2x_MEXP( 12, 7, 15, 14 );
W[15] = SHA2x_MEXP( 13, 8, 0, 15 );
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
}
state_out[0] = _mm256_add_epi32( state_in[0], A );
state_out[1] = _mm256_add_epi32( state_in[1], B );
state_out[2] = _mm256_add_epi32( state_in[2], C );
state_out[3] = _mm256_add_epi32( state_in[3], D );
state_out[4] = _mm256_add_epi32( state_in[4], E );
state_out[5] = _mm256_add_epi32( state_in[5], F );
state_out[6] = _mm256_add_epi32( state_in[6], G );
state_out[7] = _mm256_add_epi32( state_in[7], H );
}
// Accepts BE byte ordered data, need to byte swap
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
const __m256i *state_in )
{
__m256i A, B, C, D, E, F, G, H;
#if !defined(__AVX512VL__)
__m256i X_xor_Y, Y_xor_Z;
#endif
__m256i W[16];
mm256_block_bswap_32( W , data );
mm256_block_bswap_32( W+8, data+8 );
A = state_in[0];
B = state_in[1];
C = state_in[2];
D = state_in[3];
E = state_in[4];
F = state_in[5];
G = state_in[6];
H = state_in[7];
#if !defined(__AVX512VL__)
Y_xor_Z = _mm256_xor_si256( B, C );
#endif
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
@@ -587,6 +751,9 @@ static void
sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
{
register __m256i A, B, C, D, E, F, G, H;
#if !defined(__AVX512VL__)
__m256i X_xor_Y, Y_xor_Z;
#endif
__m256i W[16];
mm256_block_bswap_32( W , in );
@@ -615,6 +782,10 @@ sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
H = m256_const1_64( 0x5BE0CD195BE0CD19 );
}
#if !defined(__AVX512VL__)
Y_xor_Z = _mm256_xor_si256( B, C );
#endif
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
@@ -790,27 +961,44 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
// SHA-256 16 way
#define CHx16(X, Y, Z) \
_mm512_ternarylogic_epi32( X, Y, Z, 0xca )
#define CHx16(X, Y, Z) _mm512_ternarylogic_epi32( X, Y, Z, 0xca )
#define MAJx16(X, Y, Z) \
_mm512_ternarylogic_epi32( X, Y, Z, 0xe8 )
#define MAJx16(X, Y, Z) _mm512_ternarylogic_epi32( X, Y, Z, 0xe8 )
#define BSG2_0x16(x) \
mm512_xor3( mm512_ror_32(x, 2), mm512_ror_32(x, 13), mm512_ror_32(x, 22) )
#define BSG2_0x16(x) mm512_xor3( _mm512_ror_epi32( x, 2 ), \
_mm512_ror_epi32( x, 13 ), \
_mm512_ror_epi32( x, 22 ) )
#define BSG2_1x16(x) \
mm512_xor3( mm512_ror_32(x, 6), mm512_ror_32(x, 11), mm512_ror_32(x, 25) )
#define BSG2_1x16(x) mm512_xor3( _mm512_ror_epi32( x, 6 ), \
_mm512_ror_epi32( x, 11 ), \
_mm512_ror_epi32( x, 25 ) )
#define SSG2_0x16(x) \
mm512_xor3( mm512_ror_32(x, 7), mm512_ror_32(x, 18), _mm512_srli_epi32(x, 3) )
#define SSG2_0x16(x) mm512_xor3( _mm512_ror_epi32( x, 7 ), \
_mm512_ror_epi32( x, 18 ), \
_mm512_srli_epi32( x, 3 ) )
#define SSG2_1x16(x) \
mm512_xor3( mm512_ror_32(x, 17), mm512_ror_32(x, 19), _mm512_srli_epi32(x, 10) )
#define SSG2_1x16(x) mm512_xor3( _mm512_ror_epi32( x, 17 ), \
_mm512_ror_epi32( x, 19 ), \
_mm512_srli_epi32( x, 10 ) )
#define SHA2x16_MEXP( a, b, c, d ) \
mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] );
#define SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
do { \
__m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[ (j)+(i) ] ), \
W[ i ] ); \
__m512i T1 = BSG2_1x16( E ); \
__m512i T2 = BSG2_0x16( A ); \
T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \
T1 = _mm512_add_epi32( T1, H ); \
T2 = _mm512_add_epi32( T2, MAJx16( A, B, C ) ); \
T1 = _mm512_add_epi32( T1, T0 ); \
D = _mm512_add_epi32( D, T1 ); \
H = _mm512_add_epi32( T1, T2 ); \
} while (0)
/*
#define SHA2s_16WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \
__m512i T1, T2; \
@@ -821,14 +1009,10 @@ do { \
D = _mm512_add_epi32( D, T1 ); \
H = _mm512_add_epi32( T1, T2 ); \
} while (0)
*/
// Tranform one 16 lane by 64 byte message block and update state.
// Calling function is responsible for initializing the state, setting
// correct byte order, counting bits and padding of the final block.
// It's faster for multiple rounds of sha256 (sha256d/t/q) by eliminating
// redundant byte swapping.
//
void sha256_16way_transform( __m512i *state_out, const __m512i *data,
// accepts LE input data
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
const __m512i *state_in )
{
__m512i A, B, C, D, E, F, G, H;
@@ -909,6 +1093,89 @@ void sha256_16way_transform( __m512i *state_out, const __m512i *data,
state_out[7] = _mm512_add_epi32( state_in[7], H );
}
// Accepts BE input data, need to bswap
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
const __m512i *state_in )
{
__m512i A, B, C, D, E, F, G, H;
__m512i W[16];
mm512_block_bswap_32( W , data );
mm512_block_bswap_32( W+8, data+8 );
A = state_in[0];
B = state_in[1];
C = state_in[2];
D = state_in[3];
E = state_in[4];
F = state_in[5];
G = state_in[6];
H = state_in[7];
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
for ( int j = 16; j < 64; j += 16 )
{
W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 );
W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 );
W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 );
W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 );
W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 );
W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 );
W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 );
W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 );
W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 );
W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 );
W[10] = SHA2x16_MEXP( 8, 3, 11, 10 );
W[11] = SHA2x16_MEXP( 9, 4, 12, 11 );
W[12] = SHA2x16_MEXP( 10, 5, 13, 12 );
W[13] = SHA2x16_MEXP( 11, 6, 14, 13 );
W[14] = SHA2x16_MEXP( 12, 7, 15, 14 );
W[15] = SHA2x16_MEXP( 13, 8, 0, 15 );
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
}
state_out[0] = _mm512_add_epi32( state_in[0], A );
state_out[1] = _mm512_add_epi32( state_in[1], B );
state_out[2] = _mm512_add_epi32( state_in[2], C );
state_out[3] = _mm512_add_epi32( state_in[3], D );
state_out[4] = _mm512_add_epi32( state_in[4], E );
state_out[5] = _mm512_add_epi32( state_in[5], F );
state_out[6] = _mm512_add_epi32( state_in[6], G );
state_out[7] = _mm512_add_epi32( state_in[7], H );
}
// Aggresive prehashing
void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
const __m512i *state_in )

View File

@@ -7,9 +7,9 @@
#if defined(__SHA__)
#include "sha256-hash-opt.h"
#include "sha256-hash.h"
void sha256_opt_transform( uint32_t *state_out, const void *input,
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
__m128i STATE0, STATE1;
@@ -197,4 +197,192 @@ void sha256_opt_transform( uint32_t *state_out, const void *input,
_mm_store_si128((__m128i*) &state_out[4], STATE1);
}
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
__m128i STATE0, STATE1;
__m128i MSG, TMP, MASK;
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
__m128i ABEF_SAVE, CDGH_SAVE;
// Load initial values
TMP = _mm_load_si128((__m128i*) &state_in[0]);
STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
// Save current hash
ABEF_SAVE = STATE0;
CDGH_SAVE = STATE1;
// Rounds 0-3
TMSG0 = _mm_load_si128((const __m128i*) (input+0));
TMSG0 = _mm_shuffle_epi8( TMSG0, MASK );
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 4-7
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 8-11
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 12-15
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 16-19
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 20-23
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 24-27
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 28-31
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 32-35
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 36-39
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 40-43
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 44-47
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 48-51
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 52-55
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 56-59
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 60-63
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Add values back to state
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
// Save state
_mm_store_si128((__m128i*) &state_out[0], STATE0);
_mm_store_si128((__m128i*) &state_out[4], STATE1);
}
#endif

View File

@@ -1,18 +0,0 @@
#ifndef SHA2_HASH_OPT_H__
#define SHA2_HASH_OPT_H__ 1
#include <stddef.h>
#include "simd-utils.h"
#if defined(__SHA__)
void sha256_opt_transform( uint32_t *state_out, const void *input,
const uint32_t *state_in );
// 2 way with interleaved instructions
void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y );
#endif
#endif

142
algo/sha/sha256-hash.c Normal file
View File

@@ -0,0 +1,142 @@
#include "sha256-hash.h"
static const uint32_t SHA256_IV[8] =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
/*
static const uint8_t SHA256_PAD[64] =
{
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
*/
void sha256_ctx_init( sha256_context *ctx )
{
memcpy( ctx->state, SHA256_IV, sizeof SHA256_IV );
ctx->count = 0;
}
void sha256_update( sha256_context *ctx, const void *data, size_t len )
{
int ptr = ctx->count & 0x3f;
const uint8_t *src = data;
ctx->count += (uint64_t)len;
if ( len < 64 - ptr )
{
memcpy( ctx->buf + ptr, src, len );
return;
}
memcpy( ctx->buf + ptr, src, 64 - ptr );
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
src += 64 - ptr;
len -= 64 - ptr;
while ( len >= 64 )
{
sha256_transform_be( ctx->state, (uint32_t*)src, ctx->state );
src += 64;
len -= 64;
}
memcpy( ctx->buf, src, len );
}
#if 0
void sha256_final( sha256_context *ctx, uint32_t *hash )
{
size_t r;
/* Figure out how many bytes we have buffered. */
r = ctx->count & 0x3f;
// r = ( ctx->count >> 3 ) & 0x3f;
//printf("final: count= %d, r= %d\n", ctx->count, r );
/* Pad to 56 mod 64, transforming if we finish a block en route. */
if ( r < 56 )
{
/* Pad to 56 mod 64. */
memcpy( &ctx->buf[r], SHA256_PAD, 56 - r );
}
else
{
/* Finish the current block and mix. */
memcpy( &ctx->buf[r], SHA256_PAD, 64 - r );
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
/* The start of the final block is all zeroes. */
memset( &ctx->buf[0], 0, 56 );
}
/* Add the terminating bit-count. */
ctx->buf[56] = bswap_64( ctx->count << 3 );
// ctx->buf[56] = bswap_64( ctx->count );
// be64enc( &ctx->buf[56], ctx->count );
/* Mix in the final block. */
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
for ( int i = 0; i < 8; i++ ) hash[i] = bswap_32( ctx->state[i] );
// for ( int i = 0; i < 8; i++ ) be32enc( hash + 4*i, ctx->state + i );
/*
// be32enc_vect(digest, ctx->state, 4);
// be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
// Encode vector, two words at a time.
do {
be32enc(&dst[0], src[0]);
be32enc(&dst[4], src[1]);
src += 2;
dst += 8;
} while (--len);
*/
}
#endif
void sha256_final( sha256_context *ctx, void *hash )
{
int ptr = ctx->count & 0x3f;
ctx->buf[ ptr++ ] = 0x80;
if ( ptr > 56 )
{
memset( ctx->buf + ptr, 0, 64 - ptr );
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
memset( ctx->buf, 0, 56 );
}
else
memset( ctx->buf + ptr, 0, 56 - ptr );
*(uint64_t*)(&ctx->buf[56]) = bswap_64( ctx->count << 3 );
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
for ( int i = 0; i < 8; i++ )
( (uint32_t*)hash )[i] = bswap_32( ctx->state[i] );
}
void sha256_full( void *hash, const void *data, size_t len )
{
sha256_context ctx;
sha256_ctx_init( &ctx );
sha256_update( &ctx, data, len );
sha256_final( &ctx, hash );
}

56
algo/sha/sha256-hash.h Normal file
View File

@@ -0,0 +1,56 @@
#ifndef SHA256_HASH_H__
#define SHA256_HASH_H__ 1
#include <stddef.h>
#include "simd-utils.h"
#include "cpuminer-config.h"
#include "sph_sha2.h"
// generic interface
typedef struct {
unsigned char buf[64]; /* first field, for alignment */
uint32_t state[8];
uint64_t count;
} sha256_context __attribute__((aligned(64)));
void sha256_full( void *hash, const void *data, size_t len );
void sha256_update( sha256_context *ctx, const void *data, size_t len );
void sha256_final( sha256_context *ctx, void *hash );
void sha256_ctx_init( sha256_context *ctx );
void sha256_transform_le( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
void sha256_transform_be( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
#if defined(__SHA__)
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in );
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in );
// 2 way with interleaved instructions
void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y );
void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y );
// Select target
// with SHA...
#define sha256_transform_le sha256_opt_transform_le
#define sha256_transform_be sha256_opt_transform_be
#else
// without SHA...
#define sha256_transform_le sph_sha256_transform_le
#define sha256_transform_be sph_sha256_transform_be
#endif
#endif

View File

@@ -14,6 +14,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
__m512i hash32[8] __attribute__ ((aligned (32)));
__m512i initstate[8] __attribute__ ((aligned (32)));
__m512i midstate[8] __attribute__ ((aligned (32)));
__m512i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m512i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
@@ -23,7 +24,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16;
uint32_t n = first_nonce;
__m512i *noncev = vdata + 19;
__m512i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i last_byte = m512_const1_32( 0x80000000 );
@@ -45,27 +46,30 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_16way_transform( midstate, vdata, initstate );
// hash first 64 byte block of data
sha256_16way_transform_le( midstate, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
do
{
// 1. final 16 bytes of data, with padding
memcpy_512( block, vdata + 16, 4 );
block[ 4] = last_byte;
memset_zero_512( block + 5, 10 );
memset_zero_512( block + 5, 10 );
block[15] = m512_const1_32( 80*8 ); // bit count
sha256_16way_transform( hash32, block, midstate );
sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
// 2. 32 byte hash from 1.
memcpy_512( block, hash32, 8 );
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = m512_const1_32( 32*8 ); // bit count
sha256_16way_transform( hash32, block, initstate );
sha256_16way_transform_le( hash32, block, initstate );
// byte swap final hash for testing
mm512_block_bswap_32( hash32, hash32 );
mm512_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -85,7 +89,6 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
return 0;
}
#endif
#if defined(SHA256D_8WAY)
@@ -128,7 +131,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_8way_transform( midstate, vdata, initstate );
sha256_8way_transform_le( midstate, vdata, initstate );
do
{
@@ -137,14 +140,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte;
memset_zero_256( block + 5, 10 );
block[15] = m256_const1_32( 80*8 ); // bit count
sha256_8way_transform( hash32, block, midstate );
sha256_8way_transform_le( hash32, block, midstate );
// 2. 32 byte hash from 1.
memcpy_256( block, hash32, 8 );
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = m256_const1_32( 32*8 ); // bit count
sha256_8way_transform( hash32, block, initstate );
sha256_8way_transform_le( hash32, block, initstate );
// byte swap final hash for testing
mm256_block_bswap_32( hash32, hash32 );
@@ -209,7 +212,7 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_4way_transform( midstate, vdata, initstate );
sha256_4way_transform_le( midstate, vdata, initstate );
do
{
@@ -218,14 +221,14 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte;
memset_zero_128( block + 5, 10 );
block[15] = m128_const1_32( 80*8 ); // bit count
sha256_4way_transform( hash32, block, midstate );
sha256_4way_transform_le( hash32, block, midstate );
// 2. 32 byte hash from 1.
memcpy_128( block, hash32, 8 );
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = m128_const1_32( 32*8 ); // bit count
sha256_4way_transform( hash32, block, initstate );
sha256_4way_transform_le( hash32, block, initstate );
// byte swap final hash for testing
mm128_block_bswap_32( hash32, hash32 );

8
algo/sha/sha256d.c Normal file
View File

@@ -0,0 +1,8 @@
#include "sha256d.h"
void sha256d( void *hash, const void *data, int len )
{
sha256_full( hash, data, len );
sha256_full( hash, hash, 32 );
}

7
algo/sha/sha256d.h Normal file
View File

@@ -0,0 +1,7 @@
#include "algo-gate-api.h"
#include <string.h>
#include <inttypes.h>
#include "sha256-hash.h"
void sha256d( void *hash, const void *data, int len );

View File

@@ -3,14 +3,14 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
static __thread sph_sha256_context sha256q_ctx __attribute__ ((aligned (64)));
static __thread sha256_context sha256q_ctx __attribute__ ((aligned (64)));
void sha256q_midstate( const void* input )
{
sph_sha256_init( &sha256q_ctx );
sph_sha256( &sha256q_ctx, input, 64 );
sha256_ctx_init( &sha256q_ctx );
sha256_update( &sha256q_ctx, input, 64 );
}
int sha256q_hash( void* output, const void* input )
@@ -19,24 +19,16 @@ int sha256q_hash( void* output, const void* input )
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
sph_sha256_context ctx __attribute__ ((aligned (64)));
sha256_context ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx );
sph_sha256( &ctx, input + midlen, tail );
sph_sha256_close( &ctx, hash );
sph_sha256_init( &ctx );
sph_sha256( &ctx, hash, 32 );
sph_sha256_close( &ctx, hash );
sph_sha256_init( &ctx );
sph_sha256( &ctx, hash, 32 );
sph_sha256_close( &ctx, hash );
sph_sha256_init( &ctx );
sph_sha256( &ctx, hash, 32 );
sph_sha256_close( &ctx, output );
sha256_update( &ctx, input + midlen, tail );
sha256_final( &ctx, hash );
sha256_full( hash, hash, 32 );
sha256_full( hash, hash, 32 );
sha256_full( output, hash, 32 );
return 1;
}

View File

@@ -47,7 +47,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 byte block of data
sha256_16way_transform( midstate, vdata, initstate );
sha256_16way_transform_le( midstate, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
@@ -60,18 +60,17 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
memset_zero_512( block + 5, 10 );
block[15] = m512_const1_32( 80*8 ); // bit count
sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
// sha256_16way_transform( hash32, block, midstate );
// 2. 32 byte hash from 1.
memcpy_512( block, hash32, 8 );
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = m512_const1_32( 32*8 ); // bit count
sha256_16way_transform( hash32, block, initstate );
sha256_16way_transform_le( hash32, block, initstate );
// 3. 32 byte hash from 2.
memcpy_512( block, hash32, 8 );
sha256_16way_transform( hash32, block, initstate );
sha256_16way_transform_le( hash32, block, initstate );
// byte swap final hash for testing
mm512_block_bswap_32( hash32, hash32 );
@@ -137,7 +136,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_8way_transform( midstate, vdata, initstate );
sha256_8way_transform_le( midstate, vdata, initstate );
do
{
@@ -146,18 +145,18 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte;
memset_zero_256( block + 5, 10 );
block[15] = m256_const1_32( 80*8 ); // bit count
sha256_8way_transform( hash32, block, midstate );
sha256_8way_transform_le( hash32, block, midstate );
// 2. 32 byte hash from 1.
memcpy_256( block, hash32, 8 );
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = m256_const1_32( 32*8 ); // bit count
sha256_8way_transform( hash32, block, initstate );
sha256_8way_transform_le( hash32, block, initstate );
// 3. 32 byte hash from 2.
memcpy_256( block, hash32, 8 );
sha256_8way_transform( hash32, block, initstate );
sha256_8way_transform_le( hash32, block, initstate );
// byte swap final hash for testing
mm256_block_bswap_32( hash32, hash32 );
@@ -222,7 +221,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_4way_transform( midstate, vdata, initstate );
sha256_4way_transform_le( midstate, vdata, initstate );
do
{
@@ -231,18 +230,18 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte;
memset_zero_128( block + 5, 10 );
block[15] = m128_const1_32( 80*8 ); // bit count
sha256_4way_transform( hash32, block, midstate );
sha256_4way_transform_le( hash32, block, midstate );
// 2. 32 byte hash from 1.
memcpy_128( block, hash32, 8 );
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = m128_const1_32( 32*8 ); // bit count
sha256_4way_transform( hash32, block, initstate );
sha256_4way_transform_le( hash32, block, initstate );
// 3. 32 byte hash from 2.
memcpy_128( block, hash32, 8 );
sha256_4way_transform( hash32, block, initstate );
sha256_4way_transform_le( hash32, block, initstate );
// byte swap final hash for testing
mm128_block_bswap_32( hash32, hash32 );

View File

@@ -4,120 +4,12 @@
#include <string.h>
#include <stdio.h>
//#include "algo/sha/sph_sha2.h"
#include "sha256-hash-opt.h"
#include "sha256-hash.h"
#if defined(__SHA__)
// Only used on CPUs with SHA
/*
static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64)));
void sha256t_midstate( const void* input )
{
sph_sha256_init( &sha256t_ctx );
sph_sha256( &sha256t_ctx, input, 64 );
}
int sha256t_hash( void* output, const void* input )
{
uint32_t _ALIGN(64) hash[16];
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
sph_sha256_context ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &sha256t_ctx, sizeof sha256t_ctx );
sph_sha256( &ctx, input + midlen, tail );
sph_sha256_close( &ctx, hash );
sph_sha256_init( &ctx );
sph_sha256( &ctx, hash, 32 );
sph_sha256_close( &ctx, hash );
sph_sha256_init( &ctx );
sph_sha256( &ctx, hash, 32 );
sph_sha256_close( &ctx, output );
return 1;
}
*/
/*
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t block[16] __attribute__ ((aligned (64)));
uint32_t hash32[8] __attribute__ ((aligned (32)));
uint32_t initstate[8] __attribute__ ((aligned (32)));
uint32_t midstate[8] __attribute__ ((aligned (32)));
// uint32_t edata[20] __attribute__((aligned(64)));
// uint32_t hash[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 1;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
__m128i shuf_bswap32 =
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// mm128_bswap32_80( edata, pdata );
// sha256t_midstate( edata );
// initialize state
initstate[0] = 0x6A09E667;
initstate[1] = 0xBB67AE85;
initstate[2] = 0x3C6EF372;
initstate[3] = 0xA54FF53A;
initstate[4] = 0x510E527F;
initstate[5] = 0x9B05688C;
initstate[6] = 0x1F83D9AB;
initstate[7] = 0x5BE0CD19;
// hash first 64 bytes of data
sha256_opt_transform( midstate, pdata, initstate );
do
{
// 1. final 16 bytes of data, with padding
memcpy( block, pdata + 16, 16 );
block[ 4] = 0x80000000;
memset( block + 5, 0, 40 );
block[15] = 80*8; // bit count
sha256_opt_transform( hash32, block, midstate );
// 2. 32 byte hash from 1.
memcpy( block, hash32, 32 );
block[ 8] = 0x80000000;
memset( block + 9, 0, 24 );
block[15] = 32*8; // bit count
sha256_opt_transform( hash32, block, initstate );
// 3. 32 byte hash from 2.
memcpy( block, hash32, 32 );
sha256_opt_transform( hash32, block, initstate );
// byte swap final hash for testing
casti_m128i( hash32, 0 ) =
_mm_shuffle_epi8( casti_m128i( hash32, 0 ), shuf_bswap32 );
casti_m128i( hash32, 1 ) =
_mm_shuffle_epi8( casti_m128i( hash32, 1 ), shuf_bswap32 );
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
submit_solution( work, hash32, mythr );
n++;
pdata[19] = n;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
return 0;
}
*/
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
@@ -149,7 +41,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
initstate[7] = 0x5BE0CD19;
// hash first 64 bytes of data
sha256_opt_transform( midstate, pdata, initstate );
sha256_opt_transform_le( midstate, pdata, initstate );
do
{
@@ -162,7 +54,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
memset( block0 + 5, 0, 40 );
memset( block1 + 5, 0, 40 );
block0[15] = block1[15] = 80*8; // bit count
sha256_ni2way_transform( hash0, hash1, block0, block1, midstate, midstate );
sha256_ni2way_transform_le( hash0, hash1, block0, block1, midstate, midstate );
// 2. 32 byte hash from 1.
memcpy( block0, hash0, 32 );
@@ -171,12 +63,12 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
memset( block0 + 9, 0, 24 );
memset( block1 + 9, 0, 24 );
block0[15] = block1[15] = 32*8; // bit count
sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
// 3. 32 byte hash from 2.
memcpy( block0, hash0, 32 );
memcpy( block1, hash1, 32 );
sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
// byte swap final hash for testing
casti_m128i( hash0, 0 ) =

View File

@@ -95,32 +95,36 @@ static const uint64_t K512[80] =
// SHA-512 8 way 64 bit
#define CH8W(X, Y, Z) \
_mm512_ternarylogic_epi64( X, Y, Z, 0xca )
#define CH8W( X, Y, Z ) _mm512_ternarylogic_epi64( X, Y, Z, 0xca )
#define MAJ8W(X, Y, Z) \
_mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
#define MAJ8W( X, Y, Z ) _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
#define BSG8W_5_0(x) \
mm512_xor3( mm512_ror_64(x, 28), mm512_ror_64(x, 34), mm512_ror_64(x, 39) )
#define BSG8W_5_0( x ) mm512_xor3( _mm512_ror_epi64( x, 28 ), \
_mm512_ror_epi64( x, 34 ), \
_mm512_ror_epi64( x, 39 ) )
#define BSG8W_5_1(x) \
mm512_xor3( mm512_ror_64(x, 14), mm512_ror_64(x, 18), mm512_ror_64(x, 41) )
#define BSG8W_5_1( x ) mm512_xor3( _mm512_ror_epi64( x, 14 ), \
_mm512_ror_epi64( x, 18 ), \
_mm512_ror_epi64( x, 41 ) )
#define SSG8W_5_0(x) \
mm512_xor3( mm512_ror_64(x, 1), mm512_ror_64(x, 8), _mm512_srli_epi64(x, 7) )
#define SSG8W_5_0( x ) mm512_xor3( _mm512_ror_epi64( x, 1 ), \
_mm512_ror_epi64( x, 8 ), \
_mm512_srli_epi64( x, 7 ) )
#define SSG8W_5_1(x) \
mm512_xor3( mm512_ror_64(x, 19), mm512_ror_64(x, 61), _mm512_srli_epi64(x, 6) )
#define SSG8W_5_1( x ) mm512_xor3( _mm512_ror_epi64( x, 19 ), \
_mm512_ror_epi64( x, 61 ), \
_mm512_srli_epi64( x, 6 ) )
#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
#define SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
__m512i T1, T2; \
__m512i K = _mm512_set1_epi64( K512[ i ] ); \
T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
K, W[i] ) ); \
T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
D = _mm512_add_epi64( D, T1 ); \
__m512i T0 = _mm512_add_epi64( _mm512_set1_epi64( K512[i] ), W[ i ] ); \
__m512i T1 = BSG8W_5_1( E ); \
__m512i T2 = BSG8W_5_0( A ); \
T0 = _mm512_add_epi64( T0, CH8W( E, F, G ) ); \
T1 = _mm512_add_epi64( T1, H ); \
T2 = _mm512_add_epi64( T2, MAJ8W( A, B, C ) ); \
T1 = _mm512_add_epi64( T1, T0 ); \
D = _mm512_add_epi64( D, T1 ); \
H = _mm512_add_epi64( T1, T2 ); \
} while (0)
@@ -267,16 +271,9 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
// SHA-512 4 way 64 bit
#define CH(X, Y, Z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
/*
#define MAJ(X, Y, Z) \
_mm256_or_si256( _mm256_and_si256( X, Y ), \
_mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
*/
#define MAJ(X, Y, Z) \
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
Y_xor_Z ) )
@@ -289,15 +286,6 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
_mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
/*
#define BSG5_0(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) )
#define BSG5_1(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) )
*/
/*
#define SSG5_0(x) \
_mm256_xor_si256( _mm256_xor_si256( \
@@ -325,94 +313,20 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
return _mm256_add_epi64( w0a, w1a );
}
/*
#define SSG512x2_0( w0, w1, i ) do \
{ \
__m256i X0a, X1a, X0b, X1b; \
X0a = mm256_ror_64( W[i-15], 1 ); \
X1a = mm256_ror_64( W[i-14], 1 ); \
X0b = mm256_ror_64( W[i-15], 8 ); \
X1b = mm256_ror_64( W[i-14], 8 ); \
X0a = _mm256_xor_si256( X0a, X0b ); \
X1a = _mm256_xor_si256( X1a, X1b ); \
X0b = _mm256_srli_epi64( W[i-15], 7 ); \
X1b = _mm256_srli_epi64( W[i-14], 7 ); \
w0 = _mm256_xor_si256( X0a, X0b ); \
w1 = _mm256_xor_si256( X1a, X1b ); \
} while(0)
#define SSG512x2_1( w0, w1, i ) do \
{ \
__m256i X0a, X1a, X0b, X1b; \
X0a = mm256_ror_64( W[i-2],19 ); \
X1a = mm256_ror_64( W[i-1],19 ); \
X0b = mm256_ror_64( W[i-2],61 ); \
X1b = mm256_ror_64( W[i-1],61 ); \
X0a = _mm256_xor_si256( X0a, X0b ); \
X1a = _mm256_xor_si256( X1a, X1b ); \
X0b = _mm256_srli_epi64( W[i-2], 6 ); \
X1b = _mm256_srli_epi64( W[i-1], 6 ); \
w0 = _mm256_xor_si256( X0a, X0b ); \
w1 = _mm256_xor_si256( X1a, X1b ); \
} while(0)
*/
/*
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
__m256i K = _mm256_set1_epi64x( K512[ i ] ); \
__m256i T1 = mm256_ror_64( E, 23 ); \
__m256i T2 = mm256_ror_64( A, 5 ); \
__m256i T3 = _mm256_xor_si256( F, G ); \
__m256i T4 = _mm256_or_si256( A, B ); \
__m256i T5 = _mm256_and_si256( A, B ); \
K = _mm256_add_epi64( K, W[i] ); \
T1 = _mm256_xor_si256( T1, E ); \
T2 = _mm256_xor_si256( T2, A ); \
T3 = _mm256_and_si256( T3, E ); \
T4 = _mm256_and_si256( T4, C ); \
K = _mm256_add_epi64( H, K ); \
T1 = mm256_ror_64( T1, 4 ); \
T2 = mm256_ror_64( T2, 6 ); \
T3 = _mm256_xor_si256( T3, G ); \
T4 = _mm256_or_si256( T4, T5 ); \
T1 = _mm256_xor_si256( T1, E ); \
T2 = _mm256_xor_si256( T2, A ); \
T1 = mm256_ror_64( T1, 14 ); \
T2 = mm256_ror_64( T2, 28 ); \
T1 = _mm256_add_epi64( T1, T3 ); \
T2 = _mm256_add_epi64( T2, T4 ); \
T1 = _mm256_add_epi64( T1, K ); \
H = _mm256_add_epi64( T1, T2 ); \
D = _mm256_add_epi64( D, T1 ); \
} while (0)
*/
/*
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
do { \
__m256i K = _mm256_add_epi64( W[i], _mm256_set1_epi64x( K512[ i ] ) ); \
__m256i T1 = BSG5_1(E); \
__m256i T2 = BSG5_0(A); \
T1 = mm256_add4_64( T1, H, CH(E, F, G), K ); \
T2 = _mm256_add_epi64( T2, MAJ(A, B, C) ); \
D = _mm256_add_epi64( D, T1 ); \
H = _mm256_add_epi64( T1, T2 ); \
} while (0)
*/
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
do { \
__m256i T1, T2; \
__m256i K = _mm256_set1_epi64x( K512[ i ] ); \
T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
K, W[i] ) ); \
T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[ i ] ); \
__m256i T1 = BSG5_1( E ); \
__m256i T2 = BSG5_0( A ); \
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
T1 = _mm256_add_epi64( T1, H ); \
T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \
T1 = _mm256_add_epi64( T1, T0 ); \
Y_xor_Z = X_xor_Y; \
D = _mm256_add_epi64( D, T1 ); \
D = _mm256_add_epi64( D, T1 ); \
H = _mm256_add_epi64( T1, T2 ); \
} while (0)
static void
sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
{

View File

@@ -71,198 +71,6 @@ static const sph_u32 H256[8] = {
* of the compression function.
*/
#if defined(__SHA__)
#include "simd-utils.h"
static void sha2_round( const uint8_t input[], uint32_t state[8] )
{
__m128i STATE0, STATE1;
__m128i MSG, TMP, MASK;
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
__m128i ABEF_SAVE, CDGH_SAVE;
// Load initial values
TMP = _mm_load_si128((__m128i*) &state[0]);
STATE1 = _mm_load_si128((__m128i*) &state[4]);
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
// Save current hash
ABEF_SAVE = STATE0;
CDGH_SAVE = STATE1;
// Rounds 0-3
MSG = _mm_load_si128((const __m128i*) (input+0));
TMSG0 = _mm_shuffle_epi8(MSG, MASK);
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 4-7
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 8-11
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 12-15
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 16-19
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 20-23
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 24-27
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 28-31
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 32-35
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 36-39
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 40-43
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 44-47
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 48-51
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 52-55
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 56-59
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 60-63
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Add values back to state
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
// Save state
_mm_store_si128((__m128i*) &state[0], STATE0);
_mm_store_si128((__m128i*) &state[4], STATE1);
}
#else // no SHA
/*
static const sph_u32 K[64] = {
@@ -875,8 +683,24 @@ sha2_round(const unsigned char *data, sph_u32 r[8])
#undef SHA2_IN
}
#endif // SHA else
void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in )
{
memcpy( state_out, state_in, 32 );
#define SHA2_IN(x) (data[x])
SHA2_ROUND_BODY( SHA2_IN, state_out );
#undef SHA2_IN
}
void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in )
{
memcpy( state_out, state_in, 32 );
#define SHA2_IN(x) sph_dec32be_aligned( data+(x) )
SHA2_ROUND_BODY( SHA2_IN, state_out );
#undef SHA2_IN
}
/* see sph_sha2.h */
void

View File

@@ -207,6 +207,13 @@ void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
void sph_sha256_full( void *dst, const void *data, size_t len );
// These shouldn't be called directly, use sha256-hash.h generic functions
// sha256_transform_le & sha256_transform_be instead.
void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
#if SPH_64

View File

@@ -20,8 +20,8 @@ static const uint32_t IV512[] =
#define mm256_ror2x256hi_1x32( a, b ) \
_mm256_blend_epi32( mm256_ror128_32( a ), \
mm256_ror128_32( b ), 0x88 )
_mm256_blend_epi32( mm256_shuflr128_32( a ), \
mm256_shuflr128_32( b ), 0x88 )
#if defined(__VAES__)
@@ -78,7 +78,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
{
// round 1, 5, 9
k00 = _mm256_xor_si256( k13, mm256_ror128_32(
k00 = _mm256_xor_si256( k13, mm256_shuflr128_32(
mm256_aesenc_2x128( k00, zero ) ) );
if ( r == 0 )
@@ -88,7 +88,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
k01 = _mm256_xor_si256( k00,
mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) );
mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ) );
if ( r == 1 )
k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
@@ -97,25 +97,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( k01,
mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) );
mm256_shuflr128_32( mm256_aesenc_2x128( k02, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( k02,
mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) );
mm256_shuflr128_32( mm256_aesenc_2x128( k03, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p3 = _mm256_xor_si256( p3, x );
k10 = _mm256_xor_si256( k03,
mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) );
mm256_shuflr128_32( mm256_aesenc_2x128( k10, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
k11 = _mm256_xor_si256( k10,
mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) );
mm256_shuflr128_32( mm256_aesenc_2x128( k11, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = _mm256_xor_si256( k11,
mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) );
mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( k12,
mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) );
mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ) );
if ( r == 2 )
k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
@@ -151,31 +151,31 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
// round 3, 7, 11
k00 = _mm256_xor_si256( mm256_ror128_32(
k00 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k00, zero ) ), k13 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
k01 = _mm256_xor_si256( mm256_ror128_32(
k01 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k01, zero ) ), k00 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( mm256_ror128_32(
k02 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k02, zero ) ), k01 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( mm256_ror128_32(
k03 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k03, zero ) ), k02 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p1 = _mm256_xor_si256( p1, x );
k10 = _mm256_xor_si256( mm256_ror128_32(
k10 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k10, zero ) ), k03 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
k11 = _mm256_xor_si256( mm256_ror128_32(
k11 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k11, zero ) ), k10 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = _mm256_xor_si256( mm256_ror128_32(
k12 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k12, zero ) ), k11 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( mm256_ror128_32(
k13 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k13, zero ) ), k12 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
@@ -209,35 +209,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
// round 13
k00 = _mm256_xor_si256( mm256_ror128_32(
k00 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k00, zero ) ), k13 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
k01 = _mm256_xor_si256( mm256_ror128_32(
k01 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k01, zero ) ), k00 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( mm256_ror128_32(
k02 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k02, zero ) ), k01 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( mm256_ror128_32(
k03 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k03, zero ) ), k02 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p3 = _mm256_xor_si256( p3, x );
k10 = _mm256_xor_si256( mm256_ror128_32(
k10 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k10, zero ) ), k03 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
k11 = _mm256_xor_si256( mm256_ror128_32(
k11 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k11, zero ) ), k10 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) );
k12 = mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) );
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( mm256_ror128_32(
k13 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k13, zero ) ), k12 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );

View File

@@ -12,8 +12,8 @@ static const uint32_t IV512[] =
};
#define mm512_ror2x512hi_1x32( a, b ) \
_mm512_mask_blend_epi32( 0x8888, mm512_ror128_32( a ), \
mm512_ror128_32( b ) )
_mm512_mask_blend_epi32( 0x8888, mm512_shuflr128_32( a ), \
mm512_shuflr128_32( b ) )
static void
c512_4way( shavite512_4way_context *ctx, const void *msg )
@@ -60,7 +60,7 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
{
// round 1, 5, 9
K0 = _mm512_xor_si512( K7, mm512_ror128_32(
K0 = _mm512_xor_si512( K7, mm512_shuflr128_32(
_mm512_aesenc_epi128( K0, m512_zero ) ) );
if ( r == 0 )
@@ -69,33 +69,33 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
K1 = _mm512_xor_si512( K0,
mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
if ( r == 1 )
K1 = _mm512_xor_si512( K1, mm512_ror128_32(
K1 = _mm512_xor_si512( K1, mm512_shuflr128_32(
_mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
K2 = _mm512_xor_si512( K1,
mm512_ror128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
K3 = _mm512_xor_si512( K2,
mm512_ror128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
P3 = _mm512_xor_si512( P3, X );
K4 = _mm512_xor_si512( K3,
mm512_ror128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
K5 = _mm512_xor_si512( K4,
mm512_ror128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
K6 = _mm512_xor_si512( K5,
mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
K7 = _mm512_xor_si512( K6,
mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
if ( r == 2 )
K7 = _mm512_xor_si512( K7, mm512_swap128_64(
@@ -130,31 +130,31 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
// round 3, 7, 11
K0 = _mm512_xor_si512( mm512_ror128_32(
K0 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
K1 = _mm512_xor_si512( mm512_ror128_32(
K1 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
K2 = _mm512_xor_si512( mm512_ror128_32(
K2 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
K3 = _mm512_xor_si512( mm512_ror128_32(
K3 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
P1 = _mm512_xor_si512( P1, X );
K4 = _mm512_xor_si512( mm512_ror128_32(
K4 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
K5 = _mm512_xor_si512( mm512_ror128_32(
K5 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
K6 = _mm512_xor_si512( mm512_ror128_32(
K6 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
K7 = _mm512_xor_si512( mm512_ror128_32(
K7 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
@@ -187,34 +187,34 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
// round 13
K0 = _mm512_xor_si512( mm512_ror128_32(
K0 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
K1 = _mm512_xor_si512( mm512_ror128_32(
K1 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
K2 = _mm512_xor_si512( mm512_ror128_32(
K2 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
K3 = _mm512_xor_si512( mm512_ror128_32(
K3 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
P3 = _mm512_xor_si512( P3, X );
K4 = _mm512_xor_si512( mm512_ror128_32(
K4 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
K5 = _mm512_xor_si512( mm512_ror128_32(
K5 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
K6 = mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32(
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
K7= _mm512_xor_si512( mm512_ror128_32(
K7= _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );

View File

@@ -74,15 +74,15 @@ static const sph_u32 IV512[] = {
#endif
/*
#if defined(__AVX2__)
// 2 way version of above
// a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] }
#define mm256_ror2x256hi_1x32( a, b ) \
_mm256_blend_epi32( mm256_ror256_1x32( a ), \
mm256_rol256_3x32( b ), 0x88 )
#endif
*/
static void
c512( sph_shavite_big_context *sc, const void *msg )
@@ -135,7 +135,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
for ( r = 0; r < 3; r ++ )
{
// round 1, 5, 9
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
k00 = _mm_xor_si128( k00, k13 );
if ( r == 0 )
@@ -144,7 +144,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
k01 = _mm_xor_si128( k01, k00 );
if ( r == 1 )
@@ -153,31 +153,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, zero );
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, zero );
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, zero );
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, zero );
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
k13 = _mm_xor_si128( k13, k12 );
if ( r == 2 )
@@ -222,38 +222,38 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round 3, 7, 11
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p2, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, zero );
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, zero );
p1 = _mm_xor_si128( p1, x );
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p0, k10 );
x = _mm_aesenc_si128( x, zero );
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, zero );
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, zero );
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, zero );
@@ -295,39 +295,39 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round 13
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, zero );
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, zero );
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, zero );
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, zero );
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, zero );

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include "skein-hash-4way.h"
#include "algo/sha/sha-hash-4way.h"
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
#if defined (SKEIN_8WAY)
@@ -87,7 +87,6 @@ void skeinhash_4way( void *state, const void *input )
uint32_t hash1[16] __attribute__ ((aligned (64)));
uint32_t hash2[16] __attribute__ ((aligned (64)));
uint32_t hash3[16] __attribute__ ((aligned (64)));
sph_sha256_context ctx_sha256;
#else
uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
sha256_4way_context ctx_sha256;
@@ -98,18 +97,12 @@ void skeinhash_4way( void *state, const void *input )
#if defined(__SHA__)
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 );
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hash0, 64 );
sph_sha256_close( &ctx_sha256, hash0 );
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hash1, 64 );
sph_sha256_close( &ctx_sha256, hash1 );
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hash2, 64 );
sph_sha256_close( &ctx_sha256, hash2 );
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hash3, 64 );
sph_sha256_close( &ctx_sha256, hash3 );
sha256_full( hash0, hash0, 64 );
sha256_full( hash1, hash1, 64 );
sha256_full( hash2, hash2, 64 );
sha256_full( hash3, hash3, 64 );
intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
#else

View File

@@ -5,21 +5,18 @@
#include <string.h>
#include <stdint.h>
#include "sph_skein.h"
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
void skeinhash(void *state, const void *input)
{
uint32_t hash[16] __attribute__ ((aligned (64)));
sph_skein512_context ctx_skein;
sph_sha256_context ctx_sha256;
sph_skein512_init( &ctx_skein );
sph_skein512( &ctx_skein, input, 80 );
sph_skein512_close( &ctx_skein, hash );
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hash, 64 );
sph_sha256_close( &ctx_sha256, hash );
sha256_full( hash, hash, 64 );
memcpy(state, hash, 32);
}
@@ -27,8 +24,8 @@ void skeinhash(void *state, const void *input)
int scanhash_skein( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t hash64[8] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
const uint32_t Htarg = ptarget[7];
@@ -36,7 +33,7 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
int thr_id = mythr->id; // thr_id arg is deprecated
swab32_array( endiandata, pdata, 20 );
swab32_array( endiandata, pdata, 20 );
do {
be32enc(&endiandata[19], n);

View File

@@ -176,12 +176,6 @@ static void rotate_indexes( uint32_t *p )
*/
}
#endif
static inline uint32_t rotl32( uint32_t a, size_t r )
{
return ( a << r ) | ( a >> (32-r) );
}
// Vectorized and targetted version of fnv1a
#if defined (__AVX2__)
@@ -232,7 +226,7 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
{ \
const uint32_t *blob_off = blob + \
( ( fnv1a( rotl32( subset[i], r ), accumulator ) % mdiv ) \
( ( fnv1a( rol32( subset[i], r ), accumulator ) % mdiv ) \
* ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
UPDATE_ACCUMULATOR; \
MULXOR; \

View File

@@ -1,5 +1,5 @@
#include "algo-gate-api.h"
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
#include "Verthash.h"
#include "tiny_sha3/sha3-4way.h"
@@ -140,7 +140,7 @@ bool register_verthash_algo( algo_gate_t* gate )
uint8_t vhDataFileHash[32] = { 0 };
applog( LOG_NOTICE, "Verifying Verthash data" );
sph_sha256_full( vhDataFileHash, verthashInfo.data,
sha256_full( vhDataFileHash, verthashInfo.data,
verthashInfo.dataSize );
if ( memcmp( vhDataFileHash, verthashDatFileHash_bytes,
sizeof(verthashDatFileHash_bytes) ) == 0 )

View File

@@ -82,7 +82,7 @@ int scanhash_whirlpool( struct work* work, uint32_t max_nonce,
be32enc(&endiandata[19], n );
whirlpool_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
if (vhash[7] <= Htarg && fulltest(vhash, ptarget) & ! opt_benchmark )
submit_solution( work, vhash, mythr );
} while ( n < max_nonce && !work_restart[thr_id].restart);

View File

@@ -52,10 +52,10 @@ void x16r_8way_prehash( void *vdata, void *pdata )
break;
case CUBEHASH:
mm128_bswap32_80( edata, pdata );
cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
cube_4way_init( &x16r_ctx.cube, 512, 16, 32 );
cube_4way_update( &x16r_ctx.cube, vdata2, 64 );
rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
break;
case HAMSI:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
@@ -207,15 +207,15 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
case LUFFA:
if ( i == 0 )
{
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
luffa_4way_update_close( &ctx.luffa, vhash,
vhash + (16<<2), 16 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
luffa_4way_update_close( &ctx.luffa, vhash,
vhash + (16<<2), 16 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
luffa_4way_update_close( &ctx.luffa, vhash,
vhash + (16<<2), 16 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
luffa_4way_update_close( &ctx.luffa, vhash,
vhash + (16<<2), 16 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
}
else
{
@@ -230,56 +230,24 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
case CUBEHASH:
if ( i == 0 )
{
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*)in0 + 64, 16 );
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
cube_4way_update_close( &ctx.cube, vhash,
vhash + (16<<2), 16 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
(const byte*)in1 + 64, 16 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
(const byte*)in2 + 64, 16 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
(const byte*)in3 + 64, 16 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash4,
(const byte*)in4 + 64, 16 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash5,
(const byte*)in5 + 64, 16 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash6,
(const byte*)in6 + 64, 16 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash7,
(const byte*)in7 + 64, 16 );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
cube_4way_update_close( &ctx.cube, vhash,
vhash + (16<<2), 16 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
}
else
{
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
(const byte*)in0, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
(const byte*)in1, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
(const byte*)in2, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
(const byte*)in3, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash4,
(const byte*)in4, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash5,
(const byte*)in5, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash6,
(const byte*)in6, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash7,
(const byte*)in7, size );
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
cube_4way_full( &ctx.cube, vhash, 512, vhash, size );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
cube_4way_full( &ctx.cube, vhash, 512, vhash, size );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
}
break;
case SHAVITE:
@@ -556,9 +524,10 @@ void x16r_4way_prehash( void *vdata, void *pdata )
break;
case CUBEHASH:
mm128_bswap32_80( edata, pdata );
cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
intrlv_2x128( vdata2, edata, edata, 640 );
cube_2way_init( &x16r_ctx.cube, 512, 16, 32 );
cube_2way_update( &x16r_ctx.cube, vdata2, 64 );
rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
break;
case HAMSI:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
@@ -680,13 +649,13 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
case LUFFA:
if ( i == 0 )
{
intrlv_2x128( vhash, hash0, hash1, 640 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
dintrlv_2x128_512( hash0, hash1, vhash );
intrlv_2x128( vhash, hash2, hash3, 640 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
dintrlv_2x128_512( hash2, hash3, vhash );
intrlv_2x128( vhash, hash0, hash1, 640 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
dintrlv_2x128_512( hash0, hash1, vhash );
intrlv_2x128( vhash, hash2, hash3, 640 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
dintrlv_2x128_512( hash2, hash3, vhash );
}
else
{
@@ -701,32 +670,24 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
case CUBEHASH:
if ( i == 0 )
{
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*)in0 + 64, 16 );
intrlv_2x128( vhash, in0, in1, size<<3 );
cube_2way_update_close( &ctx.cube, vhash,
vhash + (16<<1), 16 );
dintrlv_2x128_512( hash0, hash1, vhash );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
(const byte*)in1 + 64, 16 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
(const byte*)in2 + 64, 16 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
(const byte*)in3 + 64, 16 );
intrlv_2x128( vhash, in2, in3, size<<3 );
cube_2way_update_close( &ctx.cube, vhash,
vhash + (16<<1), 16 );
dintrlv_2x128_512( hash2, hash3, vhash );
}
else
{
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
(const byte*)in0, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
(const byte*)in1, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
(const byte*)in2, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
(const byte*)in3, size );
intrlv_2x128( vhash, in0, in1, size<<3 );
cube_2way_full( &ctx.cube, vhash, 512, vhash, size );
dintrlv_2x128_512( hash0, hash1, vhash );
intrlv_2x128( vhash, in2, in3, size<<3 );
cube_2way_full( &ctx.cube, vhash, 512, vhash, size );
dintrlv_2x128_512( hash2, hash3, vhash );
}
break;
case SHAVITE:

View File

@@ -1,4 +1,5 @@
#include "x16r-gate.h"
#include "algo/sha/sha256d.h"
__thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 };

View File

@@ -37,6 +37,7 @@
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
@@ -115,7 +116,7 @@ union _x16r_8way_context_overlay
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cubehashParam cube;
cube_4way_context cube;
simd_4way_context simd;
hamsi512_8way_context hamsi;
hashState_fugue fugue;
@@ -164,8 +165,8 @@ union _x16r_4way_context_overlay
jh512_4way_context jh;
keccak512_4way_context keccak;
luffa_2way_context luffa;
cube_2way_context cube;
hashState_luffa luffa1;
cubehashParam cube;
simd_2way_context simd;
hamsi512_4way_context hamsi;
hashState_fugue fugue;

View File

@@ -13,7 +13,7 @@
#include "algo/gost/sph_gost.h"
#include "algo/lyra2/lyra2.h"
#if defined(__SHA__)
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
#endif
#if defined (X21S_8WAY)
@@ -208,9 +208,7 @@ union _x21s_4way_context_overlay
haval256_5_4way_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
#if defined(__SHA__)
sph_sha256_context sha256;
#else
#if !defined(__SHA__)
sha256_4way_context sha256;
#endif
} __attribute__ ((aligned (64)));
@@ -275,18 +273,10 @@ int x21s_4way_hash( void* output, const void* input, int thrid )
#if defined(__SHA__)
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash0, 64 );
sph_sha256_close( &ctx.sha256, output );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash1, 64 );
sph_sha256_close( &ctx.sha256, output+32 );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash2, 64 );
sph_sha256_close( &ctx.sha256, output+64 );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash3, 64 );
sph_sha256_close( &ctx.sha256, output+96 );
sha256_full( output, hash0, 64 );
sha256_full( output+32, hash1, 64 );
sha256_full( output+64, hash2, 64 );
sha256_full( output+96, hash3, 64 );
#else

View File

@@ -8,7 +8,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
#include "algo/haval/sph-haval.h"
#include "algo/tiger/sph_tiger.h"
#include "algo/gost/sph_gost.h"
@@ -23,7 +23,7 @@ union _x21s_context_overlay
sph_haval256_5_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
sph_sha256_context sha256;
sha256_context sha256;
};
typedef union _x21s_context_overlay x21s_context_overlay;
@@ -50,9 +50,7 @@ int x21s_hash( void* output, const void* input, int thrid )
sph_gost512 ( &ctx.gost, (const void*) hash, 64 );
sph_gost512_close( &ctx.gost, (void*) hash );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash, 64 );
sph_sha256_close( &ctx.sha256, hash );
sha256_full( hash, hash, 64 );
memcpy( output, hash, 32 );

View File

@@ -37,7 +37,8 @@ union _x17_8way_context_overlay
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
// cube_4way_context cube;
cube_4way_2buf_context cube;
#if defined(__VAES__)
groestl512_4way_context groestl;
shavite512_4way_context shavite;
@@ -119,8 +120,10 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 );
luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 );
cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 );
// cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
// cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
#if defined(__VAES__)

View File

@@ -28,7 +28,7 @@
#include "algo/echo/echo-hash-4way.h"
#endif
#if defined(__SHA__)
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
#endif
#if defined(X22I_8WAY)
@@ -51,9 +51,7 @@ union _x22i_8way_ctx_overlay
haval256_5_8way_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
#if defined(X22I_8WAY_SHA)
sph_sha256_context sha256;
#else
#if !defined(X22I_8WAY_SHA)
sha256_8way_context sha256;
#endif
#if defined(__VAES__)
@@ -391,30 +389,14 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
#if defined(X22I_8WAY_SHA)
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash0, 64 );
sph_sha256_close( &ctx.sha256, output );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash1, 64 );
sph_sha256_close( &ctx.sha256, output+32 );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash2, 64 );
sph_sha256_close( &ctx.sha256, output+64 );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash3, 64 );
sph_sha256_close( &ctx.sha256, output+96 );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash4, 64 );
sph_sha256_close( &ctx.sha256, output+128 );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash5, 64 );
sph_sha256_close( &ctx.sha256, output+160 );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash6, 64 );
sph_sha256_close( &ctx.sha256, output+192 );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash7, 64 );
sph_sha256_close( &ctx.sha256, output+224 );
sha256_full( hash0, hash0, 64 );
sha256_full( hash1, hash1, 64 );
sha256_full( hash2, hash2, 64 );
sha256_full( hash3, hash3, 64 );
sha256_full( hash4, hash4, 64 );
sha256_full( hash5, hash5, 64 );
sha256_full( hash6, hash6, 64 );
sha256_full( hash7, hash7, 64 );
#else
@@ -551,9 +533,7 @@ union _x22i_4way_ctx_overlay
haval256_5_4way_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
#if defined(X22I_4WAY_SHA)
sph_sha256_context sha256;
#else
#if !defined(X22I_4WAY_SHA)
sha256_4way_context sha256;
#endif
};
@@ -757,18 +737,10 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
#if defined(X22I_4WAY_SHA)
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash0, 64 );
sph_sha256_close( &ctx.sha256, output );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash1, 64 );
sph_sha256_close( &ctx.sha256, output+32 );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash2, 64 );
sph_sha256_close( &ctx.sha256, output+64 );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash3, 64 );
sph_sha256_close( &ctx.sha256, output+96 );
sha256_full( hash0, hash0, 64 );
sha256_full( hash1, hash1, 64 );
sha256_full( hash2, hash2, 64 );
sha256_full( hash3, hash3, 64 );
#else

View File

@@ -24,6 +24,7 @@
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
#include "algo/haval/sph-haval.h"
#include "algo/tiger/sph_tiger.h"
#include "algo/lyra2/lyra2.h"
@@ -57,7 +58,6 @@ union _x22i_context_overlay
sph_haval256_5_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
sph_sha256_context sha256;
};
typedef union _x22i_context_overlay x22i_context_overlay;
@@ -172,9 +172,7 @@ int x22i_hash( void *output, const void *input, int thrid )
sph_gost512 (&ctx.gost, (const void*) hash, 64);
sph_gost512_close(&ctx.gost, (void*) hash);
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash, 64 );
sph_sha256_close( &ctx.sha256, hash );
sha256_full( hash, hash, 64 );
memcpy(output, hash, 32);

View File

@@ -33,7 +33,7 @@
#include "algo/echo/echo-hash-4way.h"
#endif
#if defined(__SHA__)
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
#endif
void x25x_shuffle( void *hash )
@@ -84,7 +84,7 @@ union _x25x_8way_ctx_overlay
sph_tiger_context tiger;
sph_gost512_context gost;
#if defined(X25X_8WAY_SHA)
sph_sha256_context sha256;
sha256_context sha256;
#else
sha256_8way_context sha256;
#endif
@@ -447,31 +447,15 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
#if defined(X25X_8WAY_SHA)
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash0[20], 64 );
sph_sha256_close( &ctx.sha256, hash0[21] );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash1[20], 64 );
sph_sha256_close( &ctx.sha256, hash1[21] );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash2[20], 64 );
sph_sha256_close( &ctx.sha256, hash2[21] );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash3[20], 64 );
sph_sha256_close( &ctx.sha256, hash3[21] );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash4[20], 64 );
sph_sha256_close( &ctx.sha256, hash4[21] );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash5[20], 64 );
sph_sha256_close( &ctx.sha256, hash5[21] );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash6[20], 64 );
sph_sha256_close( &ctx.sha256, hash6[21] );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash7[20], 64 );
sph_sha256_close( &ctx.sha256, hash7[21] );
sha256_full( hash0[21], hash0[20], 64 );
sha256_full( hash1[21], hash1[20], 64 );
sha256_full( hash2[21], hash2[20], 64 );
sha256_full( hash3[21], hash3[20], 64 );
sha256_full( hash4[21], hash4[20], 64 );
sha256_full( hash5[21], hash5[20], 64 );
sha256_full( hash6[21], hash6[20], 64 );
sha256_full( hash7[21], hash7[20], 64 );
intrlv_8x32_512( vhash, hash0[21], hash1[21], hash2[21], hash3[21],
hash4[21], hash5[21], hash6[21], hash7[21] );
@@ -646,7 +630,7 @@ union _x25x_4way_ctx_overlay
sph_tiger_context tiger;
sph_gost512_context gost;
#if defined(X25X_4WAY_SHA)
sph_sha256_context sha256;
sha256_context sha256;
#else
sha256_4way_context sha256;
#endif
@@ -848,18 +832,10 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
#if defined(X25X_4WAY_SHA)
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash0[20], 64 );
sph_sha256_close( &ctx.sha256, hash0[21] );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash1[20], 64 );
sph_sha256_close( &ctx.sha256, hash1[21] );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash2[20], 64 );
sph_sha256_close( &ctx.sha256, hash2[21] );
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, hash3[20], 64 );
sph_sha256_close( &ctx.sha256, hash3[21] );
sha256_full( hash0[21], hash0[20], 64 );
sha256_full( hash1[21], hash1[20], 64 );
sha256_full( hash2[21], hash2[20], 64 );
sha256_full( hash3[21], hash3[20], 64 );
intrlv_4x32_512( vhash, hash0[21], hash1[21], hash2[21], hash3[21] );

View File

@@ -23,7 +23,7 @@
#include "algo/hamsi/sph_hamsi.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
#include "algo/haval/sph-haval.h"
#include "algo/tiger/sph_tiger.h"
#include "algo/lyra2/lyra2.h"
@@ -60,7 +60,7 @@ union _x25x_context_overlay
sph_haval256_5_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
sph_sha256_context sha256;
sha256_context sha256;
sph_panama_context panama;
blake2s_state blake2s;
};
@@ -174,9 +174,7 @@ int x25x_hash( void *output, const void *input, int thrid )
sph_gost512 (&ctx.gost, (const void*) &hash[19], 64);
sph_gost512_close(&ctx.gost, (void*) &hash[20]);
sph_sha256_init( &ctx.sha256 );
sph_sha256( &ctx.sha256, &hash[20], 64 );
sph_sha256_close( &ctx.sha256, &hash[21] );
sha256_full( &hash[21], &hash[20], 64 );
sph_panama_init(&ctx.panama);
sph_panama (&ctx.panama, (const void*) &hash[21], 64 );

View File

@@ -35,9 +35,11 @@
#include "blake2b-yp.h"
// Cyclic right rotation.
#ifndef ROTR64
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
#endif
//#ifndef ROTR64
//#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
//#endif
#define ROTR64(x, y) ror64( x, y )
// Little-endian byte access.
#define B2B_GET64(p) \

View File

@@ -52,8 +52,8 @@ int scanhash_yespower_r8g( struct work *work, uint32_t max_nonce,
endiandata[19] = n;
// do sha256 prehash
sph_sha256_init( &sha256_prehash_ctx );
sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
sha256_ctx_init( &sha256_prehash_ctx );
sha256_update( &sha256_prehash_ctx, endiandata, 64 );
do {
yespower_tls( (unsigned char *)endiandata, params.perslen,

View File

@@ -27,14 +27,11 @@
* coin.
*/
#include "yespower.h"
#include "algo-gate-api.h"
yespower_params_t yespower_params;
//SHA256_CTX sha256_prehash_ctx;
__thread sph_sha256_context sha256_prehash_ctx;
//__thread SHA256_CTX sha256_prehash_ctx;
__thread sha256_context sha256_prehash_ctx;
// YESPOWER
@@ -61,8 +58,8 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
endiandata[19] = n;
// do sha256 prehash
sph_sha256_init( &sha256_prehash_ctx );
sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
sha256_ctx_init( &sha256_prehash_ctx );
sha256_update( &sha256_prehash_ctx, endiandata, 64 );
do {
if ( yespower_hash( (char*)endiandata, (char*)vhash, 80, thr_id ) )
@@ -101,10 +98,6 @@ int scanhash_yespower_b2b( struct work *work, uint32_t max_nonce,
be32enc( &endiandata[k], pdata[k] );
endiandata[19] = n;
// do sha256 prehash
sph_sha256_init( &sha256_prehash_ctx );
sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
do {
if (yespower_b2b_hash( (char*) endiandata, (char*) vhash, 80, thr_id ) )
if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )

View File

@@ -203,17 +203,17 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
ARX(X0, X3, X2, 18) \
/* Rearrange data */ \
X1 = _mm_shuffle_epi32(X1, 0x93); \
X3 = _mm_shuffle_epi32(X3, 0x39); \
X2 = _mm_shuffle_epi32(X2, 0x4E); \
X3 = _mm_shuffle_epi32(X3, 0x39); \
/* Operate on "rows" */ \
ARX(X3, X0, X1, 7) \
ARX(X2, X3, X0, 9) \
ARX(X1, X2, X3, 13) \
ARX(X0, X1, X2, 18) \
/* Rearrange data */ \
X3 = _mm_shuffle_epi32(X3, 0x93); \
X1 = _mm_shuffle_epi32(X1, 0x39); \
X2 = _mm_shuffle_epi32(X2, 0x4E); \
X3 = _mm_shuffle_epi32(X3, 0x93);
X2 = _mm_shuffle_epi32(X2, 0x4E);
/**
* Apply the Salsa20 core to the block provided in (X0 ... X3).
@@ -1095,7 +1095,7 @@ int yespower(yespower_local_t *local,
salsa20_blk_t *V, *XY;
pwxform_ctx_t ctx;
uint8_t sha256[32];
sph_sha256_context sha256_ctx;
sha256_context sha256_ctx;
/* Sanity-check parameters */
if ( (version != YESPOWER_0_5 && version != YESPOWER_1_0)
@@ -1138,10 +1138,9 @@ int yespower(yespower_local_t *local,
// copy prehash, do tail
memcpy( &sha256_ctx, &sha256_prehash_ctx, sizeof sha256_ctx );
sph_sha256( &sha256_ctx, src+64, srclen-64 );
sph_sha256_close( &sha256_ctx, sha256 );
sha256_update( &sha256_ctx, src+64, srclen-64 );
sha256_final( &sha256_ctx, sha256 );
if ( version == YESPOWER_0_5 )
{
PBKDF2_SHA256( sha256, sizeof(sha256), src, srclen, 1, B, B_size );
@@ -1186,7 +1185,9 @@ int yespower(yespower_local_t *local,
if ( work_restart[thrid].restart ) return 0;
smix_1_0( B, r, N, V, XY, &ctx );
if ( work_restart[thrid].restart ) return 0;
HMAC_SHA256_Buf( B + B_size - 64, 64, sha256, sizeof(sha256),
(uint8_t *)dst );
}

View File

@@ -34,7 +34,7 @@
#include <stdlib.h> /* for size_t */
#include "miner.h"
#include "simd-utils.h"
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
#ifdef __cplusplus
extern "C" {
@@ -78,9 +78,7 @@ typedef struct {
extern yespower_params_t yespower_params;
//SHA256_CTX sha256_prehash_ctx;
extern __thread sph_sha256_context sha256_prehash_ctx;
//extern __thread SHA256_CTX sha256_prehash_ctx;
extern __thread sha256_context sha256_prehash_ctx;
/**
* yespower_init_local(local):

View File

@@ -4,7 +4,7 @@
# during develpment. However the information contained may provide compilation
# tips to users.
rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 > /dev/null
rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 > /dev/null
# Icelake AVX512 SHA VAES
make distclean || echo clean

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.1.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.0.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.17.1'
PACKAGE_STRING='cpuminer-opt 3.17.1'
PACKAGE_VERSION='3.18.0'
PACKAGE_STRING='cpuminer-opt 3.18.0'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.17.1 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.18.0 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.17.1:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.18.0:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.17.1
cpuminer-opt configure 3.18.0
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.17.1, which was
It was created by cpuminer-opt $as_me 3.18.0, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.17.1'
VERSION='3.18.0'
cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.17.1, which was
This file was extended by cpuminer-opt $as_me 3.18.0, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.17.1
cpuminer-opt config.status 3.18.0
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.17.1])
AC_INIT([cpuminer-opt], [3.18.0])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -38,6 +38,7 @@
#include <jansson.h>
#include <openssl/sha.h>
#include "sysinfos.c"
#include "algo/sha/sha256d.h"
#ifdef WIN32
#include <winsock2.h>
@@ -94,6 +95,7 @@ bool have_gbt = true;
bool allow_getwork = true;
bool want_stratum = true; // pretty useless
bool have_stratum = false;
bool stratum_down = true;
bool allow_mininginfo = true;
bool use_syslog = false;
bool use_colors = true;
@@ -166,6 +168,8 @@ uint32_t stale_share_count = 0;
uint32_t solved_block_count = 0;
double *thr_hashrates;
double global_hashrate = 0.;
double total_hashes = 0.;
struct timeval total_hashes_time = {0,0};
double stratum_diff = 0.;
double net_diff = 0.;
double net_hashrate = 0.;
@@ -1001,6 +1005,7 @@ struct share_stats_t
double share_diff;
double stratum_diff;
double target_diff;
uint32_t height;
char job_id[32];
};
@@ -1080,13 +1085,14 @@ void report_summary_log( bool force )
pthread_mutex_unlock( &stats_lock );
timeval_subtract( &et, &now, &start_time );
timeval_subtract( &uptime, &now, &session_start );
timeval_subtract( &uptime, &total_hashes_time, &session_start );
double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6;
double ghrate = global_hashrate;
double ghrate = safe_div( total_hashes, (double)uptime.tv_sec, 0. );
double target_diff = exp32 * last_targetdiff;
double shrate = safe_div( target_diff * (double)(accepts),
share_time, 0. );
// global_hashrate = ghrate;
double sess_hrate = safe_div( exp32 * norm_diff_sum,
(double)uptime.tv_sec, 0. );
double submit_rate = safe_div( (double)submits * 60., share_time, 0. );
@@ -1134,29 +1140,38 @@ void report_summary_log( bool force )
100. * safe_div( (double)accepted_share_count,
(double)submitted_share_count, 0. ) );
if ( stale_share_count )
applog2( LOG_INFO, "Stale %7d %7d %5.1f%%",
{
int prio = stales ? LOG_MINR : LOG_INFO;
applog2( prio, "Stale %7d %7d %5.1f%%",
stales, stale_share_count,
100. * safe_div( (double)stale_share_count,
(double)submitted_share_count, 0. ) );
}
if ( rejected_share_count )
applog2( LOG_INFO, "Rejected %7d %7d %5.1f%%",
{
int prio = rejects ? LOG_ERR : LOG_INFO;
applog2( prio, "Rejected %7d %7d %5.1f%%",
rejects, rejected_share_count,
100. * safe_div( (double)rejected_share_count,
(double)submitted_share_count, 0. ) );
}
if ( solved_block_count )
applog2( LOG_INFO,"Blocks Solved %7d %7d",
{
int prio = solved ? LOG_PINK : LOG_INFO;
applog2( prio, "Blocks Solved %7d %7d",
solved, solved_block_count );
}
applog2( LOG_INFO, "Hi/Lo Share Diff %.5g / %.5g",
highest_share, lowest_share );
highest_share, lowest_share );
int mismatch = submitted_share_count
- ( accepted_share_count + stale_share_count + rejected_share_count );
if ( mismatch )
{
if ( mismatch != 1 )
applog(LOG_WARNING,"Share count mismatch: %d, stats may be inaccurate", mismatch );
applog2(LOG_MINR, "Count mismatch: %d, stats may be inaccurate", mismatch );
else
applog(LOG_INFO,"Share count mismatch, submitted share may still be pending" );
applog2(LOG_INFO, CL_LBL "Count mismatch, submitted share may still be pending" CL_N );
}
}
@@ -1278,17 +1293,17 @@ static int share_result( int result, struct work *work,
if ( use_colors )
{
bcol = acol = scol = rcol = CL_WHT;
bcol = acol = scol = rcol = CL_N;
if ( likely( result ) )
{
acol = CL_WHT CL_GRN;
if ( unlikely( solved ) ) bcol = CL_WHT CL_MAG;
acol = CL_LGR;
if ( unlikely( solved ) ) bcol = CL_LMA;
}
else if ( stale ) scol = CL_WHT CL_YL2;
else rcol = CL_WHT CL_RED;
else if ( stale ) scol = CL_YL2;
else rcol = CL_LRD;
}
applog( LOG_NOTICE, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)",
applog( LOG_INFO, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)",
my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
bres, share_time, latency );
@@ -1296,8 +1311,7 @@ static int share_result( int result, struct work *work,
{
if ( have_stratum )
applog2( LOG_INFO, "Diff %.5g, Block %d, Job %s",
my_stats.share_diff, stratum.block_height,
my_stats.job_id );
my_stats.share_diff, my_stats.height, my_stats.job_id );
else
applog2( LOG_INFO, "Diff %.5g, Block %d",
my_stats.share_diff, work ? work->height : last_block_height );
@@ -1308,7 +1322,7 @@ static int share_result( int result, struct work *work,
uint32_t str[8];
uint32_t *targ;
if ( reason ) applog( LOG_WARNING, "Reject reason: %s", reason );
if ( reason ) applog( LOG_MINR, "Reject reason: %s", reason );
diff_to_hash( str, my_stats.share_diff );
applog2( LOG_INFO, "Hash: %08x%08x%08x%08x%08x%08x", str[7], str[6],
@@ -1861,6 +1875,7 @@ static void update_submit_stats( struct work *work, const void *hash )
share_stats[ s_put_ptr ].net_diff = net_diff;
share_stats[ s_put_ptr ].stratum_diff = stratum_diff;
share_stats[ s_put_ptr ].target_diff = work->targetdiff;
share_stats[ s_put_ptr ].height = work->height;
if ( have_stratum )
strncpy( share_stats[ s_put_ptr ].job_id, work->job_id, 30 );
s_put_ptr = stats_ptr_incr( s_put_ptr );
@@ -1871,6 +1886,10 @@ static void update_submit_stats( struct work *work, const void *hash )
bool submit_solution( struct work *work, const void *hash,
struct thr_info *thr )
{
// Job went stale during hashing of a valid share.
if ( !opt_quiet && work_restart[ thr->id ].restart )
applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N );
work->sharediff = hash_to_diff( hash );
if ( likely( submit_work( thr, work ) ) )
{
@@ -1887,11 +1906,11 @@ bool submit_solution( struct work *work, const void *hash,
if ( !opt_quiet )
{
if ( have_stratum )
applog( LOG_NOTICE, "%d Submitted Diff %.5g, Block %d, Job %s",
applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Job %s",
submitted_share_count, work->sharediff, work->height,
work->job_id );
else
applog( LOG_NOTICE, "%d Submitted Diff %.5g, Block %d, Ntime %08x",
applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Ntime %08x",
submitted_share_count, work->sharediff, work->height,
work->data[ algo_gate.ntime_index ] );
}
@@ -2048,7 +2067,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
pthread_rwlock_wrlock( &g_work_lock );
pthread_mutex_lock( &sctx->work_lock );
new_job = sctx->new_job;
new_job = sctx->new_job; // otherwise just increment extranonce2
sctx->new_job = false;
free( g_work->job_id );
@@ -2084,6 +2103,14 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
pthread_mutex_unlock( &stats_lock );
if ( !opt_quiet )
{
int mismatch = submitted_share_count
- ( accepted_share_count + stale_share_count + rejected_share_count );
if ( mismatch )
applog(LOG_INFO, CL_LBL "%d Submitted share pending, maybe stale" CL_N, submitted_share_count );
}
if ( stratum_diff != sctx->job.diff )
applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s",
sctx->job.diff, sctx->block_height, g_work->job_id );
@@ -2264,19 +2291,29 @@ static void *miner_thread( void *userdata )
}
// wait for stratum to send first job
if ( have_stratum ) while ( unlikely( !g_work.job_id ) ) sleep(1);
if ( have_stratum ) while ( unlikely( stratum_down ) )
{
if ( opt_debug )
applog( LOG_INFO, "Thread %d waiting for first job", thr_id );
sleep(1);
}
// nominal startng values
int64_t max64 = 20;
thr_hashrates[thr_id] = 20;
while (1)
{
uint64_t hashes_done;
struct timeval tv_start, tv_end, diff;
int64_t max64 = 1000;
// int64_t max64 = 1000;
int nonce_found = 0;
if ( likely( algo_gate.do_this_thread( thr_id ) ) )
{
if ( have_stratum )
if ( have_stratum )
{
while ( unlikely( stratum_down ) )
sleep( 1 );
if ( *nonceptr >= end_nonce )
stratum_gen_work( &stratum, &g_work );
}
@@ -2383,6 +2420,8 @@ static void *miner_thread( void *userdata )
if ( diff.tv_usec || diff.tv_sec )
{
pthread_mutex_lock( &stats_lock );
total_hashes += hashes_done;
total_hashes_time = tv_end;
thr_hashrates[thr_id] =
hashes_done / ( diff.tv_sec + diff.tv_usec * 1e-6 );
pthread_mutex_unlock( &stats_lock );
@@ -2439,7 +2478,6 @@ static void *miner_thread( void *userdata )
&& thr_id == opt_n_threads - 1 ) )
{
double hashrate = 0.;
pthread_mutex_lock( &stats_lock );
for ( i = 0; i < opt_n_threads; i++ )
hashrate += thr_hashrates[i];
@@ -2448,8 +2486,12 @@ static void *miner_thread( void *userdata )
if ( opt_benchmark )
{
struct timeval uptime;
char hr[16];
char hr_units[2] = {0,0};
timeval_subtract( &uptime, &total_hashes_time, &session_start );
double hashrate = safe_div( total_hashes, uptime.tv_sec, 0. );
scale_hash_for_display( &hashrate, hr_units );
sprintf( hr, "%.2f", hashrate );
#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
@@ -2745,6 +2787,7 @@ static void *stratum_thread(void *userdata )
if ( unlikely( stratum_need_reset ) )
{
stratum_need_reset = false;
stratum_down = true;
stratum_disconnect( &stratum );
if ( strcmp( stratum.url, rpc_url ) )
{
@@ -2755,11 +2798,13 @@ static void *stratum_thread(void *userdata )
else
applog(LOG_WARNING, "Stratum connection reset");
// reset stats queue as well
restart_threads();
if ( s_get_ptr != s_put_ptr ) s_get_ptr = s_put_ptr = 0;
}
while ( !stratum.curl )
{
stratum_down = true;
pthread_rwlock_wrlock( &g_work_lock );
g_work_time = 0;
pthread_rwlock_unlock( &g_work_lock );
@@ -2780,6 +2825,7 @@ static void *stratum_thread(void *userdata )
}
else
{
stratum_down = false;
restart_threads();
applog(LOG_BLUE,"Stratum connection established" );
}
@@ -2801,7 +2847,7 @@ static void *stratum_thread(void *userdata )
}
else
{
applog(LOG_WARNING, "Stratum connection interrupted");
// applog(LOG_WARNING, "Stratum connection interrupted");
// stratum_disconnect( &stratum );
stratum_need_reset = true;
}
@@ -3629,6 +3675,10 @@ int main(int argc, char *argv[])
show_usage_and_exit(1);
}
// need to register to get algo optimizations for cpu capabilities
// but that causes register logs before cpu capabilities is output.
// Would need to split register into 2 parts. First part sets algo
// optimizations but no logging, second part does any logging.
if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1);
if ( !check_cpu_capability() ) exit(1);
@@ -3685,12 +3735,6 @@ int main(int argc, char *argv[])
}
}
// Initialize stats times and counters
memset( share_stats, 0, s_stats_size * sizeof (struct share_stats_t) );
gettimeofday( &last_submit_time, NULL );
memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
// if ( !check_cpu_capability() ) exit(1);
pthread_mutex_init( &stats_lock, NULL );
@@ -3854,7 +3898,8 @@ int main(int argc, char *argv[])
return 1;
}
}
if ( have_stratum )
if ( have_stratum )
{
if ( opt_debug )
applog(LOG_INFO,"Creating stratum thread");
@@ -3900,24 +3945,35 @@ int main(int argc, char *argv[])
opt_api_listen );
}
// hold the stats lock while starting miner threads
pthread_mutex_lock( &stats_lock );
/* start mining threads */
for (i = 0; i < opt_n_threads; i++)
for ( i = 0; i < opt_n_threads; i++ )
{
usleep( 5000 );
// usleep( 5000 );
thr = &thr_info[i];
thr->id = i;
thr->q = tq_new();
if (!thr->q)
if ( !thr->q )
return 1;
err = thread_create(thr, miner_thread);
if (err) {
applog(LOG_ERR, "Miner thread %d create failed", i);
err = thread_create( thr, miner_thread );
if ( err )
{
applog( LOG_ERR, "Miner thread %d create failed", i );
return 1;
}
}
applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm",
opt_n_threads, num_cpus, algo_names[opt_algo] );
// Initialize stats times and counters
memset( share_stats, 0, s_stats_size * sizeof (struct share_stats_t) );
gettimeofday( &last_submit_time, NULL );
memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
pthread_mutex_unlock( &stats_lock );
applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm",
opt_n_threads, num_cpus, algo_names[opt_algo] );
/* main loop - simply wait for workio thread to exit */
pthread_join( thr_info[work_thr_id].pth, NULL );

37
miner.h
View File

@@ -70,17 +70,25 @@ void *alloca (size_t);
#ifdef HAVE_SYSLOG_H
#include <syslog.h>
#define LOG_BLUE 0x10 /* unique value */
#define LOG_BLUE 0x10 /* unique value */
#define LOG_MAJR 0x11 /* unique value */
#define LOG_MINR 0x12 /* unique value */
#define LOG_GREEN 0x13 /* unique value */
#define LOG_PINK 0x14 /* unique value */
#else
enum {
LOG_ERR,
LOG_CRIT,
LOG_ERR,
LOG_WARNING,
LOG_NOTICE,
LOG_INFO,
LOG_DEBUG,
/* custom notices */
LOG_BLUE = 0x10,
};
/* custom notices */
LOG_BLUE = 0x10,
LOG_MAJR = 0x11,
LOG_MINR = 0x12,
LOG_GREEN = 0x13,
LOG_PINK = 0x14 };
#endif
extern bool is_power_of_2( int n );
@@ -216,7 +224,7 @@ json_t* json_load_url(char* cfg_url, json_error_t *err);
void sha256_init(uint32_t *state);
void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
void sha256d(unsigned char *hash, const unsigned char *data, int len);
//void sha256d(unsigned char *hash, const unsigned char *data, int len);
#ifdef USE_ASM
#if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__)
@@ -225,7 +233,8 @@ int sha256_use_4way();
void sha256_init_4way(uint32_t *state);
void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
#endif
#if defined(__x86_64__) && defined(USE_AVX2)
//#if defined(__x86_64__) && defined(USE_AVX2)
#if defined(__x86_64__) && defined(__AVX2__)
#define HAVE_SHA256_8WAY 1
int sha256_use_8way();
void sha256_init_8way(uint32_t *state);
@@ -271,9 +280,9 @@ struct thr_api {
#define CL_N "\x1B[0m"
#define CL_RED "\x1B[31m"
#define CL_GRN "\x1B[32m"
#define CL_YLW "\x1B[33m"
#define CL_YLW "\x1B[33m" // dark yellow
#define CL_BLU "\x1B[34m"
#define CL_MAG "\x1B[35m"
#define CL_MAG "\x1B[35m" // purple
#define CL_CYN "\x1B[36m"
#define CL_BLK "\x1B[22;30m" /* black */
@@ -281,7 +290,7 @@ struct thr_api {
#define CL_GR2 "\x1B[22;32m" /* green */
#define CL_BRW "\x1B[22;33m" /* brown */
#define CL_BL2 "\x1B[22;34m" /* blue */
#define CL_MA2 "\x1B[22;35m" /* magenta */
#define CL_MA2 "\x1B[22;35m" /* purple */
#define CL_CY2 "\x1B[22;36m" /* cyan */
#define CL_SIL "\x1B[22;37m" /* gray */
@@ -290,9 +299,9 @@ struct thr_api {
#else
#define CL_GRY "\x1B[90m" /* dark gray selectable in putty */
#endif
#define CL_LRD "\x1B[01;31m" /* light red */
#define CL_LGR "\x1B[01;32m" /* light green */
#define CL_YL2 "\x1B[01;33m" /* yellow */
#define CL_LRD "\x1B[01;31m" /* bright red */
#define CL_LGR "\x1B[01;32m" /* bright green */
#define CL_YL2 "\x1B[01;33m" /* bright yellow */
#define CL_LBL "\x1B[01;34m" /* light blue */
#define CL_LMA "\x1B[01;35m" /* light magenta */
#define CL_LCY "\x1B[01;36m" /* light cyan */
@@ -481,7 +490,7 @@ void format_hashrate(double hashrate, char *output);
void print_hash_tests(void);
void scale_hash_for_display ( double* hashrate, char* units );
void format_number_si( double* hashrate, char* si_units );
void report_summary_log( bool force );
/*

View File

@@ -78,6 +78,8 @@
// - specialized shift and rotate functions that move elements around
// use the notation "1x32" to indicate the distance moved as units of
// the element size.
// Vector shuffle rotations are being renamed to "vrol" and "vror"
// to avoid confusion with bit rotations.
// - there is a subset of some functions for scalar data. They may have
// no prefix nor vec-size, just one size, the size of the data.
// - Some integer functions are also defined which use a similar notation.

View File

@@ -65,7 +65,7 @@ static inline void dintrlv_2x32( void *dst0, void *dst1,
d0[24] = s[48]; d1[24] = s[49]; d0[25] = s[50]; d1[25] = s[51];
d0[26] = s[52]; d1[26] = s[53]; d0[27] = s[54]; d1[27] = s[55];
d0[28] = s[56]; d1[28] = s[57]; d0[29] = s[58]; d1[29] = s[59];
d0[30] = s[60]; d1[30] = s[61]; d0[31] = s[61]; d1[31] = s[63];
d0[30] = s[60]; d1[30] = s[61]; d0[31] = s[62]; d1[31] = s[63];
}
static inline void extr_lane_2x32( void *dst, const void *src,

View File

@@ -35,6 +35,13 @@
///////////////////////////////////////////////////////////////////////////
// Used instead if casting.
typedef union
{
__m128i m128;
uint32_t u32[4];
} __attribute__ ((aligned (16))) m128_ovly;
// Efficient and convenient moving between GP & low bits of XMM.
// Use VEX when available to give access to xmm8-15 and zero extend for
// larger vectors.
@@ -61,7 +68,10 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
return a;
}
static inline uint64_t mm128_mov128_64( const __m128i a )
// Inconstant naming, prefix should reflect return value:
// u64_mov128_64
static inline uint64_t u64_mov128_64( const __m128i a )
{
uint64_t n;
#if defined(__AVX__)
@@ -72,7 +82,7 @@ static inline uint64_t mm128_mov128_64( const __m128i a )
return n;
}
static inline uint32_t mm128_mov128_32( const __m128i a )
static inline uint32_t u32_mov128_32( const __m128i a )
{
uint32_t n;
#if defined(__AVX__)
@@ -166,12 +176,17 @@ static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
// Extract 32 bit element c from v and return as integer.
static inline uint32_t mm128_extract_32( const __m128i v, const int c )
{ return mm128_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }
{ return u32_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }
// Clear (zero) 32 bit elements based on bits set in 4 bit mask.
static inline __m128i mm128_mask_32( const __m128i v, const int m )
{ return mm128_xim_32( v, v, m ); }
// Move element i2 of v2 to element i1 of v1. For reference and convenience,
// it's faster to precalculate the index.
#define mm128_shuflmov_32( v1, i1, v2, i2 ) \
mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )
#endif // SSE4_1
//
@@ -257,12 +272,37 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#endif
// Diagonal blend: d = s3[3], s2[2], s1[1], s0[0] ||
// Blend 4 32 bit elements from 4 vectors
#if defined (__AVX2__)
#define mm128_diagonal_32( v3, v2, v1, v0 ) \
mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \
_mm_blend_epi32( s1, s0, 0x1 ), 0x3 )
#elif defined(__SSE4_1)
#define mm128_diagonal_32( v3, v2, v1, v0 ) \
mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \
_mm_blend_epi16( s1, s0, 0x03 ), 0x0f )
#endif
//
// Bit rotations
// AVX512VL has implemented bit rotation for 128 bit vectors with
// 64 and 32 bit elements.
// x2 rotates elements in 2 individual vectors in a double buffered
// optimization for SSE2, does nothing for AVX512 but is there for
// transparency.
// compiler doesn't like when a variable is used for the last arg of
// _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same
// specification but works with a variable. Therefore use rol_var where
@@ -290,6 +330,22 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_ror_32 _mm_ror_epi32
#define mm128_rol_32 _mm_rol_epi32
#define mm128_rorx2_64( v1, v0, c ) \
_mm_ror_epi64( v0, c ); \
_mm_ror_epi64( v1, c )
#define mm128_rolx2_64( v1, v0, c ) \
_mm_rol_epi64( v0, c ); \
_mm_rol_epi64( v1, c )
#define mm128_rorx2_32( v1, v0, c ) \
_mm_ror_epi32( v0, c ); \
_mm_ror_epi32( v1, c )
#define mm128_rolx2_32( v1, v0, c ) \
_mm_rol_epi32( v0, c ); \
_mm_rol_epi32( v1, c )
#else // SSE2
#define mm128_ror_64 mm128_ror_var_64
@@ -297,6 +353,46 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_ror_32 mm128_ror_var_32
#define mm128_rol_32 mm128_rol_var_32
#define mm128_rorx2_64( v1, v0, c ) \
{ \
__m128i t0 = _mm_srli_epi64( v0, c ); \
__m128i t1 = _mm_srli_epi64( v1, c ); \
v0 = _mm_slli_epi64( v0, 64-(c) ); \
v1 = _mm_slli_epi64( v1, 64-(c) ); \
v0 = _mm_or_si256( v0, t0 ); \
v1 = _mm_or_si256( v1, t1 ); \
}
#define mm128_rolx2_64( v1, v0, c ) \
{ \
__m128i t0 = _mm_slli_epi64( v0, c ); \
__m128i t1 = _mm_slli_epi64( v1, c ); \
v0 = _mm_srli_epi64( v0, 64-(c) ); \
v1 = _mm_srli_epi64( v1, 64-(c) ); \
v0 = _mm_or_si256( v0, t0 ); \
v1 = _mm_or_si256( v1, t1 ); \
}
#define mm128_rorx2_32( v1, v0, c ) \
{ \
__m128i t0 = _mm_srli_epi32( v0, c ); \
__m128i t1 = _mm_srli_epi32( v1, c ); \
v0 = _mm_slli_epi32( v0, 32-(c) ); \
v1 = _mm_slli_epi32( v1, 32-(c) ); \
v0 = _mm_or_si256( v0, t0 ); \
v1 = _mm_or_si256( v1, t1 ); \
}
#define mm128_rolx2_32( v1, v0, c ) \
{ \
__m128i t0 = _mm_slli_epi32( v0, c ); \
__m128i t1 = _mm_slli_epi32( v1, c ); \
v0 = _mm_srli_epi32( v0, 32-(c) ); \
v1 = _mm_srli_epi32( v1, 32-(c) ); \
v0 = _mm_or_si256( v0, t0 ); \
v1 = _mm_or_si256( v1, t1 ); \
}
#endif // AVX512 else SSE2
#define mm128_ror_16( v, c ) \
@@ -309,16 +405,22 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
// Rotate vector elements accross all lanes
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
#define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
#define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
#define mm128_shuflr_64 mm128_swap_64
#define mm128_shufll_64 mm128_swap_64
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
// Swap 32 bit elements in 64 bit lanes
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
#define mm128_shuflr64_32 mm128_swap64_32
#define mm128_shufll64_32 mm128_swap64_32
#if defined(__SSSE3__)
// Rotate right by c bytes, no SSE2 equivalent.
static inline __m128i mm128_ror_x8( const __m128i v, const int c )
static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
{ return _mm_alignr_epi8( v, v, c ); }
//
@@ -422,59 +524,88 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
v1 = _mm_xor_si128( v1, v2 );
// Two input shuffle-rotate.
// Concatenate v1 & v2 and rotate as one 256 bit vector.
#if defined(__SSE4_1__)
// Continue to use vror/vrol for now to avoid confusion with
// shufl2r/shufl2l function macros available with AVX512.
#define mm128_ror256_64( v1, v2 ) \
#if defined(__SSSE3__)
// Function macro with two inputs and one output, inputs are preserved.
// Returns modified first arg.
// Two input functions are not available without SSSE3. Use procedure
// belowe instead.
#define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
#define mm128_shufl2l_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
#define mm128_shufl2r_32( v1, v2 ) _mm_alignr_epi8( v2, v1, 4 )
#define mm128_shufl2l_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 )
#define mm128_shufl2r_16( v1, v2 ) _mm_alignr_epi8( v2, v1, 2 )
#define mm128_shufl2l_16( v1, v2 ) _mm_alignr_epi8( v1, v2, 2 )
#define mm128_shufl2r_8( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
// Procedure macroswith 2 inputs and 2 outputs, inputs are destroyed.
// Returns both modified args in place.
// These macros retain the vrol/vror name for now to avoid
// confusion with the shufl2r/shuffle2l function macros above.
// These may be renamed to something like shufl2r2 for 2 1nputs and
// 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs.
#define mm128_vror256_64( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
v1 = _mm_alignr_epi8( v2, v1, 8 ); \
v2 = t; \
} while(0)
#define mm128_rol256_64( v1, v2 ) \
#define mm128_vrol256_64( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
v2 = _mm_alignr_epi8( v2, v1, 8 ); \
v1 = t; \
} while(0)
#define mm128_ror256_32( v1, v2 ) \
#define mm128_vror256_32( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 4 ); \
v1 = _mm_alignr_epi8( v2, v1, 4 ); \
v2 = t; \
} while(0)
#define mm128_rol256_32( v1, v2 ) \
#define mm128_vrol256_32( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 12 ); \
v2 = _mm_alignr_epi8( v2, v1, 12 ); \
v1 = t; \
} while(0)
#define mm128_ror256_16( v1, v2 ) \
#define mm128_vror256_16( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 2 ); \
v1 = _mm_alignr_epi8( v2, v1, 2 ); \
v2 = t; \
} while(0)
#define mm128_rol256_16( v1, v2 ) \
#define mm128_vrol256_16( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 14 ); \
v2 = _mm_alignr_epi8( v2, v1, 14 ); \
v1 = t; \
} while(0)
#define mm128_ror256_8( v1, v2 ) \
#define mm128_vror256_8( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 1 ); \
v1 = _mm_alignr_epi8( v2, v1, 1 ); \
v2 = t; \
} while(0)
#define mm128_rol256_8( v1, v2 ) \
#define mm128_vrol256_8( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 15 ); \
v2 = _mm_alignr_epi8( v2, v1, 15 ); \
@@ -483,7 +614,7 @@ do { \
#else // SSE2
#define mm128_ror256_64( v1, v2 ) \
#define mm128_vror256_64( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
_mm_slli_si128( v2, 8 ) ); \
@@ -492,7 +623,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_rol256_64( v1, v2 ) \
#define mm128_vrol256_64( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
_mm_srli_si128( v2, 8 ) ); \
@@ -501,7 +632,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_ror256_32( v1, v2 ) \
#define mm128_vror256_32( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
_mm_slli_si128( v2, 12 ) ); \
@@ -510,7 +641,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_rol256_32( v1, v2 ) \
#define mm128_vrol256_32( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
_mm_srli_si128( v2, 12 ) ); \
@@ -519,7 +650,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_ror256_16( v1, v2 ) \
#define mm128_vror256_16( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
_mm_slli_si128( v2, 14 ) ); \
@@ -528,7 +659,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_rol256_16( v1, v2 ) \
#define mm128_vrol256_16( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
_mm_srli_si128( v2, 14 ) ); \
@@ -537,7 +668,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_ror256_8( v1, v2 ) \
#define mm128_vror256_8( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
_mm_slli_si128( v2, 15 ) ); \
@@ -546,7 +677,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_rol256_8( v1, v2 ) \
#define mm128_vrol256_8( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
_mm_srli_si128( v2, 15 ) ); \

View File

@@ -14,13 +14,28 @@
// is limited because 256 bit vectors are less likely to be used when 512
// is available.
// Used instead if casting.
typedef union
{
__m256i m256;
__m128i m128[2];
uint64_t u64[4];
uint32_t u32[8];
} __attribute__ ((aligned (32))) m256_ovly;
// Move integer to low element of vector, other elements are set to zero.
#define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
#define mm256_mov32_256( i ) _mm256_castsi128_si256( mm128_mov32_128( i ) )
// Move low element of vector to integer.
#define mm256_mov256_64( v ) mm128_mov128_64( _mm256_castsi256_si128( v ) )
#define mm256_mov256_32( v ) mm128_mov128_32( _mm256_castsi256_si128( v ) )
#define u64_mov256_64( v ) u64_mov128_64( _mm256_castsi256_si128( v ) )
#define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) )
// deprecated
//#define mm256_mov256_64 u64_mov256_64
//#define mm256_mov256_32 u32_mov256_32
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
#define mm256_concat_128( hi, lo ) \
@@ -214,12 +229,41 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#endif
// Diagonal blending
// Blend 4 64 bit elements from 4 vectors
#define mm256_diagonal_64( v3, v2, v1, v0 ) \
mm256_blend_epi32( _mm256_blend_epi32( v3, v2, 0x30 ), \
_mm256_blend_epi32( v1, v0, 0x03 ), 0x0f )
// Blend 8 32 bit elements from 8 vectors
#define mm256_diagonal_32( v7, v6, v5, v4, v3, v2, v1, v0 ) \
_mm256_blend_epi32( \
_mm256_blend_epi32( \
_mm256_blend_epi32( v7, v6, 0x40 ), \
_mm256_blend_epi32( v5, v4, 0x10 ) 0x30 ), \
_mm256_blend_epi32( \
_mm256_blend_epi32( v3, v2, 0x04) \
_mm256_blend_epi32( v1, v0, 0x01 ), 0x03 ), 0x0f )
// Blend 4 32 bit elements from each 128 bit lane.
#define mm256_diagonal128_32( v3, v2, v1, v0 ) \
_mm256_blend_epi32( \
_mm256_blend_epi32( v3, v2, 0x44) \
_mm256_blend_epi32( v1, v0, 0x11 ) )
//
// Bit rotations.
//
// The only bit shift for more than 64 bits is with __int128.
// The only bit shift for more than 64 bits is with __int128 which is slow.
//
// AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements
//
// x2 rotates elements in 2 individual vectors in a double buffered
// optimization for SSE2, does nothing for AVX512 but is there for
// transparency.
// compiler doesn't like when a variable is used for the last arg of
@@ -255,6 +299,22 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#define mm256_ror_32 _mm256_ror_epi32
#define mm256_rol_32 _mm256_rol_epi32
#define mm256_rorx2_64( v1, v0, c ) \
_mm256_ror_epi64( v0, c ); \
_mm256_ror_epi64( v1, c )
#define mm256_rolx2_64( v1, v0, c ) \
_mm256_rol_epi64( v0, c ); \
_mm256_rol_epi64( v1, c )
#define mm256_rorx2_32( v1, v0, c ) \
_mm256_ror_epi32( v0, c ); \
_mm256_ror_epi32( v1, c )
#define mm256_rolx2_32( v1, v0, c ) \
_mm256_rol_epi32( v0, c ); \
_mm256_rol_epi32( v1, c )
#else // AVX2
#define mm256_ror_64 mm256_ror_var_64
@@ -262,6 +322,46 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#define mm256_ror_32 mm256_ror_var_32
#define mm256_rol_32 mm256_rol_var_32
#define mm256_rorx2_64( v1, v0, c ) \
{ \
__m256i t0 = _mm256_srli_epi64( v0, c ); \
__m256i t1 = _mm256_srli_epi64( v1, c ); \
v0 = _mm256_slli_epi64( v0, 64-(c) ); \
v1 = _mm256_slli_epi64( v1, 64-(c) ); \
v0 = _mm256_or_si256( v0, t0 ); \
v1 = _mm256_or_si256( v1, t1 ); \
}
#define mm256_rolx2_64( v1, v0, c ) \
{ \
__m256i t0 = _mm256_slli_epi64( v0, c ); \
__m256i t1 = _mm256_slli_epi64( v1, c ); \
v0 = _mm256_srli_epi64( v0, 64-(c) ); \
v1 = _mm256_srli_epi64( v1, 64-(c) ); \
v0 = _mm256_or_si256( v0, t0 ); \
v1 = _mm256_or_si256( v1, t1 ); \
}
#define mm256_rorx2_32( v1, v0, c ) \
{ \
__m256i t0 = _mm256_srli_epi32( v0, c ); \
__m256i t1 = _mm256_srli_epi32( v1, c ); \
v0 = _mm256_slli_epi32( v0, 32-(c) ); \
v1 = _mm256_slli_epi32( v1, 32-(c) ); \
v0 = _mm256_or_si256( v0, t0 ); \
v1 = _mm256_or_si256( v1, t1 ); \
}
#define mm256_rolx2_32( v1, v0, c ) \
{ \
__m256i t0 = _mm256_slli_epi32( v0, c ); \
__m256i t1 = _mm256_slli_epi32( v1, c ); \
v0 = _mm256_srli_epi32( v0, 32-(c) ); \
v1 = _mm256_srli_epi32( v1, 32-(c) ); \
v0 = _mm256_or_si256( v0, t0 ); \
v1 = _mm256_or_si256( v1, t1 ); \
}
#endif // AVX512 else AVX2
#define mm256_ror_16( v, c ) \
@@ -276,58 +376,45 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
//
// Rotate elements accross all lanes.
#if defined(__AVX512VL__)
static inline __m256i mm256_swap_128( const __m256i v )
{ return _mm256_alignr_epi64( v, v, 2 ); }
static inline __m256i mm256_ror_1x64( const __m256i v )
{ return _mm256_alignr_epi64( v, v, 1 ); }
static inline __m256i mm256_rol_1x64( const __m256i v )
{ return _mm256_alignr_epi64( v, v, 3 ); }
static inline __m256i mm256_ror_1x32( const __m256i v )
{ return _mm256_alignr_epi32( v, v, 1 ); }
static inline __m256i mm256_rol_1x32( const __m256i v )
{ return _mm256_alignr_epi32( v, v, 7 ); }
#else // AVX2
// Swap 128 bit elements in 256 bit vector.
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
#define mm256_shuflr_128 mm256_swap_128
#define mm256_shufll_128 mm256_swap_128
// Rotate 256 bit vector by one 64 bit element
#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
// Rotate 256 bit vector by one 32 bit element.
#define mm256_ror_1x32( v ) \
#define mm256_shuflr_32( v ) \
_mm256_permutevar8x32_epi32( v, \
m256_const_64( 0x0000000000000007, 0x0000000600000005, \
0x0000000400000003, 0x0000000200000001 )
0x0000000400000003, 0x0000000200000001 ) )
#define mm256_rol_1x32( v ) \
#define mm256_shufll_32( v ) \
_mm256_permutevar8x32_epi32( v, \
m256_const_64( 0x0000000600000005, 0x0000000400000003, \
0x0000000200000001, 0x0000000000000007 )
0x0000000200000001, 0x0000000000000007 ) )
#endif // AVX512 else AVX2
//
// Rotate elements within each 128 bit lane of 256 bit vector.
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
#define mm256_ror128_32( v ) _mm256_shuffle_epi32( v, 0x39 )
#define mm256_rol128_32( v ) _mm256_shuffle_epi32( v, 0x93 )
#define mm256_shuflr128_64 mm256_swap128_64
#define mm256_shufll128_64 mm256_swap128_64
static inline __m256i mm256_ror128_x8( const __m256i v, const int c )
#define mm256_shuflr128_32( v ) _mm256_shuffle_epi32( v, 0x39 )
#define mm256_shufll128_32( v ) _mm256_shuffle_epi32( v, 0x93 )
static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
{ return _mm256_alignr_epi8( v, v, c ); }
// Swap 32 bit elements in each 64 bit lane.
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
#define mm256_shuflr64_32 mm256_swap64_32
#define mm256_shufll64_32 mm256_swap64_32
//
// Swap bytes in vector elements, endian bswap.
@@ -387,19 +474,21 @@ static inline __m256i mm256_ror128_x8( const __m256i v, const int c )
// _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
// makes these macros unnecessary.
// continue using vror/vrol notation for now to avoid confusion with
// shufl2r/shufl2l macro functions available with AVX512.
#define mm256_swap512_256( v1, v2 ) \
v1 = _mm256_xor_si256( v1, v2 ); \
v2 = _mm256_xor_si256( v1, v2 ); \
v1 = _mm256_xor_si256( v1, v2 );
#define mm256_ror512_128( v1, v2 ) \
#define mm256_vror512_128( v1, v2 ) \
do { \
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
v1 = _mm256_permute2x128( v2, v1, 0x21 ); \
v2 = t; \
} while(0)
#define mm256_rol512_128( v1, v2 ) \
#define mm256_vrol512_128( v1, v2 ) \
do { \
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
v2 = _mm256_permute2x128( v2, v1, 0x21 ); \

View File

@@ -74,13 +74,22 @@
// __AVX512VBMI__ __AVX512VAES__
//
// Used instead if casting.
typedef union
{
__m512i m512;
__m128i m128[4];
uint32_t u32[16];
uint64_t u64[8];
} __attribute__ ((aligned (64))) m512_ovly;
// Move integer to/from element 0 of vector.
#define mm512_mov64_512( n ) _mm512_castsi128_si512( mm128_mov64_128( n ) )
#define mm512_mov32_512( n ) _mm512_castsi128_si512( mm128_mov32_128( n ) )
#define mm512_mov256_64( a ) mm128_mov128_64( _mm256_castsi512_si128( a ) )
#define mm512_mov256_32( a ) mm128_mov128_32( _mm256_castsi512_si128( a ) )
#define u64_mov512_64( a ) u64_mov128_64( _mm256_castsi512_si128( a ) )
#define u32_mov512_32( a ) u32_mov128_32( _mm256_castsi512_si128( a ) )
// A simple 128 bit permute, using function instead of macro avoids
// problems if the v arg passed as an expression.
@@ -91,6 +100,10 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
#define mm512_concat_256( hi, lo ) \
_mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 )
#define m512_const_128( v3, v2, v1, v0 ) \
mm512_concat_256( mm256_concat_128( v3, v2 ), \
mm256_concat_128( v1, v0 ) )
// Equivalent of set, assign 64 bit integers to respective 64 bit elements.
// Use stack memory overlay
static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
@@ -225,7 +238,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
//
// Ternary logic uses 8 bit truth table to define any 3 input logical
// operation using any number or combinations of AND, OR XOR, NOT.
// expression using any number or combinations of AND, OR, XOR, NOT.
// a ^ b ^ c
#define mm512_xor3( a, b, c ) \
@@ -251,11 +264,11 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_andxor( a, b, c ) \
_mm512_ternarylogic_epi64( a, b, c, 0x60 )
// a ^ ( b & c )
// a ^ ( b | c )
#define mm512_xoror( a, b, c ) \
_mm512_ternarylogic_epi64( a, b, c, 0x1e )
// a ^ ( ~b & c ) [ xor( a, andnot( b, c ) ]
// a ^ ( ~b & c ) xor( a, andnot( b, c ) )
#define mm512_xorandnot( a, b, c ) \
_mm512_ternarylogic_epi64( a, b, c, 0xd2 )
@@ -265,11 +278,11 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
// Some 2 input operations that don't have their own instruction mnemonic.
// ~( a | b )
// ~( a | b ), (~a) & (~b)
#define mm512_nor( a, b ) \
_mm512_ternarylogic_epi64( a, b, b, 0x01 )
// ~( a ^ b ), same as (~a) ^ b
// ~( a ^ b ), (~a) ^ b
#define mm512_xnor( a, b ) \
_mm512_ternarylogic_epi64( a, b, b, 0x81 )
@@ -278,6 +291,27 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
_mm512_ternarylogic_epi64( a, b, b, 0xef )
// Diagonal blending
// Blend 8 64 bit elements from 8 vectors
#define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \
_mm512_mask_blend_epi64( 0x0f, \
_mm512_mask_blend_epi64( 0x30, \
_mm512_mask_blend_epi64( 0x40, v7, v6 ), \
_mm512_mask_blend_epi64( 0x40, v5, v4 ) ), \
_mm512_mask_blend_epi64( 0x03, \
_mm512_mask_blend_epi64( 0x04, v3, v2 ) \
_mm512_mask_blend_epi64( 0x01, v1, v0 ) ) )
// Blend 4 32 bit elements from each 128 bit lane.
#define mm512_diagonal128_32( v3, v2, v1, v0 ) \
_mm512_mask_blend_epi32( 0x3333, \
_mm512_mask_blend_epi32( 0x4444, v3, v2 ), \
_mm512_mask_blend_epi32( 0x1111, v1, v0 ) )
// Bit rotations.
// AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
@@ -395,59 +429,95 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
} while(0)
//
// Rotate elements in 512 bit vector.
// Shift with zero fill & shuffle-rotate elements in 512 bit vector.
//
// rename plan change ror to vror for Vector ROtate Right,
// and vrol for Vector ROtate Left, not to be confused with
//variable rotate rorv, rolv,
// Plan changed, use shuflr & shufll instead symbolizing a shuffle-rotate
// operation. 1xNN notaion ia also removed and replaced with simpler NN.
// Swap will still have its own mnemonic and will be aliased as both
// left and right shuffles.
// Shift elements right or left in 512 bit vector, filling with zeros.
// Multiple element shifts can be combined into a single larger
// element shift.
#define mm512_shiftr_256( v ) \
_mm512_alignr_epi64( _mm512_setzero, v, 4 )
#define mm512_shiftl_256( v ) mm512_shifr_256
#define mm512_shiftr_128( v ) \
_mm512_alignr_epi64( _mm512_setzero, v, 2 )
#define mm512_shiftl_128( v ) \
_mm512_alignr_epi64( v, _mm512_setzero, 6 )
#define mm512_shiftr_64( v ) \
_mm512_alignr_epi64( _mm512_setzero, v, 1 )
#define mm512_shiftl_64( v ) \
_mm512_alignr_epi64( v, _mm512_setzero, 7 )
#define mm512_shiftr_32( v ) \
_mm512_alignr_epi32( _mm512_setzero, v, 1 )
#define mm512_shiftl_32( v ) \
_mm512_alignr_epi32( v, _mm512_setzero, 15 )
// Shuffle-rotate elements left or right in 512 bit vector.
static inline __m512i mm512_swap_256( const __m512i v )
{ return _mm512_alignr_epi64( v, v, 4 ); }
#define mm512_shuflr_256( v ) mm512_swap_256
#define mm512_shufll_256( v ) mm512_swap_256
static inline __m512i mm512_ror_1x128( const __m512i v )
static inline __m512i mm512_shuflr_128( const __m512i v )
{ return _mm512_alignr_epi64( v, v, 2 ); }
static inline __m512i mm512_rol_1x128( const __m512i v )
static inline __m512i mm512_shufll_128( const __m512i v )
{ return _mm512_alignr_epi64( v, v, 6 ); }
static inline __m512i mm512_ror_1x64( const __m512i v )
static inline __m512i mm512_shuflr_64( const __m512i v )
{ return _mm512_alignr_epi64( v, v, 1 ); }
static inline __m512i mm512_rol_1x64( const __m512i v )
static inline __m512i mm512_shufll_64( const __m512i v )
{ return _mm512_alignr_epi64( v, v, 7 ); }
static inline __m512i mm512_ror_1x32( const __m512i v )
static inline __m512i mm512_shuflr_32( const __m512i v )
{ return _mm512_alignr_epi32( v, v, 1 ); }
static inline __m512i mm512_rol_1x32( const __m512i v )
static inline __m512i mm512_shufll_32( const __m512i v )
{ return _mm512_alignr_epi32( v, v, 15 ); }
static inline __m512i mm512_ror_x64( const __m512i v, const int n )
// Generic
static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
{ return _mm512_alignr_epi64( v, v, n ); }
static inline __m512i mm512_ror_x32( const __m512i v, const int n )
static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
{ return _mm512_alignr_epi32( v, v, n ); }
#define mm512_ror_1x16( v ) \
#define mm512_shuflr_16( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x0000001F001E001D, 0x001C001B001A0019, \
0X0018001700160015, 0X0014001300120011, \
0X0010000F000E000D, 0X000C000B000A0009, \
0X0008000700060005, 0X0004000300020001 ), v )
#define mm512_rol_1x16( v ) \
#define mm512_shufll_16( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x001E001D001C001B, 0x001A001900180017, \
0X0016001500140013, 0X001200110010000F, \
0X000E000D000C000B, 0X000A000900080007, \
0X0006000500040003, 0X000200010000001F ), v )
#define mm512_ror_1x8( v ) \
#define mm512_shuflr_8( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
0x003F3E3D3C3B3A39, 0x3837363534333231, \
0x302F2E2D2C2B2A29, 0x2827262524232221, \
0x201F1E1D1C1B1A19. 0x1817161514131211, \
0x100F0E0D0C0B0A09, 0x0807060504030201 ) )
#define mm512_rol_1x8( v ) \
#define mm512_shufll_8( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
0x3E3D3C3B3A393837, 0x363534333231302F. \
0x2E2D2C2B2A292827, 0x262524232221201F, \
@@ -456,51 +526,55 @@ static inline __m512i mm512_ror_x32( const __m512i v, const int n )
//
// Rotate elements within 256 bit lanes of 512 bit vector.
// 128 bit lane shift is handled by bslli bsrli.
// Swap hi & lo 128 bits in each 256 bit lane
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
#define mm512_shuflr256_128 mm512_swap256_128
#define mm512_shufll256_128 mm512_swap256_128
// Rotate 256 bit lanes by one 64 bit element
#define mm512_ror256_64( v ) _mm512_permutex_epi64( v, 0x39 )
#define mm512_rol256_64( v ) _mm512_permutex_epi64( v, 0x93 )
#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 )
#define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 )
// Rotate 256 bit lanes by one 32 bit element
#define mm512_ror256_32( v ) \
#define mm512_shuflr256_32( v ) \
_mm512_permutexvar_epi32( m512_const_64( \
0x000000080000000f, 0x0000000e0000000d, \
0x0000000c0000000b, 0x0000000a00000009, \
0x0000000000000007, 0x0000000600000005, \
0x0000000400000003, 0x0000000200000001 ), v )
#define mm512_rol256_32( v ) \
#define mm512_shufll256_32( v ) \
_mm512_permutexvar_epi32( m512_const_64( \
0x0000000e0000000d, 0x0000000c0000000b, \
0x0000000a00000009, 0x000000080000000f, \
0x0000000600000005, 0x0000000400000003, \
0x0000000200000001, 0x0000000000000007 ), v )
#define mm512_ror256_16( v ) \
#define mm512_shuflr256_16( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x00100001001e001d, 0x001c001b001a0019, \
0x0018001700160015, 0x0014001300120011, \
0x0000000f000e000d, 0x000c000b000a0009, \
0x0008000700060005, 0x0004000300020001 ), v )
#define mm512_rol256_16( v ) \
#define mm512_shufll256_16( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x001e001d001c001b, 0x001a001900180017, \
0x0016001500140013, 0x001200110010001f, \
0x000e000d000c000b, 0x000a000900080007, \
0x0006000500040003, 0x000200010000000f ), v )
#define mm512_ror256_8( v ) \
#define mm512_shuflr256_8( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
0x203f3e3d3c3b3a39, 0x3837363534333231, \
0x302f2e2d2c2b2a29, 0x2827262524232221, \
0x001f1e1d1c1b1a19, 0x1817161514131211, \
0x100f0e0d0c0b0a09, 0x0807060504030201 ) )
#define mm512_rol256_8( v ) \
#define mm512_shufll256_8( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
0x3e3d3c3b3a393837, 0x363534333231302f, \
0x2e2d2c2b2a292827, 0x262524232221203f, \
@@ -508,82 +582,120 @@ static inline __m512i mm512_ror_x32( const __m512i v, const int n )
0x0e0d0c0b0a090807, 0x060504030201001f ) )
//
// Rotate elements within 128 bit lanes of 512 bit vector.
// Shuffle-roate elements within 128 bit lanes of 512 bit vector.
// Swap 64 bits in each 128 bit lane
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
#define mm512_shuflr128_64 mm512_swap128_64
#define mm512_shufll128_64 mm512_swap128_64
// Rotate 128 bit lanes by one 32 bit element
#define mm512_ror128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
#define mm512_rol128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
// Rotate right 128 bit lanes by c bytes
static inline __m512i mm512_ror128_x8( const __m512i v, const int c )
// Rotate right 128 bit lanes by c bytes, versatile and just as fast
static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
{ return _mm512_alignr_epi8( v, v, c ); }
// Swap 32 bits in each 64 bit lane.
// Swap 32 bits in each 64 bit lane. Can be done with rotate instruction
// but only with AVX512. Shuffle is just as fast and availble with AVX2
// & SSE2.
#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 )
#define mm512_shuflr64_32 mm512_swap64_32
#define mm512_shufll64_32 mm512_swap64_32
// Need good way to distinguish 1 input shuffles, 2 input shuffle functions,
// and 2 input 2 output shuffle macros.
//
// Rotate elements from 2 512 bit vectors in place, source arguments
// shuflr is 1 input
// shufl2r is 2 input ...
// Drop macros? They can easilly be rebuilt using shufl2 functions
// add shuflr shufll functions performing rotate, returning first arg
// They're faster than doing both, when both not needed.
// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
// rotated v1
// visually confusing for shif2r because of arg order. First arg is always
// the target for modification, either update by reference or by function
// return.
#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 )
#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 )
#define mm512_shufl2r_128( v1, v2 ) _mm512_alignr_epi64( v2, v1, 2 )
#define mm512_shufl2l_128( v1, v2 ) _mm512_alignr_epi64( v1, v2, 2 )
#define mm512_shufl2r_64( v1, v2 ) _mm512_alignr_epi64( v2, v1, 1 )
#define mm512_shufl2l_64( v1, v2 ) _mm512_alignr_epi64( v1, v2, 1 )
#define mm512_shufl2r_32( v1, v2 ) _mm512_alignr_epi32( v2, v1, 1 )
#define mm512_shufl2l_32( v1, v2 ) _mm512_alignr_epi32( v1, v2, 1 )
// Rotate elements from 2 512 bit vectors in place, source arguments
// are overwritten.
#define mm512_swap1024_512( v1, v2 ) \
v1 = _mm512_xor_si512( v1, v2 ); \
v2 = _mm512_xor_si512( v1, v2 ); \
v1 = _mm512_xor_si512( v1, v2 );
#define mm512_shufl2l_512 mm512_swap1024_512 \
#define mm512_shufl2r_512 mm512_swap1024_512 \
#define mm512_ror1024_256( v1, v2 ) \
// Deprecated, will be removed. Use shufl2 functions instead. Leave them as is
// for now.
// Rotate elements from 2 512 bit vectors in place, both source arguments
// are updated.
#define mm512_vror1024_256( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
v2 = t; \
} while(0)
#define mm512_rol1024_256( v1, v2 ) \
#define mm512_vrol1024_256( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
v1 = t; \
} while(0)
#define mm512_ror1024_128( v1, v2 ) \
#define mm512_vror1024_128( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
v2 = t; \
} while(0)
#define mm512_rol1024_128( v1, v2 ) \
#define mm512_vrol1024_128( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
v1 = t; \
} while(0)
#define mm512_ror1024_64( v1, v2 ) \
#define mm512_vror1024_64( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
v2 = t; \
} while(0)
#define mm512_rol1024_64( v1, v2 ) \
#define mm512_vrol1024_64( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
v1 = t; \
} while(0)
#define mm512_ror1024_32( v1, v2 ) \
#define mm512_vror1024_32( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
v2 = t; \
} while(0)
#define mm512_rol1024_32( v1, v2 ) \
#define mm512_vrol1024_32( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
v2 = _mm512_alignr_epi32( v2, v1, 15 ); \

View File

@@ -68,13 +68,13 @@
// rotation.
// Swap hi & lo 32 bits.
#define mm64_swap_32( a ) _mm_shuffle_pi16( a, 0x4e )
#define mm64_swap_32( a ) _mm_shuffle_pi16( a, 0x4e )
#define mm64_ror64_1x16( a ) _mm_shuffle_pi16( a, 0x39 )
#define mm64_rol64_1x16( a ) _mm_shuffle_pi16( a, 0x93 )
#define mm64_shulfr_16( a ) _mm_shuffle_pi16( a, 0x39 )
#define mm64_shufll_16( a ) _mm_shuffle_pi16( a, 0x93 )
// Swap hi & lo 16 bits of each 32 bit element
#define mm64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 )
#define mm64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 )
#if defined(__SSSE3__)
@@ -86,7 +86,7 @@
_mm_shuffle_pi8( v, (__m64)0x0607040502030001 );
// Rotate right by c bytes
static inline __m64 mm64_ror_x8( __m64 v, const int c )
static inline __m64 mm64_vror_x8( __m64 v, const int c )
{ return _mm_alignr_pi8( v, v, c ); }
#else

View File

@@ -5,10 +5,19 @@
#define bswap_64( a ) __builtin_bswap64( a )
#define bswap_32( a ) __builtin_bswap32( a )
// safe division, integer or floating point
// Safe division, integer or floating point. For floating point it's as
// safe as 0. is precisely zero.
// Returns safe_result if division by zero.
#define safe_div( dividend, divisor, safe_result ) \
( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) ) )
// Aliases with familiar names for built in bit rotate instructions
#define rol64( a, n ) _lrotl( a, n )
#define ror64( a, n ) _lrotr( a, n )
#define rol32( a, n ) _rotl( a, n )
#define ror32( a, n ) _rotr( a, n )
#define rol16( a, n ) _rotwl( a, n )
#define ror16( a, n ) _rotwr( a, n )
///////////////////////////////////////
//
@@ -29,12 +38,14 @@
// __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 );
// my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );
// obsolete test
// Compiler check for __int128 support
// Configure also has a test for int128.
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
#define GCC_INT128 1
#endif
// obsolte test
#if !defined(GCC_INT128)
#warning "__int128 not supported, requires GCC-4.8 or newer."
#endif

View File

@@ -218,7 +218,7 @@ static inline void cpu_getname(char *outbuf, size_t maxsz)
for (int i = 2; i <= (ext & 0xF); i++)
{
cpuid(0x80000000+i, output);
memcpy(&brand[(i-2) * 4*sizeof(int)], output, 4*sizeof(int));
memcpy(&brand[(i-2) * 4*sizeof(int)], output, 4*sizeof(int));
}
snprintf(outbuf, maxsz, "%s", brand);
}

60
util.c
View File

@@ -47,6 +47,7 @@
//#include "miner.h"
#include "elist.h"
#include "algo-gate-api.h"
#include "algo/sha/sha256d.h"
//extern pthread_mutex_t stats_lock;
@@ -129,17 +130,19 @@ void applog2( int prio, const char *fmt, ... )
// localtime_r(&now, &tm);
switch (prio) {
switch ( prio )
{
case LOG_CRIT: color = CL_LRD; break;
case LOG_ERR: color = CL_RED; break;
case LOG_WARNING: color = CL_YLW; break;
case LOG_WARNING: color = CL_YL2; break;
case LOG_MAJR: color = CL_YL2; break;
case LOG_NOTICE: color = CL_WHT; break;
case LOG_INFO: color = ""; break;
case LOG_DEBUG: color = CL_GRY; break;
case LOG_BLUE:
prio = LOG_NOTICE;
color = CL_CYN;
break;
case LOG_MINR: color = CL_YLW; break;
case LOG_GREEN: color = CL_GRN; prio = LOG_INFO; break;
case LOG_BLUE: color = CL_CYN; prio = LOG_NOTICE; break;
case LOG_PINK: color = CL_LMA; prio = LOG_NOTICE; break;
}
if (!use_colors)
color = "";
@@ -206,17 +209,19 @@ void applog(int prio, const char *fmt, ...)
localtime_r(&now, &tm);
switch (prio) {
case LOG_ERR: color = CL_RED; break;
case LOG_WARNING: color = CL_YLW; break;
switch ( prio )
{
case LOG_CRIT: color = CL_LRD; break;
case LOG_ERR: color = CL_RED; break;
case LOG_WARNING: color = CL_YL2; break;
case LOG_MAJR: color = CL_YL2; break;
case LOG_NOTICE: color = CL_WHT; break;
case LOG_INFO: color = ""; break;
case LOG_INFO: color = ""; break;
case LOG_DEBUG: color = CL_GRY; break;
case LOG_BLUE:
prio = LOG_NOTICE;
color = CL_CYN;
break;
case LOG_MINR: color = CL_YLW; break;
case LOG_GREEN: color = CL_GRN; prio = LOG_INFO; break;
case LOG_BLUE: color = CL_CYN; prio = LOG_NOTICE; break;
case LOG_PINK: color = CL_LMA; prio = LOG_NOTICE; break;
}
if (!use_colors)
color = "";
@@ -303,6 +308,29 @@ void format_hashrate(double hashrate, char *output)
);
}
// For use with MiB etc
void format_number_si( double* n, char* si_units )
{
if ( *n < 1024*10 ) { *si_units = 0; return; }
*n /= 1024;
if ( *n < 1024*10 ) { *si_units = 'k'; return; }
*n /= 1024;
if ( *n < 1024*10 ) { *si_units = 'M'; return; }
*n /= 1024;
if ( *n < 1024*10 ) { *si_units = 'G'; return; }
*n /= 1024;
if ( *n < 1024*10 ) { *si_units = 'T'; return; }
*n /= 1024;
if ( *n < 1024*10 ) { *si_units = 'P'; return; }
*n /= 1024;
if ( *n < 1024*10 ) { *si_units = 'E'; return; }
*n /= 1024;
if ( *n < 1024*10 ) { *si_units = 'Z'; return; }
*n /= 1024;
*si_units = 'Y';
}
/* Modify the representation of integer numbers which would cause an overflow
* so that they are treated as floating-point numbers.
* This is a hack to overcome the limitations of some versions of Jansson. */