Compare commits

...

5 Commits

Author SHA1 Message Date
Jay D Dee
9b905fccc8 v3.17.1 2021-07-26 15:01:37 -04:00
Jay D Dee
92b3733925 v3.17.0 2021-07-15 20:30:44 -04:00
Jay D Dee
19cc88d102 v3.16.5 2021-06-26 12:27:44 -04:00
Jay D Dee
a053690170 v3.16.4 2021-06-23 21:52:42 -04:00
Jay D Dee
3c5e8921b7 v3.16.3 2021-05-06 14:55:03 -04:00
69 changed files with 3077 additions and 1175 deletions

View File

@@ -163,6 +163,8 @@ cpuminer_SOURCES = \
algo/sha/sph_sha2big.c \ algo/sha/sph_sha2big.c \
algo/sha/sha256-hash-4way.c \ algo/sha/sha256-hash-4way.c \
algo/sha/sha512-hash-4way.c \ algo/sha/sha512-hash-4way.c \
algo/sha/sha256-hash-opt.c \
algo/sha/sha256-hash-2way-ni.c \
algo/sha/hmac-sha256-hash.c \ algo/sha/hmac-sha256-hash.c \
algo/sha/hmac-sha256-hash-4way.c \ algo/sha/hmac-sha256-hash-4way.c \
algo/sha/sha2.c \ algo/sha/sha2.c \

View File

@@ -64,6 +64,11 @@ source code obtained from the author's official repository. The exact
procedure is documented in the build instructions for Windows: procedure is documented in the build instructions for Windows:
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
Some DLL filess may already be installed on the system by Windows or third
party packages. They often will work and may be used instead of the included
file. Without a compelling reason to do so it's recommended to use the included
files as they are packaged.
If you like this software feel free to donate: If you like this software feel free to donate:
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT

View File

@@ -65,11 +65,39 @@ If not what makes it happen or not happen?
Change Log Change Log
---------- ----------
v3.17.1
Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES.
More ternary logic optimizations for AVX512, AVX512+VAES, and AVX512+AES.
Fixed my-gr algo for VAES.
v3.17.0
AVX512 optimized using ternary logic instructions.
Faster sha256t on all CPU architectures: AVX512 +30%, SHA +30%, AVX2 +9%.
Use SHA on supported CPUs to produce merkle hash.
Fixed byte order in Extranonce2 log & replaced Block height with Job ID.
v3.16.5
#329: Fixed GBT incorrect target diff in stats, second attempt.
Fixed formatting error in share result log when --no-color option is used.
v3.16.4
Faster sha512 and sha256 when not using SHA CPU extension.
#329: Fixed GBT incorrect target diff in stats.
v3.16.3
#313 Fix compile error with GCC 11.
Incremental improvements to verthash.
v3.16.2 v3.16.2
Verthash: midstate prehash optimization for all architectures. Verthash: midstate prehash optimization for all architectures.
Verthash: AVX2 optimization. Verthash: AVX2 optimization.
GBT: added support for Bech32 addresses, untested. GBT: added support for Bech32 addresses.
Linux: added CPU frequency to benchmark log. Linux: added CPU frequency to benchmark log.
Fixed integer overflow in time calculations. Fixed integer overflow in time calculations.
@@ -112,7 +140,6 @@ v3.15.5
Fix stratum jobs lost if 2 jobs received in less than one second. Fix stratum jobs lost if 2 jobs received in less than one second.
v3.15.4 v3.15.4
Fixed yescryptr16 broken in v3.15.3. Fixed yescryptr16 broken in v3.15.3.

View File

@@ -180,6 +180,7 @@ void blake512_8way_update( void *cc, const void *data, size_t len );
void blake512_8way_close( void *cc, void *dst ); void blake512_8way_close( void *cc, void *dst );
void blake512_8way_full( blake_8way_big_context *sc, void * dst, void blake512_8way_full( blake_8way_big_context *sc, void * dst,
const void *data, size_t len ); const void *data, size_t len );
void blake512_8way_hash_le80( void *hash, const void *data );
#endif // AVX512 #endif // AVX512
#endif // AVX2 #endif // AVX2

View File

@@ -669,14 +669,14 @@ do { \
ROUND_S_8WAY(2); \ ROUND_S_8WAY(2); \
ROUND_S_8WAY(3); \ ROUND_S_8WAY(3); \
} \ } \
H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \ H0 = mm256_xor3( V8, V0, H0 ); \
H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \ H1 = mm256_xor3( V9, V1, H1 ); \
H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \ H2 = mm256_xor3( VA, V2, H2 ); \
H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \ H3 = mm256_xor3( VB, V3, H3 ); \
H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \ H4 = mm256_xor3( VC, V4, H4 ); \
H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \ H5 = mm256_xor3( VD, V5, H5 ); \
H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \ H6 = mm256_xor3( VE, V6, H6 ); \
H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \ H7 = mm256_xor3( VF, V7, H7 ); \
} while (0) } while (0)
@@ -808,14 +808,14 @@ do { \
ROUND_S_16WAY(2); \ ROUND_S_16WAY(2); \
ROUND_S_16WAY(3); \ ROUND_S_16WAY(3); \
} \ } \
H0 = _mm512_xor_si512( _mm512_xor_si512( V8, V0 ), H0 ); \ H0 = mm512_xor3( V8, V0, H0 ); \
H1 = _mm512_xor_si512( _mm512_xor_si512( V9, V1 ), H1 ); \ H1 = mm512_xor3( V9, V1, H1 ); \
H2 = _mm512_xor_si512( _mm512_xor_si512( VA, V2 ), H2 ); \ H2 = mm512_xor3( VA, V2, H2 ); \
H3 = _mm512_xor_si512( _mm512_xor_si512( VB, V3 ), H3 ); \ H3 = mm512_xor3( VB, V3, H3 ); \
H4 = _mm512_xor_si512( _mm512_xor_si512( VC, V4 ), H4 ); \ H4 = mm512_xor3( VC, V4, H4 ); \
H5 = _mm512_xor_si512( _mm512_xor_si512( VD, V5 ), H5 ); \ H5 = mm512_xor3( VD, V5, H5 ); \
H6 = _mm512_xor_si512( _mm512_xor_si512( VE, V6 ), H6 ); \ H6 = mm512_xor3( VE, V6, H6 ); \
H7 = _mm512_xor_si512( _mm512_xor_si512( VF, V7 ), H7 ); \ H7 = mm512_xor3( VF, V7, H7 ); \
} while (0) } while (0)
#endif #endif

View File

@@ -122,14 +122,14 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
B2B8W_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] ); B2B8W_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
} }
ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] ); ctx->h[0] = mm512_xor3( ctx->h[0], v[0], v[ 8] );
ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] ); ctx->h[1] = mm512_xor3( ctx->h[1], v[1], v[ 9] );
ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] ); ctx->h[2] = mm512_xor3( ctx->h[2], v[2], v[10] );
ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] ); ctx->h[3] = mm512_xor3( ctx->h[3], v[3], v[11] );
ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] ); ctx->h[4] = mm512_xor3( ctx->h[4], v[4], v[12] );
ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] ); ctx->h[5] = mm512_xor3( ctx->h[5], v[5], v[13] );
ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] ); ctx->h[6] = mm512_xor3( ctx->h[6], v[6], v[14] );
ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] ); ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
} }
int blake2b_8way_init( blake2b_8way_ctx *ctx ) int blake2b_8way_init( blake2b_8way_ctx *ctx )

View File

@@ -17,7 +17,7 @@
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
ALIGN(128) typedef struct { typedef struct ALIGN( 64 ) {
__m512i b[16]; // input buffer __m512i b[16]; // input buffer
__m512i h[8]; // chained state __m512i h[8]; // chained state
uint64_t t[2]; // total number of bytes uint64_t t[2]; // total number of bytes
@@ -35,7 +35,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
#if defined(__AVX2__) #if defined(__AVX2__)
// state context // state context
ALIGN(128) typedef struct { typedef struct ALIGN( 64 ) {
__m256i b[16]; // input buffer __m256i b[16]; // input buffer
__m256i h[8]; // chained state __m256i h[8]; // chained state
uint64_t t[2]; // total number of bytes uint64_t t[2]; // total number of bytes

View File

@@ -4,7 +4,6 @@
#include <stdint.h> #include <stdint.h>
#include "algo-gate-api.h" #include "algo-gate-api.h"
//#if defined(__SSE4_2__)
#if defined(__SSE2__) #if defined(__SSE2__)
#define BLAKE2S_4WAY #define BLAKE2S_4WAY
#endif #endif
@@ -27,8 +26,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
#elif defined (BLAKE2S_8WAY) #elif defined (BLAKE2S_8WAY)
//#if defined(BLAKE2S_8WAY)
void blake2s_8way_hash( void *state, const void *input ); void blake2s_8way_hash( void *state, const void *input );
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce, int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ); uint64_t *hashes_done, struct thr_info *mythr );

View File

@@ -368,7 +368,7 @@ do { \
ROUND8W( 9 ); ROUND8W( 9 );
for( size_t i = 0; i < 8; ++i ) for( size_t i = 0; i < 8; ++i )
S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] ); S->h[i] = mm256_xor3( S->h[i], v[i], v[i + 8] );
#undef G8W #undef G8W
#undef ROUND8W #undef ROUND8W
@@ -566,7 +566,7 @@ do { \
ROUND16W( 9 ); ROUND16W( 9 );
for( size_t i = 0; i < 8; ++i ) for( size_t i = 0; i < 8; ++i )
S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] ); S->h[i] = mm512_xor3( S->h[i], v[i], v[i + 8] );
#undef G16W #undef G16W
#undef ROUND16W #undef ROUND16W

View File

@@ -60,7 +60,7 @@ typedef struct __blake2s_nway_param
} blake2s_nway_param; } blake2s_nway_param;
#pragma pack(pop) #pragma pack(pop)
ALIGN( 64 ) typedef struct __blake2s_4way_state typedef struct ALIGN( 64 ) __blake2s_4way_state
{ {
__m128i h[8]; __m128i h[8];
uint8_t buf[ BLAKE2S_BLOCKBYTES * 4 ]; uint8_t buf[ BLAKE2S_BLOCKBYTES * 4 ];
@@ -80,7 +80,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
#if defined(__AVX2__) #if defined(__AVX2__)
ALIGN( 64 ) typedef struct __blake2s_8way_state typedef struct ALIGN( 64 ) __blake2s_8way_state
{ {
__m256i h[8]; __m256i h[8];
uint8_t buf[ BLAKE2S_BLOCKBYTES * 8 ]; uint8_t buf[ BLAKE2S_BLOCKBYTES * 8 ];
@@ -101,7 +101,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
ALIGN( 128 ) typedef struct __blake2s_16way_state typedef struct ALIGN( 64 ) __blake2s_16way_state
{ {
__m512i h[8]; __m512i h[8];
uint8_t buf[ BLAKE2S_BLOCKBYTES * 16 ]; uint8_t buf[ BLAKE2S_BLOCKBYTES * 16 ];

View File

@@ -293,10 +293,6 @@ static const sph_u64 CB[16] = {
H5 = (state)->H[5]; \ H5 = (state)->H[5]; \
H6 = (state)->H[6]; \ H6 = (state)->H[6]; \
H7 = (state)->H[7]; \ H7 = (state)->H[7]; \
S0 = (state)->S[0]; \
S1 = (state)->S[1]; \
S2 = (state)->S[2]; \
S3 = (state)->S[3]; \
T0 = (state)->T0; \ T0 = (state)->T0; \
T1 = (state)->T1; \ T1 = (state)->T1; \
} while (0) } while (0)
@@ -310,10 +306,6 @@ static const sph_u64 CB[16] = {
(state)->H[5] = H5; \ (state)->H[5] = H5; \
(state)->H[6] = H6; \ (state)->H[6] = H6; \
(state)->H[7] = H7; \ (state)->H[7] = H7; \
(state)->S[0] = S0; \
(state)->S[1] = S1; \
(state)->S[2] = S2; \
(state)->S[3] = S3; \
(state)->T0 = T0; \ (state)->T0 = T0; \
(state)->T1 = T1; \ (state)->T1 = T1; \
} while (0) } while (0)
@@ -348,7 +340,6 @@ static const sph_u64 CB[16] = {
#define DECL_STATE64_8WAY \ #define DECL_STATE64_8WAY \
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \ __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
__m512i S0, S1, S2, S3; \
uint64_t T0, T1; uint64_t T0, T1;
#define COMPRESS64_8WAY( buf ) do \ #define COMPRESS64_8WAY( buf ) do \
@@ -366,10 +357,10 @@ static const sph_u64 CB[16] = {
V5 = H5; \ V5 = H5; \
V6 = H6; \ V6 = H6; \
V7 = H7; \ V7 = H7; \
V8 = _mm512_xor_si512( S0, m512_const1_64( CB0 ) ); \ V8 = m512_const1_64( CB0 ); \
V9 = _mm512_xor_si512( S1, m512_const1_64( CB1 ) ); \ V9 = m512_const1_64( CB1 ); \
VA = _mm512_xor_si512( S2, m512_const1_64( CB2 ) ); \ VA = m512_const1_64( CB2 ); \
VB = _mm512_xor_si512( S3, m512_const1_64( CB3 ) ); \ VB = m512_const1_64( CB3 ); \
VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \ VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
m512_const1_64( CB4 ) ); \ m512_const1_64( CB4 ) ); \
VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \ VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
@@ -414,14 +405,14 @@ static const sph_u64 CB[16] = {
ROUND_B_8WAY(3); \ ROUND_B_8WAY(3); \
ROUND_B_8WAY(4); \ ROUND_B_8WAY(4); \
ROUND_B_8WAY(5); \ ROUND_B_8WAY(5); \
H0 = mm512_xor4( V8, V0, S0, H0 ); \ H0 = mm512_xor3( V8, V0, H0 ); \
H1 = mm512_xor4( V9, V1, S1, H1 ); \ H1 = mm512_xor3( V9, V1, H1 ); \
H2 = mm512_xor4( VA, V2, S2, H2 ); \ H2 = mm512_xor3( VA, V2, H2 ); \
H3 = mm512_xor4( VB, V3, S3, H3 ); \ H3 = mm512_xor3( VB, V3, H3 ); \
H4 = mm512_xor4( VC, V4, S0, H4 ); \ H4 = mm512_xor3( VC, V4, H4 ); \
H5 = mm512_xor4( VD, V5, S1, H5 ); \ H5 = mm512_xor3( VD, V5, H5 ); \
H6 = mm512_xor4( VE, V6, S2, H6 ); \ H6 = mm512_xor3( VE, V6, H6 ); \
H7 = mm512_xor4( VF, V7, S3, H7 ); \ H7 = mm512_xor3( VF, V7, H7 ); \
} while (0) } while (0)
void blake512_8way_compress( blake_8way_big_context *sc ) void blake512_8way_compress( blake_8way_big_context *sc )
@@ -440,10 +431,10 @@ void blake512_8way_compress( blake_8way_big_context *sc )
V5 = sc->H[5]; V5 = sc->H[5];
V6 = sc->H[6]; V6 = sc->H[6];
V7 = sc->H[7]; V7 = sc->H[7];
V8 = _mm512_xor_si512( sc->S[0], m512_const1_64( CB0 ) ); V8 = m512_const1_64( CB0 );
V9 = _mm512_xor_si512( sc->S[1], m512_const1_64( CB1 ) ); V9 = m512_const1_64( CB1 );
VA = _mm512_xor_si512( sc->S[2], m512_const1_64( CB2 ) ); VA = m512_const1_64( CB2 );
VB = _mm512_xor_si512( sc->S[3], m512_const1_64( CB3 ) ); VB = m512_const1_64( CB3 );
VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ), VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
m512_const1_64( CB4 ) ); m512_const1_64( CB4 ) );
VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ), VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
@@ -492,19 +483,18 @@ void blake512_8way_compress( blake_8way_big_context *sc )
ROUND_B_8WAY(4); ROUND_B_8WAY(4);
ROUND_B_8WAY(5); ROUND_B_8WAY(5);
sc->H[0] = mm512_xor4( V8, V0, sc->S[0], sc->H[0] ); sc->H[0] = mm512_xor3( V8, V0, sc->H[0] );
sc->H[1] = mm512_xor4( V9, V1, sc->S[1], sc->H[1] ); sc->H[1] = mm512_xor3( V9, V1, sc->H[1] );
sc->H[2] = mm512_xor4( VA, V2, sc->S[2], sc->H[2] ); sc->H[2] = mm512_xor3( VA, V2, sc->H[2] );
sc->H[3] = mm512_xor4( VB, V3, sc->S[3], sc->H[3] ); sc->H[3] = mm512_xor3( VB, V3, sc->H[3] );
sc->H[4] = mm512_xor4( VC, V4, sc->S[0], sc->H[4] ); sc->H[4] = mm512_xor3( VC, V4, sc->H[4] );
sc->H[5] = mm512_xor4( VD, V5, sc->S[1], sc->H[5] ); sc->H[5] = mm512_xor3( VD, V5, sc->H[5] );
sc->H[6] = mm512_xor4( VE, V6, sc->S[2], sc->H[6] ); sc->H[6] = mm512_xor3( VE, V6, sc->H[6] );
sc->H[7] = mm512_xor4( VF, V7, sc->S[3], sc->H[7] ); sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
} }
void blake512_8way_init( blake_8way_big_context *sc ) void blake512_8way_init( blake_8way_big_context *sc )
{ {
__m512i zero = m512_zero;
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 ); casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B ); casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B ); casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
@@ -514,11 +504,6 @@ void blake512_8way_init( blake_8way_big_context *sc )
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B ); casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 ); casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
casti_m512i( sc->S, 0 ) = zero;
casti_m512i( sc->S, 1 ) = zero;
casti_m512i( sc->S, 2 ) = zero;
casti_m512i( sc->S, 3 ) = zero;
sc->T0 = sc->T1 = 0; sc->T0 = sc->T1 = 0;
sc->ptr = 0; sc->ptr = 0;
} }
@@ -641,11 +626,6 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B ); casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 ); casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
casti_m512i( sc->S, 0 ) = m512_zero;
casti_m512i( sc->S, 1 ) = m512_zero;
casti_m512i( sc->S, 2 ) = m512_zero;
casti_m512i( sc->S, 3 ) = m512_zero;
sc->T0 = sc->T1 = 0; sc->T0 = sc->T1 = 0;
sc->ptr = 0; sc->ptr = 0;
@@ -740,7 +720,6 @@ blake512_8way_close(void *cc, void *dst)
#define DECL_STATE64_4WAY \ #define DECL_STATE64_4WAY \
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \ __m256i H0, H1, H2, H3, H4, H5, H6, H7; \
__m256i S0, S1, S2, S3; \
uint64_t T0, T1; uint64_t T0, T1;
#define COMPRESS64_4WAY do \ #define COMPRESS64_4WAY do \
@@ -758,10 +737,10 @@ blake512_8way_close(void *cc, void *dst)
V5 = H5; \ V5 = H5; \
V6 = H6; \ V6 = H6; \
V7 = H7; \ V7 = H7; \
V8 = _mm256_xor_si256( S0, m256_const1_64( CB0 ) ); \ V8 = m256_const1_64( CB0 ); \
V9 = _mm256_xor_si256( S1, m256_const1_64( CB1 ) ); \ V9 = m256_const1_64( CB1 ); \
VA = _mm256_xor_si256( S2, m256_const1_64( CB2 ) ); \ VA = m256_const1_64( CB2 ); \
VB = _mm256_xor_si256( S3, m256_const1_64( CB3 ) ); \ VB = m256_const1_64( CB3 ); \
VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \ VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
m256_const1_64( CB4 ) ); \ m256_const1_64( CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \ VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
@@ -804,14 +783,14 @@ blake512_8way_close(void *cc, void *dst)
ROUND_B_4WAY(3); \ ROUND_B_4WAY(3); \
ROUND_B_4WAY(4); \ ROUND_B_4WAY(4); \
ROUND_B_4WAY(5); \ ROUND_B_4WAY(5); \
H0 = mm256_xor4( V8, V0, S0, H0 ); \ H0 = mm256_xor3( V8, V0, H0 ); \
H1 = mm256_xor4( V9, V1, S1, H1 ); \ H1 = mm256_xor3( V9, V1, H1 ); \
H2 = mm256_xor4( VA, V2, S2, H2 ); \ H2 = mm256_xor3( VA, V2, H2 ); \
H3 = mm256_xor4( VB, V3, S3, H3 ); \ H3 = mm256_xor3( VB, V3, H3 ); \
H4 = mm256_xor4( VC, V4, S0, H4 ); \ H4 = mm256_xor3( VC, V4, H4 ); \
H5 = mm256_xor4( VD, V5, S1, H5 ); \ H5 = mm256_xor3( VD, V5, H5 ); \
H6 = mm256_xor4( VE, V6, S2, H6 ); \ H6 = mm256_xor3( VE, V6, H6 ); \
H7 = mm256_xor4( VF, V7, S3, H7 ); \ H7 = mm256_xor3( VF, V7, H7 ); \
} while (0) } while (0)
@@ -831,10 +810,10 @@ void blake512_4way_compress( blake_4way_big_context *sc )
V5 = sc->H[5]; V5 = sc->H[5];
V6 = sc->H[6]; V6 = sc->H[6];
V7 = sc->H[7]; V7 = sc->H[7];
V8 = _mm256_xor_si256( sc->S[0], m256_const1_64( CB0 ) ); V8 = m256_const1_64( CB0 );
V9 = _mm256_xor_si256( sc->S[1], m256_const1_64( CB1 ) ); V9 = m256_const1_64( CB1 );
VA = _mm256_xor_si256( sc->S[2], m256_const1_64( CB2 ) ); VA = m256_const1_64( CB2 );
VB = _mm256_xor_si256( sc->S[3], m256_const1_64( CB3 ) ); VB = m256_const1_64( CB3 );
VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ), VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
m256_const1_64( CB4 ) ); m256_const1_64( CB4 ) );
VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ), VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
@@ -880,19 +859,18 @@ void blake512_4way_compress( blake_4way_big_context *sc )
ROUND_B_4WAY(4); ROUND_B_4WAY(4);
ROUND_B_4WAY(5); ROUND_B_4WAY(5);
sc->H[0] = mm256_xor4( V8, V0, sc->S[0], sc->H[0] ); sc->H[0] = mm256_xor3( V8, V0, sc->H[0] );
sc->H[1] = mm256_xor4( V9, V1, sc->S[1], sc->H[1] ); sc->H[1] = mm256_xor3( V9, V1, sc->H[1] );
sc->H[2] = mm256_xor4( VA, V2, sc->S[2], sc->H[2] ); sc->H[2] = mm256_xor3( VA, V2, sc->H[2] );
sc->H[3] = mm256_xor4( VB, V3, sc->S[3], sc->H[3] ); sc->H[3] = mm256_xor3( VB, V3, sc->H[3] );
sc->H[4] = mm256_xor4( VC, V4, sc->S[0], sc->H[4] ); sc->H[4] = mm256_xor3( VC, V4, sc->H[4] );
sc->H[5] = mm256_xor4( VD, V5, sc->S[1], sc->H[5] ); sc->H[5] = mm256_xor3( VD, V5, sc->H[5] );
sc->H[6] = mm256_xor4( VE, V6, sc->S[2], sc->H[6] ); sc->H[6] = mm256_xor3( VE, V6, sc->H[6] );
sc->H[7] = mm256_xor4( VF, V7, sc->S[3], sc->H[7] ); sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
} }
void blake512_4way_init( blake_4way_big_context *sc ) void blake512_4way_init( blake_4way_big_context *sc )
{ {
__m256i zero = m256_zero;
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 ); casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B ); casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B ); casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
@@ -902,11 +880,6 @@ void blake512_4way_init( blake_4way_big_context *sc )
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B ); casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 ); casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
casti_m256i( sc->S, 0 ) = zero;
casti_m256i( sc->S, 1 ) = zero;
casti_m256i( sc->S, 2 ) = zero;
casti_m256i( sc->S, 3 ) = zero;
sc->T0 = sc->T1 = 0; sc->T0 = sc->T1 = 0;
sc->ptr = 0; sc->ptr = 0;
} }
@@ -1026,11 +999,6 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B ); casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 ); casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
casti_m256i( sc->S, 0 ) = m256_zero;
casti_m256i( sc->S, 1 ) = m256_zero;
casti_m256i( sc->S, 2 ) = m256_zero;
casti_m256i( sc->S, 3 ) = m256_zero;
sc->T0 = sc->T1 = 0; sc->T0 = sc->T1 = 0;
sc->ptr = 0; sc->ptr = 0;

View File

@@ -323,7 +323,7 @@ int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )
int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ) int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
{ {
blake2s_state S[1]; blake2s_state S;
/* Verify parameters */ /* Verify parameters */
if ( NULL == in ) return -1; if ( NULL == in ) return -1;
@@ -334,15 +334,15 @@ int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen
if( keylen > 0 ) if( keylen > 0 )
{ {
if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1; if( blake2s_init_key( &S, outlen, key, keylen ) < 0 ) return -1;
} }
else else
{ {
if( blake2s_init( S, outlen ) < 0 ) return -1; if( blake2s_init( &S, outlen ) < 0 ) return -1;
} }
blake2s_update( S, ( uint8_t * )in, inlen ); blake2s_update( &S, ( uint8_t * )in, inlen );
blake2s_final( S, out, outlen ); blake2s_final( &S, out, outlen );
return 0; return 0;
} }

View File

@@ -116,7 +116,7 @@ extern "C" {
uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32 uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32
} blake2s_param; } blake2s_param;
ALIGN( 64 ) typedef struct __blake2s_state typedef struct ALIGN( 64 ) __blake2s_state
{ {
uint32_t h[8]; uint32_t h[8];
uint32_t t[2]; uint32_t t[2];

View File

@@ -18,7 +18,7 @@
#endif #endif
// state context // state context
ALIGN(64) typedef struct { typedef ALIGN(64) struct {
uint8_t b[128]; // input buffer uint8_t b[128]; // input buffer
uint64_t h[8]; // chained state uint64_t h[8]; // chained state
uint64_t t[2]; // total number of bytes uint64_t t[2]; // total number of bytes

View File

@@ -867,40 +867,35 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
qt[30] = expand2s8( qt, M, H, 30 ); qt[30] = expand2s8( qt, M, H, 30 );
qt[31] = expand2s8( qt, M, H, 31 ); qt[31] = expand2s8( qt, M, H, 31 );
xl = _mm256_xor_si256( xl = mm256_xor3( mm256_xor3( qt[16], qt[17], qt[18] ),
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ), mm256_xor3( qt[19], qt[20], qt[21] ),
mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) ); _mm256_xor_si256( qt[22], qt[23] ) );
xh = _mm256_xor_si256( xl, _mm256_xor_si256(
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ), xh = mm256_xor3( mm256_xor3( xl, qt[24], qt[25] ),
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); mm256_xor3( qt[26], qt[27], qt[28] ),
mm256_xor3( qt[29], qt[30], qt[31] ) );
#define DH1L( m, sl, sr, a, b, c ) \ #define DH1L( m, sl, sr, a, b, c ) \
_mm256_add_epi32( \ _mm256_add_epi32( mm256_xor3( M[m], _mm256_slli_epi32( xh, sl ), \
_mm256_xor_si256( M[m], \ _mm256_srli_epi32( qt[a], sr ) ), \
_mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \ mm256_xor3( xl, qt[b], qt[c] ) )
_mm256_srli_epi32( qt[a], sr ) ) ), \
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
#define DH1R( m, sl, sr, a, b, c ) \ #define DH1R( m, sl, sr, a, b, c ) \
_mm256_add_epi32( \ _mm256_add_epi32( mm256_xor3( M[m], _mm256_srli_epi32( xh, sl ), \
_mm256_xor_si256( M[m], \ _mm256_slli_epi32( qt[a], sr ) ), \
_mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \ mm256_xor3( xl, qt[b], qt[c] ) )
_mm256_slli_epi32( qt[a], sr ) ) ), \
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
#define DH2L( m, rl, sl, h, a, b, c ) \ #define DH2L( m, rl, sl, h, a, b, c ) \
_mm256_add_epi32( _mm256_add_epi32( \ _mm256_add_epi32( _mm256_add_epi32( \
mm256_rol_32( dH[h], rl ), \ mm256_rol_32( dH[h], rl ), \
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \ mm256_xor3( xh, qt[a], M[m] ) ), \
_mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \ mm256_xor3( _mm256_slli_epi32( xl, sl ), qt[b], qt[c] ) )
_mm256_xor_si256( qt[b], qt[c] ) ) );
#define DH2R( m, rl, sr, h, a, b, c ) \ #define DH2R( m, rl, sr, h, a, b, c ) \
_mm256_add_epi32( _mm256_add_epi32( \ _mm256_add_epi32( _mm256_add_epi32( \
mm256_rol_32( dH[h], rl ), \ mm256_rol_32( dH[h], rl ), \
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \ mm256_xor3( xh, qt[a], M[m] ) ), \
_mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \ mm256_xor3( _mm256_srli_epi32( xl, sr ), qt[b], qt[c] ) )
_mm256_xor_si256( qt[b], qt[c] ) ) );
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 ); dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 ); dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
@@ -924,88 +919,6 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
#undef DH2L #undef DH2L
#undef DH2R #undef DH2R
/*
dH[ 0] = _mm256_add_epi32(
_mm256_xor_si256( M[0],
_mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
_mm256_srli_epi32( qt[16], 5 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
dH[ 1] = _mm256_add_epi32(
_mm256_xor_si256( M[1],
_mm256_xor_si256( _mm256_srli_epi32( xh, 7 ),
_mm256_slli_epi32( qt[17], 8 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
dH[ 2] = _mm256_add_epi32(
_mm256_xor_si256( M[2],
_mm256_xor_si256( _mm256_srli_epi32( xh, 5 ),
_mm256_slli_epi32( qt[18], 5 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
dH[ 3] = _mm256_add_epi32(
_mm256_xor_si256( M[3],
_mm256_xor_si256( _mm256_srli_epi32( xh, 1 ),
_mm256_slli_epi32( qt[19], 5 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
dH[ 4] = _mm256_add_epi32(
_mm256_xor_si256( M[4],
_mm256_xor_si256( _mm256_srli_epi32( xh, 3 ),
_mm256_slli_epi32( qt[20], 0 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
dH[ 5] = _mm256_add_epi32(
_mm256_xor_si256( M[5],
_mm256_xor_si256( _mm256_slli_epi32( xh, 6 ),
_mm256_srli_epi32( qt[21], 6 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
dH[ 6] = _mm256_add_epi32(
_mm256_xor_si256( M[6],
_mm256_xor_si256( _mm256_srli_epi32( xh, 4 ),
_mm256_slli_epi32( qt[22], 6 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
dH[ 7] = _mm256_add_epi32(
_mm256_xor_si256( M[7],
_mm256_xor_si256( _mm256_srli_epi32( xh, 11 ),
_mm256_slli_epi32( qt[23], 2 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
dH[ 8] = _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32( dH[4], 9 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
_mm256_xor_si256( _mm256_slli_epi32( xl, 8 ),
_mm256_xor_si256( qt[23], qt[ 8] ) ) );
dH[ 9] = _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32( dH[5], 10 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
_mm256_xor_si256( _mm256_srli_epi32( xl, 6 ),
_mm256_xor_si256( qt[16], qt[ 9] ) ) );
dH[10] = _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32( dH[6], 11 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
_mm256_xor_si256( _mm256_slli_epi32( xl, 6 ),
_mm256_xor_si256( qt[17], qt[10] ) ) );
dH[11] = _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32( dH[7], 12 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
_mm256_xor_si256( _mm256_slli_epi32( xl, 4 ),
_mm256_xor_si256( qt[18], qt[11] ) ) );
dH[12] = _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32( dH[0], 13 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
_mm256_xor_si256( _mm256_srli_epi32( xl, 3 ),
_mm256_xor_si256( qt[19], qt[12] ) ) );
dH[13] = _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32( dH[1], 14 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
_mm256_xor_si256( _mm256_srli_epi32( xl, 4 ),
_mm256_xor_si256( qt[20], qt[13] ) ) );
dH[14] = _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32( dH[2], 15 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
_mm256_xor_si256( _mm256_srli_epi32( xl, 7 ),
_mm256_xor_si256( qt[21], qt[14] ) ) );
dH[15] = _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32( dH[3], 16 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
_mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
_mm256_xor_si256( qt[22], qt[15] ) ) );
*/
} }
static const __m256i final_s8[16] = static const __m256i final_s8[16] =
@@ -1422,40 +1335,35 @@ void compress_small_16way( const __m512i *M, const __m512i H[16],
qt[30] = expand2s16( qt, M, H, 30 ); qt[30] = expand2s16( qt, M, H, 30 );
qt[31] = expand2s16( qt, M, H, 31 ); qt[31] = expand2s16( qt, M, H, 31 );
xl = _mm512_xor_si512( xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
mm512_xor4( qt[16], qt[17], qt[18], qt[19] ), mm512_xor3( qt[19], qt[20], qt[21] ),
mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) ); _mm512_xor_si512( qt[22], qt[23] ) );
xh = _mm512_xor_si512( xl, _mm512_xor_si512(
mm512_xor4( qt[24], qt[25], qt[26], qt[27] ), xh = mm512_xor3( mm512_xor3( xl, qt[24], qt[25] ),
mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); mm512_xor3( qt[26], qt[27], qt[28] ),
mm512_xor3( qt[29], qt[30], qt[31] ) );
#define DH1L( m, sl, sr, a, b, c ) \ #define DH1L( m, sl, sr, a, b, c ) \
_mm512_add_epi32( \ _mm512_add_epi32( mm512_xor3( M[m], _mm512_slli_epi32( xh, sl ), \
_mm512_xor_si512( M[m], \ _mm512_srli_epi32( qt[a], sr ) ), \
_mm512_xor_si512( _mm512_slli_epi32( xh, sl ), \ mm512_xor3( xl, qt[b], qt[c] ) )
_mm512_srli_epi32( qt[a], sr ) ) ), \
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
#define DH1R( m, sl, sr, a, b, c ) \ #define DH1R( m, sl, sr, a, b, c ) \
_mm512_add_epi32( \ _mm512_add_epi32( mm512_xor3( M[m], _mm512_srli_epi32( xh, sl ), \
_mm512_xor_si512( M[m], \ _mm512_slli_epi32( qt[a], sr ) ), \
_mm512_xor_si512( _mm512_srli_epi32( xh, sl ), \ mm512_xor3( xl, qt[b], qt[c] ) )
_mm512_slli_epi32( qt[a], sr ) ) ), \
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
#define DH2L( m, rl, sl, h, a, b, c ) \ #define DH2L( m, rl, sl, h, a, b, c ) \
_mm512_add_epi32( _mm512_add_epi32( \ _mm512_add_epi32( _mm512_add_epi32( \
mm512_rol_32( dH[h], rl ), \ mm512_rol_32( dH[h], rl ), \
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \ mm512_xor3( xh, qt[a], M[m] ) ), \
_mm512_xor_si512( _mm512_slli_epi32( xl, sl ), \ mm512_xor3( _mm512_slli_epi32( xl, sl ), qt[b], qt[c] ) )
_mm512_xor_si512( qt[b], qt[c] ) ) );
#define DH2R( m, rl, sr, h, a, b, c ) \ #define DH2R( m, rl, sr, h, a, b, c ) \
_mm512_add_epi32( _mm512_add_epi32( \ _mm512_add_epi32( _mm512_add_epi32( \
mm512_rol_32( dH[h], rl ), \ mm512_rol_32( dH[h], rl ), \
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \ mm512_xor3( xh, qt[a], M[m] ) ), \
_mm512_xor_si512( _mm512_srli_epi32( xl, sr ), \ mm512_xor3( _mm512_srli_epi32( xl, sr ), qt[b], qt[c] ) )
_mm512_xor_si512( qt[b], qt[c] ) ) );
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 ); dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 ); dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );

View File

@@ -1285,40 +1285,35 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
qt[30] = expand2b8( qt, M, H, 30 ); qt[30] = expand2b8( qt, M, H, 30 );
qt[31] = expand2b8( qt, M, H, 31 ); qt[31] = expand2b8( qt, M, H, 31 );
xl = _mm512_xor_si512( xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
mm512_xor4( qt[16], qt[17], qt[18], qt[19] ), mm512_xor3( qt[19], qt[20], qt[21] ),
mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) ); _mm512_xor_si512( qt[22], qt[23] ) );
xh = _mm512_xor_si512( xl, _mm512_xor_si512(
mm512_xor4( qt[24], qt[25], qt[26], qt[27] ), xh = mm512_xor3( mm512_xor3( xl, qt[24], qt[25] ),
mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); mm512_xor3( qt[26], qt[27], qt[28] ),
mm512_xor3( qt[29], qt[30], qt[31] ) );
#define DH1L( m, sl, sr, a, b, c ) \ #define DH1L( m, sl, sr, a, b, c ) \
_mm512_add_epi64( \ _mm512_add_epi64( mm512_xor3( M[m], _mm512_slli_epi64( xh, sl ), \
_mm512_xor_si512( M[m], \ _mm512_srli_epi64( qt[a], sr ) ), \
_mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \ mm512_xor3( xl, qt[b], qt[c] ) )
_mm512_srli_epi64( qt[a], sr ) ) ), \
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
#define DH1R( m, sl, sr, a, b, c ) \ #define DH1R( m, sl, sr, a, b, c ) \
_mm512_add_epi64( \ _mm512_add_epi64( mm512_xor3( M[m], _mm512_srli_epi64( xh, sl ), \
_mm512_xor_si512( M[m], \ _mm512_slli_epi64( qt[a], sr ) ), \
_mm512_xor_si512( _mm512_srli_epi64( xh, sl ), \ mm512_xor3( xl, qt[b], qt[c] ) )
_mm512_slli_epi64( qt[a], sr ) ) ), \
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
#define DH2L( m, rl, sl, h, a, b, c ) \ #define DH2L( m, rl, sl, h, a, b, c ) \
_mm512_add_epi64( _mm512_add_epi64( \ _mm512_add_epi64( _mm512_add_epi64( \
mm512_rol_64( dH[h], rl ), \ mm512_rol_64( dH[h], rl ), \
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \ mm512_xor3( xh, qt[a], M[m] ) ), \
_mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \ mm512_xor3( _mm512_slli_epi64( xl, sl ), qt[b], qt[c] ) )
_mm512_xor_si512( qt[b], qt[c] ) ) );
#define DH2R( m, rl, sr, h, a, b, c ) \ #define DH2R( m, rl, sr, h, a, b, c ) \
_mm512_add_epi64( _mm512_add_epi64( \ _mm512_add_epi64( _mm512_add_epi64( \
mm512_rol_64( dH[h], rl ), \ mm512_rol_64( dH[h], rl ), \
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \ mm512_xor3( xh, qt[a], M[m] ) ), \
_mm512_xor_si512( _mm512_srli_epi64( xl, sr ), \ mm512_xor3( _mm512_srli_epi64( xl, sr ), qt[b], qt[c] ) )
_mm512_xor_si512( qt[b], qt[c] ) ) );
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 ); dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );

View File

@@ -53,6 +53,20 @@ MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x000
MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234}; MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
#define ECHO_SUBBYTES4(state, j) \
state[0][j] = _mm_aesenc_si128(state[0][j], k1);\
k1 = _mm_add_epi32(k1, M128(const1));\
state[1][j] = _mm_aesenc_si128(state[1][j], k1);\
k1 = _mm_add_epi32(k1, M128(const1));\
state[2][j] = _mm_aesenc_si128(state[2][j], k1);\
k1 = _mm_add_epi32(k1, M128(const1));\
state[3][j] = _mm_aesenc_si128(state[3][j], k1);\
k1 = _mm_add_epi32(k1, M128(const1));\
state[0][j] = _mm_aesenc_si128(state[0][j], m128_zero ); \
state[1][j] = _mm_aesenc_si128(state[1][j], m128_zero ); \
state[2][j] = _mm_aesenc_si128(state[2][j], m128_zero ); \
state[3][j] = _mm_aesenc_si128(state[3][j], m128_zero )
#define ECHO_SUBBYTES(state, i, j) \ #define ECHO_SUBBYTES(state, i, j) \
state[i][j] = _mm_aesenc_si128(state[i][j], k1);\ state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
k1 = _mm_add_epi32(k1, M128(const1));\ k1 = _mm_add_epi32(k1, M128(const1));\
@@ -73,7 +87,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
t1 = _mm_and_si128(t1, M128(lsbmask));\ t1 = _mm_and_si128(t1, M128(lsbmask));\
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\ t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
s2 = _mm_xor_si128(s2, t2);\ s2 = _mm_xor_si128(s2, t2);\
state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\ state2[0][j] = mm128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
state2[1][j] = _mm_xor_si128(state2[1][j], s2);\ state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\ state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\ state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
@@ -83,7 +97,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\ t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
s2 = _mm_xor_si128(s2, t2);\ s2 = _mm_xor_si128(s2, t2);\
state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\ state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\ state2[1][j] = mm128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
state2[2][j] = _mm_xor_si128(state2[2][j], s2);\ state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\ state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\ s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
@@ -93,10 +107,29 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
s2 = _mm_xor_si128(s2, t2);\ s2 = _mm_xor_si128(s2, t2);\
state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\ state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\ state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\ state2[2][j] = mm128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
state2[3][j] = _mm_xor_si128(state2[3][j], s2) state2[3][j] = _mm_xor_si128(state2[3][j], s2)
#define ECHO_ROUND_UNROLL2 \
ECHO_SUBBYTES4(_state, 0);\
ECHO_SUBBYTES4(_state, 1);\
ECHO_SUBBYTES4(_state, 2);\
ECHO_SUBBYTES4(_state, 3);\
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
ECHO_SUBBYTES4(_state2, 0);\
ECHO_SUBBYTES4(_state2, 1);\
ECHO_SUBBYTES4(_state2, 2);\
ECHO_SUBBYTES4(_state2, 3);\
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
/*
#define ECHO_ROUND_UNROLL2 \ #define ECHO_ROUND_UNROLL2 \
ECHO_SUBBYTES(_state, 0, 0);\ ECHO_SUBBYTES(_state, 0, 0);\
ECHO_SUBBYTES(_state, 1, 0);\ ECHO_SUBBYTES(_state, 1, 0);\
@@ -138,7 +171,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\ ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\ ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2) ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
*/
#define SAVESTATE(dst, src)\ #define SAVESTATE(dst, src)\

View File

@@ -13,12 +13,19 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define ECHO_SUBBYTES4(state, j) \
//#define mul2mask m512_const2_64( 0, 0x00001b00 ) state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
//_mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) k1 = _mm512_add_epi32( k1, one ); \
//_mm512_set4_epi32( 0x00001b00, 0, 0, 0 ) state[1][j] = _mm512_aesenc_epi128( state[1][j], k1 ); \
k1 = _mm512_add_epi32( k1, one ); \
//#define lsbmask m512_const1_32( 0x01010101 ) state[2][j] = _mm512_aesenc_epi128( state[2][j], k1 ); \
k1 = _mm512_add_epi32( k1, one ); \
state[3][j] = _mm512_aesenc_epi128( state[3][j], k1 ); \
k1 = _mm512_add_epi32( k1, one ); \
state[0][j] = _mm512_aesenc_epi128( state[0][j], m512_zero ); \
state[1][j] = _mm512_aesenc_epi128( state[1][j], m512_zero ); \
state[2][j] = _mm512_aesenc_epi128( state[2][j], m512_zero ); \
state[3][j] = _mm512_aesenc_epi128( state[3][j], m512_zero )
#define ECHO_SUBBYTES( state, i, j ) \ #define ECHO_SUBBYTES( state, i, j ) \
state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \ state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
@@ -44,8 +51,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
t1 = _mm512_and_si512( t1, lsbmask ); \ t1 = _mm512_and_si512( t1, lsbmask ); \
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \ t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
s2 = _mm512_xor_si512( s2, t2 );\ s2 = _mm512_xor_si512( s2, t2 );\
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \ state2[ 0 ][ j ] = mm512_xor3( state2[ 0 ][ j ], s2, state1[ 1 ][ j1 ] ); \
_mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \ state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \ state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \ state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
@@ -55,8 +61,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \ t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
s2 = _mm512_xor_si512( s2, t2 ); \ s2 = _mm512_xor_si512( s2, t2 ); \
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \ state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \ state2[ 1 ][ j ] = mm512_xor3( state2[ 1 ][ j ], s2, state1[ 2 ][ j2 ] ); \
_mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \ state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \ state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \ s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
@@ -66,11 +71,29 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
s2 = _mm512_xor_si512( s2, t2 ); \ s2 = _mm512_xor_si512( s2, t2 ); \
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \ state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \ state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \ state2[ 2 ][ j ] = mm512_xor3( state2[ 2 ][ j ], s2, state1[ 3 ][ j3] ); \
_mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \ state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
} while(0) } while(0)
#define ECHO_ROUND_UNROLL2 \
ECHO_SUBBYTES4(_state, 0);\
ECHO_SUBBYTES4(_state, 1);\
ECHO_SUBBYTES4(_state, 2);\
ECHO_SUBBYTES4(_state, 3);\
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
ECHO_SUBBYTES4(_state2, 0);\
ECHO_SUBBYTES4(_state2, 1);\
ECHO_SUBBYTES4(_state2, 2);\
ECHO_SUBBYTES4(_state2, 3);\
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
/*
#define ECHO_ROUND_UNROLL2 \ #define ECHO_ROUND_UNROLL2 \
ECHO_SUBBYTES(_state, 0, 0);\ ECHO_SUBBYTES(_state, 0, 0);\
ECHO_SUBBYTES(_state, 1, 0);\ ECHO_SUBBYTES(_state, 1, 0);\
@@ -112,6 +135,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\ ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\ ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2) ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
*/
#define SAVESTATE(dst, src)\ #define SAVESTATE(dst, src)\
dst[0][0] = src[0][0];\ dst[0][0] = src[0][0];\
@@ -405,6 +429,20 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
#define lsbmask_2way m256_const1_32( 0x01010101 ) #define lsbmask_2way m256_const1_32( 0x01010101 )
#define ECHO_SUBBYTES4_2WAY( state, j ) \
state[0][j] = _mm256_aesenc_epi128( state[0][j], k1 ); \
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
state[1][j] = _mm256_aesenc_epi128( state[1][j], k1 ); \
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
state[2][j] = _mm256_aesenc_epi128( state[2][j], k1 ); \
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
state[3][j] = _mm256_aesenc_epi128( state[3][j], k1 ); \
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
state[0][j] = _mm256_aesenc_epi128( state[0][j], m256_zero ); \
state[1][j] = _mm256_aesenc_epi128( state[1][j], m256_zero ); \
state[2][j] = _mm256_aesenc_epi128( state[2][j], m256_zero ); \
state[3][j] = _mm256_aesenc_epi128( state[3][j], m256_zero )
#define ECHO_SUBBYTES_2WAY( state, i, j ) \ #define ECHO_SUBBYTES_2WAY( state, i, j ) \
state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \ state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \
k1 = _mm256_add_epi32( k1, m256_one_128 ); \ k1 = _mm256_add_epi32( k1, m256_one_128 ); \
@@ -456,6 +494,25 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \ state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
} while(0) } while(0)
#define ECHO_ROUND_UNROLL2_2WAY \
ECHO_SUBBYTES4_2WAY(_state, 0);\
ECHO_SUBBYTES4_2WAY(_state, 1);\
ECHO_SUBBYTES4_2WAY(_state, 2);\
ECHO_SUBBYTES4_2WAY(_state, 3);\
ECHO_MIXBYTES_2WAY(_state, _state2, 0, t1, t2, s2);\
ECHO_MIXBYTES_2WAY(_state, _state2, 1, t1, t2, s2);\
ECHO_MIXBYTES_2WAY(_state, _state2, 2, t1, t2, s2);\
ECHO_MIXBYTES_2WAY(_state, _state2, 3, t1, t2, s2);\
ECHO_SUBBYTES4_2WAY(_state2, 0);\
ECHO_SUBBYTES4_2WAY(_state2, 1);\
ECHO_SUBBYTES4_2WAY(_state2, 2);\
ECHO_SUBBYTES4_2WAY(_state2, 3);\
ECHO_MIXBYTES_2WAY(_state2, _state, 0, t1, t2, s2);\
ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
/*
#define ECHO_ROUND_UNROLL2_2WAY \ #define ECHO_ROUND_UNROLL2_2WAY \
ECHO_SUBBYTES_2WAY(_state, 0, 0);\ ECHO_SUBBYTES_2WAY(_state, 0, 0);\
ECHO_SUBBYTES_2WAY(_state, 1, 0);\ ECHO_SUBBYTES_2WAY(_state, 1, 0);\
@@ -497,6 +554,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\ ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\ ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2) ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
*/
#define SAVESTATE_2WAY(dst, src)\ #define SAVESTATE_2WAY(dst, src)\
dst[0][0] = src[0][0];\ dst[0][0] = src[0][0];\

View File

@@ -124,7 +124,16 @@ MYALIGN const unsigned int _IV512[] = {
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\ t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
s7 = _mm_xor_si128(s7, t1) s7 = _mm_xor_si128(s7, t1)
#define PRESUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
t3 = _mm_add_epi8(t0, t0);\
t4 = _mm_add_epi8(t3, t3);\
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
/*
#define PRESUPERMIX(x, t1, s1, s2, t2)\ #define PRESUPERMIX(x, t1, s1, s2, t2)\
s1 = x;\ s1 = x;\
s2 = _mm_add_epi8(x, x);\ s2 = _mm_add_epi8(x, x);\
@@ -133,37 +142,59 @@ MYALIGN const unsigned int _IV512[] = {
t1 = _mm_and_si128(t1, M128(_lsbmask2));\ t1 = _mm_and_si128(t1, M128(_lsbmask2));\
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\ s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1)) x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
*/
#define SUBSTITUTE(r0, _t1, _t2, _t3, _t0)\ #define SUBSTITUTE(r0, _t2 )\
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\ _t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
_t2 = _mm_aesenclast_si128( _t2, m128_zero ) _t2 = _mm_aesenclast_si128( _t2, m128_zero )
#define SUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
t3 = _mm_add_epi8(t0, t0);\
t4 = _mm_add_epi8(t3, t3);\
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t2 = mm128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t4 = mm128_xor3( t4, t1, t2 ); \
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t4 = mm128_xor3( t4, t2, t1 ); \
t0 = _mm_xor_si128(t0, t3);\
t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
/*
#define SUPERMIX(t0, t1, t2, t3, t4)\ #define SUPERMIX(t0, t1, t2, t3, t4)\
PRESUPERMIX(t0, t1, t2, t3, t4);\ PRESUPERMIX(t0, t1, t2, t3, t4);\
POSTSUPERMIX(t0, t1, t2, t3, t4) POSTSUPERMIX(t0, t1, t2, t3, t4)
*/
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\ #define POSTSUPERMIX(t0, t1, t2, t3, t4)\
t1 = t2;\ t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
t1 = _mm_shuffle_epi8(t1, M128(_supermix1b));\
t4 = t1;\ t4 = t1;\
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\ t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
t4 = _mm_xor_si128(t4, t1);\ t4 = _mm_xor_si128(t4, t1);\
t1 = t4;\ t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t1 = _mm_shuffle_epi8(t1, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\ t4 = _mm_xor_si128(t4, t1);\
t1 = t2;\ t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t1 = _mm_shuffle_epi8(t1, M128(_supermix1a));\
t4 = _mm_xor_si128(t4, t1);\ t4 = _mm_xor_si128(t4, t1);\
t2 = _mm_xor_si128(t2, t3);\ t2 = mm128_xor3(t2, t3, t0 );\
t2 = _mm_xor_si128(t2, t0);\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\ t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t4 = _mm_xor_si128(t4, t2);\ t4 = _mm_xor_si128(t4, t2);\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\ t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t4 = _mm_xor_si128(t4, t2);\ t4 = _mm_xor_si128(t4, t2);\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\ t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = t0;\ t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t1 = _mm_shuffle_epi8(t1, M128(_supermix4a));\
t4 = _mm_xor_si128(t4, t1);\ t4 = _mm_xor_si128(t4, t1);\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\ t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t0 = _mm_xor_si128(t0, t3);\ t0 = _mm_xor_si128(t0, t3);\
@@ -171,59 +202,55 @@ MYALIGN const unsigned int _IV512[] = {
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\ t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
t4 = _mm_xor_si128(t4, t0) t4 = _mm_xor_si128(t4, t0)
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\ #define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\ CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
PACK_S0(r1c, r1a, _t0);\ PACK_S0(r1c, r1a, _t0);\
SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\ SUBSTITUTE(r1c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\ SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
_t0 = _mm_shuffle_epi32(r1c, 0x39);\ _t0 = _mm_shuffle_epi32(r1c, 0x39);\
r2c = _mm_xor_si128(r2c, _t0);\ r2c = _mm_xor_si128(r2c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \ _t0 = mm128_mask_32( _t0, 8 ); \
r2d = _mm_xor_si128(r2d, _t0);\ r2d = _mm_xor_si128(r2d, _t0);\
UNPACK_S0(r1c, r1a, _t3);\ UNPACK_S0(r1c, r1a, _t3);\
SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\ SUBSTITUTE(r2c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\ SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
_t0 = _mm_shuffle_epi32(r2c, 0x39);\ _t0 = _mm_shuffle_epi32(r2c, 0x39);\
r3c = _mm_xor_si128(r3c, _t0);\ r3c = _mm_xor_si128(r3c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \ _t0 = mm128_mask_32( _t0, 8 ); \
r3d = _mm_xor_si128(r3d, _t0);\ r3d = _mm_xor_si128(r3d, _t0);\
UNPACK_S0(r2c, r2a, _t3);\ UNPACK_S0(r2c, r2a, _t3);\
SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\ SUBSTITUTE(r3c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\ SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
UNPACK_S0(r3c, r3a, _t3) UNPACK_S0(r3c, r3a, _t3)
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\ #define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\ CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
PACK_S0(r1c, r1a, _t0);\ PACK_S0(r1c, r1a, _t0);\
SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\ SUBSTITUTE( r1c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\ SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
_t0 = _mm_shuffle_epi32(r1c, 0x39);\ _t0 = _mm_shuffle_epi32(r1c, 0x39);\
r2c = _mm_xor_si128(r2c, _t0);\ r2c = _mm_xor_si128(r2c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \ _t0 = mm128_mask_32( _t0, 8 ); \
r2d = _mm_xor_si128(r2d, _t0);\ r2d = _mm_xor_si128(r2d, _t0);\
UNPACK_S0(r1c, r1a, _t3);\ UNPACK_S0(r1c, r1a, _t3);\
SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\ SUBSTITUTE(r2c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\ SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
_t0 = _mm_shuffle_epi32(r2c, 0x39);\ _t0 = _mm_shuffle_epi32(r2c, 0x39);\
r3c = _mm_xor_si128(r3c, _t0);\ r3c = _mm_xor_si128(r3c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \ _t0 = mm128_mask_32( _t0, 8 ); \
r3d = _mm_xor_si128(r3d, _t0);\ r3d = _mm_xor_si128(r3d, _t0);\
UNPACK_S0(r2c, r2a, _t3);\ UNPACK_S0(r2c, r2a, _t3);\
SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\ SUBSTITUTE( r3c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\ SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
_t0 = _mm_shuffle_epi32(r3c, 0x39);\ _t0 = _mm_shuffle_epi32(r3c, 0x39);\
r4c = _mm_xor_si128(r4c, _t0);\ r4c = _mm_xor_si128(r4c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \ _t0 = mm128_mask_32( _t0, 8 ); \
r4d = _mm_xor_si128(r4d, _t0);\ r4d = _mm_xor_si128(r4d, _t0);\
UNPACK_S0(r3c, r3a, _t3);\ UNPACK_S0(r3c, r3a, _t3);\
SUBSTITUTE(r4c, _t1, _t2, _t3, _t0);\ SUBSTITUTE( r4c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\ SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
UNPACK_S0(r4c, r4a, _t3) UNPACK_S0(r4c, r4a, _t3)
#define LOADCOLUMN(x, s, a)\ #define LOADCOLUMN(x, s, a)\
block[0] = col[(base + a + 0) % s];\ block[0] = col[(base + a + 0) % s];\
block[1] = col[(base + a + 1) % s];\ block[1] = col[(base + a + 1) % s];\
@@ -247,14 +274,14 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
case 1: case 1:
TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4], TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4],
ctx->state[5], ctx->state[ 6], ctx->state[8], ctx->state[5], ctx->state[ 6], ctx->state[8],
ctx->state[9], ctx->state[10], _t0, _t1, _t2 ); ctx->state[9], ctx->state[10], _t0, _t1, _t2 );
SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7],
ctx->state[1], ctx->state[7], ctx->state[8], ctx->state[1], ctx->state[7], ctx->state[8],
ctx->state[6], ctx->state[0], ctx->state[6], ctx->state[6], ctx->state[0], ctx->state[6],
ctx->state[7], ctx->state[5], ctx->state[11], ctx->state[7], ctx->state[5], ctx->state[11],
ctx->state[5], ctx->state[6], ctx->state[4], ctx->state[5], ctx->state[6], ctx->state[4],
ctx->state[10] ); ctx->state[10] );
ctx->base++; ctx->base++;
pmsg += 4; pmsg += 4;
uBlockCount--; uBlockCount--;
@@ -263,14 +290,14 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
case 2: case 2:
TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0], TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0],
ctx->state[ 1], ctx->state[2], ctx->state[4], ctx->state[ 1], ctx->state[2], ctx->state[4],
ctx->state[ 5], ctx->state[6], _t0, _t1, _t2); ctx->state[ 5], ctx->state[6], _t0, _t1, _t2);
SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3], SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3],
ctx->state[9], ctx->state[3], ctx->state[4], ctx->state[9], ctx->state[3], ctx->state[4],
ctx->state[2], ctx->state[8], ctx->state[2], ctx->state[2], ctx->state[8], ctx->state[2],
ctx->state[3], ctx->state[1], ctx->state[7], ctx->state[3], ctx->state[1], ctx->state[7],
ctx->state[1], ctx->state[2], ctx->state[0], ctx->state[1], ctx->state[2], ctx->state[0],
ctx->state[6]); ctx->state[6]);
ctx->base = 0; ctx->base = 0;
pmsg += 4; pmsg += 4;
@@ -278,44 +305,42 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
break; break;
} }
while( uBlockCount > 0 ) while( uBlockCount > 0 )
{ {
TIX512( pmsg, ctx->state[ 7], ctx->state[2], ctx->state[8], ctx->state[9], TIX512( pmsg, ctx->state[ 7],ctx->state[2],ctx->state[8],ctx->state[9],
ctx->state[10], ctx->state[0], ctx->state[1], ctx->state[2], ctx->state[10],ctx->state[0],ctx->state[1],ctx->state[2],
_t0, _t1, _t2 ); _t0, _t1, _t2 );
SUBROUND512_4( ctx->state[0], ctx->state[1], ctx->state[11], SUBROUND512_4( ctx->state[0], ctx->state[1],ctx->state[11],ctx->state[5],
ctx->state[5], ctx->state[11], ctx->state[0], ctx->state[11],ctx->state[0],ctx->state[10],ctx->state[4],
ctx->state[10], ctx->state[4], ctx->state[10], ctx->state[10],ctx->state[11],ctx->state[9],ctx->state[3],
ctx->state[11], ctx->state[9], ctx->state[3], ctx->state[9],ctx->state[10],ctx->state[8],ctx->state[2] );
ctx->state[9], ctx->state[10], ctx->state[8],
ctx->state[2] );
ctx->base++; ctx->base++;
pmsg += 4; pmsg += 4;
uBlockCount--; uBlockCount--;
if( uBlockCount == 0 ) break; if( uBlockCount == 0 ) break;
TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4], ctx->state[5], TIX512( pmsg, ctx->state[3],ctx->state[10],ctx->state[4],ctx->state[5],
ctx->state[6], ctx->state[8], ctx->state[9], ctx->state[10], ctx->state[6],ctx->state[8], ctx->state[9],ctx->state[10],
_t0, _t1, _t2 ); _t0, _t1, _t2 );
SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], ctx->state[1], ctx->state[7], ctx->state[8], ctx->state[6], ctx->state[0], SUBROUND512_4( ctx->state[8],ctx->state[9],ctx->state[7],ctx->state[1],
ctx->state[6], ctx->state[7], ctx->state[5], ctx->state[11], ctx->state[7],ctx->state[8],ctx->state[6],ctx->state[0],
ctx->state[5], ctx->state[6, ctx->state[4], ctx->state[10]); ctx->state[6],ctx->state[7],ctx->state[5],ctx->state[11],
ctx->state[5],ctx->state[6],ctx->state[4],ctx->state[10] );
ctx->base++; ctx->base++;
pmsg += 4; pmsg += 4;
uBlockCount--; uBlockCount--;
if( uBlockCount == 0 ) break; if( uBlockCount == 0 ) break;
TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0], ctx->state[1], TIX512( pmsg, ctx->state[11],ctx->state[6],ctx->state[0],ctx->state[1],
ctx->state[2], ctx->state[4], ctx->state[5], ctx->state[6], ctx->state[2], ctx->state[4],ctx->state[5],ctx->state[6],
_t0, _t1, _t2); _t0, _t1, _t2);
SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3], ctx->state[9], SUBROUND512_4( ctx->state[4],ctx->state[5],ctx->state[3],ctx->state[9],
ctx->state[3], ctx->state[4], ctx->state[2], ctx->state[8], ctx->state[3],ctx->state[4],ctx->state[2],ctx->state[8],
ctx->state[2], ctx->state[3], ctx->state[1], ctx->state[7], ctx->state[2],ctx->state[3],ctx->state[1],ctx->state[7],
ctx->state[1], ctx->state[2], ctx->state[0], ctx->state[6]); ctx->state[1],ctx->state[2],ctx->state[0],ctx->state[6]);
ctx->base = 0; ctx->base = 0;
pmsg += 4; pmsg += 4;
@@ -326,8 +351,8 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
void Final512(hashState_fugue *ctx, BitSequence *hashval) void Final512(hashState_fugue *ctx, BitSequence *hashval)
{ {
unsigned int block[4] __attribute__ ((aligned (32))); unsigned int block[4] __attribute__ ((aligned (32)));
unsigned int col[36] __attribute__ ((aligned (16))); unsigned int col[36] __attribute__ ((aligned (16)));
unsigned int i, base; unsigned int i, base;
__m128i r0, _t0, _t1, _t2, _t3; __m128i r0, _t0, _t1, _t2, _t3;
@@ -357,7 +382,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
// SMIX // SMIX
LOADCOLUMN(r0, 36, 0); LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t1, _t2, _t3, _t0); SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0); SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36); STORECOLUMN(r0, 36);
} }
@@ -375,7 +400,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
// SMIX // SMIX
LOADCOLUMN(r0, 36, 0); LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t1, _t2, _t3, _t0); SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0); SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36); STORECOLUMN(r0, 36);
@@ -390,7 +415,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
// SMIX // SMIX
LOADCOLUMN(r0, 36, 0); LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t1, _t2, _t3, _t0); SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0); SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36); STORECOLUMN(r0, 36);
@@ -405,7 +430,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
// SMIX // SMIX
LOADCOLUMN(r0, 36, 0); LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t1, _t2, _t3, _t0); SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0); SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36); STORECOLUMN(r0, 36);
@@ -420,7 +445,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
// SMIX // SMIX
LOADCOLUMN(r0, 36, 0); LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t1, _t2, _t3, _t0); SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0); SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36); STORECOLUMN(r0, 36);
} }

View File

@@ -67,11 +67,9 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
* xmm[j] will be lost * xmm[j] will be lost
* xmm[k] has to be all 0x1b */ * xmm[k] has to be all 0x1b */
#define MUL2(i, j, k){\ #define MUL2(i, j, k){\
j = _mm_xor_si128(j, j);\ j = _mm_cmpgt_epi8( m128_zero, i);\
j = _mm_cmpgt_epi8(j, i);\
i = _mm_add_epi8(i, i);\ i = _mm_add_epi8(i, i);\
j = _mm_and_si128(j, k);\ i = mm128_xorand(i, j, k );\
i = _mm_xor_si128(i, j);\
} }
/**/ /**/
@@ -93,6 +91,96 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
We almost fit into 16 registers, need only 3 spills to memory. We almost fit into 16 registers, need only 3 spills to memory.
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */ K. Matusiewicz, 2011/05/29 */
#if defined(__AVX512VL__)
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\
b6 = a0;\
b7 = a1;\
a0 = _mm_xor_si128(a0, a1);\
b0 = a2;\
a1 = _mm_xor_si128(a1, a2);\
b1 = a3;\
TEMP2 = _mm_xor_si128(a2, a3);\
b2 = a4;\
a3 = _mm_xor_si128(a3, a4);\
b3 = a5;\
a4 = _mm_xor_si128(a4, a5);\
b4 = a6;\
a5 = _mm_xor_si128(a5, a6);\
b5 = a7;\
a6 = _mm_xor_si128(a6, a7);\
a7 = _mm_xor_si128(a7, b6);\
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
TEMP0 = mm128_xor3( b0, a4, a6 ); \
/* spill values y_4, y_5 to memory */\
TEMP1 = mm128_xor3( b1, a5, a7 );\
b2 = mm128_xor3( b2, a6, a0 ); \
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
b0 = a0;\
b3 = mm128_xor3( b3, a7, a1 ); \
b1 = a1;\
b6 = mm128_xor3( b6, a4, TEMP2 ); \
b4 = mm128_xor3( b4, a0, TEMP2 ); \
b7 = mm128_xor3( b7, a5, a3 ); \
b5 = mm128_xor3( b5, a1, a3 ); \
\
/* compute x_i = t_i + t_{i+3} */\
a0 = _mm_xor_si128(a0, a3);\
a1 = _mm_xor_si128(a1, a4);\
a2 = _mm_xor_si128(TEMP2, a5);\
a3 = _mm_xor_si128(a3, a6);\
a4 = _mm_xor_si128(a4, a7);\
a5 = _mm_xor_si128(a5, b0);\
a6 = _mm_xor_si128(a6, b1);\
a7 = _mm_xor_si128(a7, TEMP2);\
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
MUL2(a0, b0, b1);\
a0 = _mm_xor_si128(a0, TEMP0);\
MUL2(a1, b0, b1);\
a1 = _mm_xor_si128(a1, TEMP1);\
MUL2(a2, b0, b1);\
a2 = _mm_xor_si128(a2, b2);\
MUL2(a3, b0, b1);\
a3 = _mm_xor_si128(a3, b3);\
MUL2(a4, b0, b1);\
a4 = _mm_xor_si128(a4, b4);\
MUL2(a5, b0, b1);\
a5 = _mm_xor_si128(a5, b5);\
MUL2(a6, b0, b1);\
a6 = _mm_xor_si128(a6, b6);\
MUL2(a7, b0, b1);\
a7 = _mm_xor_si128(a7, b7);\
\
/* compute v_i : double w_i */\
/* add to y_4 y_5 .. v3, v4, ... */\
MUL2(a0, b0, b1);\
b5 = _mm_xor_si128(b5, a0);\
MUL2(a1, b0, b1);\
b6 = _mm_xor_si128(b6, a1);\
MUL2(a2, b0, b1);\
b7 = _mm_xor_si128(b7, a2);\
MUL2(a5, b0, b1);\
b2 = _mm_xor_si128(b2, a5);\
MUL2(a6, b0, b1);\
b3 = _mm_xor_si128(b3, a6);\
MUL2(a7, b0, b1);\
b4 = _mm_xor_si128(b4, a7);\
MUL2(a3, b0, b1);\
MUL2(a4, b0, b1);\
b0 = TEMP0;\
b1 = TEMP1;\
b0 = _mm_xor_si128(b0, a3);\
b1 = _mm_xor_si128(b1, a4);\
}/*MixBytes*/
#else
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\ /* t_i = a_i + a_{i+1} */\
b6 = a0;\ b6 = a0;\
@@ -189,6 +277,8 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
b1 = _mm_xor_si128(b1, a4);\ b1 = _mm_xor_si128(b1, a4);\
}/*MixBytes*/ }/*MixBytes*/
#endif
/* one round /* one round
* a0-a7 = input rows * a0-a7 = input rows

View File

@@ -58,11 +58,9 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
* xmm[j] will be lost * xmm[j] will be lost
* xmm[k] has to be all 0x1b */ * xmm[k] has to be all 0x1b */
#define MUL2(i, j, k){\ #define MUL2(i, j, k){\
j = _mm_xor_si128(j, j);\ j = _mm_cmpgt_epi8( m128_zero, i);\
j = _mm_cmpgt_epi8(j, i);\
i = _mm_add_epi8(i, i);\ i = _mm_add_epi8(i, i);\
j = _mm_and_si128(j, k);\ i = mm128_xorand(i, j, k );\
i = _mm_xor_si128(i, j);\
} }
/* Yet another implementation of MixBytes. /* Yet another implementation of MixBytes.
@@ -82,6 +80,96 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
We almost fit into 16 registers, need only 3 spills to memory. We almost fit into 16 registers, need only 3 spills to memory.
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */ K. Matusiewicz, 2011/05/29 */
#if defined(__AVX512VL__)
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\
b6 = a0;\
b7 = a1;\
a0 = _mm_xor_si128(a0, a1);\
b0 = a2;\
a1 = _mm_xor_si128(a1, a2);\
b1 = a3;\
TEMP2 = _mm_xor_si128(a2, a3);\
b2 = a4;\
a3 = _mm_xor_si128(a3, a4);\
b3 = a5;\
a4 = _mm_xor_si128(a4, a5);\
b4 = a6;\
a5 = _mm_xor_si128(a5, a6);\
b5 = a7;\
a6 = _mm_xor_si128(a6, a7);\
a7 = _mm_xor_si128(a7, b6);\
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
TEMP0 = mm128_xor3( b0, a4, a6 ); \
/* spill values y_4, y_5 to memory */\
TEMP1 = mm128_xor3( b1, a5, a7 );\
b2 = mm128_xor3( b2, a6, a0 ); \
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
b0 = a0;\
b3 = mm128_xor3( b3, a7, a1 ); \
b1 = a1;\
b6 = mm128_xor3( b6, a4, TEMP2 ); \
b4 = mm128_xor3( b4, a0, TEMP2 ); \
b7 = mm128_xor3( b7, a5, a3 ); \
b5 = mm128_xor3( b5, a1, a3 ); \
\
/* compute x_i = t_i + t_{i+3} */\
a0 = _mm_xor_si128(a0, a3);\
a1 = _mm_xor_si128(a1, a4);\
a2 = _mm_xor_si128(TEMP2, a5);\
a3 = _mm_xor_si128(a3, a6);\
a4 = _mm_xor_si128(a4, a7);\
a5 = _mm_xor_si128(a5, b0);\
a6 = _mm_xor_si128(a6, b1);\
a7 = _mm_xor_si128(a7, TEMP2);\
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
MUL2(a0, b0, b1);\
a0 = _mm_xor_si128(a0, TEMP0);\
MUL2(a1, b0, b1);\
a1 = _mm_xor_si128(a1, TEMP1);\
MUL2(a2, b0, b1);\
a2 = _mm_xor_si128(a2, b2);\
MUL2(a3, b0, b1);\
a3 = _mm_xor_si128(a3, b3);\
MUL2(a4, b0, b1);\
a4 = _mm_xor_si128(a4, b4);\
MUL2(a5, b0, b1);\
a5 = _mm_xor_si128(a5, b5);\
MUL2(a6, b0, b1);\
a6 = _mm_xor_si128(a6, b6);\
MUL2(a7, b0, b1);\
a7 = _mm_xor_si128(a7, b7);\
\
/* compute v_i : double w_i */\
/* add to y_4 y_5 .. v3, v4, ... */\
MUL2(a0, b0, b1);\
b5 = _mm_xor_si128(b5, a0);\
MUL2(a1, b0, b1);\
b6 = _mm_xor_si128(b6, a1);\
MUL2(a2, b0, b1);\
b7 = _mm_xor_si128(b7, a2);\
MUL2(a5, b0, b1);\
b2 = _mm_xor_si128(b2, a5);\
MUL2(a6, b0, b1);\
b3 = _mm_xor_si128(b3, a6);\
MUL2(a7, b0, b1);\
b4 = _mm_xor_si128(b4, a7);\
MUL2(a3, b0, b1);\
MUL2(a4, b0, b1);\
b0 = TEMP0;\
b1 = TEMP1;\
b0 = _mm_xor_si128(b0, a3);\
b1 = _mm_xor_si128(b1, a4);\
}/*MixBytes*/
#else
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\ /* t_i = a_i + a_{i+1} */\
b6 = a0;\ b6 = a0;\
@@ -178,6 +266,8 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
b1 = _mm_xor_si128(b1, a4);\ b1 = _mm_xor_si128(b1, a4);\
}/*MixBytes*/ }/*MixBytes*/
#endif
/* one round /* one round
* i = round number * i = round number
* a0-a7 = input rows * a0-a7 = input rows

View File

@@ -96,11 +96,9 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
* xmm[j] will be lost * xmm[j] will be lost
* xmm[k] has to be all 0x1b */ * xmm[k] has to be all 0x1b */
#define MUL2(i, j, k){\ #define MUL2(i, j, k){\
j = _mm512_xor_si512(j, j);\ j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
i = _mm512_add_epi8(i, i);\ i = _mm512_add_epi8(i, i);\
j = _mm512_and_si512(j, k);\ i = mm512_xorand( i, j, k );\
i = _mm512_xor_si512(i, j);\
} }
/* Yet another implementation of MixBytes. /* Yet another implementation of MixBytes.
@@ -120,6 +118,95 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
We almost fit into 16 registers, need only 3 spills to memory. We almost fit into 16 registers, need only 3 spills to memory.
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */ K. Matusiewicz, 2011/05/29 */
#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
b0, b1, b2, b3, b4, b5, b6, b7) { \
/* t_i = a_i + a_{i+1} */\
b6 = a0; \
b7 = a1; \
a0 = _mm512_xor_si512( a0, a1 ); \
b0 = a2; \
a1 = _mm512_xor_si512( a1, a2 ); \
b1 = a3; \
TEMP2 = _mm512_xor_si512( a2, a3 ); \
b2 = a4; \
a3 = _mm512_xor_si512( a3, a4 ); \
b3 = a5; \
a4 = _mm512_xor_si512( a4, a5 );\
b4 = a6; \
a5 = _mm512_xor_si512( a5, a6 ); \
b5 = a7; \
a6 = _mm512_xor_si512( a6, a7 ); \
a7 = _mm512_xor_si512( a7, b6 ); \
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
TEMP0 = mm512_xor3( b0, a4, a6 ); \
/* spill values y_4, y_5 to memory */\
TEMP1 = mm512_xor3( b1, a5, a7 ); \
b2 = mm512_xor3( b2, a6, a0 ); \
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
b0 = a0; \
b3 = mm512_xor3( b3, a7, a1 ); \
b1 = a1; \
b6 = mm512_xor3( b6, a4, TEMP2 ); \
b4 = mm512_xor3( b4, a0, TEMP2 ); \
b7 = mm512_xor3( b7, a5, a3 ); \
b5 = mm512_xor3( b5, a1, a3 ); \
\
/* compute x_i = t_i + t_{i+3} */\
a0 = _mm512_xor_si512( a0, a3 ); \
a1 = _mm512_xor_si512( a1, a4 ); \
a2 = _mm512_xor_si512( TEMP2, a5 ); \
a3 = _mm512_xor_si512( a3, a6 ); \
a4 = _mm512_xor_si512( a4, a7 ); \
a5 = _mm512_xor_si512( a5, b0 ); \
a6 = _mm512_xor_si512( a6, b1 ); \
a7 = _mm512_xor_si512( a7, TEMP2 ); \
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
MUL2( a0, b0, b1 ); \
a0 = _mm512_xor_si512( a0, TEMP0 ); \
MUL2( a1, b0, b1 ); \
a1 = _mm512_xor_si512( a1, TEMP1 ); \
MUL2( a2, b0, b1 ); \
a2 = _mm512_xor_si512( a2, b2 ); \
MUL2( a3, b0, b1 ); \
a3 = _mm512_xor_si512( a3, b3 ); \
MUL2( a4, b0, b1 ); \
a4 = _mm512_xor_si512( a4, b4 ); \
MUL2( a5, b0, b1 ); \
a5 = _mm512_xor_si512( a5, b5 ); \
MUL2( a6, b0, b1 ); \
a6 = _mm512_xor_si512( a6, b6 ); \
MUL2( a7, b0, b1 ); \
a7 = _mm512_xor_si512( a7, b7 ); \
\
/* compute v_i : double w_i */\
/* add to y_4 y_5 .. v3, v4, ... */\
MUL2( a0, b0, b1 ); \
b5 = _mm512_xor_si512( b5, a0 ); \
MUL2( a1, b0, b1 ); \
b6 = _mm512_xor_si512( b6, a1 ); \
MUL2( a2, b0, b1 ); \
b7 = _mm512_xor_si512( b7, a2 ); \
MUL2( a5, b0, b1 ); \
b2 = _mm512_xor_si512( b2, a5 ); \
MUL2( a6, b0, b1 ); \
b3 = _mm512_xor_si512( b3, a6 ); \
MUL2( a7, b0, b1 ); \
b4 = _mm512_xor_si512( b4, a7 ); \
MUL2( a3, b0, b1 ); \
MUL2( a4, b0, b1 ); \
b0 = TEMP0;\
b1 = TEMP1;\
b0 = _mm512_xor_si512( b0, a3 ); \
b1 = _mm512_xor_si512( b1, a4 ); \
}/*MixBytes*/
#if 0
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\ /* t_i = a_i + a_{i+1} */\
b6 = a0;\ b6 = a0;\
@@ -215,7 +302,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
b0 = _mm512_xor_si512(b0, a3);\ b0 = _mm512_xor_si512(b0, a3);\
b1 = _mm512_xor_si512(b1, a4);\ b1 = _mm512_xor_si512(b1, a4);\
}/*MixBytes*/ }/*MixBytes*/
#endif
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant */\ /* AddRoundConstant */\

View File

@@ -104,11 +104,9 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
* xmm[j] will be lost * xmm[j] will be lost
* xmm[k] has to be all 0x1b */ * xmm[k] has to be all 0x1b */
#define MUL2(i, j, k){\ #define MUL2(i, j, k){\
j = _mm512_xor_si512(j, j);\ j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
i = _mm512_add_epi8(i, i);\ i = _mm512_add_epi8(i, i);\
j = _mm512_and_si512(j, k);\ i = mm512_xorand( i, j, k );\
i = _mm512_xor_si512(i, j);\
} }
/**/ /**/
@@ -130,100 +128,90 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
We almost fit into 16 registers, need only 3 spills to memory. We almost fit into 16 registers, need only 3 spills to memory.
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */ K. Matusiewicz, 2011/05/29 */
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ #define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
b0, b1, b2, b3, b4, b5, b6, b7) { \
/* t_i = a_i + a_{i+1} */\ /* t_i = a_i + a_{i+1} */\
b6 = a0;\ b6 = a0; \
b7 = a1;\ b7 = a1; \
a0 = _mm512_xor_si512(a0, a1);\ a0 = _mm512_xor_si512( a0, a1 ); \
b0 = a2;\ b0 = a2; \
a1 = _mm512_xor_si512(a1, a2);\ a1 = _mm512_xor_si512( a1, a2 ); \
b1 = a3;\ b1 = a3; \
a2 = _mm512_xor_si512(a2, a3);\ TEMP2 = _mm512_xor_si512( a2, a3 ); \
b2 = a4;\ b2 = a4; \
a3 = _mm512_xor_si512(a3, a4);\ a3 = _mm512_xor_si512( a3, a4 ); \
b3 = a5;\ b3 = a5; \
a4 = _mm512_xor_si512(a4, a5);\ a4 = _mm512_xor_si512( a4, a5 );\
b4 = a6;\ b4 = a6; \
a5 = _mm512_xor_si512(a5, a6);\ a5 = _mm512_xor_si512( a5, a6 ); \
b5 = a7;\ b5 = a7; \
a6 = _mm512_xor_si512(a6, a7);\ a6 = _mm512_xor_si512( a6, a7 ); \
a7 = _mm512_xor_si512(a7, b6);\ a7 = _mm512_xor_si512( a7, b6 ); \
\ \
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
b0 = _mm512_xor_si512(b0, a4);\ TEMP0 = mm512_xor3( b0, a4, a6 ); \
b6 = _mm512_xor_si512(b6, a4);\
b1 = _mm512_xor_si512(b1, a5);\
b7 = _mm512_xor_si512(b7, a5);\
b2 = _mm512_xor_si512(b2, a6);\
b0 = _mm512_xor_si512(b0, a6);\
/* spill values y_4, y_5 to memory */\ /* spill values y_4, y_5 to memory */\
TEMP0 = b0;\ TEMP1 = mm512_xor3( b1, a5, a7 ); \
b3 = _mm512_xor_si512(b3, a7);\ b2 = mm512_xor3( b2, a6, a0 ); \
b1 = _mm512_xor_si512(b1, a7);\
TEMP1 = b1;\
b4 = _mm512_xor_si512(b4, a0);\
b2 = _mm512_xor_si512(b2, a0);\
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\ /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
b0 = a0;\ b0 = a0; \
b5 = _mm512_xor_si512(b5, a1);\ b3 = mm512_xor3( b3, a7, a1 ); \
b3 = _mm512_xor_si512(b3, a1);\ b1 = a1; \
b1 = a1;\ b6 = mm512_xor3( b6, a4, TEMP2 ); \
b6 = _mm512_xor_si512(b6, a2);\ b4 = mm512_xor3( b4, a0, TEMP2 ); \
b4 = _mm512_xor_si512(b4, a2);\ b7 = mm512_xor3( b7, a5, a3 ); \
TEMP2 = a2;\ b5 = mm512_xor3( b5, a1, a3 ); \
b7 = _mm512_xor_si512(b7, a3);\
b5 = _mm512_xor_si512(b5, a3);\
\ \
/* compute x_i = t_i + t_{i+3} */\ /* compute x_i = t_i + t_{i+3} */\
a0 = _mm512_xor_si512(a0, a3);\ a0 = _mm512_xor_si512( a0, a3 ); \
a1 = _mm512_xor_si512(a1, a4);\ a1 = _mm512_xor_si512( a1, a4 ); \
a2 = _mm512_xor_si512(a2, a5);\ a2 = _mm512_xor_si512( TEMP2, a5 ); \
a3 = _mm512_xor_si512(a3, a6);\ a3 = _mm512_xor_si512( a3, a6 ); \
a4 = _mm512_xor_si512(a4, a7);\ a4 = _mm512_xor_si512( a4, a7 ); \
a5 = _mm512_xor_si512(a5, b0);\ a5 = _mm512_xor_si512( a5, b0 ); \
a6 = _mm512_xor_si512(a6, b1);\ a6 = _mm512_xor_si512( a6, b1 ); \
a7 = _mm512_xor_si512(a7, TEMP2);\ a7 = _mm512_xor_si512( a7, TEMP2 ); \
\ \
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\ /* compute w_i : add y_{i+4} */\
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\ b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
MUL2(a0, b0, b1);\ MUL2( a0, b0, b1 ); \
a0 = _mm512_xor_si512(a0, TEMP0);\ a0 = _mm512_xor_si512( a0, TEMP0 ); \
MUL2(a1, b0, b1);\ MUL2( a1, b0, b1 ); \
a1 = _mm512_xor_si512(a1, TEMP1);\ a1 = _mm512_xor_si512( a1, TEMP1 ); \
MUL2(a2, b0, b1);\ MUL2( a2, b0, b1 ); \
a2 = _mm512_xor_si512(a2, b2);\ a2 = _mm512_xor_si512( a2, b2 ); \
MUL2(a3, b0, b1);\ MUL2( a3, b0, b1 ); \
a3 = _mm512_xor_si512(a3, b3);\ a3 = _mm512_xor_si512( a3, b3 ); \
MUL2(a4, b0, b1);\ MUL2( a4, b0, b1 ); \
a4 = _mm512_xor_si512(a4, b4);\ a4 = _mm512_xor_si512( a4, b4 ); \
MUL2(a5, b0, b1);\ MUL2( a5, b0, b1 ); \
a5 = _mm512_xor_si512(a5, b5);\ a5 = _mm512_xor_si512( a5, b5 ); \
MUL2(a6, b0, b1);\ MUL2( a6, b0, b1 ); \
a6 = _mm512_xor_si512(a6, b6);\ a6 = _mm512_xor_si512( a6, b6 ); \
MUL2(a7, b0, b1);\ MUL2( a7, b0, b1 ); \
a7 = _mm512_xor_si512(a7, b7);\ a7 = _mm512_xor_si512( a7, b7 ); \
\ \
/* compute v_i : double w_i */\ /* compute v_i : double w_i */\
/* add to y_4 y_5 .. v3, v4, ... */\ /* add to y_4 y_5 .. v3, v4, ... */\
MUL2(a0, b0, b1);\ MUL2( a0, b0, b1 ); \
b5 = _mm512_xor_si512(b5, a0);\ b5 = _mm512_xor_si512( b5, a0 ); \
MUL2(a1, b0, b1);\ MUL2( a1, b0, b1 ); \
b6 = _mm512_xor_si512(b6, a1);\ b6 = _mm512_xor_si512( b6, a1 ); \
MUL2(a2, b0, b1);\ MUL2( a2, b0, b1 ); \
b7 = _mm512_xor_si512(b7, a2);\ b7 = _mm512_xor_si512( b7, a2 ); \
MUL2(a5, b0, b1);\ MUL2( a5, b0, b1 ); \
b2 = _mm512_xor_si512(b2, a5);\ b2 = _mm512_xor_si512( b2, a5 ); \
MUL2(a6, b0, b1);\ MUL2( a6, b0, b1 ); \
b3 = _mm512_xor_si512(b3, a6);\ b3 = _mm512_xor_si512( b3, a6 ); \
MUL2(a7, b0, b1);\ MUL2( a7, b0, b1 ); \
b4 = _mm512_xor_si512(b4, a7);\ b4 = _mm512_xor_si512( b4, a7 ); \
MUL2(a3, b0, b1);\ MUL2( a3, b0, b1 ); \
MUL2(a4, b0, b1);\ MUL2( a4, b0, b1 ); \
b0 = TEMP0;\ b0 = TEMP0;\
b1 = TEMP1;\ b1 = TEMP1;\
b0 = _mm512_xor_si512(b0, a3);\ b0 = _mm512_xor_si512( b0, a3 ); \
b1 = _mm512_xor_si512(b1, a4);\ b1 = _mm512_xor_si512( b1, a4 ); \
}/*MixBytes*/ }/*MixBytes*/
/* one round /* one round
@@ -709,11 +697,9 @@ static const __m256i SUBSH_MASK7_2WAY =
* xmm[j] will be lost * xmm[j] will be lost
* xmm[k] has to be all 0x1b */ * xmm[k] has to be all 0x1b */
#define MUL2_2WAY(i, j, k){\ #define MUL2_2WAY(i, j, k){\
j = _mm256_xor_si256(j, j);\ j = _mm256_cmpgt_epi8( m256_zero, i );\
j = _mm256_cmpgt_epi8(j, i );\
i = _mm256_add_epi8(i, i);\ i = _mm256_add_epi8(i, i);\
j = _mm256_and_si256(j, k);\ i = mm256_xorand( i, j, k );\
i = _mm256_xor_si256(i, j);\
} }
#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ #define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\

View File

@@ -44,6 +44,7 @@ void myriad_8way_hash( void *output, const void *input )
rintrlv_8x64_4x128( vhashA, vhashB, input, 640 ); rintrlv_8x64_4x128( vhashA, vhashB, input, 640 );
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 ); groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 );
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(groestl512_4way_context) );
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 ); groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 );
uint32_t hash0[20] __attribute__ ((aligned (64))); uint32_t hash0[20] __attribute__ ((aligned (64)));
@@ -58,8 +59,6 @@ void myriad_8way_hash( void *output, const void *input )
// rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 ); // rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
hash6, hash7 );
#else #else
@@ -76,27 +75,27 @@ void myriad_8way_hash( void *output, const void *input )
hash4, hash5, hash6, hash7, input, 640 ); hash4, hash5, hash6, hash7, input, 640 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 ); update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 ); update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 ); update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 ); update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 ); update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 ); update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 ); update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 ); update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, 512 );
#endif #endif
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
hash6, hash7 );
sha256_8way_update( &ctx.sha, vhash, 64 ); sha256_8way_update( &ctx.sha, vhash, 64 );
sha256_8way_close( &ctx.sha, output ); sha256_8way_close( &ctx.sha, output );
} }

View File

@@ -560,22 +560,14 @@ do { \
__m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \ __m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
dm = mm512_negate_32( _mm512_or_si512( dm, \ dm = mm512_negate_32( _mm512_or_si512( dm, \
_mm512_slli_epi64( dm, 32 ) ) ); \ _mm512_slli_epi64( dm, 32 ) ) ); \
m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \ m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \
m512_const1_64( tp[0] ) ) ); \ m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \
m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \ m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \
m512_const1_64( tp[1] ) ) ); \ m3 = mm512_xorand( m3, dm, m512_const1_64( tp[3] ) ); \
m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \ m4 = mm512_xorand( m4, dm, m512_const1_64( tp[4] ) ); \
m512_const1_64( tp[2] ) ) ); \ m5 = mm512_xorand( m5, dm, m512_const1_64( tp[5] ) ); \
m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \ m6 = mm512_xorand( m6, dm, m512_const1_64( tp[6] ) ); \
m512_const1_64( tp[3] ) ) ); \ m7 = mm512_xorand( m7, dm, m512_const1_64( tp[7] ) ); \
m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \
m512_const1_64( tp[4] ) ) ); \
m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \
m512_const1_64( tp[5] ) ) ); \
m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \
m512_const1_64( tp[6] ) ) ); \
m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \
m512_const1_64( tp[7] ) ) ); \
tp += 8; \ tp += 8; \
db = _mm512_srli_epi64( db, 1 ); \ db = _mm512_srli_epi64( db, 1 ); \
} \ } \
@@ -585,20 +577,13 @@ do { \
do { \ do { \
__m512i t; \ __m512i t; \
t = a; \ t = a; \
a = _mm512_and_si512( a, c ); \ a = mm512_xorand( d, a, c ); \
a = _mm512_xor_si512( a, d ); \ c = mm512_xor3( a, b, c ); \
c = _mm512_xor_si512( c, b ); \ b = mm512_xoror( b, d, t ); \
c = _mm512_xor_si512( c, a ); \
d = _mm512_or_si512( d, t ); \
d = _mm512_xor_si512( d, b ); \
t = _mm512_xor_si512( t, c ); \ t = _mm512_xor_si512( t, c ); \
b = d; \ d = mm512_xoror( a, b, t ); \
d = _mm512_or_si512( d, t ); \ t = mm512_xorand( t, a, b ); \
d = _mm512_xor_si512( d, a ); \ b = mm512_xor3( b, d, t ); \
a = _mm512_and_si512( a, b ); \
t = _mm512_xor_si512( t, a ); \
b = _mm512_xor_si512( b, d ); \
b = _mm512_xor_si512( b, t ); \
a = c; \ a = c; \
c = b; \ c = b; \
b = d; \ b = d; \
@@ -609,14 +594,12 @@ do { \
do { \ do { \
a = mm512_rol_32( a, 13 ); \ a = mm512_rol_32( a, 13 ); \
c = mm512_rol_32( c, 3 ); \ c = mm512_rol_32( c, 3 ); \
b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \ b = mm512_xor3( a, b, c ); \
d = _mm512_xor_si512( d, _mm512_xor_si512( c, \ d = mm512_xor3( d, c, _mm512_slli_epi32( a, 3 ) ); \
_mm512_slli_epi32( a, 3 ) ) ); \
b = mm512_rol_32( b, 1 ); \ b = mm512_rol_32( b, 1 ); \
d = mm512_rol_32( d, 7 ); \ d = mm512_rol_32( d, 7 ); \
a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \ a = mm512_xor3( a, b, d ); \
c = _mm512_xor_si512( c, _mm512_xor_si512( d, \ c = mm512_xor3( c, d, _mm512_slli_epi32( b, 7 ) ); \
_mm512_slli_epi32( b, 7 ) ) ); \
a = mm512_rol_32( a, 5 ); \ a = mm512_rol_32( a, 5 ); \
c = mm512_rol_32( c, 22 ); \ c = mm512_rol_32( c, 22 ); \
} while (0) } while (0)

View File

@@ -522,50 +522,53 @@ do { \
// Haval-256 8 way 32 bit avx2 // Haval-256 8 way 32 bit avx2
#if defined (__AVX512VL__)
// ( ~( a ^ b ) ) & c
#define mm256_andnotxor( a, b, c ) \
_mm256_ternarylogic_epi32( a, b, c, 0x82 )
#else
#define mm256_andnotxor( a, b, c ) \
_mm256_andnot_si256( _mm256_xor_si256( a, b ), c )
#endif
#define F1_8W(x6, x5, x4, x3, x2, x1, x0) \ #define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
_mm256_xor_si256( x0, \ mm256_xor3( x0, mm256_andxor( x1, x0, x4 ), \
_mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \ _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
_mm256_xor_si256( _mm256_and_si256( x2, x5 ), \ _mm256_and_si256( x3, x6 ) ) ) \
_mm256_and_si256( x3, x6 ) ) ) ) \
#define F2_8W(x6, x5, x4, x3, x2, x1, x0) \ #define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
_mm256_xor_si256( \ mm256_xor3( mm256_andxor( x2, _mm256_andnot_si256( x3, x1 ), \
_mm256_and_si256( x2, \ mm256_xor3( _mm256_and_si256( x4, x5 ), x6, x0 ) ), \
_mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \ mm256_andxor( x4, x1, x5 ), \
_mm256_xor_si256( _mm256_and_si256( x4, x5 ), \ mm256_xorand( x0, x3, x5 ) ) \
_mm256_xor_si256( x6, x0 ) ) ) ), \
_mm256_xor_si256( \
_mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
_mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
#define F3_8W(x6, x5, x4, x3, x2, x1, x0) \ #define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
_mm256_xor_si256( \ mm256_xor3( x0, \
_mm256_and_si256( x3, \ _mm256_and_si256( x3, \
_mm256_xor_si256( _mm256_and_si256( x1, x2 ), \ mm256_xor3( _mm256_and_si256( x1, x2 ), x6, x0 ) ), \
_mm256_xor_si256( x6, x0 ) ) ), \ _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
_mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \ _mm256_and_si256( x2, x5 ) ) )
_mm256_and_si256( x2, x5 ) ), x0 ) )
#define F4_8W(x6, x5, x4, x3, x2, x1, x0) \ #define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
_mm256_xor_si256( \ mm256_xor3( \
_mm256_xor_si256( \ mm256_andxor( x3, x5, \
_mm256_and_si256( x3, \ _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \ _mm256_or_si256( x4, x6 ) ) ), \
_mm256_or_si256( x4, x6 ) ), x5 ) ), \ _mm256_and_si256( x4, \
_mm256_and_si256( x4, \ mm256_xor3( x0, _mm256_andnot_si256( x2, x5 ), \
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \ _mm256_xor_si256( x1, x6 ) ) ), \
_mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \ mm256_xorand( x0, x2, x6 ) )
_mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
#define F5_8W(x6, x5, x4, x3, x2, x1, x0) \ #define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
_mm256_xor_si256( \ _mm256_xor_si256( \
_mm256_and_si256( x0, \ mm256_andnotxor( mm256_and3( x1, x2, x3 ), x5, x0 ), \
mm256_not( _mm256_xor_si256( \ mm256_xor3( _mm256_and_si256( x1, x4 ), \
_mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \ _mm256_and_si256( x2, x5 ), \
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \ _mm256_and_si256( x3, x6 ) ) )
_mm256_and_si256( x2, x5 ) ), \
_mm256_and_si256( x3, x6 ) ) )
#define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \ #define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
F1_8W(x1, x0, x3, x5, x6, x2, x4) F1_8W(x1, x0, x3, x5, x6, x2, x4)

View File

@@ -51,15 +51,15 @@ extern "C"{
do { \ do { \
__m512i cc = _mm512_set1_epi64( c ); \ __m512i cc = _mm512_set1_epi64( c ); \
x3 = mm512_not( x3 ); \ x3 = mm512_not( x3 ); \
x0 = _mm512_xor_si512( x0, _mm512_andnot_si512( x2, cc ) ); \ x0 = mm512_xorandnot( x0, x2, cc ); \
tmp = _mm512_xor_si512( cc, _mm512_and_si512( x0, x1 ) ); \ tmp = mm512_xorand( cc, x0, x1 ); \
x0 = _mm512_xor_si512( x0, _mm512_and_si512( x2, x3 ) ); \ x0 = mm512_xorand( x0, x2, x3 ); \
x3 = _mm512_xor_si512( x3, _mm512_andnot_si512( x1, x2 ) ); \ x3 = mm512_xorandnot( x3, x1, x2 ); \
x1 = _mm512_xor_si512( x1, _mm512_and_si512( x0, x2 ) ); \ x1 = mm512_xorand( x1, x0, x2 ); \
x2 = _mm512_xor_si512( x2, _mm512_andnot_si512( x3, x0 ) ); \ x2 = mm512_xorandnot( x2, x3, x0 ); \
x0 = _mm512_xor_si512( x0, _mm512_or_si512( x1, x3 ) ); \ x0 = mm512_xoror( x0, x1, x3 ); \
x3 = _mm512_xor_si512( x3, _mm512_and_si512( x1, x2 ) ); \ x3 = mm512_xorand( x3, x1, x2 ); \
x1 = _mm512_xor_si512( x1, _mm512_and_si512( tmp, x0 ) ); \ x1 = mm512_xorand( x1, tmp, x0 ); \
x2 = _mm512_xor_si512( x2, tmp ); \ x2 = _mm512_xor_si512( x2, tmp ); \
} while (0) } while (0)
@@ -67,11 +67,11 @@ do { \
do { \ do { \
x4 = _mm512_xor_si512( x4, x1 ); \ x4 = _mm512_xor_si512( x4, x1 ); \
x5 = _mm512_xor_si512( x5, x2 ); \ x5 = _mm512_xor_si512( x5, x2 ); \
x6 = _mm512_xor_si512( x6, _mm512_xor_si512( x3, x0 ) ); \ x6 = mm512_xor3( x6, x3, x0 ); \
x7 = _mm512_xor_si512( x7, x0 ); \ x7 = _mm512_xor_si512( x7, x0 ); \
x0 = _mm512_xor_si512( x0, x5 ); \ x0 = _mm512_xor_si512( x0, x5 ); \
x1 = _mm512_xor_si512( x1, x6 ); \ x1 = _mm512_xor_si512( x1, x6 ); \
x2 = _mm512_xor_si512( x2, _mm512_xor_si512( x7, x4 ) ); \ x2 = mm512_xor3( x2, x7, x4 ); \
x3 = _mm512_xor_si512( x3, x4 ); \ x3 = _mm512_xor_si512( x3, x4 ); \
} while (0) } while (0)
@@ -318,12 +318,12 @@ static const sph_u64 C[] = {
#define Wz_8W(x, c, n) \ #define Wz_8W(x, c, n) \
do { \ do { \
__m512i t = _mm512_slli_epi64( _mm512_and_si512(x ## h, (c)), (n) ); \ __m512i t = _mm512_slli_epi64( _mm512_and_si512(x ## h, (c)), (n) ); \
x ## h = _mm512_or_si512( _mm512_and_si512( \ x ## h = mm512_orand( t, _mm512_srli_epi64( x ## h, (n) ), (c) ); \
_mm512_srli_epi64(x ## h, (n)), (c)), t ); \
t = _mm512_slli_epi64( _mm512_and_si512(x ## l, (c)), (n) ); \ t = _mm512_slli_epi64( _mm512_and_si512(x ## l, (c)), (n) ); \
x ## l = _mm512_or_si512( _mm512_and_si512((x ## l >> (n)), (c)), t ); \ x ## l = mm512_orand( t, (x ## l >> (n)), (c) ); \
} while (0) } while (0)
#define W80(x) Wz_8W(x, m512_const1_64( 0x5555555555555555 ), 1 ) #define W80(x) Wz_8W(x, m512_const1_64( 0x5555555555555555 ), 1 )
#define W81(x) Wz_8W(x, m512_const1_64( 0x3333333333333333 ), 2 ) #define W81(x) Wz_8W(x, m512_const1_64( 0x3333333333333333 ), 2 )
#define W82(x) Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 ) #define W82(x) Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 )

View File

@@ -76,6 +76,9 @@ static const uint64_t RC[] = {
#define OR64(d, a, b) (d = _mm512_or_si512(a,b)) #define OR64(d, a, b) (d = _mm512_or_si512(a,b))
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1)) #define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
#define ROL64(d, v, n) (d = mm512_rol_64(v, n)) #define ROL64(d, v, n) (d = mm512_rol_64(v, n))
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
#include "keccak-macros.c" #include "keccak-macros.c"
@@ -238,6 +241,8 @@ keccak512_8way_close(void *cc, void *dst)
#undef NOT64 #undef NOT64
#undef ROL64 #undef ROL64
#undef KECCAK_F_1600 #undef KECCAK_F_1600
#undef XOROR
#undef XORAND
#endif // AVX512 #endif // AVX512
@@ -255,6 +260,8 @@ keccak512_8way_close(void *cc, void *dst)
#define OR64(d, a, b) (d = _mm256_or_si256(a,b)) #define OR64(d, a, b) (d = _mm256_or_si256(a,b))
#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1)) #define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
#define ROL64(d, v, n) (d = mm256_rol_64(v, n)) #define ROL64(d, v, n) (d = mm256_rol_64(v, n))
#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
#include "keccak-macros.c" #include "keccak-macros.c"
@@ -419,5 +426,7 @@ keccak512_4way_close(void *cc, void *dst)
#undef NOT64 #undef NOT64
#undef ROL64 #undef ROL64
#undef KECCAK_F_1600 #undef KECCAK_F_1600
#undef XOROR
#undef XORAND
#endif // AVX2 #endif // AVX2

View File

@@ -110,20 +110,34 @@
#ifdef KHI_XO #ifdef KHI_XO
#undef KHI_XO #undef KHI_XO
#endif #endif
#define KHI_XO(d, a, b, c) do { \
XOROR(d, a, b, c); \
} while (0)
/*
#define KHI_XO(d, a, b, c) do { \ #define KHI_XO(d, a, b, c) do { \
DECL64(kt); \ DECL64(kt); \
OR64(kt, b, c); \ OR64(kt, b, c); \
XOR64(d, a, kt); \ XOR64(d, a, kt); \
} while (0) } while (0)
*/
#ifdef KHI_XA #ifdef KHI_XA
#undef KHI_XA #undef KHI_XA
#endif #endif
#define KHI_XA(d, a, b, c) do { \
XORAND(d, a, b, c); \
} while (0)
/*
#define KHI_XA(d, a, b, c) do { \ #define KHI_XA(d, a, b, c) do { \
DECL64(kt); \ DECL64(kt); \
AND64(kt, b, c); \ AND64(kt, b, c); \
XOR64(d, a, kt); \ XOR64(d, a, kt); \
} while (0) } while (0)
*/
#ifdef KHI #ifdef KHI
#undef KHI #undef KHI

View File

@@ -97,6 +97,21 @@ do { \
MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\ MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\
ADD_CONSTANT4W(*x, *(x+4), c0, c1); ADD_CONSTANT4W(*x, *(x+4), c0, c1);
#define SUBCRUMB4W(a0,a1,a2,a3,t)\
t = a0;\
a0 = mm512_xoror( a3, a0, a1 ); \
a2 = _mm512_xor_si512(a2,a3);\
a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a3 = mm512_xorand( a2, a3, t ); \
a2 = mm512_xorand( a1, a2, a0);\
a1 = _mm512_or_si512(a1,a3);\
a3 = _mm512_xor_si512(a3,a2);\
t = _mm512_xor_si512(t,a1);\
a2 = _mm512_and_si512(a2,a1);\
a1 = mm512_xnor(a1,a0);\
a0 = t;
/*
#define SUBCRUMB4W(a0,a1,a2,a3,t)\ #define SUBCRUMB4W(a0,a1,a2,a3,t)\
t = _mm512_load_si512(&a0);\ t = _mm512_load_si512(&a0);\
a0 = _mm512_or_si512(a0,a1);\ a0 = _mm512_or_si512(a0,a1);\
@@ -115,7 +130,25 @@ do { \
a2 = _mm512_and_si512(a2,a1);\ a2 = _mm512_and_si512(a2,a1);\
a1 = _mm512_xor_si512(a1,a0);\ a1 = _mm512_xor_si512(a1,a0);\
a0 = _mm512_load_si512(&t); a0 = _mm512_load_si512(&t);
*/
#define MIXWORD4W(a,b,t1,t2)\
b = _mm512_xor_si512(a,b);\
t1 = _mm512_slli_epi32(a,2);\
t2 = _mm512_srli_epi32(a,30);\
a = mm512_xoror( b, t1, t2 ); \
t1 = _mm512_slli_epi32(b,14);\
t2 = _mm512_srli_epi32(b,18);\
b = _mm512_or_si512(t1,t2);\
b = mm512_xoror( a, t1, t2 ); \
t1 = _mm512_slli_epi32(a,10);\
t2 = _mm512_srli_epi32(a,22);\
a = mm512_xoror( b, t1, t2 ); \
t1 = _mm512_slli_epi32(b,1);\
t2 = _mm512_srli_epi32(b,31);\
b = _mm512_or_si512(t1,t2);
/*
#define MIXWORD4W(a,b,t1,t2)\ #define MIXWORD4W(a,b,t1,t2)\
b = _mm512_xor_si512(a,b);\ b = _mm512_xor_si512(a,b);\
t1 = _mm512_slli_epi32(a,2);\ t1 = _mm512_slli_epi32(a,2);\
@@ -133,6 +166,7 @@ do { \
t1 = _mm512_slli_epi32(b,1);\ t1 = _mm512_slli_epi32(b,1);\
t2 = _mm512_srli_epi32(b,31);\ t2 = _mm512_srli_epi32(b,31);\
b = _mm512_or_si512(t1,t2); b = _mm512_or_si512(t1,t2);
*/
#define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\ #define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
a1 = _mm512_shuffle_epi32(a1,147);\ a1 = _mm512_shuffle_epi32(a1,147);\
@@ -248,17 +282,10 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
__m512i tmp[2]; __m512i tmp[2];
__m512i x[8]; __m512i x[8];
t0 = chainv[0]; t0 = mm512_xor3( chainv[0], chainv[2], chainv[4] );
t1 = chainv[1]; t1 = mm512_xor3( chainv[1], chainv[3], chainv[5] );
t0 = mm512_xor3( t0, chainv[6], chainv[8] );
t0 = _mm512_xor_si512( t0, chainv[2] ); t1 = mm512_xor3( t1, chainv[7], chainv[9] );
t1 = _mm512_xor_si512( t1, chainv[3] );
t0 = _mm512_xor_si512( t0, chainv[4] );
t1 = _mm512_xor_si512( t1, chainv[5] );
t0 = _mm512_xor_si512( t0, chainv[6] );
t1 = _mm512_xor_si512( t1, chainv[7] );
t0 = _mm512_xor_si512( t0, chainv[8] );
t1 = _mm512_xor_si512( t1, chainv[9] );
MULT24W( t0, t1 ); MULT24W( t0, t1 );
@@ -319,8 +346,8 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] ); chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
MULT24W( chainv[0], chainv[1] ); MULT24W( chainv[0], chainv[1] );
chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 ); chainv[0] = mm512_xor3( chainv[0], t0, msg0 );
chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 ); chainv[1] = mm512_xor3( chainv[1], t1, msg1 );
MULT24W( msg0, msg1 ); MULT24W( msg0, msg1 );
chainv[2] = _mm512_xor_si512( chainv[2], msg0 ); chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
@@ -399,18 +426,10 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
/*---- blank round with m=0 ----*/ /*---- blank round with m=0 ----*/
rnd512_4way( state, zero ); rnd512_4way( state, zero );
t[0] = chainv[0]; t[0] = mm512_xor3( chainv[0], chainv[2], chainv[4] );
t[1] = chainv[1]; t[1] = mm512_xor3( chainv[1], chainv[3], chainv[5] );
t[0] = mm512_xor3( t[0], chainv[6], chainv[8] );
t[0] = _mm512_xor_si512( t[0], chainv[2] ); t[1] = mm512_xor3( t[1], chainv[7], chainv[9] );
t[1] = _mm512_xor_si512( t[1], chainv[3] );
t[0] = _mm512_xor_si512( t[0], chainv[4] );
t[1] = _mm512_xor_si512( t[1], chainv[5] );
t[0] = _mm512_xor_si512( t[0], chainv[6] );
t[1] = _mm512_xor_si512( t[1], chainv[7] );
t[0] = _mm512_xor_si512( t[0], chainv[8] );
t[1] = _mm512_xor_si512( t[1], chainv[9] );
t[0] = _mm512_shuffle_epi32( t[0], 27 ); t[0] = _mm512_shuffle_epi32( t[0], 27 );
t[1] = _mm512_shuffle_epi32( t[1], 27 ); t[1] = _mm512_shuffle_epi32( t[1], 27 );
@@ -676,8 +695,6 @@ do { \
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \ a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
} while(0) } while(0)
// confirm pointer arithmetic
// ok but use array indexes
#define STEP_PART(x,c0,c1,t)\ #define STEP_PART(x,c0,c1,t)\
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\ SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\ SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
@@ -688,23 +705,23 @@ do { \
ADD_CONSTANT(*x, *(x+4), c0, c1); ADD_CONSTANT(*x, *(x+4), c0, c1);
#define SUBCRUMB(a0,a1,a2,a3,t)\ #define SUBCRUMB(a0,a1,a2,a3,t)\
t = _mm256_load_si256(&a0);\ t = a0;\
a0 = _mm256_or_si256(a0,a1);\ a0 = _mm256_or_si256(a0,a1);\
a2 = _mm256_xor_si256(a2,a3);\ a2 = _mm256_xor_si256(a2,a3);\
a1 = _mm256_andnot_si256(a1, m256_neg1 );\ a1 = mm256_not( a1 );\
a0 = _mm256_xor_si256(a0,a3);\ a0 = _mm256_xor_si256(a0,a3);\
a3 = _mm256_and_si256(a3,t);\ a3 = _mm256_and_si256(a3,t);\
a1 = _mm256_xor_si256(a1,a3);\ a1 = _mm256_xor_si256(a1,a3);\
a3 = _mm256_xor_si256(a3,a2);\ a3 = _mm256_xor_si256(a3,a2);\
a2 = _mm256_and_si256(a2,a0);\ a2 = _mm256_and_si256(a2,a0);\
a0 = _mm256_andnot_si256(a0, m256_neg1 );\ a0 = mm256_not( a0 );\
a2 = _mm256_xor_si256(a2,a1);\ a2 = _mm256_xor_si256(a2,a1);\
a1 = _mm256_or_si256(a1,a3);\ a1 = _mm256_or_si256(a1,a3);\
t = _mm256_xor_si256(t,a1);\ t = _mm256_xor_si256(t,a1);\
a3 = _mm256_xor_si256(a3,a2);\ a3 = _mm256_xor_si256(a3,a2);\
a2 = _mm256_and_si256(a2,a1);\ a2 = _mm256_and_si256(a2,a1);\
a1 = _mm256_xor_si256(a1,a0);\ a1 = _mm256_xor_si256(a1,a0);\
a0 = _mm256_load_si256(&t);\ a0 = t;\
#define MIXWORD(a,b,t1,t2)\ #define MIXWORD(a,b,t1,t2)\
b = _mm256_xor_si256(a,b);\ b = _mm256_xor_si256(a,b);\

View File

@@ -312,10 +312,26 @@ do { \
BUPDATE1_8W( 7, 1 ); \ BUPDATE1_8W( 7, 1 ); \
} while (0) } while (0)
#if defined(__AVX512VL__)
#define GAMMA_8W(n0, n1, n2, n4) \
( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )
#define THETA_8W(n0, n1, n2, n4) \
( g ## n0 = mm256_xor3( a ## n0, a ## n1, a ## n4 ) )
#else
#define GAMMA_8W(n0, n1, n2, n4) \ #define GAMMA_8W(n0, n1, n2, n4) \
(g ## n0 = _mm256_xor_si256( a ## n0, \ (g ## n0 = _mm256_xor_si256( a ## n0, \
_mm256_or_si256( a ## n1, mm256_not( a ## n2 ) ) ) ) _mm256_or_si256( a ## n1, mm256_not( a ## n2 ) ) ) )
#define THETA_8W(n0, n1, n2, n4) \
( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \
a ## n4 ) ) )
#endif
#define PI_ALL_8W do { \ #define PI_ALL_8W do { \
a0 = g0; \ a0 = g0; \
a1 = mm256_rol_32( g7, 1 ); \ a1 = mm256_rol_32( g7, 1 ); \
@@ -336,9 +352,6 @@ do { \
a16 = mm256_rol_32( g10, 8 ); \ a16 = mm256_rol_32( g10, 8 ); \
} while (0) } while (0)
#define THETA_8W(n0, n1, n2, n4) \
( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \
a ## n4 ) ) )
#define SIGMA_ALL_8W do { \ #define SIGMA_ALL_8W do { \
a0 = _mm256_xor_si256( g0, m256_one_32 ); \ a0 = _mm256_xor_si256( g0, m256_one_32 ); \

View File

@@ -127,10 +127,8 @@ void quark_8way_hash( void *state, const void *input )
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
if ( ( vh_mask & 0x0f ) != 0x0f ) groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 ); groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
if ( ( vh_mask & 0xf0 ) != 0xf0 )
groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
@@ -139,22 +137,14 @@ void quark_8way_hash( void *state, const void *input )
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, 512 ); vhash, 512 );
if ( hash0[0] & 8 ) groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
if ( hash1[0] & 8 ) groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
if ( hash2[0] & 8) groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
if ( hash3[0] & 8 ) groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
if ( hash4[0] & 8 )
groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
if ( hash5[0] & 8 )
groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
if ( hash6[0] & 8 )
groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
if ( hash7[0] & 8 )
groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
512 ); 512 );

View File

@@ -39,17 +39,10 @@
void void
SHA256_Buf( const void * in, size_t len, uint8_t digest[32] ) SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
{ {
#if defined(HMAC_SPH_SHA)
sph_sha256_context ctx; sph_sha256_context ctx;
sph_sha256_init( &ctx ); sph_sha256_init( &ctx );
sph_sha256( &ctx, in, len ); sph_sha256( &ctx, in, len );
sph_sha256_close( &ctx, digest ); sph_sha256_close( &ctx, digest );
#else
SHA256_CTX ctx;
SHA256_Init( &ctx );
SHA256_Update( &ctx, in, len );
SHA256_Final( digest, &ctx );
#endif
} }
/** /**
@@ -79,51 +72,29 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
/* If Klen > 64, the key is really SHA256(K). */ /* If Klen > 64, the key is really SHA256(K). */
if ( Klen > 64 ) if ( Klen > 64 )
{ {
#if defined(HMAC_SPH_SHA)
sph_sha256_init( &ctx->ictx ); sph_sha256_init( &ctx->ictx );
sph_sha256( &ctx->ictx, K, Klen ); sph_sha256( &ctx->ictx, K, Klen );
sph_sha256_close( &ctx->ictx, khash ); sph_sha256_close( &ctx->ictx, khash );
#else
SHA256_Init( &ctx->ictx ); K = khash;
SHA256_Update( &ctx->ictx, K, Klen ); Klen = 32;
SHA256_Final( khash, &ctx->ictx );
#endif
K = khash;
Klen = 32;
} }
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
#if defined(HMAC_SPH_SHA)
sph_sha256_init( &ctx->ictx ); sph_sha256_init( &ctx->ictx );
#else
SHA256_Init( &ctx->ictx );
#endif
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x36; for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x36;
memset( pad + Klen, 0x36, 64 - Klen ); memset( pad + Klen, 0x36, 64 - Klen );
#if defined(HMAC_SPH_SHA)
sph_sha256( &ctx->ictx, pad, 64 ); sph_sha256( &ctx->ictx, pad, 64 );
#else
SHA256_Update( &ctx->ictx, pad, 64 );
#endif
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
#if defined(HMAC_SPH_SHA)
sph_sha256_init( &ctx->octx ); sph_sha256_init( &ctx->octx );
#else
SHA256_Init( &ctx->octx );
#endif
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x5c; for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x5c;
memset( pad + Klen, 0x5c, 64 - Klen ); memset( pad + Klen, 0x5c, 64 - Klen );
#if defined(HMAC_SPH_SHA)
sph_sha256( &ctx->octx, pad, 64 ); sph_sha256( &ctx->octx, pad, 64 );
#else
SHA256_Update( &ctx->octx, pad, 64 );
#endif
} }
/* Add bytes to the HMAC-SHA256 operation. */ /* Add bytes to the HMAC-SHA256 operation. */
@@ -131,11 +102,7 @@ void
HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len ) HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
{ {
/* Feed data to the inner SHA256 operation. */ /* Feed data to the inner SHA256 operation. */
#if defined(HMAC_SPH_SHA)
sph_sha256( &ctx->ictx, in, len ); sph_sha256( &ctx->ictx, in, len );
#else
SHA256_Update( &ctx->ictx, in, len );
#endif
} }
/* Finish an HMAC-SHA256 operation. */ /* Finish an HMAC-SHA256 operation. */
@@ -144,20 +111,9 @@ HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
{ {
unsigned char ihash[32]; unsigned char ihash[32];
#if defined(HMAC_SPH_SHA)
sph_sha256_close( &ctx->ictx, ihash ); sph_sha256_close( &ctx->ictx, ihash );
sph_sha256( &ctx->octx, ihash, 32 ); sph_sha256( &ctx->octx, ihash, 32 );
sph_sha256_close( &ctx->octx, digest ); sph_sha256_close( &ctx->octx, digest );
#else
/* Finish the inner SHA256 operation. */
SHA256_Final( ihash, &ctx->ictx );
/* Feed the inner hash to the outer SHA256 operation. */
SHA256_Update( &ctx->octx, ihash, 32 );
/* Finish the outer SHA256 operation. */
SHA256_Final( digest, &ctx->octx );
#endif
} }
/** /**

View File

@@ -29,24 +29,14 @@
#ifndef HMAC_SHA256_H__ #ifndef HMAC_SHA256_H__
#define HMAC_SHA256_H__ #define HMAC_SHA256_H__
//#define HMAC_SSL_SHA 1
#define HMAC_SPH_SHA 1
#include <sys/types.h> #include <sys/types.h>
#include <stdint.h> #include <stdint.h>
#include "sph_sha2.h" #include "sph_sha2.h"
#include <openssl/sha.h>
typedef struct HMAC_SHA256Context typedef struct HMAC_SHA256Context
{ {
#if defined(HMAC_SPH_SHA)
sph_sha256_context ictx; sph_sha256_context ictx;
sph_sha256_context octx; sph_sha256_context octx;
#else
SHA256_CTX ictx;
SHA256_CTX octx;
#endif
} HMAC_SHA256_CTX; } HMAC_SHA256_CTX;
void SHA256_Buf( const void *, size_t len, uint8_t digest[32] ); void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );

View File

@@ -59,6 +59,8 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data,
size_t len ); size_t len );
void sha256_4way_close( sha256_4way_context *sc, void *dst ); void sha256_4way_close( sha256_4way_context *sc, void *dst );
void sha256_4way_full( void *dst, const void *data, size_t len ); void sha256_4way_full( void *dst, const void *data, size_t len );
void sha256_4way_transform( __m128i *state_out, const __m128i *data,
const __m128i *state_in );
#endif // SSE2 #endif // SSE2
@@ -77,6 +79,8 @@ void sha256_8way_init( sha256_8way_context *sc );
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len ); void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
void sha256_8way_close( sha256_8way_context *sc, void *dst ); void sha256_8way_close( sha256_8way_context *sc, void *dst );
void sha256_8way_full( void *dst, const void *data, size_t len ); void sha256_8way_full( void *dst, const void *data, size_t len );
void sha256_8way_transform( __m256i *state_out, const __m256i *data,
const __m256i *state_in );
#endif // AVX2 #endif // AVX2
@@ -95,6 +99,12 @@ void sha256_16way_init( sha256_16way_context *sc );
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len ); void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
void sha256_16way_close( sha256_16way_context *sc, void *dst ); void sha256_16way_close( sha256_16way_context *sc, void *dst );
void sha256_16way_full( void *dst, const void *data, size_t len ); void sha256_16way_full( void *dst, const void *data, size_t len );
void sha256_16way_transform( __m512i *state_out, const __m512i *data,
const __m512i *state_in );
void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
const __m512i *state_in );
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const __m512i *state_mid );
#endif // AVX512 #endif // AVX512

View File

@@ -195,8 +195,28 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
hash[i] = swab32(hash[i]); hash[i] = swab32(hash[i]);
} }
extern void sha256d(unsigned char *hash, const unsigned char *data, int len) #if defined (__SHA__)
#include "algo/sha/sph_sha2.h"
void sha256d(unsigned char *hash, const unsigned char *data, int len)
{ {
sph_sha256_context ctx __attribute__ ((aligned (64)));
sph_sha256_init( &ctx );
sph_sha256( &ctx, data, len );
sph_sha256_close( &ctx, hash );
sph_sha256_init( &ctx );
sph_sha256( &ctx, hash, 32 );
sph_sha256_close( &ctx, hash );
}
#else
void sha256d(unsigned char *hash, const unsigned char *data, int len)
{
uint32_t S[16], T[16]; uint32_t S[16], T[16];
int i, r; int i, r;
@@ -220,6 +240,8 @@ extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
be32enc((uint32_t *)hash + i, T[i]); be32enc((uint32_t *)hash + i, T[i]);
} }
#endif
static inline void sha256d_preextend(uint32_t *W) static inline void sha256d_preextend(uint32_t *W)
{ {
W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0]; W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];

View File

@@ -0,0 +1,345 @@
/* Intel SHA extensions using C intrinsics */
/* Written and place in public domain by Jeffrey Walton */
/* Based on code from Intel, and by Sean Gulley for */
/* the miTLS project. */
// A stripped down version with byte swapping removed.
#if defined(__SHA__)
#include "sha256-hash-opt.h"
void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y )
{
__m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
__m128i MSG_X, MSG_Y, TMP_X, TMP_Y;
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
__m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y;
// Load initial values
TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
// Save current hash
ABEF_SAVE_X = STATE0_X;
ABEF_SAVE_Y = STATE0_Y;
CDGH_SAVE_X = STATE1_X;
CDGH_SAVE_Y = STATE1_Y;
// Rounds 0-3
TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Rounds 4-7
TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
// Rounds 8-11
TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
// Rounds 12-15
TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
// Rounds 16-19
TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
// Rounds 20-23
TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
// Rounds 24-27
TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
// Rounds 28-31
TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
// Rounds 32-35
TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
// Rounds 36-39
TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
// Rounds 40-43
TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
// Rounds 44-47
TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
// Rounds 48-51
TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
// Rounds 52-55
TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Rounds 56-59
TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Rounds 60-63
TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Add values back to state
STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
// Save state
_mm_store_si128((__m128i*) &out_X[0], STATE0_X);
_mm_store_si128((__m128i*) &out_X[4], STATE1_X);
_mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
}
#endif

View File

@@ -74,9 +74,20 @@ static const uint32_t K256[64] =
#define CHs(X, Y, Z) \ #define CHs(X, Y, Z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z )
/*
#define MAJs(X, Y, Z) \ #define MAJs(X, Y, Z) \
_mm_or_si128( _mm_and_si128( X, Y ), \ _mm_or_si128( _mm_and_si128( X, Y ), \
_mm_and_si128( _mm_or_si128( X, Y ), Z ) ) _mm_and_si128( _mm_or_si128( X, Y ), Z ) )
*/
/*
#define MAJs(X, Y, Z) \
_mm_xor_si128( Y, _mm_and_si128( _mm_xor_si128( X, Y ), \
_mm_xor_si128( Y, Z ) ) )
*/
#define MAJs(X, Y, Z) \
_mm_xor_si128( Y, _mm_and_si128( X_xor_Y = _mm_xor_si128( X, Y ), \
Y_xor_Z ) )
#define BSG2_0(x) \ #define BSG2_0(x) \
_mm_xor_si128( _mm_xor_si128( \ _mm_xor_si128( _mm_xor_si128( \
@@ -94,6 +105,7 @@ static const uint32_t K256[64] =
_mm_xor_si128( _mm_xor_si128( \ _mm_xor_si128( _mm_xor_si128( \
mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) ) mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )
/*
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \ do { \
__m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \ __m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
@@ -122,9 +134,9 @@ do { \
H = _mm_add_epi32( T1, T2 ); \ H = _mm_add_epi32( T1, T2 ); \
D = _mm_add_epi32( D, T1 ); \ D = _mm_add_epi32( D, T1 ); \
} while (0) } while (0)
*/
/*
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \ do { \
__m128i T1, T2; \ __m128i T1, T2; \
@@ -132,16 +144,98 @@ do { \
T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \ T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
K, W[i] ) ); \ K, W[i] ) ); \
T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \ T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
Y_xor_Z = X_xor_Y; \
D = _mm_add_epi32( D, T1 ); \ D = _mm_add_epi32( D, T1 ); \
H = _mm_add_epi32( T1, T2 ); \ H = _mm_add_epi32( T1, T2 ); \
} while (0) } while (0)
*/
void sha256_4way_transform( __m128i *state_out, const __m128i *data,
const __m128i *state_in )
{
__m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
__m128i W[16];
memcpy_128( W, data, 16 );
A = state_in[0];
B = state_in[1];
C = state_in[2];
D = state_in[3];
E = state_in[4];
F = state_in[5];
G = state_in[6];
H = state_in[7];
Y_xor_Z = _mm_xor_si128( B, C );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
for ( int j = 16; j < 64; j += 16 )
{
W[ 0] = SHA2s_MEXP( 14, 9, 1, 0 );
W[ 1] = SHA2s_MEXP( 15, 10, 2, 1 );
W[ 2] = SHA2s_MEXP( 0, 11, 3, 2 );
W[ 3] = SHA2s_MEXP( 1, 12, 4, 3 );
W[ 4] = SHA2s_MEXP( 2, 13, 5, 4 );
W[ 5] = SHA2s_MEXP( 3, 14, 6, 5 );
W[ 6] = SHA2s_MEXP( 4, 15, 7, 6 );
W[ 7] = SHA2s_MEXP( 5, 0, 8, 7 );
W[ 8] = SHA2s_MEXP( 6, 1, 9, 8 );
W[ 9] = SHA2s_MEXP( 7, 2, 10, 9 );
W[10] = SHA2s_MEXP( 8, 3, 11, 10 );
W[11] = SHA2s_MEXP( 9, 4, 12, 11 );
W[12] = SHA2s_MEXP( 10, 5, 13, 12 );
W[13] = SHA2s_MEXP( 11, 6, 14, 13 );
W[14] = SHA2s_MEXP( 12, 7, 15, 14 );
W[15] = SHA2s_MEXP( 13, 8, 0, 15 );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
}
state_out[0] = _mm_add_epi32( state_in[0], A );
state_out[1] = _mm_add_epi32( state_in[1], B );
state_out[2] = _mm_add_epi32( state_in[2], C );
state_out[3] = _mm_add_epi32( state_in[3], D );
state_out[4] = _mm_add_epi32( state_in[4], E );
state_out[5] = _mm_add_epi32( state_in[5], F );
state_out[6] = _mm_add_epi32( state_in[6], G );
state_out[7] = _mm_add_epi32( state_in[7], H );
}
static void static void
sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] ) sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
{ {
register __m128i A, B, C, D, E, F, G, H; register __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
__m128i W[16]; __m128i W[16];
mm128_block_bswap_32( W, in ); mm128_block_bswap_32( W, in );
@@ -170,6 +264,8 @@ sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
H = m128_const1_64( 0x5BE0CD195BE0CD19 ); H = m128_const1_64( 0x5BE0CD195BE0CD19 );
} }
Y_xor_Z = _mm_xor_si128( B, C );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
@@ -321,10 +417,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
high = (sc->count_high << 3) | (low >> 29); high = (sc->count_high << 3) | (low >> 29);
low = low << 3; low = low << 3;
sc->buf[ pad >> 2 ] = sc->buf[ pad >> 2 ] = m128_const1_32( bswap_32( high ) );
mm128_bswap_32( m128_const1_32( high ) ); sc->buf[( pad+4 ) >> 2 ] = m128_const1_32( bswap_32( low ) );
sc->buf[ ( pad+4 ) >> 2 ] =
mm128_bswap_32( m128_const1_32( low ) );
sha256_4way_round( sc, sc->buf, sc->val ); sha256_4way_round( sc, sc->buf, sc->val );
mm128_block_bswap_32( dst, sc->val ); mm128_block_bswap_32( dst, sc->val );
@@ -342,12 +436,39 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
// SHA-256 8 way // SHA-256 8 way
#if defined(__AVX512VL__)
#define CHx(X, Y, Z) \
_mm256_ternarylogic_epi32( X, Y, Z, 0xca )
#define MAJx(X, Y, Z) \
_mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
#define BSG2_0x(x) \
mm256_xor3( mm256_ror_32(x, 2), mm256_ror_32(x, 13), mm256_ror_32(x, 22) )
#define BSG2_1x(x) \
mm256_xor3( mm256_ror_32(x, 6), mm256_ror_32(x, 11), mm256_ror_32(x, 25) )
#define SSG2_0x(x) \
mm256_xor3( mm256_ror_32(x, 7), mm256_ror_32(x, 18), _mm256_srli_epi32(x, 3) )
#define SSG2_1x(x) \
mm256_xor3( mm256_ror_32(x, 17), mm256_ror_32(x, 19), _mm256_srli_epi32(x, 10) )
#else // AVX2
#define CHx(X, Y, Z) \ #define CHx(X, Y, Z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
#define MAJx(X, Y, Z) \ #define MAJx(X, Y, Z) \
_mm256_or_si256( _mm256_and_si256( X, Y ), \ _mm256_xor_si256( Y, _mm256_and_si256( _mm256_xor_si256( X, Y ), \
_mm256_and_si256( _mm256_or_si256( X, Y ), Z ) ) _mm256_xor_si256( Y, Z ) ) )
/*
#define MAJx(X, Y, Z) \
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
Y_xor_Z ) )
*/
#define BSG2_0x(x) \ #define BSG2_0x(x) \
_mm256_xor_si256( _mm256_xor_si256( \ _mm256_xor_si256( _mm256_xor_si256( \
@@ -365,6 +486,8 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
_mm256_xor_si256( _mm256_xor_si256( \ _mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_32(x, 17), mm256_ror_32(x, 19) ), _mm256_srli_epi32(x, 10) ) mm256_ror_32(x, 17), mm256_ror_32(x, 19) ), _mm256_srli_epi32(x, 10) )
#endif // AVX512 else AVX2
#define SHA2x_MEXP( a, b, c, d ) \ #define SHA2x_MEXP( a, b, c, d ) \
mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] ); mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );
@@ -379,8 +502,89 @@ do { \
H = _mm256_add_epi32( T1, T2 ); \ H = _mm256_add_epi32( T1, T2 ); \
} while (0) } while (0)
void sha256_8way_transform( __m256i *state_out, const __m256i *data,
const __m256i *state_in )
{
__m256i A, B, C, D, E, F, G, H;
__m256i W[16];
memcpy_256( W, data, 16 );
A = state_in[0];
B = state_in[1];
C = state_in[2];
D = state_in[3];
E = state_in[4];
F = state_in[5];
G = state_in[6];
H = state_in[7];
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
for ( int j = 16; j < 64; j += 16 )
{
W[ 0] = SHA2x_MEXP( 14, 9, 1, 0 );
W[ 1] = SHA2x_MEXP( 15, 10, 2, 1 );
W[ 2] = SHA2x_MEXP( 0, 11, 3, 2 );
W[ 3] = SHA2x_MEXP( 1, 12, 4, 3 );
W[ 4] = SHA2x_MEXP( 2, 13, 5, 4 );
W[ 5] = SHA2x_MEXP( 3, 14, 6, 5 );
W[ 6] = SHA2x_MEXP( 4, 15, 7, 6 );
W[ 7] = SHA2x_MEXP( 5, 0, 8, 7 );
W[ 8] = SHA2x_MEXP( 6, 1, 9, 8 );
W[ 9] = SHA2x_MEXP( 7, 2, 10, 9 );
W[10] = SHA2x_MEXP( 8, 3, 11, 10 );
W[11] = SHA2x_MEXP( 9, 4, 12, 11 );
W[12] = SHA2x_MEXP( 10, 5, 13, 12 );
W[13] = SHA2x_MEXP( 11, 6, 14, 13 );
W[14] = SHA2x_MEXP( 12, 7, 15, 14 );
W[15] = SHA2x_MEXP( 13, 8, 0, 15 );
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
}
state_out[0] = _mm256_add_epi32( state_in[0], A );
state_out[1] = _mm256_add_epi32( state_in[1], B );
state_out[2] = _mm256_add_epi32( state_in[2], C );
state_out[3] = _mm256_add_epi32( state_in[3], D );
state_out[4] = _mm256_add_epi32( state_in[4], E );
state_out[5] = _mm256_add_epi32( state_in[5], F );
state_out[6] = _mm256_add_epi32( state_in[6], G );
state_out[7] = _mm256_add_epi32( state_in[7], H );
}
static void static void
sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] ) sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
{ {
register __m256i A, B, C, D, E, F, G, H; register __m256i A, B, C, D, E, F, G, H;
__m256i W[16]; __m256i W[16];
@@ -566,10 +770,8 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
high = (sc->count_high << 3) | (low >> 29); high = (sc->count_high << 3) | (low >> 29);
low = low << 3; low = low << 3;
sc->buf[ pad >> 2 ] = sc->buf[ pad >> 2 ] = m256_const1_32( bswap_32( high ) );
mm256_bswap_32( m256_const1_32( high ) ); sc->buf[ ( pad+4 ) >> 2 ] = m256_const1_32( bswap_32( low ) );
sc->buf[ ( pad+4 ) >> 2 ] =
mm256_bswap_32( m256_const1_32( low ) );
sha256_8way_round( sc, sc->buf, sc->val ); sha256_8way_round( sc, sc->buf, sc->val );
@@ -589,27 +791,22 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
// SHA-256 16 way // SHA-256 16 way
#define CHx16(X, Y, Z) \ #define CHx16(X, Y, Z) \
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) _mm512_ternarylogic_epi32( X, Y, Z, 0xca )
#define MAJx16(X, Y, Z) \ #define MAJx16(X, Y, Z) \
_mm512_or_si512( _mm512_and_si512( X, Y ), \ _mm512_ternarylogic_epi32( X, Y, Z, 0xe8 )
_mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
#define BSG2_0x16(x) \ #define BSG2_0x16(x) \
_mm512_xor_si512( _mm512_xor_si512( \ mm512_xor3( mm512_ror_32(x, 2), mm512_ror_32(x, 13), mm512_ror_32(x, 22) )
mm512_ror_32(x, 2), mm512_ror_32(x, 13) ), mm512_ror_32( x, 22) )
#define BSG2_1x16(x) \ #define BSG2_1x16(x) \
_mm512_xor_si512( _mm512_xor_si512( \ mm512_xor3( mm512_ror_32(x, 6), mm512_ror_32(x, 11), mm512_ror_32(x, 25) )
mm512_ror_32(x, 6), mm512_ror_32(x, 11) ), mm512_ror_32( x, 25) )
#define SSG2_0x16(x) \ #define SSG2_0x16(x) \
_mm512_xor_si512( _mm512_xor_si512( \ mm512_xor3( mm512_ror_32(x, 7), mm512_ror_32(x, 18), _mm512_srli_epi32(x, 3) )
mm512_ror_32(x, 7), mm512_ror_32(x, 18) ), _mm512_srli_epi32(x, 3) )
#define SSG2_1x16(x) \ #define SSG2_1x16(x) \
_mm512_xor_si512( _mm512_xor_si512( \ mm512_xor3( mm512_ror_32(x, 17), mm512_ror_32(x, 19), _mm512_srli_epi32(x, 10) )
mm512_ror_32(x, 17), mm512_ror_32(x, 19) ), _mm512_srli_epi32(x, 10) )
#define SHA2x16_MEXP( a, b, c, d ) \ #define SHA2x16_MEXP( a, b, c, d ) \
mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] ); mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] );
@@ -625,10 +822,216 @@ do { \
H = _mm512_add_epi32( T1, T2 ); \ H = _mm512_add_epi32( T1, T2 ); \
} while (0) } while (0)
// Tranform one 16 lane by 64 byte message block and update state.
// Calling function is responsible for initializing the state, setting
// correct byte order, counting bits and padding of the final block.
// It's faster for multiple rounds of sha256 (sha256d/t/q) by eliminating
// redundant byte swapping.
//
void sha256_16way_transform( __m512i *state_out, const __m512i *data,
const __m512i *state_in )
{
__m512i A, B, C, D, E, F, G, H;
__m512i W[16];
memcpy_512( W, data, 16 );
A = state_in[0];
B = state_in[1];
C = state_in[2];
D = state_in[3];
E = state_in[4];
F = state_in[5];
G = state_in[6];
H = state_in[7];
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
for ( int j = 16; j < 64; j += 16 )
{
W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 );
W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 );
W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 );
W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 );
W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 );
W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 );
W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 );
W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 );
W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 );
W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 );
W[10] = SHA2x16_MEXP( 8, 3, 11, 10 );
W[11] = SHA2x16_MEXP( 9, 4, 12, 11 );
W[12] = SHA2x16_MEXP( 10, 5, 13, 12 );
W[13] = SHA2x16_MEXP( 11, 6, 14, 13 );
W[14] = SHA2x16_MEXP( 12, 7, 15, 14 );
W[15] = SHA2x16_MEXP( 13, 8, 0, 15 );
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
}
state_out[0] = _mm512_add_epi32( state_in[0], A );
state_out[1] = _mm512_add_epi32( state_in[1], B );
state_out[2] = _mm512_add_epi32( state_in[2], C );
state_out[3] = _mm512_add_epi32( state_in[3], D );
state_out[4] = _mm512_add_epi32( state_in[4], E );
state_out[5] = _mm512_add_epi32( state_in[5], F );
state_out[6] = _mm512_add_epi32( state_in[6], G );
state_out[7] = _mm512_add_epi32( state_in[7], H );
}
// Aggresive prehashing
void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
const __m512i *state_in )
{
__m512i A, B, C, D, E, F, G, H;
A = _mm512_load_si512( state_in );
B = _mm512_load_si512( state_in + 1 );
C = _mm512_load_si512( state_in + 2 );
D = _mm512_load_si512( state_in + 3 );
E = _mm512_load_si512( state_in + 4 );
F = _mm512_load_si512( state_in + 5 );
G = _mm512_load_si512( state_in + 6 );
H = _mm512_load_si512( state_in + 7 );
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
_mm512_store_si512( state_mid , A );
_mm512_store_si512( state_mid + 1, B );
_mm512_store_si512( state_mid + 2, C );
_mm512_store_si512( state_mid + 3, D );
_mm512_store_si512( state_mid + 4, E );
_mm512_store_si512( state_mid + 5, F );
_mm512_store_si512( state_mid + 6, G );
_mm512_store_si512( state_mid + 7, H );
}
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const __m512i *state_mid )
{
__m512i A, B, C, D, E, F, G, H;
__m512i W[16];
memcpy_512( W, data, 16 );
A = _mm512_load_si512( state_mid );
B = _mm512_load_si512( state_mid + 1 );
C = _mm512_load_si512( state_mid + 2 );
D = _mm512_load_si512( state_mid + 3 );
E = _mm512_load_si512( state_mid + 4 );
F = _mm512_load_si512( state_mid + 5 );
G = _mm512_load_si512( state_mid + 6 );
H = _mm512_load_si512( state_mid + 7 );
// SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
// SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
// SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
for ( int j = 16; j < 64; j += 16 )
{
W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 );
W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 );
W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 );
W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 );
W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 );
W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 );
W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 );
W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 );
W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 );
W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 );
W[10] = SHA2x16_MEXP( 8, 3, 11, 10 );
W[11] = SHA2x16_MEXP( 9, 4, 12, 11 );
W[12] = SHA2x16_MEXP( 10, 5, 13, 12 );
W[13] = SHA2x16_MEXP( 11, 6, 14, 13 );
W[14] = SHA2x16_MEXP( 12, 7, 15, 14 );
W[15] = SHA2x16_MEXP( 13, 8, 0, 15 );
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
}
A = _mm512_add_epi32( A, _mm512_load_si512( state_in ) );
B = _mm512_add_epi32( B, _mm512_load_si512( state_in + 1 ) );
C = _mm512_add_epi32( C, _mm512_load_si512( state_in + 2 ) );
D = _mm512_add_epi32( D, _mm512_load_si512( state_in + 3 ) );
E = _mm512_add_epi32( E, _mm512_load_si512( state_in + 4 ) );
F = _mm512_add_epi32( F, _mm512_load_si512( state_in + 5 ) );
G = _mm512_add_epi32( G, _mm512_load_si512( state_in + 6 ) );
H = _mm512_add_epi32( H, _mm512_load_si512( state_in + 7 ) );
_mm512_store_si512( state_out , A );
_mm512_store_si512( state_out + 1, B );
_mm512_store_si512( state_out + 2, C );
_mm512_store_si512( state_out + 3, D );
_mm512_store_si512( state_out + 4, E );
_mm512_store_si512( state_out + 5, F );
_mm512_store_si512( state_out + 6, G );
_mm512_store_si512( state_out + 7, H );
}
static void static void
sha256_16way_round( sha256_16way_context *ctx, __m512i *in, __m512i r[8] ) sha256_16way_round( sha256_16way_context *ctx, __m512i *in, __m512i r[8] )
{ {
register __m512i A, B, C, D, E, F, G, H; register __m512i A, B, C, D, E, F, G, H;
__m512i W[16]; __m512i W[16];
mm512_block_bswap_32( W , in ); mm512_block_bswap_32( W , in );
@@ -657,6 +1060,7 @@ sha256_16way_round( sha256_16way_context *ctx, __m512i *in, __m512i r[8] )
H = m512_const1_64( 0x5BE0CD195BE0CD19 ); H = m512_const1_64( 0x5BE0CD195BE0CD19 );
} }
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
@@ -800,10 +1204,8 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
high = (sc->count_high << 3) | (low >> 29); high = (sc->count_high << 3) | (low >> 29);
low = low << 3; low = low << 3;
sc->buf[ pad >> 2 ] = sc->buf[ pad >> 2 ] = m512_const1_32( bswap_32( high ) );
mm512_bswap_32( m512_const1_32( high ) ); sc->buf[ ( pad+4 ) >> 2 ] = m512_const1_32( bswap_32( low ) );
sc->buf[ ( pad+4 ) >> 2 ] =
mm512_bswap_32( m512_const1_32( low ) );
sha256_16way_round( sc, sc->buf, sc->val ); sha256_16way_round( sc, sc->buf, sc->val );

View File

@@ -3,23 +3,24 @@
/* Based on code from Intel, and by Sean Gulley for */ /* Based on code from Intel, and by Sean Gulley for */
/* the miTLS project. */ /* the miTLS project. */
// A drop in replacement for the function of the same name in sph_sha2.c. // A stripped down version with byte swapping removed.
#if defined(__SHA__) #if defined(__SHA__)
#include "simd-utils.h" #include "sha256-hash-opt.h"
static void sha2_round( const uint8_t input[], uint32_t state[8] ) void sha256_opt_transform( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{ {
__m128i STATE0, STATE1; __m128i STATE0, STATE1;
__m128i MSG, TMP, MASK; __m128i MSG, TMP;
__m128i TMSG0, TMSG1, TMSG2, TMSG3; __m128i TMSG0, TMSG1, TMSG2, TMSG3;
__m128i ABEF_SAVE, CDGH_SAVE; __m128i ABEF_SAVE, CDGH_SAVE;
// Load initial values // Load initial values
TMP = _mm_load_si128((__m128i*) &state[0]); TMP = _mm_load_si128((__m128i*) &state_in[0]);
STATE1 = _mm_load_si128((__m128i*) &state[4]); STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); // MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
@@ -31,8 +32,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
CDGH_SAVE = STATE1; CDGH_SAVE = STATE1;
// Rounds 0-3 // Rounds 0-3
MSG = _mm_load_si128((const __m128i*) (input+0)); TMSG0 = _mm_load_si128((const __m128i*) (input+0));
TMSG0 = _mm_shuffle_epi8(MSG, MASK); // TMSG0 = _mm_shuffle_epi8(MSG, MASK);
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E); MSG = _mm_shuffle_epi32(MSG, 0x0E);
@@ -40,7 +41,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
// Rounds 4-7 // Rounds 4-7
TMSG1 = _mm_load_si128((const __m128i*) (input+16)); TMSG1 = _mm_load_si128((const __m128i*) (input+16));
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); // TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E); MSG = _mm_shuffle_epi32(MSG, 0x0E);
@@ -49,7 +50,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
// Rounds 8-11 // Rounds 8-11
TMSG2 = _mm_load_si128((const __m128i*) (input+32)); TMSG2 = _mm_load_si128((const __m128i*) (input+32));
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); // TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E); MSG = _mm_shuffle_epi32(MSG, 0x0E);
@@ -58,7 +59,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
// Rounds 12-15 // Rounds 12-15
TMSG3 = _mm_load_si128((const __m128i*) (input+48)); TMSG3 = _mm_load_si128((const __m128i*) (input+48));
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); // TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
@@ -192,9 +193,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
// Save state // Save state
_mm_store_si128((__m128i*) &state[0], STATE0); _mm_store_si128((__m128i*) &state_out[0], STATE0);
_mm_store_si128((__m128i*) &state[4], STATE1); _mm_store_si128((__m128i*) &state_out[4], STATE1);
} }
#endif #endif

View File

@@ -0,0 +1,18 @@
#ifndef SHA2_HASH_OPT_H__
#define SHA2_HASH_OPT_H__ 1
#include <stddef.h>
#include "simd-utils.h"
#if defined(__SHA__)
void sha256_opt_transform( uint32_t *state_out, const void *input,
const uint32_t *state_in );
// 2 way with interleaved instructions
void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y );
#endif
#endif

252
algo/sha/sha256d-4way.c Normal file
View File

@@ -0,0 +1,252 @@
#include "sha256t-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "sha-hash-4way.h"
#if defined(SHA256D_16WAY)
int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m512i block[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (32)));
__m512i initstate[8] __attribute__ ((aligned (32)));
__m512i midstate[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m512i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16;
uint32_t n = first_nonce;
__m512i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i last_byte = m512_const1_32( 0x80000000 );
const __m512i sixteen = m512_const1_32( 16 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m512_const1_32( pdata[i] );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
// initialize state
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m512_const1_64( 0x510E527F510E527F );
initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_16way_transform( midstate, vdata, initstate );
do
{
// 1. final 16 bytes of data, with padding
memcpy_512( block, vdata + 16, 4 );
block[ 4] = last_byte;
memset_zero_512( block + 5, 10 );
block[15] = m512_const1_32( 80*8 ); // bit count
sha256_16way_transform( hash32, block, midstate );
// 2. 32 byte hash from 1.
memcpy_512( block, hash32, 8 );
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = m512_const1_32( 32*8 ); // bit count
sha256_16way_transform( hash32, block, initstate );
// byte swap final hash for testing
mm512_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_16x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev, sixteen );
n += 16;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif
#if defined(SHA256D_8WAY)
int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m256i block[16] __attribute__ ((aligned (64)));
__m256i hash32[8] __attribute__ ((aligned (32)));
__m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m256i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m256i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i last_byte = m256_const1_32( 0x80000000 );
const __m256i eight = m256_const1_32( 8 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m256_const1_32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
// initialize state
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m256_const1_64( 0x510E527F510E527F );
initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_8way_transform( midstate, vdata, initstate );
do
{
// 1. final 16 bytes of data, with padding
memcpy_256( block, vdata + 16, 4 );
block[ 4] = last_byte;
memset_zero_256( block + 5, 10 );
block[15] = m256_const1_32( 80*8 ); // bit count
sha256_8way_transform( hash32, block, midstate );
// 2. 32 byte hash from 1.
memcpy_256( block, hash32, 8 );
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = m256_const1_32( 32*8 ); // bit count
sha256_8way_transform( hash32, block, initstate );
// byte swap final hash for testing
mm256_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, eight );
n += 8;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif
#if defined(SHA256D_4WAY)
int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m128i block[16] __attribute__ ((aligned (64)));
__m128i hash32[8] __attribute__ ((aligned (32)));
__m128i initstate[8] __attribute__ ((aligned (32)));
__m128i midstate[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m128i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m128i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = m128_const1_32( 0x80000000 );
const __m128i four = m128_const1_32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m128_const1_32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
// initialize state
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m128_const1_64( 0x510E527F510E527F );
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_4way_transform( midstate, vdata, initstate );
do
{
// 1. final 16 bytes of data, with padding
memcpy_128( block, vdata + 16, 4 );
block[ 4] = last_byte;
memset_zero_128( block + 5, 10 );
block[15] = m128_const1_32( 80*8 ); // bit count
sha256_4way_transform( hash32, block, midstate );
// 2. 32 byte hash from 1.
memcpy_128( block, hash32, 8 );
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = m128_const1_32( 32*8 ); // bit count
sha256_4way_transform( hash32, block, initstate );
// byte swap final hash for testing
mm128_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm_add_epi32( *noncev, four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -7,133 +7,173 @@
#if defined(SHA256T_16WAY) #if defined(SHA256T_16WAY)
static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64)));
void sha256t_16way_hash( void* output, const void* input )
{
uint32_t vhash[8*16] __attribute__ ((aligned (64)));
sha256_16way_context ctx;
memcpy( &ctx, &sha256_ctx16, sizeof ctx );
sha256_16way_update( &ctx, input + (64<<4), 16 );
sha256_16way_close( &ctx, vhash );
sha256_16way_init( &ctx );
sha256_16way_update( &ctx, vhash, 32 );
sha256_16way_close( &ctx, vhash );
sha256_16way_init( &ctx );
sha256_16way_update( &ctx, vhash, 32 );
sha256_16way_close( &ctx, output );
}
int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ) uint64_t *hashes_done, struct thr_info *mythr )
{ {
uint32_t vdata[20*16] __attribute__ ((aligned (64))); __m512i block[16] __attribute__ ((aligned (64)));
uint32_t hash32[8*16] __attribute__ ((aligned (32))); __m512i hash32[8] __attribute__ ((aligned (32)));
__m512i initstate[8] __attribute__ ((aligned (32)));
__m512i midstate[8] __attribute__ ((aligned (32)));
__m512i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = &(hash32[7<<4]); __m512i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target; const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7]; const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16; const uint32_t last_nonce = max_nonce - 16;
uint32_t n = first_nonce; uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 19; // aligned __m512i *noncev = vdata + 19;
const int thr_id = mythr->id; const int thr_id = mythr->id;
const bool bench = opt_benchmark; const bool bench = opt_benchmark;
const __m512i last_byte = m512_const1_32( 0x80000000 );
const __m512i sixteen = m512_const1_32( 16 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m512_const1_32( pdata[i] );
mm512_bswap32_intrlv80_16x32( vdata, pdata );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8, *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
sha256_16way_init( &sha256_ctx16 );
sha256_16way_update( &sha256_ctx16, vdata, 64 ); // initialize state
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m512_const1_64( 0x510E527F510E527F );
initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 byte block of data
sha256_16way_transform( midstate, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
do do
{ {
pdata[19] = n; // 1. final 16 bytes of data, with padding
sha256t_16way_hash( hash32, vdata ); memcpy_512( block, vdata + 16, 4 );
for ( int lane = 0; lane < 16; lane++ ) block[ 4] = last_byte;
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) memset_zero_512( block + 5, 10 );
{ block[15] = m512_const1_32( 80*8 ); // bit count
extr_lane_16x32( lane_hash, hash32, lane, 256 ); sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) // sha256_16way_transform( hash32, block, midstate );
{
pdata[19] = bswap_32( n + lane ); // 2. 32 byte hash from 1.
submit_solution( work, lane_hash, mythr ); memcpy_512( block, hash32, 8 );
} block[ 8] = last_byte;
} memset_zero_512( block + 9, 6 );
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) ); block[15] = m512_const1_32( 32*8 ); // bit count
n += 16; sha256_16way_transform( hash32, block, initstate );
// 3. 32 byte hash from 2.
memcpy_512( block, hash32, 8 );
sha256_16way_transform( hash32, block, initstate );
// byte swap final hash for testing
mm512_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_16x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev, sixteen );
n += 16;
} while ( (n < last_nonce) && !work_restart[thr_id].restart ); } while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n; pdata[19] = n;
*hashes_done = n - first_nonce; *hashes_done = n - first_nonce;
return 0; return 0;
} }
#endif #endif
#if defined(SHA256T_8WAY) #if defined(SHA256T_8WAY)
static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
void sha256t_8way_hash( void* output, const void* input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
sha256_8way_context ctx;
memcpy( &ctx, &sha256_ctx8, sizeof ctx );
sha256_8way_update( &ctx, input + (64<<3), 16 );
sha256_8way_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way_update( &ctx, vhash, 32 );
sha256_8way_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way_update( &ctx, vhash, 32 );
sha256_8way_close( &ctx, output );
}
int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ) uint64_t *hashes_done, struct thr_info *mythr )
{ {
uint32_t vdata[20*8] __attribute__ ((aligned (64))); __m256i block[16] __attribute__ ((aligned (64)));
uint32_t hash32[8*8] __attribute__ ((aligned (32))); __m256i hash32[8] __attribute__ ((aligned (32)));
__m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = &(hash32[7<<3]); __m256i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target; const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7]; const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8; const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce; uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned __m256i *noncev = vdata + 19;
const int thr_id = mythr->id; const int thr_id = mythr->id;
const bool bench = opt_benchmark; const bool bench = opt_benchmark;
const __m256i last_byte = m256_const1_32( 0x80000000 );
const __m256i eight = m256_const1_32( 8 );
mm256_bswap32_intrlv80_8x32( vdata, pdata ); for ( int i = 0; i < 19; i++ )
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ); vdata[i] = m256_const1_32( pdata[i] );
sha256_8way_init( &sha256_ctx8 );
sha256_8way_update( &sha256_ctx8, vdata, 64 ); *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
// initialize state
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m256_const1_64( 0x510E527F510E527F );
initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_8way_transform( midstate, vdata, initstate );
do do
{ {
pdata[19] = n; // 1. final 16 bytes of data, with padding
sha256t_8way_hash( hash32, vdata ); memcpy_256( block, vdata + 16, 4 );
for ( int lane = 0; lane < 8; lane++ ) block[ 4] = last_byte;
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) memset_zero_256( block + 5, 10 );
{ block[15] = m256_const1_32( 80*8 ); // bit count
extr_lane_8x32( lane_hash, hash32, lane, 256 ); sha256_8way_transform( hash32, block, midstate );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{ // 2. 32 byte hash from 1.
pdata[19] = bswap_32( n + lane ); memcpy_256( block, hash32, 8 );
submit_solution( work, lane_hash, mythr ); block[ 8] = last_byte;
} memset_zero_256( block + 9, 6 );
} block[15] = m256_const1_32( 32*8 ); // bit count
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) ); sha256_8way_transform( hash32, block, initstate );
n += 8;
// 3. 32 byte hash from 2.
memcpy_256( block, hash32, 8 );
sha256_8way_transform( hash32, block, initstate );
// byte swap final hash for testing
mm256_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, eight );
n += 8;
} while ( (n < last_nonce) && !work_restart[thr_id].restart ); } while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n; pdata[19] = n;
*hashes_done = n - first_nonce; *hashes_done = n - first_nonce;
@@ -144,82 +184,84 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
#if defined(SHA256T_4WAY) #if defined(SHA256T_4WAY)
static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
void sha256t_4way_hash( void* output, const void* input )
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
sha256_4way_context ctx;
memcpy( &ctx, &sha256_ctx4, sizeof ctx );
sha256_4way_update( &ctx, input + (64<<2), 16 );
sha256_4way_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way_update( &ctx, vhash, 32 );
sha256_4way_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way_update( &ctx, vhash, 32 );
sha256_4way_close( &ctx, output );
}
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ) uint64_t *hashes_done, struct thr_info *mythr )
{ {
uint32_t vdata[20*4] __attribute__ ((aligned (64))); __m128i block[16] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32))); __m128i hash32[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (64))); __m128i initstate[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]); __m128i midstate[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m128i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target; const uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7]; const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce; uint32_t n = first_nonce;
__m128i *noncev = (__m128i*)vdata + 19; // aligned __m128i *noncev = vdata + 19;
const int thr_id = mythr->id; const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = m128_const1_32( 0x80000000 );
const __m128i four = m128_const1_32( 4 );
const uint64_t htmax[] = { 0, for ( int i = 0; i < 19; i++ )
0xF, vdata[i] = m128_const1_32( pdata[i] );
0xFF,
0xFFF,
0xFFFF,
0x10000000 };
const uint32_t masks[] = { 0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0 };
mm128_bswap32_intrlv80_4x32( vdata, pdata ); *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
sha256_4way_init( &sha256_ctx4 );
sha256_4way_update( &sha256_ctx4, vdata, 64 );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) // initialize state
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m128_const1_64( 0x510E527F510E527F );
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_4way_transform( midstate, vdata, initstate );
do
{ {
const uint32_t mask = masks[m]; // 1. final 16 bytes of data, with padding
do { memcpy_128( block, vdata + 16, 4 );
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) ); block[ 4] = last_byte;
pdata[19] = n; memset_zero_128( block + 5, 10 );
block[15] = m128_const1_32( 80*8 ); // bit count
sha256_4way_transform( hash32, block, midstate );
sha256t_4way_hash( hash, vdata ); // 2. 32 byte hash from 1.
memcpy_128( block, hash32, 8 );
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = m128_const1_32( 32*8 ); // bit count
sha256_4way_transform( hash32, block, initstate );
for ( int lane = 0; lane < 4; lane++ ) // 3. 32 byte hash from 2.
if ( !( hash7[ lane ] & mask ) ) memcpy_128( block, hash32, 8 );
sha256_4way_transform( hash32, block, initstate );
// byte swap final hash for testing
mm128_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{ {
extr_lane_4x32( lane_hash, hash, lane, 256 ); pdata[19] = n + lane;
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) submit_solution( work, lane_hash, mythr );
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
} }
n += 4; }
} while ( (n < max_nonce - 4) && !work_restart[thr_id].restart ); *noncev = _mm_add_epi32( *noncev, four );
break; n += 4;
} } while ( (n < last_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1; pdata[19] = n;
*hashes_done = n - first_nonce;
return 0; return 0;
} }

View File

@@ -5,17 +5,13 @@ bool register_sha256t_algo( algo_gate_t* gate )
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256T_16WAY) #if defined(SHA256T_16WAY)
gate->scanhash = (void*)&scanhash_sha256t_16way; gate->scanhash = (void*)&scanhash_sha256t_16way;
gate->hash = (void*)&sha256t_16way_hash;
#elif defined(__SHA__) #elif defined(__SHA__)
gate->optimizations = SHA_OPT; gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256t; gate->scanhash = (void*)&scanhash_sha256t;
gate->hash = (void*)&sha256t_hash;
#elif defined(SHA256T_8WAY) #elif defined(SHA256T_8WAY)
gate->scanhash = (void*)&scanhash_sha256t_8way; gate->scanhash = (void*)&scanhash_sha256t_8way;
gate->hash = (void*)&sha256t_8way_hash;
#else #else
gate->scanhash = (void*)&scanhash_sha256t_4way; gate->scanhash = (void*)&scanhash_sha256t_4way;
gate->hash = (void*)&sha256t_4way_hash;
#endif #endif
return true; return true;
} }

View File

@@ -17,7 +17,6 @@ bool register_sha256q_algo( algo_gate_t* gate );
#if defined(SHA256T_16WAY) #if defined(SHA256T_16WAY)
void sha256t_16way_hash( void *output, const void *input );
int scanhash_sha256t_16way( struct work *work, uint32_t max_nonce, int scanhash_sha256t_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ); uint64_t *hashes_done, struct thr_info *mythr );
void sha256q_16way_hash( void *output, const void *input ); void sha256q_16way_hash( void *output, const void *input );
@@ -27,7 +26,6 @@ int scanhash_sha256q_16way( struct work *work, uint32_t max_nonce,
#if defined(SHA256T_8WAY) #if defined(SHA256T_8WAY)
void sha256t_8way_hash( void *output, const void *input );
int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce, int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ); uint64_t *hashes_done, struct thr_info *mythr );
void sha256q_8way_hash( void *output, const void *input ); void sha256q_8way_hash( void *output, const void *input );
@@ -37,7 +35,6 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
#if defined(SHA256T_4WAY) #if defined(SHA256T_4WAY)
void sha256t_4way_hash( void *output, const void *input );
int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce, int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ); uint64_t *hashes_done, struct thr_info *mythr );
void sha256q_4way_hash( void *output, const void *input ); void sha256q_4way_hash( void *output, const void *input );
@@ -45,10 +42,13 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ); uint64_t *hashes_done, struct thr_info *mythr );
#endif #endif
#if defined(__SHA__)
int sha256t_hash( void *output, const void *input );
int scanhash_sha256t( struct work *work, uint32_t max_nonce, int scanhash_sha256t( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ); uint64_t *hashes_done, struct thr_info *mythr );
#endif
int sha256q_hash( void *output, const void *input ); int sha256q_hash( void *output, const void *input );
int scanhash_sha256q( struct work *work, uint32_t max_nonce, int scanhash_sha256q( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ); uint64_t *hashes_done, struct thr_info *mythr );

View File

@@ -3,10 +3,14 @@
#include <stdint.h> #include <stdint.h>
#include <string.h> #include <string.h>
#include <stdio.h> #include <stdio.h>
#include "algo/sha/sph_sha2.h" //#include "algo/sha/sph_sha2.h"
#include "sha256-hash-opt.h"
#if defined(__SHA__)
// Only used on CPUs with SHA // Only used on CPUs with SHA
/*
static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64))); static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64)));
void sha256t_midstate( const void* input ) void sha256t_midstate( const void* input )
@@ -37,12 +41,21 @@ int sha256t_hash( void* output, const void* input )
return 1; return 1;
} }
*/
/*
int scanhash_sha256t( struct work *work, uint32_t max_nonce, int scanhash_sha256t( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ) uint64_t *hashes_done, struct thr_info *mythr )
{ {
uint32_t edata[20] __attribute__((aligned(64))); uint32_t block[16] __attribute__ ((aligned (64)));
uint32_t hash[8] __attribute__((aligned(64))); uint32_t hash32[8] __attribute__ ((aligned (32)));
uint32_t initstate[8] __attribute__ ((aligned (32)));
uint32_t midstate[8] __attribute__ ((aligned (32)));
// uint32_t edata[20] __attribute__((aligned(64)));
// uint32_t hash[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
@@ -50,24 +63,148 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce; uint32_t n = first_nonce;
const int thr_id = mythr->id; const int thr_id = mythr->id;
const bool bench = opt_benchmark; const bool bench = opt_benchmark;
__m128i shuf_bswap32 =
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
mm128_bswap32_80( edata, pdata ); // mm128_bswap32_80( edata, pdata );
sha256t_midstate( edata ); // sha256t_midstate( edata );
// initialize state
initstate[0] = 0x6A09E667;
initstate[1] = 0xBB67AE85;
initstate[2] = 0x3C6EF372;
initstate[3] = 0xA54FF53A;
initstate[4] = 0x510E527F;
initstate[5] = 0x9B05688C;
initstate[6] = 0x1F83D9AB;
initstate[7] = 0x5BE0CD19;
// hash first 64 bytes of data
sha256_opt_transform( midstate, pdata, initstate );
do do
{ {
edata[19] = n; // 1. final 16 bytes of data, with padding
if ( likely( sha256t_hash( hash, edata ) ) ) memcpy( block, pdata + 16, 16 );
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) ) block[ 4] = 0x80000000;
{ memset( block + 5, 0, 40 );
pdata[19] = bswap_32( n ); block[15] = 80*8; // bit count
submit_solution( work, hash, mythr ); sha256_opt_transform( hash32, block, midstate );
}
// 2. 32 byte hash from 1.
memcpy( block, hash32, 32 );
block[ 8] = 0x80000000;
memset( block + 9, 0, 24 );
block[15] = 32*8; // bit count
sha256_opt_transform( hash32, block, initstate );
// 3. 32 byte hash from 2.
memcpy( block, hash32, 32 );
sha256_opt_transform( hash32, block, initstate );
// byte swap final hash for testing
casti_m128i( hash32, 0 ) =
_mm_shuffle_epi8( casti_m128i( hash32, 0 ), shuf_bswap32 );
casti_m128i( hash32, 1 ) =
_mm_shuffle_epi8( casti_m128i( hash32, 1 ), shuf_bswap32 );
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
submit_solution( work, hash32, mythr );
n++; n++;
} while ( n < last_nonce && !work_restart[thr_id].restart ); pdata[19] = n;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce; *hashes_done = n - first_nonce;
return 0;
}
*/
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t block0[16] __attribute__ ((aligned (64)));
uint32_t block1[16] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t initstate[8] __attribute__ ((aligned (32)));
uint32_t midstate[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 1;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
__m128i shuf_bswap32 =
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// initialize state
initstate[0] = 0x6A09E667;
initstate[1] = 0xBB67AE85;
initstate[2] = 0x3C6EF372;
initstate[3] = 0xA54FF53A;
initstate[4] = 0x510E527F;
initstate[5] = 0x9B05688C;
initstate[6] = 0x1F83D9AB;
initstate[7] = 0x5BE0CD19;
// hash first 64 bytes of data
sha256_opt_transform( midstate, pdata, initstate );
do
{
// 1. final 16 bytes of data, with padding
memcpy( block0, pdata + 16, 16 );
memcpy( block1, pdata + 16, 16 );
block0[ 3] = n;
block1[ 3] = n+1;
block0[ 4] = block1[ 4] = 0x80000000;
memset( block0 + 5, 0, 40 );
memset( block1 + 5, 0, 40 );
block0[15] = block1[15] = 80*8; // bit count
sha256_ni2way_transform( hash0, hash1, block0, block1, midstate, midstate );
// 2. 32 byte hash from 1.
memcpy( block0, hash0, 32 );
memcpy( block1, hash1, 32 );
block0[ 8] = block1[ 8] = 0x80000000;
memset( block0 + 9, 0, 24 );
memset( block1 + 9, 0, 24 );
block0[15] = block1[15] = 32*8; // bit count
sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
// 3. 32 byte hash from 2.
memcpy( block0, hash0, 32 );
memcpy( block1, hash1, 32 );
sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
// byte swap final hash for testing
casti_m128i( hash0, 0 ) =
_mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
casti_m128i( hash0, 1 ) =
_mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
casti_m128i( hash1, 0 ) =
_mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
casti_m128i( hash1, 1 ) =
_mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
if ( unlikely( valid_hash( hash0, ptarget ) && !bench ) )
{
pdata[19] = n;
submit_solution( work, hash0, mythr );
}
if ( unlikely( valid_hash( hash1, ptarget ) && !bench ) )
{
pdata[19] = n+1;
submit_solution( work, hash1, mythr );
}
n += 2;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n; pdata[19] = n;
*hashes_done = n - first_nonce;
return 0; return 0;
} }
#endif

View File

@@ -96,74 +96,22 @@ static const uint64_t K512[80] =
// SHA-512 8 way 64 bit // SHA-512 8 way 64 bit
#define CH8W(X, Y, Z) \ #define CH8W(X, Y, Z) \
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) _mm512_ternarylogic_epi64( X, Y, Z, 0xca )
#define MAJ8W(X, Y, Z) \ #define MAJ8W(X, Y, Z) \
_mm512_or_si512( _mm512_and_si512( X, Y ), \ _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
_mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
#define BSG8W_5_0(x) \ #define BSG8W_5_0(x) \
_mm512_xor_si512( _mm512_xor_si512( \ mm512_xor3( mm512_ror_64(x, 28), mm512_ror_64(x, 34), mm512_ror_64(x, 39) )
mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) )
#define BSG8W_5_1(x) \ #define BSG8W_5_1(x) \
_mm512_xor_si512( _mm512_xor_si512( \ mm512_xor3( mm512_ror_64(x, 14), mm512_ror_64(x, 18), mm512_ror_64(x, 41) )
mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) )
#define SSG8W_5_0(x) \ #define SSG8W_5_0(x) \
_mm512_xor_si512( _mm512_xor_si512( \ mm512_xor3( mm512_ror_64(x, 1), mm512_ror_64(x, 8), _mm512_srli_epi64(x, 7) )
mm512_ror_64(x, 1), mm512_ror_64(x, 8) ), _mm512_srli_epi64(x, 7) )
#define SSG8W_5_1(x) \ #define SSG8W_5_1(x) \
_mm512_xor_si512( _mm512_xor_si512( \ mm512_xor3( mm512_ror_64(x, 19), mm512_ror_64(x, 61), _mm512_srli_epi64(x, 6) )
mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) )
static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 )
{
__m512i w0a, w1a, w0b, w1b;
w0a = mm512_ror_64( w0, 1 );
w1a = mm512_ror_64( w1,19 );
w0b = mm512_ror_64( w0, 8 );
w1b = mm512_ror_64( w1,61 );
w0a = _mm512_xor_si512( w0a, w0b );
w1a = _mm512_xor_si512( w1a, w1b );
w0b = _mm512_srli_epi64( w0, 7 );
w1b = _mm512_srli_epi64( w1, 6 );
w0a = _mm512_xor_si512( w0a, w0b );
w1a = _mm512_xor_si512( w1a, w1b );
return _mm512_add_epi64( w0a, w1a );
}
#define SSG8W_512x2_0( w0, w1, i ) do \
{ \
__m512i X0a, X1a, X0b, X1b; \
X0a = mm512_ror_64( W[i-15], 1 ); \
X1a = mm512_ror_64( W[i-14], 1 ); \
X0b = mm512_ror_64( W[i-15], 8 ); \
X1b = mm512_ror_64( W[i-14], 8 ); \
X0a = _mm512_xor_si512( X0a, X0b ); \
X1a = _mm512_xor_si512( X1a, X1b ); \
X0b = _mm512_srli_epi64( W[i-15], 7 ); \
X1b = _mm512_srli_epi64( W[i-14], 7 ); \
w0 = _mm512_xor_si512( X0a, X0b ); \
w1 = _mm512_xor_si512( X1a, X1b ); \
} while(0)
#define SSG8W_512x2_1( w0, w1, i ) do \
{ \
__m512i X0a, X1a, X0b, X1b; \
X0a = mm512_ror_64( W[i-2],19 ); \
X1a = mm512_ror_64( W[i-1],19 ); \
X0b = mm512_ror_64( W[i-2],61 ); \
X1b = mm512_ror_64( W[i-1],61 ); \
X0a = _mm512_xor_si512( X0a, X0b ); \
X1a = _mm512_xor_si512( X1a, X1b ); \
X0b = _mm512_srli_epi64( W[i-2], 6 ); \
X1b = _mm512_srli_epi64( W[i-1], 6 ); \
w0 = _mm512_xor_si512( X0a, X0b ); \
w1 = _mm512_xor_si512( X1a, X1b ); \
} while(0)
#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \ #define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
do { \ do { \
@@ -187,8 +135,8 @@ sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] )
mm512_block_bswap_64( W+8, in+8 ); mm512_block_bswap_64( W+8, in+8 );
for ( i = 16; i < 80; i++ ) for ( i = 16; i < 80; i++ )
W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ), W[i] = mm512_add4_64( SSG8W_5_0( W[i-15] ), SSG8W_5_1( W[i-2] ),
_mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) ); W[ i- 7 ], W[ i-16 ] );
if ( ctx->initialized ) if ( ctx->initialized )
{ {
@@ -319,13 +267,19 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
// SHA-512 4 way 64 bit // SHA-512 4 way 64 bit
/*
#define CH(X, Y, Z) \ #define CH(X, Y, Z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
/*
#define MAJ(X, Y, Z) \ #define MAJ(X, Y, Z) \
_mm256_or_si256( _mm256_and_si256( X, Y ), \ _mm256_or_si256( _mm256_and_si256( X, Y ), \
_mm256_and_si256( _mm256_or_si256( X, Y ), Z ) ) _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
*/
#define MAJ(X, Y, Z) \
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
Y_xor_Z ) )
#define BSG5_0(x) \ #define BSG5_0(x) \
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \ mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
@@ -334,7 +288,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
#define BSG5_1(x) \ #define BSG5_1(x) \
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \ mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
_mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 ) _mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
*/
/* /*
#define BSG5_0(x) \ #define BSG5_0(x) \
_mm256_xor_si256( _mm256_xor_si256( \ _mm256_xor_si256( _mm256_xor_si256( \
@@ -402,7 +356,7 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
w1 = _mm256_xor_si256( X1a, X1b ); \ w1 = _mm256_xor_si256( X1a, X1b ); \
} while(0) } while(0)
*/ */
/*
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \ #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
do { \ do { \
__m256i K = _mm256_set1_epi64x( K512[ i ] ); \ __m256i K = _mm256_set1_epi64x( K512[ i ] ); \
@@ -431,7 +385,7 @@ do { \
H = _mm256_add_epi64( T1, T2 ); \ H = _mm256_add_epi64( T1, T2 ); \
D = _mm256_add_epi64( D, T1 ); \ D = _mm256_add_epi64( D, T1 ); \
} while (0) } while (0)
*/
/* /*
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \ #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
do { \ do { \
@@ -445,7 +399,7 @@ do { \
} while (0) } while (0)
*/ */
/*
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \ #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
do { \ do { \
__m256i T1, T2; \ __m256i T1, T2; \
@@ -453,16 +407,17 @@ do { \
T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \ T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
K, W[i] ) ); \ K, W[i] ) ); \
T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \ T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
Y_xor_Z = X_xor_Y; \
D = _mm256_add_epi64( D, T1 ); \ D = _mm256_add_epi64( D, T1 ); \
H = _mm256_add_epi64( T1, T2 ); \ H = _mm256_add_epi64( T1, T2 ); \
} while (0) } while (0)
*/
static void static void
sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] ) sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
{ {
int i; int i;
register __m256i A, B, C, D, E, F, G, H; register __m256i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
__m256i W[80]; __m256i W[80];
mm256_block_bswap_64( W , in ); mm256_block_bswap_64( W , in );
@@ -495,6 +450,8 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
H = m256_const1_64( 0x5BE0CD19137E2179 ); H = m256_const1_64( 0x5BE0CD19137E2179 );
} }
Y_xor_Z = _mm256_xor_si256( B, C );
for ( i = 0; i < 80; i += 8 ) for ( i = 0; i < 80; i += 8 )
{ {
SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 ); SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );

View File

@@ -40,8 +40,8 @@
#endif #endif
#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z)) #define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X))) //#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X)))
#define MAJ( X, Y, Z ) ( Y ^ ( ( X_xor_Y = X ^ Y ) & ( Y_xor_Z ) ) )
#define ROTR SPH_ROTR32 #define ROTR SPH_ROTR32
#define BSG2_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) #define BSG2_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
@@ -73,7 +73,194 @@ static const sph_u32 H256[8] = {
#if defined(__SHA__) #if defined(__SHA__)
#include "sha256-hash-opt.c" #include "simd-utils.h"
static void sha2_round( const uint8_t input[], uint32_t state[8] )
{
__m128i STATE0, STATE1;
__m128i MSG, TMP, MASK;
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
__m128i ABEF_SAVE, CDGH_SAVE;
// Load initial values
TMP = _mm_load_si128((__m128i*) &state[0]);
STATE1 = _mm_load_si128((__m128i*) &state[4]);
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
// Save current hash
ABEF_SAVE = STATE0;
CDGH_SAVE = STATE1;
// Rounds 0-3
MSG = _mm_load_si128((const __m128i*) (input+0));
TMSG0 = _mm_shuffle_epi8(MSG, MASK);
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 4-7
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 8-11
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 12-15
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 16-19
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 20-23
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 24-27
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 28-31
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 32-35
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 36-39
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 40-43
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 44-47
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 48-51
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 52-55
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 56-59
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 60-63
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Add values back to state
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
// Save state
_mm_store_si128((__m128i*) &state[0], STATE0);
_mm_store_si128((__m128i*) &state[4], STATE1);
}
#else // no SHA #else // no SHA
@@ -132,6 +319,7 @@ static const sph_u32 K[64] = {
t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \ t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \
+ K[pcount + (pc)] + W[(pc) & 0x0F]); \ + K[pcount + (pc)] + W[(pc) & 0x0F]); \
t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \ t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \
Y_xor_Z = X_xor_Y; \
d = SPH_T32(d + t1); \ d = SPH_T32(d + t1); \
h = SPH_T32(t1 + t2); \ h = SPH_T32(t1 + t2); \
} while (0) } while (0)
@@ -142,7 +330,7 @@ static const sph_u32 K[64] = {
SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc) SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc)
#define SHA2_ROUND_BODY(in, r) do { \ #define SHA2_ROUND_BODY(in, r) do { \
sph_u32 A, B, C, D, E, F, G, H; \ sph_u32 A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; \
sph_u32 W[16]; \ sph_u32 W[16]; \
unsigned pcount; \ unsigned pcount; \
\ \
@@ -155,6 +343,7 @@ static const sph_u32 K[64] = {
G = (r)[6]; \ G = (r)[6]; \
H = (r)[7]; \ H = (r)[7]; \
pcount = 0; \ pcount = 0; \
Y_xor_Z = B ^ C; \
SHA2_STEP1(A, B, C, D, E, F, G, H, in, 0); \ SHA2_STEP1(A, B, C, D, E, F, G, H, in, 0); \
SHA2_STEP1(H, A, B, C, D, E, F, G, in, 1); \ SHA2_STEP1(H, A, B, C, D, E, F, G, in, 1); \
SHA2_STEP1(G, H, A, B, C, D, E, F, in, 2); \ SHA2_STEP1(G, H, A, B, C, D, E, F, in, 2); \
@@ -202,7 +391,7 @@ static const sph_u32 K[64] = {
#else // large footprint (default) #else // large footprint (default)
#define SHA2_ROUND_BODY(in, r) do { \ #define SHA2_ROUND_BODY(in, r) do { \
sph_u32 A, B, C, D, E, F, G, H, T1, T2; \ sph_u32 A, B, C, D, E, F, G, H, T1, T2, X_xor_Y, Y_xor_Z;; \
sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \ sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \ sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
\ \
@@ -214,388 +403,453 @@ static const sph_u32 K[64] = {
F = (r)[5]; \ F = (r)[5]; \
G = (r)[6]; \ G = (r)[6]; \
H = (r)[7]; \ H = (r)[7]; \
Y_xor_Z = B ^ C; \
W00 = in(0); \ W00 = in(0); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0x428A2F98) + W00); \ + SPH_C32(0x428A2F98) + W00); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
Y_xor_Z = X_xor_Y; \
D = SPH_T32(D + T1); \ D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \ H = SPH_T32(T1 + T2); \
W01 = in(1); \ W01 = in(1); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0x71374491) + W01); \ + SPH_C32(0x71374491) + W01); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
Y_xor_Z = X_xor_Y; \
C = SPH_T32(C + T1); \ C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \ G = SPH_T32(T1 + T2); \
W02 = in(2); \ W02 = in(2); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0xB5C0FBCF) + W02); \ + SPH_C32(0xB5C0FBCF) + W02); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
Y_xor_Z = X_xor_Y; \
B = SPH_T32(B + T1); \ B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \ F = SPH_T32(T1 + T2); \
W03 = in(3); \ W03 = in(3); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0xE9B5DBA5) + W03); \ + SPH_C32(0xE9B5DBA5) + W03); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
Y_xor_Z = X_xor_Y; \
A = SPH_T32(A + T1); \ A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \ E = SPH_T32(T1 + T2); \
W04 = in(4); \ W04 = in(4); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x3956C25B) + W04); \ + SPH_C32(0x3956C25B) + W04); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
Y_xor_Z = X_xor_Y; \
H = SPH_T32(H + T1); \ H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \ D = SPH_T32(T1 + T2); \
W05 = in(5); \ W05 = in(5); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0x59F111F1) + W05); \ + SPH_C32(0x59F111F1) + W05); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
Y_xor_Z = X_xor_Y; \
G = SPH_T32(G + T1); \ G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \ C = SPH_T32(T1 + T2); \
W06 = in(6); \ W06 = in(6); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x923F82A4) + W06); \ + SPH_C32(0x923F82A4) + W06); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
Y_xor_Z = X_xor_Y; \
F = SPH_T32(F + T1); \ F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \ B = SPH_T32(T1 + T2); \
W07 = in(7); \ W07 = in(7); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0xAB1C5ED5) + W07); \ + SPH_C32(0xAB1C5ED5) + W07); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
Y_xor_Z = X_xor_Y; \
E = SPH_T32(E + T1); \ E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \ A = SPH_T32(T1 + T2); \
W08 = in(8); \ W08 = in(8); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0xD807AA98) + W08); \ + SPH_C32(0xD807AA98) + W08); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
Y_xor_Z = X_xor_Y; \
D = SPH_T32(D + T1); \ D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \ H = SPH_T32(T1 + T2); \
W09 = in(9); \ W09 = in(9); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0x12835B01) + W09); \ + SPH_C32(0x12835B01) + W09); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
Y_xor_Z = X_xor_Y; \
C = SPH_T32(C + T1); \ C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \ G = SPH_T32(T1 + T2); \
W10 = in(10); \ W10 = in(10); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0x243185BE) + W10); \ + SPH_C32(0x243185BE) + W10); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
Y_xor_Z = X_xor_Y; \
B = SPH_T32(B + T1); \ B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \ F = SPH_T32(T1 + T2); \
W11 = in(11); \ W11 = in(11); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0x550C7DC3) + W11); \ + SPH_C32(0x550C7DC3) + W11); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
Y_xor_Z = X_xor_Y; \
A = SPH_T32(A + T1); \ A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \ E = SPH_T32(T1 + T2); \
W12 = in(12); \ W12 = in(12); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x72BE5D74) + W12); \ + SPH_C32(0x72BE5D74) + W12); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
Y_xor_Z = X_xor_Y; \
H = SPH_T32(H + T1); \ H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \ D = SPH_T32(T1 + T2); \
W13 = in(13); \ W13 = in(13); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0x80DEB1FE) + W13); \ + SPH_C32(0x80DEB1FE) + W13); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
Y_xor_Z = X_xor_Y; \
G = SPH_T32(G + T1); \ G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \ C = SPH_T32(T1 + T2); \
W14 = in(14); \ W14 = in(14); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x9BDC06A7) + W14); \ + SPH_C32(0x9BDC06A7) + W14); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
Y_xor_Z = X_xor_Y; \
F = SPH_T32(F + T1); \ F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \ B = SPH_T32(T1 + T2); \
W15 = in(15); \ W15 = in(15); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0xC19BF174) + W15); \ + SPH_C32(0xC19BF174) + W15); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
Y_xor_Z = X_xor_Y; \
E = SPH_T32(E + T1); \ E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \ A = SPH_T32(T1 + T2); \
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \ W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0xE49B69C1) + W00); \ + SPH_C32(0xE49B69C1) + W00); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
Y_xor_Z = X_xor_Y; \
D = SPH_T32(D + T1); \ D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \ H = SPH_T32(T1 + T2); \
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \ W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0xEFBE4786) + W01); \ + SPH_C32(0xEFBE4786) + W01); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
Y_xor_Z = X_xor_Y; \
C = SPH_T32(C + T1); \ C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \ G = SPH_T32(T1 + T2); \
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \ W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0x0FC19DC6) + W02); \ + SPH_C32(0x0FC19DC6) + W02); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
Y_xor_Z = X_xor_Y; \
B = SPH_T32(B + T1); \ B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \ F = SPH_T32(T1 + T2); \
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \ W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0x240CA1CC) + W03); \ + SPH_C32(0x240CA1CC) + W03); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
Y_xor_Z = X_xor_Y; \
A = SPH_T32(A + T1); \ A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \ E = SPH_T32(T1 + T2); \
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \ W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x2DE92C6F) + W04); \ + SPH_C32(0x2DE92C6F) + W04); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
Y_xor_Z = X_xor_Y; \
H = SPH_T32(H + T1); \ H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \ D = SPH_T32(T1 + T2); \
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \ W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0x4A7484AA) + W05); \ + SPH_C32(0x4A7484AA) + W05); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
Y_xor_Z = X_xor_Y; \
G = SPH_T32(G + T1); \ G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \ C = SPH_T32(T1 + T2); \
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \ W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x5CB0A9DC) + W06); \ + SPH_C32(0x5CB0A9DC) + W06); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
Y_xor_Z = X_xor_Y; \
F = SPH_T32(F + T1); \ F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \ B = SPH_T32(T1 + T2); \
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \ W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0x76F988DA) + W07); \ + SPH_C32(0x76F988DA) + W07); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
Y_xor_Z = X_xor_Y; \
E = SPH_T32(E + T1); \ E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \ A = SPH_T32(T1 + T2); \
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \ W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0x983E5152) + W08); \ + SPH_C32(0x983E5152) + W08); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
Y_xor_Z = X_xor_Y; \
D = SPH_T32(D + T1); \ D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \ H = SPH_T32(T1 + T2); \
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \ W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0xA831C66D) + W09); \ + SPH_C32(0xA831C66D) + W09); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
Y_xor_Z = X_xor_Y; \
C = SPH_T32(C + T1); \ C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \ G = SPH_T32(T1 + T2); \
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \ W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0xB00327C8) + W10); \ + SPH_C32(0xB00327C8) + W10); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
Y_xor_Z = X_xor_Y; \
B = SPH_T32(B + T1); \ B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \ F = SPH_T32(T1 + T2); \
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \ W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0xBF597FC7) + W11); \ + SPH_C32(0xBF597FC7) + W11); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
Y_xor_Z = X_xor_Y; \
A = SPH_T32(A + T1); \ A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \ E = SPH_T32(T1 + T2); \
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \ W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0xC6E00BF3) + W12); \ + SPH_C32(0xC6E00BF3) + W12); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
Y_xor_Z = X_xor_Y; \
H = SPH_T32(H + T1); \ H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \ D = SPH_T32(T1 + T2); \
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \ W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0xD5A79147) + W13); \ + SPH_C32(0xD5A79147) + W13); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
Y_xor_Z = X_xor_Y; \
G = SPH_T32(G + T1); \ G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \ C = SPH_T32(T1 + T2); \
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \ W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x06CA6351) + W14); \ + SPH_C32(0x06CA6351) + W14); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
Y_xor_Z = X_xor_Y; \
F = SPH_T32(F + T1); \ F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \ B = SPH_T32(T1 + T2); \
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \ W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0x14292967) + W15); \ + SPH_C32(0x14292967) + W15); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
Y_xor_Z = X_xor_Y; \
E = SPH_T32(E + T1); \ E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \ A = SPH_T32(T1 + T2); \
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \ W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0x27B70A85) + W00); \ + SPH_C32(0x27B70A85) + W00); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
Y_xor_Z = X_xor_Y; \
D = SPH_T32(D + T1); \ D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \ H = SPH_T32(T1 + T2); \
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \ W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0x2E1B2138) + W01); \ + SPH_C32(0x2E1B2138) + W01); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
Y_xor_Z = X_xor_Y; \
C = SPH_T32(C + T1); \ C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \ G = SPH_T32(T1 + T2); \
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \ W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0x4D2C6DFC) + W02); \ + SPH_C32(0x4D2C6DFC) + W02); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
Y_xor_Z = X_xor_Y; \
B = SPH_T32(B + T1); \ B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \ F = SPH_T32(T1 + T2); \
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \ W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0x53380D13) + W03); \ + SPH_C32(0x53380D13) + W03); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
Y_xor_Z = X_xor_Y; \
A = SPH_T32(A + T1); \ A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \ E = SPH_T32(T1 + T2); \
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \ W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x650A7354) + W04); \ + SPH_C32(0x650A7354) + W04); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
Y_xor_Z = X_xor_Y; \
H = SPH_T32(H + T1); \ H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \ D = SPH_T32(T1 + T2); \
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \ W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0x766A0ABB) + W05); \ + SPH_C32(0x766A0ABB) + W05); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
Y_xor_Z = X_xor_Y; \
G = SPH_T32(G + T1); \ G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \ C = SPH_T32(T1 + T2); \
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \ W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x81C2C92E) + W06); \ + SPH_C32(0x81C2C92E) + W06); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
Y_xor_Z = X_xor_Y; \
F = SPH_T32(F + T1); \ F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \ B = SPH_T32(T1 + T2); \
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \ W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0x92722C85) + W07); \ + SPH_C32(0x92722C85) + W07); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
Y_xor_Z = X_xor_Y; \
E = SPH_T32(E + T1); \ E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \ A = SPH_T32(T1 + T2); \
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \ W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0xA2BFE8A1) + W08); \ + SPH_C32(0xA2BFE8A1) + W08); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
Y_xor_Z = X_xor_Y; \
D = SPH_T32(D + T1); \ D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \ H = SPH_T32(T1 + T2); \
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \ W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0xA81A664B) + W09); \ + SPH_C32(0xA81A664B) + W09); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
Y_xor_Z = X_xor_Y; \
C = SPH_T32(C + T1); \ C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \ G = SPH_T32(T1 + T2); \
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \ W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0xC24B8B70) + W10); \ + SPH_C32(0xC24B8B70) + W10); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
Y_xor_Z = X_xor_Y; \
B = SPH_T32(B + T1); \ B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \ F = SPH_T32(T1 + T2); \
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \ W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0xC76C51A3) + W11); \ + SPH_C32(0xC76C51A3) + W11); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
Y_xor_Z = X_xor_Y; \
A = SPH_T32(A + T1); \ A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \ E = SPH_T32(T1 + T2); \
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \ W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0xD192E819) + W12); \ + SPH_C32(0xD192E819) + W12); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
Y_xor_Z = X_xor_Y; \
H = SPH_T32(H + T1); \ H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \ D = SPH_T32(T1 + T2); \
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \ W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0xD6990624) + W13); \ + SPH_C32(0xD6990624) + W13); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
Y_xor_Z = X_xor_Y; \
G = SPH_T32(G + T1); \ G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \ C = SPH_T32(T1 + T2); \
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \ W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0xF40E3585) + W14); \ + SPH_C32(0xF40E3585) + W14); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
Y_xor_Z = X_xor_Y; \
F = SPH_T32(F + T1); \ F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \ B = SPH_T32(T1 + T2); \
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \ W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0x106AA070) + W15); \ + SPH_C32(0x106AA070) + W15); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
Y_xor_Z = X_xor_Y; \
E = SPH_T32(E + T1); \ E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \ A = SPH_T32(T1 + T2); \
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \ W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0x19A4C116) + W00); \ + SPH_C32(0x19A4C116) + W00); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
Y_xor_Z = X_xor_Y; \
D = SPH_T32(D + T1); \ D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \ H = SPH_T32(T1 + T2); \
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \ W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0x1E376C08) + W01); \ + SPH_C32(0x1E376C08) + W01); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
Y_xor_Z = X_xor_Y; \
C = SPH_T32(C + T1); \ C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \ G = SPH_T32(T1 + T2); \
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \ W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0x2748774C) + W02); \ + SPH_C32(0x2748774C) + W02); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
Y_xor_Z = X_xor_Y; \
B = SPH_T32(B + T1); \ B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \ F = SPH_T32(T1 + T2); \
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \ W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0x34B0BCB5) + W03); \ + SPH_C32(0x34B0BCB5) + W03); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
Y_xor_Z = X_xor_Y; \
A = SPH_T32(A + T1); \ A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \ E = SPH_T32(T1 + T2); \
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \ W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x391C0CB3) + W04); \ + SPH_C32(0x391C0CB3) + W04); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
Y_xor_Z = X_xor_Y; \
H = SPH_T32(H + T1); \ H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \ D = SPH_T32(T1 + T2); \
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \ W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0x4ED8AA4A) + W05); \ + SPH_C32(0x4ED8AA4A) + W05); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
Y_xor_Z = X_xor_Y; \
G = SPH_T32(G + T1); \ G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \ C = SPH_T32(T1 + T2); \
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \ W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x5B9CCA4F) + W06); \ + SPH_C32(0x5B9CCA4F) + W06); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
Y_xor_Z = X_xor_Y; \
F = SPH_T32(F + T1); \ F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \ B = SPH_T32(T1 + T2); \
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \ W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0x682E6FF3) + W07); \ + SPH_C32(0x682E6FF3) + W07); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
Y_xor_Z = X_xor_Y; \
E = SPH_T32(E + T1); \ E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \ A = SPH_T32(T1 + T2); \
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \ W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0x748F82EE) + W08); \ + SPH_C32(0x748F82EE) + W08); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
Y_xor_Z = X_xor_Y; \
D = SPH_T32(D + T1); \ D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \ H = SPH_T32(T1 + T2); \
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \ W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0x78A5636F) + W09); \ + SPH_C32(0x78A5636F) + W09); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
Y_xor_Z = X_xor_Y; \
C = SPH_T32(C + T1); \ C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \ G = SPH_T32(T1 + T2); \
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \ W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0x84C87814) + W10); \ + SPH_C32(0x84C87814) + W10); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
Y_xor_Z = X_xor_Y; \
B = SPH_T32(B + T1); \ B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \ F = SPH_T32(T1 + T2); \
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \ W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0x8CC70208) + W11); \ + SPH_C32(0x8CC70208) + W11); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
Y_xor_Z = X_xor_Y; \
A = SPH_T32(A + T1); \ A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \ E = SPH_T32(T1 + T2); \
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \ W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x90BEFFFA) + W12); \ + SPH_C32(0x90BEFFFA) + W12); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
Y_xor_Z = X_xor_Y; \
H = SPH_T32(H + T1); \ H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \ D = SPH_T32(T1 + T2); \
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \ W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0xA4506CEB) + W13); \ + SPH_C32(0xA4506CEB) + W13); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
Y_xor_Z = X_xor_Y; \
G = SPH_T32(G + T1); \ G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \ C = SPH_T32(T1 + T2); \
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \ W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0xBEF9A3F7) + W14); \ + SPH_C32(0xBEF9A3F7) + W14); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
Y_xor_Z = X_xor_Y; \
F = SPH_T32(F + T1); \ F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \ B = SPH_T32(T1 + T2); \
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \ W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0xC67178F2) + W15); \ + SPH_C32(0xC67178F2) + W15); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
Y_xor_Z = X_xor_Y; \
E = SPH_T32(E + T1); \ E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \ A = SPH_T32(T1 + T2); \
(r)[0] = SPH_T32((r)[0] + A); \ (r)[0] = SPH_T32((r)[0] + A); \

View File

@@ -38,7 +38,8 @@
#if SPH_64 #if SPH_64
#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z)) #define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z))) //#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z)))
#define MAJ( X, Y, Z ) ( Y ^ ( ( X ^ Y ) & ( Y ^ Z ) ) )
#define ROTR64 SPH_ROTR64 #define ROTR64 SPH_ROTR64

View File

@@ -310,12 +310,13 @@ do { \
#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ #define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
do { \ do { \
xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256( \ xa0 = mm256_xor3( xm, xb1, _mm256_xor_si256( \
_mm256_andnot_si256( xb3, xb2 ), \ _mm256_andnot_si256( xb3, xb2 ), \
_mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \ _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \ _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
) ), _mm256_set1_epi32(3UL) ) ) ) ); \ _mm256_set1_epi32(5UL) ) ), \
xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \ _mm256_set1_epi32(3UL) ) ) ); \
xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
} while (0) } while (0)
#define PERM_STEP_0_8 do { \ #define PERM_STEP_0_8 do { \

View File

@@ -23,6 +23,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
register __m512i K0, K1, K2, K3, K4, K5, K6, K7; register __m512i K0, K1, K2, K3, K4, K5, K6, K7;
__m512i *M = (__m512i*)msg; __m512i *M = (__m512i*)msg;
__m512i *H = (__m512i*)ctx->h; __m512i *H = (__m512i*)ctx->h;
const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2,
ctx->count1, ctx->count0 );
int r; int r;
P0 = H[0]; P0 = H[0];
@@ -62,16 +64,16 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
_mm512_aesenc_epi128( K0, m512_zero ) ) ); _mm512_aesenc_epi128( K0, m512_zero ) ) );
if ( r == 0 ) if ( r == 0 )
K0 = _mm512_xor_si512( K0, _mm512_set4_epi32( K0 = _mm512_xor_si512( K0,
~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) ); _mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
K1 = _mm512_xor_si512( K0, K1 = _mm512_xor_si512( K0,
mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) ); mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
if ( r == 1 ) if ( r == 1 )
K1 = _mm512_xor_si512( K1, _mm512_set4_epi32( K1 = _mm512_xor_si512( K1, mm512_ror128_32(
~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) ); _mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
K2 = _mm512_xor_si512( K1, K2 = _mm512_xor_si512( K1,
@@ -96,8 +98,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) ); mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
if ( r == 2 ) if ( r == 2 )
K7 = _mm512_xor_si512( K7, _mm512_set4_epi32( K7 = _mm512_xor_si512( K7, mm512_swap128_64(
~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) ); _mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
P1 = _mm512_xor_si512( P1, X ); P1 = _mm512_xor_si512( P1, X );

View File

@@ -101,15 +101,6 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round // round
// working proof of concept
/*
__m512i K = m512_const1_128( m[0] );
__m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K );
X = _mm512_aesenc_epi128( X, m512_zero );
k00 = _mm512_castsi512_si128( K );
x = _mm512_castsi512_si128( X );
*/
k00 = m[0]; k00 = m[0];
x = _mm_xor_si128( p1, k00 ); x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, zero ); x = _mm_aesenc_si128( x, zero );

View File

@@ -747,11 +747,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
static const m512_v16 code[] = { c1_16_512(185), c1_16_512(233) }; static const m512_v16 code[] = { c1_16_512(185), c1_16_512(233) };
// static const m512_v16 code[] = { c1_16(185), c1_16(233),
// c1_16(185), c1_16(233) };
S0l = _mm512_xor_si512( S[0], M[0] ); S0l = _mm512_xor_si512( S[0], M[0] );
S0h = _mm512_xor_si512( S[1], M[1] ); S0h = _mm512_xor_si512( S[1], M[1] );
S1l = _mm512_xor_si512( S[2], M[2] ); S1l = _mm512_xor_si512( S[2], M[2] );
@@ -764,11 +759,16 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
// targetted, local macros don't need a unique name // targetted, local macros don't need a unique name
#define S(i) S##i #define S(i) S##i
#define F_0( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xca )
#define F_1( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xe8 )
/*
#define F_0(B, C, D) \ #define F_0(B, C, D) \
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( C,D ), B ), D ) _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( C,D ), B ), D )
#define F_1(B, C, D) \ #define F_1(B, C, D) \
_mm512_or_si512( _mm512_and_si512( D, C ),\ _mm512_or_si512( _mm512_and_si512( D, C ),\
_mm512_and_si512( _mm512_or_si512( D,C ), B ) ) _mm512_and_si512( _mm512_or_si512( D,C ), B ) )
*/
#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l) #define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h) #define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)

View File

@@ -6,10 +6,6 @@
#define PRINT_SOME 0 #define PRINT_SOME 0
/* JDD all ocurrances of macro X in this file renamed to XX
* due to name conflict
*/
int SupportedLength(int hashbitlen) { int SupportedLength(int hashbitlen) {
if (hashbitlen <= 0 || hashbitlen > 512) if (hashbitlen <= 0 || hashbitlen > 512)
return 0; return 0;

View File

@@ -309,19 +309,13 @@ static const uint64_t IV512[] = {
sc->bcount = bcount; \ sc->bcount = bcount; \
} while (0) } while (0)
// AVX2 all scalar vars are now vectors representing 4 nonces in parallel
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \ #define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
do { \ do { \
k8 = _mm512_xor_si512( _mm512_xor_si512( \ k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), mm512_xor3( k3, k4, k5 ), \
_mm512_xor_si512( _mm512_xor_si512( k0, k1 ), \ mm512_xor3( k6, k7, m512_const1_64( 0x1BD11BDAA9FC1A22) ));\
_mm512_xor_si512( k2, k3 ) ), \
_mm512_xor_si512( _mm512_xor_si512( k4, k5 ), \
_mm512_xor_si512( k6, k7 ) ) ), \
m512_const1_64( 0x1BD11BDAA9FC1A22) ); \
t2 = t0 ^ t1; \ t2 = t0 ^ t1; \
} while (0) } while (0)
@@ -340,7 +334,6 @@ do { \
m512_const1_64( s ) ) ); \ m512_const1_64( s ) ) ); \
} while (0) } while (0)
#define TFBIG_MIX_8WAY(x0, x1, rc) \ #define TFBIG_MIX_8WAY(x0, x1, rc) \
do { \ do { \
x0 = _mm512_add_epi64( x0, x1 ); \ x0 = _mm512_add_epi64( x0, x1 ); \

View File

@@ -44,8 +44,8 @@ int verthash_info_init(verthash_info_t* info, const char* file_name)
if ( opt_data_file || !opt_verify ) if ( opt_data_file || !opt_verify )
{ {
if ( opt_data_file ) if ( opt_data_file )
applog( LOG_ERR, applog( LOG_ERR, "Verthash data file not found or invalid: %s",
"Verthash data file not found or invalid: %s", info->fileName ); info->fileName );
else else
{ {
applog( LOG_ERR, applog( LOG_ERR,
@@ -134,76 +134,117 @@ static inline uint32_t fnv1a(const uint32_t a, const uint32_t b)
return (a ^ b) * 0x1000193; return (a ^ b) * 0x1000193;
} }
void verthash_hash( const unsigned char* blob_bytes, #if 0
const size_t blob_size, static void rotate_indexes( uint32_t *p )
const unsigned char(*input)[VH_HEADER_SIZE],
unsigned char(*output)[VH_HASH_OUT_SIZE] )
{ {
unsigned char p1[ VH_HASH_OUT_SIZE ] __attribute__ ((aligned (64)));
unsigned char p0[ VH_N_SUBSET ] __attribute__ ((aligned (64)));
uint32_t seek_indexes[VH_N_INDEXES] __attribute__ ((aligned (64)));
uint32_t* p0_index = (uint32_t*)p0;
verthash_sha3_512_final_8( p0, ( (uint64_t*)input )[ 9 ] );
for ( size_t x = 0; x < VH_N_ROT; ++x )
{
memcpy( seek_indexes + x * (VH_N_SUBSET / sizeof(uint32_t)),
p0, VH_N_SUBSET);
#if defined(__AVX2__) #if defined(__AVX2__)
for ( size_t y = 0; y < VH_N_SUBSET / sizeof(__m256i); y += 8) for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m256i); x += 8 )
{ {
casti_m256i( p0_index, y ) = mm256_rol_32( __m256i *px = (__m256i*)p + x;
casti_m256i( p0_index, y ), 1 );
casti_m256i( p0_index, y+1 ) = mm256_rol_32( px[0] = mm256_rol_32( px[0], 1 );
casti_m256i( p0_index, y+1 ), 1 ); px[1] = mm256_rol_32( px[1], 1 );
casti_m256i( p0_index, y+2 ) = mm256_rol_32( px[2] = mm256_rol_32( px[2], 1 );
casti_m256i( p0_index, y+2 ), 1 ); px[3] = mm256_rol_32( px[3], 1 );
casti_m256i( p0_index, y+3 ) = mm256_rol_32( px[4] = mm256_rol_32( px[4], 1 );
casti_m256i( p0_index, y+3 ), 1 ); px[5] = mm256_rol_32( px[5], 1 );
casti_m256i( p0_index, y+4 ) = mm256_rol_32( px[6] = mm256_rol_32( px[6], 1 );
casti_m256i( p0_index, y+4 ), 1 ); px[7] = mm256_rol_32( px[7], 1 );
casti_m256i( p0_index, y+5 ) = mm256_rol_32( }
casti_m256i( p0_index, y+5 ), 1 );
casti_m256i( p0_index, y+6 ) = mm256_rol_32(
casti_m256i( p0_index, y+6 ), 1 );
casti_m256i( p0_index, y+7 ) = mm256_rol_32(
casti_m256i( p0_index, y+7 ), 1 );
}
#else #else
for ( size_t y = 0; y < VH_N_SUBSET / sizeof(__m128i); y += 8) for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m128i); x += 8 )
{ {
casti_m128i( p0_index, y ) = mm128_rol_32( __m128i *px = (__m128i*)p0_index + x;
casti_m128i( p0_index, y ), 1 );
casti_m128i( p0_index, y+1 ) = mm128_rol_32( px[0] = mm128_rol_32( px[0], 1 );
casti_m128i( p0_index, y+1 ), 1 ); px[1] = mm128_rol_32( px[1], 1 );
casti_m128i( p0_index, y+2 ) = mm128_rol_32( px[2] = mm128_rol_32( px[2], 1 );
casti_m128i( p0_index, y+2 ), 1 ); px[3] = mm128_rol_32( px[3], 1 );
casti_m128i( p0_index, y+3 ) = mm128_rol_32( px[4] = mm128_rol_32( px[4], 1 );
casti_m128i( p0_index, y+3 ), 1 ); px[5] = mm128_rol_32( px[5], 1 );
casti_m128i( p0_index, y+4 ) = mm128_rol_32( px[6] = mm128_rol_32( px[6], 1 );
casti_m128i( p0_index, y+4 ), 1 ); px[7] = mm128_rol_32( px[7], 1 );
casti_m128i( p0_index, y+5 ) = mm128_rol_32( }
casti_m128i( p0_index, y+5 ), 1 );
casti_m128i( p0_index, y+6 ) = mm128_rol_32( #endif
casti_m128i( p0_index, y+6 ), 1 ); /*
casti_m128i( p0_index, y+7 ) = mm128_rol_32( for ( size_t x = 0; x < VH_N_SUBSET / sizeof(uint32_t); ++x )
casti_m128i( p0_index, y+7 ), 1 ); p[x] = ( p[x] << 1 ) | ( p[x] >> 31 );
} */
}
#endif
static inline uint32_t rotl32( uint32_t a, size_t r )
{
return ( a << r ) | ( a >> (32-r) );
}
// Vectorized and targetted version of fnv1a
#if defined (__AVX2__)
#define MULXOR \
*(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
*(__m256i*)hash, *(__m256i*)blob_off ), k );
#elif defined(__SSE41__)
#define MULXOR \
casti_m128i( hash, 0 ) = _mm_mullo_epi32( _mm_xor_si128( \
casti_m128i( hash, 0 ), casti_m128i( blob_off, 0 ) ), k ); \
casti_m128i( hash, 1 ) = _mm_mullo_epi32( _mm_xor_si128( \
casti_m128i( hash, 1 ), casti_m128i( blob_off, 1 ) ), k );
#else
#define MULXOR \
for ( size_t j = 0; j < VH_HASH_OUT_SIZE / sizeof(uint32_t); j++ ) \
hash[j] = fnv1a( hash[j], blob_off[j] ); \
#endif #endif
} #define UPDATE_ACCUMULATOR \
accumulator = fnv1a( accumulator, blob_off[0] ); \
accumulator = fnv1a( accumulator, blob_off[1] ); \
accumulator = fnv1a( accumulator, blob_off[2] ); \
accumulator = fnv1a( accumulator, blob_off[3] ); \
accumulator = fnv1a( accumulator, blob_off[4] ); \
accumulator = fnv1a( accumulator, blob_off[5] ); \
accumulator = fnv1a( accumulator, blob_off[6] ); \
accumulator = fnv1a( accumulator, blob_off[7] )
sha3( &input[0], VH_HEADER_SIZE, &p1[0], VH_HASH_OUT_SIZE );
uint32_t* p1_32 = (uint32_t*)p1; // first pass no rotate
uint32_t* blob_bytes_32 = (uint32_t*)blob_bytes; #define ROUND_0 \
uint32_t value_accumulator = 0x811c9dc5; for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
{ \
const uint32_t *blob_off = blob + \
( ( fnv1a( subset[i], accumulator ) % mdiv ) \
* ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
UPDATE_ACCUMULATOR; \
MULXOR; \
}
// subsequent passes rotate by r on demand, no need for mass rotate
#define ROUND_r( r ) \
for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
{ \
const uint32_t *blob_off = blob + \
( ( fnv1a( rotl32( subset[i], r ), accumulator ) % mdiv ) \
* ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
UPDATE_ACCUMULATOR; \
MULXOR; \
}
void verthash_hash( const void *blob_bytes, const size_t blob_size,
const void *input, void *output )
{
uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (64)));
uint32_t subset[ VH_N_SUBSET / 4 ] __attribute__ ((aligned (64)));
const uint32_t *blob = (const uint32_t*)blob_bytes;
uint32_t accumulator = 0x811c9dc5;
const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE ) const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE )
/ VH_BYTE_ALIGNMENT ) + 1; / VH_BYTE_ALIGNMENT ) + 1;
#if defined (__AVX2__) #if defined (__AVX2__)
@@ -212,39 +253,14 @@ void verthash_hash( const unsigned char* blob_bytes,
const __m128i k = _mm_set1_epi32( 0x1000193 ); const __m128i k = _mm_set1_epi32( 0x1000193 );
#endif #endif
for ( size_t i = 0; i < VH_N_INDEXES; i++ ) sha3( input, VH_HEADER_SIZE, hash, VH_HASH_OUT_SIZE );
{ verthash_sha3_512_final_8( subset, ( (uint64_t*)input )[ 9 ] );
const uint32_t offset =
( fnv1a( seek_indexes[i], value_accumulator) % mdiv )
* ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) );
const uint32_t *blob_off = blob_bytes_32 + offset;
// update value accumulator for next seek index ROUND_0;
value_accumulator = fnv1a( value_accumulator, blob_off[0] ); for ( size_t r = 1; r < VH_N_ROT; ++r )
value_accumulator = fnv1a( value_accumulator, blob_off[1] ); ROUND_r( r );
value_accumulator = fnv1a( value_accumulator, blob_off[2] );
value_accumulator = fnv1a( value_accumulator, blob_off[3] );
value_accumulator = fnv1a( value_accumulator, blob_off[4] );
value_accumulator = fnv1a( value_accumulator, blob_off[5] );
value_accumulator = fnv1a( value_accumulator, blob_off[6] );
value_accumulator = fnv1a( value_accumulator, blob_off[7] );
#if defined (__AVX2__) memcpy( output, hash, VH_HASH_OUT_SIZE );
*(__m256i*)p1_32 = _mm256_mullo_epi32( _mm256_xor_si256(
*(__m256i*)p1_32, *(__m256i*)blob_off ), k );
#elif defined(__SSE41__)
casti_m128i( p1_32, 0 ) = _mm_mullo_epi32( _mm_xor_si128(
casti_m128i( p1_32, 0 ), casti_m128i( blob_off, 0 ) ), k );
casti_m128i( p1_32, 1 ) = _mm_mullo_epi32( _mm_xor_si128(
casti_m128i( p1_32, 1 ), casti_m128i( blob_off, 1 ) ), k );
#else
for ( size_t i2 = 0; i2 < VH_HASH_OUT_SIZE / sizeof(uint32_t); i2++ )
p1_32[i2] = fnv1a( p1_32[i2], blob_off[i2] );
#endif
}
memcpy( output, p1, VH_HASH_OUT_SIZE );
} }
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------

View File

@@ -47,10 +47,8 @@ void verthash_info_free(verthash_info_t* info);
//! Generate verthash data file and save it to specified location. //! Generate verthash data file and save it to specified location.
int verthash_generate_data_file(const char* output_file_name); int verthash_generate_data_file(const char* output_file_name);
void verthash_hash(const unsigned char* blob_bytes, void verthash_hash( const void *blob_bytes, const size_t blob_size,
const size_t blob_size, const void *input, void *output );
const unsigned char(*input)[VH_HEADER_SIZE],
unsigned char(*output)[VH_HASH_OUT_SIZE]);
void verthash_sha3_512_prehash_72( const void *input ); void verthash_sha3_512_prehash_72( const void *input );
void verthash_sha3_512_final_8( void *hash, const uint64_t nonce ); void verthash_sha3_512_final_8( void *hash, const uint64_t nonce );

View File

@@ -62,7 +62,7 @@ void verthash_sha3_512_final_8( void *hash, const uint64_t nonce )
__m256i vhashB[ 10 ] __attribute__ ((aligned (64))); __m256i vhashB[ 10 ] __attribute__ ((aligned (64)));
sha3_4way_ctx_t ctx; sha3_4way_ctx_t ctx;
__m256i vnonce = _mm256_set1_epi64x( nonce ); const __m256i vnonce = _mm256_set1_epi64x( nonce );
memcpy( &ctx, &sha3_mid_ctxA, sizeof ctx ); memcpy( &ctx, &sha3_mid_ctxA, sizeof ctx );
sha3_4way_update( &ctx, &vnonce, 8 ); sha3_4way_update( &ctx, &vnonce, 8 );
@@ -88,14 +88,13 @@ void verthash_sha3_512_final_8( void *hash, const uint64_t nonce )
#endif #endif
} }
int scanhash_verthash( struct work *work, uint32_t max_nonce, int scanhash_verthash( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ) uint64_t *hashes_done, struct thr_info *mythr )
{ {
uint32_t edata[20] __attribute__((aligned(64))); uint32_t edata[20] __attribute__((aligned(64)));
uint32_t hash[8] __attribute__((aligned(64))); uint32_t hash[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 1; const uint32_t last_nonce = max_nonce - 1;
uint32_t n = first_nonce; uint32_t n = first_nonce;
@@ -109,8 +108,7 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
{ {
edata[19] = n; edata[19] = n;
verthash_hash( verthashInfo.data, verthashInfo.dataSize, verthash_hash( verthashInfo.data, verthashInfo.dataSize,
(const unsigned char (*)[80]) edata, edata, hash );
(unsigned char (*)[32]) hash );
if ( valid_hash( hash, ptarget ) && !bench ) if ( valid_hash( hash, ptarget ) && !bench )
{ {
pdata[19] = bswap_32( n ); pdata[19] = bswap_32( n );
@@ -123,17 +121,16 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
return 0; return 0;
} }
const char *default_verthash_data_file = "verthash.dat"; static const char *default_verthash_data_file = "verthash.dat";
bool register_verthash_algo( algo_gate_t* gate ) bool register_verthash_algo( algo_gate_t* gate )
{ {
opt_target_factor = 256.0; opt_target_factor = 256.0;
gate->scanhash = (void*)&scanhash_verthash; gate->scanhash = (void*)&scanhash_verthash;
gate->optimizations = AVX2_OPT; gate->optimizations = AVX2_OPT;
char *verthash_data_file = opt_data_file ? opt_data_file const char *verthash_data_file = opt_data_file ? opt_data_file
: default_verthash_data_file; : default_verthash_data_file;
int vhLoadResult = verthash_info_init( &verthashInfo, verthash_data_file ); int vhLoadResult = verthash_info_init( &verthashInfo, verthash_data_file );
if (vhLoadResult == 0) // No Error if (vhLoadResult == 0) // No Error
@@ -160,7 +157,8 @@ bool register_verthash_algo( algo_gate_t* gate )
// Handle Verthash error codes // Handle Verthash error codes
if ( vhLoadResult == 1 ) if ( vhLoadResult == 1 )
{ {
applog( LOG_ERR, "Verthash data file not found: %s", verthash_data_file ); applog( LOG_ERR, "Verthash data file not found: %s",
verthash_data_file );
if ( !opt_data_file ) if ( !opt_data_file )
applog( LOG_NOTICE, "Add '--verify' to create verthash.dat"); applog( LOG_NOTICE, "Add '--verify' to create verthash.dat");
} }

View File

@@ -373,6 +373,45 @@ static inline void salsa20(salsa20_blk_t *restrict B,
#define INTEGERIFY (uint32_t)X.d[0] #define INTEGERIFY (uint32_t)X.d[0]
#endif #endif
// AVX512 ternary logic optimization
#if defined(__AVX512VL__)
#define XOR_X_XOR_X( in1, in2 ) \
X0 = _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
X1 = _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
X2 = _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
X3 = _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 );
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
X0 = _mm_ternarylogic_epi32( (in1).q[0], (in2).q[0], (in3).q[0], 0x96 ); \
X1 = _mm_ternarylogic_epi32( (in1).q[1], (in2).q[1], (in3).q[1], 0x96 ); \
X2 = _mm_ternarylogic_epi32( (in1).q[2], (in2).q[2], (in3).q[2], 0x96 ); \
X3 = _mm_ternarylogic_epi32( (in1).q[3], (in2).q[3], (in3).q[3], 0x96 );
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
X0 = _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
X1 = _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
X2 = _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
X3 = _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); \
SALSA20(out)
#else
#define XOR_X_XOR_X( in1, in2 ) \
XOR_X( in1 ) \
XOR_X( in2 )
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
XOR_X_2( in1, in2 ) \
XOR_X( in3 )
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
XOR_X(in1) \
XOR_X(in2) \
SALSA20( out )
#endif
/** /**
* Apply the Salsa20 core to the block provided in X ^ in. * Apply the Salsa20 core to the block provided in X ^ in.
*/ */
@@ -406,11 +445,15 @@ static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1,
{ {
DECL_X DECL_X
XOR_X_2(Bin1[1], Bin2[1]) XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] )
XOR_X(Bin1[0]) // XOR_X_2(Bin1[1], Bin2[1])
// XOR_X(Bin1[0])
SALSA20_XOR_MEM(Bin2[0], Bout[0]) SALSA20_XOR_MEM(Bin2[0], Bout[0])
XOR_X(Bin1[1])
SALSA20_XOR_MEM(Bin2[1], Bout[1]) // Factor out the XOR from salsa20 to do a xor3
XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] )
// XOR_X(Bin1[1])
// SALSA20_XOR_MEM(Bin2[1], Bout[1])
return INTEGERIFY; return INTEGERIFY;
} }
@@ -745,13 +788,15 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
i = 0; i = 0;
r--; r--;
do { do {
XOR_X(Bin1[i]) XOR_X_XOR_X( Bin1[i], Bin2[i] )
XOR_X(Bin2[i]) // XOR_X(Bin1[i])
// XOR_X(Bin2[i])
PWXFORM PWXFORM
WRITE_X(Bout[i]) WRITE_X(Bout[i])
XOR_X(Bin1[i + 1]) XOR_X_XOR_X( Bin1[ i+1 ], Bin2[ i+1 ] )
XOR_X(Bin2[i + 1]) // XOR_X(Bin1[i + 1])
// XOR_X(Bin2[i + 1])
PWXFORM PWXFORM
if (unlikely(i >= r)) if (unlikely(i >= r))

View File

@@ -35,7 +35,6 @@
#include "miner.h" #include "miner.h"
#include "simd-utils.h" #include "simd-utils.h"
#include "algo/sha/sph_sha2.h" #include "algo/sha/sph_sha2.h"
#include <openssl/sha.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {

View File

@@ -63,7 +63,7 @@ mv cpuminer cpuminer-avx
# Westmere SSE4.2 AES # Westmere SSE4.2 AES
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
CFLAGS="-O3 -march=westmere -Wall -fno-common" ./configure --with-curl CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl
make -j 8 make -j 8
strip -s cpuminer.exe strip -s cpuminer.exe
mv cpuminer.exe cpuminer-aes-sse42.exe mv cpuminer.exe cpuminer-aes-sse42.exe

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.2. # Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.1.
# #
# #
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='cpuminer-opt' PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.16.2' PACKAGE_VERSION='3.17.1'
PACKAGE_STRING='cpuminer-opt 3.16.2' PACKAGE_STRING='cpuminer-opt 3.17.1'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures cpuminer-opt 3.16.2 to adapt to many kinds of systems. \`configure' configures cpuminer-opt 3.17.1 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.16.2:";; short | recursive ) echo "Configuration of cpuminer-opt 3.17.1:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
cpuminer-opt configure 3.16.2 cpuminer-opt configure 3.17.1
generated by GNU Autoconf 2.69 generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc. Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.16.2, which was It was created by cpuminer-opt $as_me 3.17.1, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@ $ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='cpuminer-opt' PACKAGE='cpuminer-opt'
VERSION='3.16.2' VERSION='3.17.1'
cat >>confdefs.h <<_ACEOF cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by cpuminer-opt $as_me 3.16.2, which was This file was extended by cpuminer-opt $as_me 3.17.1, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\ ac_cs_version="\\
cpuminer-opt config.status 3.16.2 cpuminer-opt config.status 3.17.1
configured by $0, generated by GNU Autoconf 2.69, configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.16.2]) AC_INIT([cpuminer-opt], [3.17.1])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

View File

@@ -447,8 +447,10 @@ static bool work_decode( const json_t *val, struct work *work )
if ( !allow_mininginfo ) if ( !allow_mininginfo )
net_diff = algo_gate.calc_network_diff( work ); net_diff = algo_gate.calc_network_diff( work );
else
net_diff = hash_to_diff( work->target );
work->targetdiff = hash_to_diff( work->target ); work->targetdiff = net_diff;
stratum_diff = last_targetdiff = work->targetdiff; stratum_diff = last_targetdiff = work->targetdiff;
work->sharediff = 0; work->sharediff = 0;
algo_gate.decode_extra_data( work, &net_blocks ); algo_gate.decode_extra_data( work, &net_blocks );
@@ -482,13 +484,17 @@ static bool get_mininginfo( CURL *curl, struct work *work )
// "networkhashps": 56475980 // "networkhashps": 56475980
if ( res ) if ( res )
{ {
// net_diff is a global that is set from the work hash target by
// both getwork and GBT. Don't overwrite it, define a local to override
// the global.
double net_diff = 0.;
json_t *key = json_object_get( res, "difficulty" ); json_t *key = json_object_get( res, "difficulty" );
if ( key ) if ( key )
{ {
if ( json_is_object( key ) ) if ( json_is_object( key ) )
key = json_object_get( key, "proof-of-work" ); key = json_object_get( key, "proof-of-work" );
if ( json_is_real( key ) ) if ( json_is_real( key ) )
net_diff = work->targetdiff = json_real_value( key ); net_diff = json_real_value( key );
} }
key = json_object_get( res, "networkhashps" ); key = json_object_get( res, "networkhashps" );
@@ -908,6 +914,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
} }
for ( i = 0; i < ARRAY_SIZE( work->target ); i++ ) for ( i = 0; i < ARRAY_SIZE( work->target ); i++ )
work->target[7 - i] = be32dec( target + i ); work->target[7 - i] = be32dec( target + i );
net_diff = work->targetdiff = hash_to_diff( work->target );
tmp = json_object_get( val, "workid" ); tmp = json_object_get( val, "workid" );
if ( tmp ) if ( tmp )
@@ -1047,6 +1054,8 @@ void report_summary_log( bool force )
applog( LOG_NOTICE,"CPU temp: curr %s max %d, Freq: %.3f/%.3f GHz", applog( LOG_NOTICE,"CPU temp: curr %s max %d, Freq: %.3f/%.3f GHz",
tempstr, hi_temp, lo_freq / 1e6, hi_freq / 1e6 ); tempstr, hi_temp, lo_freq / 1e6, hi_freq / 1e6 );
if ( curr_temp > hi_temp ) hi_temp = curr_temp; if ( curr_temp > hi_temp ) hi_temp = curr_temp;
if ( ( opt_max_temp > 0.0 ) && ( curr_temp > opt_max_temp ) )
restart_threads();
prev_temp = curr_temp; prev_temp = curr_temp;
} }
} }
@@ -1145,7 +1154,7 @@ void report_summary_log( bool force )
if ( mismatch ) if ( mismatch )
{ {
if ( mismatch != 1 ) if ( mismatch != 1 )
applog(LOG_WARNING,"Share count mismatch: %d, stats may be incorrect", mismatch ); applog(LOG_WARNING,"Share count mismatch: %d, stats may be inaccurate", mismatch );
else else
applog(LOG_INFO,"Share count mismatch, submitted share may still be pending" ); applog(LOG_INFO,"Share count mismatch, submitted share may still be pending" );
} }
@@ -1165,7 +1174,8 @@ static int share_result( int result, struct work *work,
char bres[48]; char bres[48];
bool solved = false; bool solved = false;
bool stale = false; bool stale = false;
char *acol = NULL, *bcol = NULL, *scol = NULL, *rcol = NULL; char *acol, *bcol, *scol, *rcol;
acol = bcol = scol = rcol = "\0";
pthread_mutex_lock( &stats_lock ); pthread_mutex_lock( &stats_lock );
@@ -1207,7 +1217,7 @@ static int share_result( int result, struct work *work,
sprintf( sres, "S%d", stale_share_count ); sprintf( sres, "S%d", stale_share_count );
sprintf( rres, "R%d", rejected_share_count ); sprintf( rres, "R%d", rejected_share_count );
if unlikely( ( my_stats.net_diff > 0. ) if unlikely( ( my_stats.net_diff > 0. )
&& ( my_stats.share_diff >= net_diff ) ) && ( my_stats.share_diff >= my_stats.net_diff ) )
{ {
solved = true; solved = true;
solved_block_count++; solved_block_count++;
@@ -2085,10 +2095,10 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
sctx->block_height, net_diff, g_work->job_id ); sctx->block_height, net_diff, g_work->job_id );
else if ( !opt_quiet ) else if ( !opt_quiet )
{ {
unsigned char *xnonce2str = abin2hex( g_work->xnonce2, unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
g_work->xnonce2_len ); g_work->xnonce2_len );
applog( LOG_INFO, "Extranonce2 %s, Block %d, Net Diff %.5g", applog( LOG_INFO, "Extranonce2 %s, Block %d, Job %s",
xnonce2str, sctx->block_height, net_diff ); xnonce2str, sctx->block_height, g_work->job_id );
free( xnonce2str ); free( xnonce2str );
} }
@@ -2171,11 +2181,11 @@ static void *miner_thread( void *userdata )
/* Set worker threads to nice 19 and then preferentially to SCHED_IDLE /* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
* and if that fails, then SCHED_BATCH. No need for this to be an * and if that fails, then SCHED_BATCH. No need for this to be an
* error if it fails */ * error if it fails */
if (!opt_benchmark && opt_priority == 0) if ( !opt_priority )
{ {
setpriority(PRIO_PROCESS, 0, 19); setpriority(PRIO_PROCESS, 0, 19);
if ( !thr_id && !opt_quiet ) if ( !thr_id && opt_debug )
applog(LOG_INFO, "Miner thread priority %d (nice 19)", opt_priority ); applog(LOG_INFO, "Default miner thread priority %d (nice 19)", opt_priority );
drop_policy(); drop_policy();
} }
else else
@@ -2192,9 +2202,12 @@ static void *miner_thread( void *userdata )
case 4: prio = -10; break; case 4: prio = -10; break;
case 5: prio = -15; case 5: prio = -15;
} }
if ( !( thr_id || opt_quiet ) ) if ( !thr_id )
applog( LOG_INFO, "Miner thread priority %d (nice %d)", {
applog( LOG_INFO, "User set miner thread priority %d (nice %d)",
opt_priority, prio ); opt_priority, prio );
applog( LOG_WARNING, "High priority mining threads may cause system instability");
}
#endif #endif
setpriority(PRIO_PROCESS, 0, prio); setpriority(PRIO_PROCESS, 0, prio);
if ( opt_priority == 0 ) if ( opt_priority == 0 )
@@ -2439,7 +2452,7 @@ static void *miner_thread( void *userdata )
char hr_units[2] = {0,0}; char hr_units[2] = {0,0};
scale_hash_for_display( &hashrate, hr_units ); scale_hash_for_display( &hashrate, hr_units );
sprintf( hr, "%.2f", hashrate ); sprintf( hr, "%.2f", hashrate );
#if ((defined(_WIN64) || defined(__WINDOWS__)) || defined(_WIN32)) #if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units ); applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
#else #else
float lo_freq = 0., hi_freq = 0.; float lo_freq = 0., hi_freq = 0.;
@@ -2739,10 +2752,10 @@ static void *stratum_thread(void *userdata )
stratum.url = strdup( rpc_url ); stratum.url = strdup( rpc_url );
applog(LOG_BLUE, "Connection changed to %s", short_url); applog(LOG_BLUE, "Connection changed to %s", short_url);
} }
else // if ( !opt_quiet ) else
applog(LOG_WARNING, "Stratum connection reset"); applog(LOG_WARNING, "Stratum connection reset");
// reset stats queue as well // reset stats queue as well
s_get_ptr = s_put_ptr = 0; if ( s_get_ptr != s_put_ptr ) s_get_ptr = s_put_ptr = 0;
} }
while ( !stratum.curl ) while ( !stratum.curl )
@@ -2789,13 +2802,15 @@ static void *stratum_thread(void *userdata )
else else
{ {
applog(LOG_WARNING, "Stratum connection interrupted"); applog(LOG_WARNING, "Stratum connection interrupted");
stratum_disconnect( &stratum ); // stratum_disconnect( &stratum );
stratum_need_reset = true;
} }
} }
else else
{ {
applog(LOG_ERR, "Stratum connection timeout"); applog(LOG_ERR, "Stratum connection timeout");
stratum_disconnect( &stratum ); stratum_need_reset = true;
// stratum_disconnect( &stratum );
} }
} // loop } // loop
@@ -2843,7 +2858,6 @@ static bool cpu_capability( bool display_only )
bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features ); bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
bool use_aes; bool use_aes;
bool use_sse2; bool use_sse2;
bool use_sse42;
bool use_avx2; bool use_avx2;
bool use_avx512; bool use_avx512;
bool use_sha; bool use_sha;
@@ -2917,13 +2931,14 @@ static bool cpu_capability( bool display_only )
if ( algo_features == EMPTY_SET ) printf( " None" ); if ( algo_features == EMPTY_SET ) printf( " None" );
else else
{ {
if ( algo_has_avx512 ) printf( " AVX512" ); if ( algo_has_avx512 ) printf( " AVX512" );
else if ( algo_has_avx2 ) printf( " AVX2 " ); else if ( algo_has_avx2 ) printf( " AVX2 " );
else if ( algo_has_sse42 ) printf( " SSE4.2" ); else if ( algo_has_sse42 ) printf( " SSE4.2" );
else if ( algo_has_sse2 ) printf( " SSE2 " ); else if ( algo_has_sse2 ) printf( " SSE2 " );
if ( algo_has_vaes ) printf( " VAES" ); if ( algo_has_vaes ||
else if ( algo_has_aes ) printf( " AES" ); algo_has_vaes256 ) printf( " VAES" );
if ( algo_has_sha ) printf( " SHA" ); else if ( algo_has_aes ) printf( " AES" );
if ( algo_has_sha ) printf( " SHA" );
} }
printf("\n"); printf("\n");
@@ -2959,13 +2974,12 @@ static bool cpu_capability( bool display_only )
// Determine mining options // Determine mining options
use_sse2 = cpu_has_sse2 && algo_has_sse2; use_sse2 = cpu_has_sse2 && algo_has_sse2;
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes; use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42;
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2; use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512; use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha; use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes use_vaes = cpu_has_vaes && sw_has_vaes && ( algo_has_vaes
&& ( use_avx512 || algo_has_vaes256 ); || algo_has_vaes256 );
use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 || use_none = !( use_sse2 || use_aes || use_avx512 || use_avx2 ||
use_sha || use_vaes ); use_sha || use_vaes );
// Display best options // Display best options
@@ -2975,7 +2989,6 @@ static bool cpu_capability( bool display_only )
{ {
if ( use_avx512 ) printf( " AVX512" ); if ( use_avx512 ) printf( " AVX512" );
else if ( use_avx2 ) printf( " AVX2" ); else if ( use_avx2 ) printf( " AVX2" );
else if ( use_sse42 ) printf( " SSE4.2" );
else if ( use_sse2 ) printf( " SSE2" ); else if ( use_sse2 ) printf( " SSE2" );
if ( use_vaes ) printf( " VAES" ); if ( use_vaes ) printf( " VAES" );
else if ( use_aes ) printf( " AES" ); else if ( use_aes ) printf( " AES" );
@@ -3394,8 +3407,6 @@ void parse_arg(int key, char *arg )
v = atoi(arg); v = atoi(arg);
if (v < 0 || v > 5) /* sanity check */ if (v < 0 || v > 5) /* sanity check */
show_usage_and_exit(1); show_usage_and_exit(1);
// option is deprecated, show warning
applog( LOG_WARNING, "High priority mining threads may cause system instability");
opt_priority = v; opt_priority = v;
break; break;
case 'N': // N parameter for various scrypt algos case 'N': // N parameter for various scrypt algos

View File

@@ -307,6 +307,7 @@ extern json_t *json_rpc_call( CURL *curl, const char *url, const char *userpass,
extern void cbin2hex(char *out, const char *in, size_t len); extern void cbin2hex(char *out, const char *in, size_t len);
void bin2hex( char *s, const unsigned char *p, size_t len ); void bin2hex( char *s, const unsigned char *p, size_t len );
char *abin2hex( const unsigned char *p, size_t len ); char *abin2hex( const unsigned char *p, size_t len );
char *bebin2hex( const unsigned char *p, size_t len );
bool hex2bin( unsigned char *p, const char *hexstr, size_t len ); bool hex2bin( unsigned char *p, const char *hexstr, size_t len );
bool jobj_binary( const json_t *obj, const char *key, void *buf, bool jobj_binary( const json_t *obj, const char *key, void *buf,
size_t buflen ); size_t buflen );

View File

@@ -237,6 +237,25 @@ static inline void memset_128( __m128i *dst, const __m128i a, const int n )
static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } { for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
#if defined(__AVX512VL__)
// a ^ b ^ c
#define mm128_xor3( a, b, c ) \
_mm_ternarylogic_epi64( a, b, c, 0x96 )
// a ^ ( b & c )
#define mm128_xorand( a, b, c ) \
_mm_ternarylogic_epi64( a, b, c, 0x78 )
#else
#define mm128_xor3( a, b, c ) \
_mm_xor_si128( a, _mm_xor_si128( b, c ) )
#define mm128_xorand( a, b, c ) \
_mm_xor_si128( a, _mm_and_si128( b, c ) )
#endif
// //
// Bit rotations // Bit rotations

View File

@@ -136,9 +136,84 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#define mm256_add4_8( a, b, c, d ) \ #define mm256_add4_8( a, b, c, d ) \
_mm256_add_epi8( _mm256_add_epi8( a, b ), _mm256_add_epi8( c, d ) ) _mm256_add_epi8( _mm256_add_epi8( a, b ), _mm256_add_epi8( c, d ) )
#if defined(__AVX512VL__)
// AVX512 has ternary logic that supports any 3 input boolean expression.
// a ^ b ^ c
#define mm256_xor3( a, b, c ) \
_mm256_ternarylogic_epi64( a, b, c, 0x96 )
// legacy convenience only
#define mm256_xor4( a, b, c, d ) \
_mm256_xor_si256( a, mm256_xor3( b, c, d ) )
// a & b & c
#define mm256_and3( a, b, c ) \
_mm256_ternarylogic_epi64( a, b, c, 0x80 )
// a | b | c
#define mm256_or3( a, b, c ) \
_mm256_ternarylogic_epi64( a, b, c, 0xfe )
// a ^ ( b & c )
#define mm256_xorand( a, b, c ) \
_mm256_ternarylogic_epi64( a, b, c, 0x78 )
// a & ( b ^ c )
#define mm256_andxor( a, b, c ) \
_mm256_ternarylogic_epi64( a, b, c, 0x60 )
// a ^ ( b | c )
#define mm256_xoror( a, b, c ) \
_mm256_ternarylogic_epi64( a, b, c, 0x1e )
// a ^ ( ~b & c )
#define mm256_xorandnot( a, b, c ) \
_mm256_ternarylogic_epi64( a, b, c, 0xd2 )
// a | ( b & c )
#define mm256_orand( a, b, c ) \
_mm256_ternarylogic_epi64( a, b, c, 0xf8 )
// ~( a ^ b ), same as (~a) ^ b
#define mm256_xnor( a, b ) \
_mm256_ternarylogic_epi64( a, b, b, 0x81 )
#else
#define mm256_xor3( a, b, c ) \
_mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
#define mm256_xor4( a, b, c, d ) \ #define mm256_xor4( a, b, c, d ) \
_mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) ) _mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
#define mm256_and3( a, b, c ) \
_mm256_and_si256( a, _mm256_and_si256( b, c ) )
#define mm256_or3( a, b, c ) \
_mm256_or_si256( a, _mm256_or_si256( b, c ) )
#define mm256_xorand( a, b, c ) \
_mm256_xor_si256( a, _mm256_and_si256( b, c ) )
#define mm256_andxor( a, b, c ) \
_mm256_and_si256( a, _mm256_xor_si256( b, c ))
#define mm256_xoror( a, b, c ) \
_mm256_xor_si256( a, _mm256_or_si256( b, c ) )
#define mm256_xorandnot( a, b, c ) \
_mm256_xor_si256( a, _mm256_andnot_si256( b, c ) )
#define mm256_orand( a, b, c ) \
_mm256_or_si256( a, _mm256_and_si256( b, c ) )
#define mm256_xnor( a, b ) \
mm256_not( _mm256_xor_si256( a, b ) )
#endif
// //
// Bit rotations. // Bit rotations.
// //
@@ -200,15 +275,17 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
// //
// Rotate elements accross all lanes. // Rotate elements accross all lanes.
//
// Swap 128 bit elements in 256 bit vector.
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
// Rotate 256 bit vector by one 64 bit element #if defined(__AVX512VL__)
#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
#if defined(__AVX512F__) && defined(__AVX512VL__) static inline __m256i mm256_swap_128( const __m256i v )
{ return _mm256_alignr_epi64( v, v, 2 ); }
static inline __m256i mm256_ror_1x64( const __m256i v )
{ return _mm256_alignr_epi64( v, v, 1 ); }
static inline __m256i mm256_rol_1x64( const __m256i v )
{ return _mm256_alignr_epi64( v, v, 3 ); }
static inline __m256i mm256_ror_1x32( const __m256i v ) static inline __m256i mm256_ror_1x32( const __m256i v )
{ return _mm256_alignr_epi32( v, v, 1 ); } { return _mm256_alignr_epi32( v, v, 1 ); }
@@ -218,6 +295,13 @@ static inline __m256i mm256_rol_1x32( const __m256i v )
#else // AVX2 #else // AVX2
// Swap 128 bit elements in 256 bit vector.
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
// Rotate 256 bit vector by one 64 bit element
#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
// Rotate 256 bit vector by one 32 bit element. // Rotate 256 bit vector by one 32 bit element.
#define mm256_ror_1x32( v ) \ #define mm256_ror_1x32( v ) \
_mm256_permutevar8x32_epi32( v, \ _mm256_permutevar8x32_epi32( v, \
@@ -229,6 +313,7 @@ static inline __m256i mm256_rol_1x32( const __m256i v )
m256_const_64( 0x0000000600000005, 0x0000000400000003, \ m256_const_64( 0x0000000600000005, 0x0000000400000003, \
0x0000000200000001, 0x0000000000000007 ) 0x0000000200000001, 0x0000000000000007 )
#endif // AVX512 else AVX2 #endif // AVX512 else AVX2
// //

View File

@@ -61,7 +61,7 @@
// //
// Additionally, permutations using smaller vectors can be more efficient // Additionally, permutations using smaller vectors can be more efficient
// if the permutation doesn't cross lane boundaries, typically 128 bits, // if the permutation doesn't cross lane boundaries, typically 128 bits,
// and the smnaller vector can use an imm comtrol. // and the smaller vector can use an imm comtrol.
// //
// If the permutation doesn't cross lane boundaries a shuffle instructions // If the permutation doesn't cross lane boundaries a shuffle instructions
// can be used with imm control instead of permute. // can be used with imm control instead of permute.
@@ -107,7 +107,7 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
return v.m512i; return v.m512i;
} }
// Equivalent of set1, broadcast lo element all elements. // Equivalent of set1, broadcast lo element to all elements.
static inline __m512i m512_const1_256( const __m256i v ) static inline __m512i m512_const1_256( const __m256i v )
{ return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); } { return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); }
@@ -166,7 +166,9 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
// Basic operations without SIMD equivalent // Basic operations without SIMD equivalent
// ~x // ~x
#define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 ) // #define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 )
static inline __m512i mm512_not( const __m512i x )
{ return _mm512_ternarylogic_epi64( x, x, x, 1 ); }
// -x // -x
#define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x ) #define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
@@ -221,11 +223,61 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_add4_8( a, b, c, d ) \ #define mm512_add4_8( a, b, c, d ) \
_mm512_add_epi8( _mm512_add_epi8( a, b ), _mm512_add_epi8( c, d ) ) _mm512_add_epi8( _mm512_add_epi8( a, b ), _mm512_add_epi8( c, d ) )
#define mm512_xor4( a, b, c, d ) \
_mm512_xor_si512( _mm512_xor_si512( a, b ), _mm512_xor_si512( c, d ) )
// //
// Ternary logic uses 8 bit truth table to define any 3 input logical
// operation using any number or combinations of AND, OR XOR, NOT.
// a ^ b ^ c
#define mm512_xor3( a, b, c ) \
_mm512_ternarylogic_epi64( a, b, c, 0x96 )
// legacy convenience only
#define mm512_xor4( a, b, c, d ) \
_mm512_xor_si512( a, mm512_xor3( b, c, d ) )
// a & b & c
#define mm512_and3( a, b, c ) \
_mm512_ternarylogic_epi64( a, b, c, 0x80 )
// a | b | c
#define mm512_or3( a, b, c ) \
_mm512_ternarylogic_epi64( a, b, c, 0xfe )
// a ^ ( b & c )
#define mm512_xorand( a, b, c ) \
_mm512_ternarylogic_epi64( a, b, c, 0x78 )
// a & ( b ^ c )
#define mm512_andxor( a, b, c ) \
_mm512_ternarylogic_epi64( a, b, c, 0x60 )
// a ^ ( b & c )
#define mm512_xoror( a, b, c ) \
_mm512_ternarylogic_epi64( a, b, c, 0x1e )
// a ^ ( ~b & c ) [ xor( a, andnot( b, c ) ]
#define mm512_xorandnot( a, b, c ) \
_mm512_ternarylogic_epi64( a, b, c, 0xd2 )
// a | ( b & c )
#define mm512_orand( a, b, c ) \
_mm512_ternarylogic_epi64( a, b, c, 0xf8 )
// Some 2 input operations that don't have their own instruction mnemonic.
// ~( a | b )
#define mm512_nor( a, b ) \
_mm512_ternarylogic_epi64( a, b, b, 0x01 )
// ~( a ^ b ), same as (~a) ^ b
#define mm512_xnor( a, b ) \
_mm512_ternarylogic_epi64( a, b, b, 0x81 )
// ~( a & b )
#define mm512_nand( a, b ) \
_mm512_ternarylogic_epi64( a, b, b, 0xef )
// Bit rotations. // Bit rotations.
// AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit

21
util.c
View File

@@ -795,6 +795,15 @@ char *abin2hex(const unsigned char *p, size_t len)
return s; return s;
} }
char *bebin2hex(const unsigned char *p, size_t len)
{
char *s = (char*) malloc((len * 2) + 1);
if (!s) return NULL;
for ( size_t i = 0, j = len - 1; i < len; i++, j-- )
sprintf( s + ( i*2 ), "%02x", (unsigned int) p[ j ] );
return s;
}
bool hex2bin(unsigned char *p, const char *hexstr, size_t len) bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
{ {
char hex_byte[3]; char hex_byte[3];
@@ -1789,10 +1798,14 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
if ( !stratum_handle_method( sctx, sret ) ) if ( !stratum_handle_method( sctx, sret ) )
applog( LOG_WARNING, "Stratum answer id is not correct!" ); applog( LOG_WARNING, "Stratum answer id is not correct!" );
} }
res_val = json_object_get( extra, "result" ); else
if (opt_debug && (!res_val || json_is_false(res_val))) {
applog(LOG_DEBUG, "Method extranonce.subscribe is not supported"); res_val = json_object_get( extra, "result" );
json_decref( extra ); if ( opt_debug && ( !res_val || json_is_false( res_val ) ) )
applog( LOG_DEBUG,
"Method extranonce.subscribe is not supported" );
}
json_decref( extra );
} }
free(sret); free(sret);
} }

View File

@@ -107,7 +107,7 @@ mv cpuminer.exe release/cpuminer-avx.exe
# Westmere SSE4.2 AES # Westmere SSE4.2 AES
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
CFLAGS="-O3 -march=westmere -Wall" ./configure $CONFIGURE_ARGS CFLAGS="-O3 -march=westmere -maes -Wall" ./configure $CONFIGURE_ARGS
#CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS #CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS
make -j 8 make -j 8
strip -s cpuminer.exe strip -s cpuminer.exe