This commit is contained in:
Jay D Dee
2021-07-26 15:01:37 -04:00
parent 92b3733925
commit 9b905fccc8
33 changed files with 889 additions and 565 deletions

View File

@@ -65,6 +65,12 @@ If not what makes it happen or not happen?
Change Log
----------
v3.17.1
Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES.
More ternary logic optimizations for AVX512, AVX512+VAES, and AVX512+AES.
Fixed my-gr algo for VAES.
v3.17.0
AVX512 optimized using ternary logic instructions.

View File

@@ -180,6 +180,7 @@ void blake512_8way_update( void *cc, const void *data, size_t len );
void blake512_8way_close( void *cc, void *dst );
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_8way_hash_le80( void *hash, const void *data );
#endif // AVX512
#endif // AVX2

View File

@@ -669,14 +669,14 @@ do { \
ROUND_S_8WAY(2); \
ROUND_S_8WAY(3); \
} \
H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \
H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \
H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \
H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \
H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \
H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \
H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \
H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \
H0 = mm256_xor3( V8, V0, H0 ); \
H1 = mm256_xor3( V9, V1, H1 ); \
H2 = mm256_xor3( VA, V2, H2 ); \
H3 = mm256_xor3( VB, V3, H3 ); \
H4 = mm256_xor3( VC, V4, H4 ); \
H5 = mm256_xor3( VD, V5, H5 ); \
H6 = mm256_xor3( VE, V6, H6 ); \
H7 = mm256_xor3( VF, V7, H7 ); \
} while (0)
@@ -808,14 +808,14 @@ do { \
ROUND_S_16WAY(2); \
ROUND_S_16WAY(3); \
} \
H0 = _mm512_xor_si512( _mm512_xor_si512( V8, V0 ), H0 ); \
H1 = _mm512_xor_si512( _mm512_xor_si512( V9, V1 ), H1 ); \
H2 = _mm512_xor_si512( _mm512_xor_si512( VA, V2 ), H2 ); \
H3 = _mm512_xor_si512( _mm512_xor_si512( VB, V3 ), H3 ); \
H4 = _mm512_xor_si512( _mm512_xor_si512( VC, V4 ), H4 ); \
H5 = _mm512_xor_si512( _mm512_xor_si512( VD, V5 ), H5 ); \
H6 = _mm512_xor_si512( _mm512_xor_si512( VE, V6 ), H6 ); \
H7 = _mm512_xor_si512( _mm512_xor_si512( VF, V7 ), H7 ); \
H0 = mm512_xor3( V8, V0, H0 ); \
H1 = mm512_xor3( V9, V1, H1 ); \
H2 = mm512_xor3( VA, V2, H2 ); \
H3 = mm512_xor3( VB, V3, H3 ); \
H4 = mm512_xor3( VC, V4, H4 ); \
H5 = mm512_xor3( VD, V5, H5 ); \
H6 = mm512_xor3( VE, V6, H6 ); \
H7 = mm512_xor3( VF, V7, H7 ); \
} while (0)
#endif

View File

@@ -122,14 +122,14 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
B2B8W_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
}
ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] );
ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] );
ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] );
ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] );
ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] );
ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] );
ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] );
ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] );
ctx->h[0] = mm512_xor3( ctx->h[0], v[0], v[ 8] );
ctx->h[1] = mm512_xor3( ctx->h[1], v[1], v[ 9] );
ctx->h[2] = mm512_xor3( ctx->h[2], v[2], v[10] );
ctx->h[3] = mm512_xor3( ctx->h[3], v[3], v[11] );
ctx->h[4] = mm512_xor3( ctx->h[4], v[4], v[12] );
ctx->h[5] = mm512_xor3( ctx->h[5], v[5], v[13] );
ctx->h[6] = mm512_xor3( ctx->h[6], v[6], v[14] );
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
}
int blake2b_8way_init( blake2b_8way_ctx *ctx )

View File

@@ -4,7 +4,6 @@
#include <stdint.h>
#include "algo-gate-api.h"
//#if defined(__SSE4_2__)
#if defined(__SSE2__)
#define BLAKE2S_4WAY
#endif
@@ -27,8 +26,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
#elif defined (BLAKE2S_8WAY)
//#if defined(BLAKE2S_8WAY)
void blake2s_8way_hash( void *state, const void *input );
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );

View File

@@ -368,7 +368,7 @@ do { \
ROUND8W( 9 );
for( size_t i = 0; i < 8; ++i )
S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] );
S->h[i] = mm256_xor3( S->h[i], v[i], v[i + 8] );
#undef G8W
#undef ROUND8W
@@ -566,7 +566,7 @@ do { \
ROUND16W( 9 );
for( size_t i = 0; i < 8; ++i )
S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] );
S->h[i] = mm512_xor3( S->h[i], v[i], v[i + 8] );
#undef G16W
#undef ROUND16W

View File

@@ -293,10 +293,6 @@ static const sph_u64 CB[16] = {
H5 = (state)->H[5]; \
H6 = (state)->H[6]; \
H7 = (state)->H[7]; \
S0 = (state)->S[0]; \
S1 = (state)->S[1]; \
S2 = (state)->S[2]; \
S3 = (state)->S[3]; \
T0 = (state)->T0; \
T1 = (state)->T1; \
} while (0)
@@ -310,10 +306,6 @@ static const sph_u64 CB[16] = {
(state)->H[5] = H5; \
(state)->H[6] = H6; \
(state)->H[7] = H7; \
(state)->S[0] = S0; \
(state)->S[1] = S1; \
(state)->S[2] = S2; \
(state)->S[3] = S3; \
(state)->T0 = T0; \
(state)->T1 = T1; \
} while (0)
@@ -348,7 +340,6 @@ static const sph_u64 CB[16] = {
#define DECL_STATE64_8WAY \
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
__m512i S0, S1, S2, S3; \
uint64_t T0, T1;
#define COMPRESS64_8WAY( buf ) do \
@@ -366,10 +357,10 @@ static const sph_u64 CB[16] = {
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm512_xor_si512( S0, m512_const1_64( CB0 ) ); \
V9 = _mm512_xor_si512( S1, m512_const1_64( CB1 ) ); \
VA = _mm512_xor_si512( S2, m512_const1_64( CB2 ) ); \
VB = _mm512_xor_si512( S3, m512_const1_64( CB3 ) ); \
V8 = m512_const1_64( CB0 ); \
V9 = m512_const1_64( CB1 ); \
VA = m512_const1_64( CB2 ); \
VB = m512_const1_64( CB3 ); \
VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
m512_const1_64( CB4 ) ); \
VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
@@ -414,14 +405,14 @@ static const sph_u64 CB[16] = {
ROUND_B_8WAY(3); \
ROUND_B_8WAY(4); \
ROUND_B_8WAY(5); \
H0 = mm512_xor4( V8, V0, S0, H0 ); \
H1 = mm512_xor4( V9, V1, S1, H1 ); \
H2 = mm512_xor4( VA, V2, S2, H2 ); \
H3 = mm512_xor4( VB, V3, S3, H3 ); \
H4 = mm512_xor4( VC, V4, S0, H4 ); \
H5 = mm512_xor4( VD, V5, S1, H5 ); \
H6 = mm512_xor4( VE, V6, S2, H6 ); \
H7 = mm512_xor4( VF, V7, S3, H7 ); \
H0 = mm512_xor3( V8, V0, H0 ); \
H1 = mm512_xor3( V9, V1, H1 ); \
H2 = mm512_xor3( VA, V2, H2 ); \
H3 = mm512_xor3( VB, V3, H3 ); \
H4 = mm512_xor3( VC, V4, H4 ); \
H5 = mm512_xor3( VD, V5, H5 ); \
H6 = mm512_xor3( VE, V6, H6 ); \
H7 = mm512_xor3( VF, V7, H7 ); \
} while (0)
void blake512_8way_compress( blake_8way_big_context *sc )
@@ -440,10 +431,10 @@ void blake512_8way_compress( blake_8way_big_context *sc )
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = _mm512_xor_si512( sc->S[0], m512_const1_64( CB0 ) );
V9 = _mm512_xor_si512( sc->S[1], m512_const1_64( CB1 ) );
VA = _mm512_xor_si512( sc->S[2], m512_const1_64( CB2 ) );
VB = _mm512_xor_si512( sc->S[3], m512_const1_64( CB3 ) );
V8 = m512_const1_64( CB0 );
V9 = m512_const1_64( CB1 );
VA = m512_const1_64( CB2 );
VB = m512_const1_64( CB3 );
VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
m512_const1_64( CB4 ) );
VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
@@ -492,19 +483,18 @@ void blake512_8way_compress( blake_8way_big_context *sc )
ROUND_B_8WAY(4);
ROUND_B_8WAY(5);
sc->H[0] = mm512_xor4( V8, V0, sc->S[0], sc->H[0] );
sc->H[1] = mm512_xor4( V9, V1, sc->S[1], sc->H[1] );
sc->H[2] = mm512_xor4( VA, V2, sc->S[2], sc->H[2] );
sc->H[3] = mm512_xor4( VB, V3, sc->S[3], sc->H[3] );
sc->H[4] = mm512_xor4( VC, V4, sc->S[0], sc->H[4] );
sc->H[5] = mm512_xor4( VD, V5, sc->S[1], sc->H[5] );
sc->H[6] = mm512_xor4( VE, V6, sc->S[2], sc->H[6] );
sc->H[7] = mm512_xor4( VF, V7, sc->S[3], sc->H[7] );
sc->H[0] = mm512_xor3( V8, V0, sc->H[0] );
sc->H[1] = mm512_xor3( V9, V1, sc->H[1] );
sc->H[2] = mm512_xor3( VA, V2, sc->H[2] );
sc->H[3] = mm512_xor3( VB, V3, sc->H[3] );
sc->H[4] = mm512_xor3( VC, V4, sc->H[4] );
sc->H[5] = mm512_xor3( VD, V5, sc->H[5] );
sc->H[6] = mm512_xor3( VE, V6, sc->H[6] );
sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
}
void blake512_8way_init( blake_8way_big_context *sc )
{
__m512i zero = m512_zero;
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
@@ -514,11 +504,6 @@ void blake512_8way_init( blake_8way_big_context *sc )
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
casti_m512i( sc->S, 0 ) = zero;
casti_m512i( sc->S, 1 ) = zero;
casti_m512i( sc->S, 2 ) = zero;
casti_m512i( sc->S, 3 ) = zero;
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
}
@@ -641,11 +626,6 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
casti_m512i( sc->S, 0 ) = m512_zero;
casti_m512i( sc->S, 1 ) = m512_zero;
casti_m512i( sc->S, 2 ) = m512_zero;
casti_m512i( sc->S, 3 ) = m512_zero;
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
@@ -740,7 +720,6 @@ blake512_8way_close(void *cc, void *dst)
#define DECL_STATE64_4WAY \
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
__m256i S0, S1, S2, S3; \
uint64_t T0, T1;
#define COMPRESS64_4WAY do \
@@ -758,10 +737,10 @@ blake512_8way_close(void *cc, void *dst)
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm256_xor_si256( S0, m256_const1_64( CB0 ) ); \
V9 = _mm256_xor_si256( S1, m256_const1_64( CB1 ) ); \
VA = _mm256_xor_si256( S2, m256_const1_64( CB2 ) ); \
VB = _mm256_xor_si256( S3, m256_const1_64( CB3 ) ); \
V8 = m256_const1_64( CB0 ); \
V9 = m256_const1_64( CB1 ); \
VA = m256_const1_64( CB2 ); \
VB = m256_const1_64( CB3 ); \
VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
m256_const1_64( CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
@@ -804,14 +783,14 @@ blake512_8way_close(void *cc, void *dst)
ROUND_B_4WAY(3); \
ROUND_B_4WAY(4); \
ROUND_B_4WAY(5); \
H0 = mm256_xor4( V8, V0, S0, H0 ); \
H1 = mm256_xor4( V9, V1, S1, H1 ); \
H2 = mm256_xor4( VA, V2, S2, H2 ); \
H3 = mm256_xor4( VB, V3, S3, H3 ); \
H4 = mm256_xor4( VC, V4, S0, H4 ); \
H5 = mm256_xor4( VD, V5, S1, H5 ); \
H6 = mm256_xor4( VE, V6, S2, H6 ); \
H7 = mm256_xor4( VF, V7, S3, H7 ); \
H0 = mm256_xor3( V8, V0, H0 ); \
H1 = mm256_xor3( V9, V1, H1 ); \
H2 = mm256_xor3( VA, V2, H2 ); \
H3 = mm256_xor3( VB, V3, H3 ); \
H4 = mm256_xor3( VC, V4, H4 ); \
H5 = mm256_xor3( VD, V5, H5 ); \
H6 = mm256_xor3( VE, V6, H6 ); \
H7 = mm256_xor3( VF, V7, H7 ); \
} while (0)
@@ -831,10 +810,10 @@ void blake512_4way_compress( blake_4way_big_context *sc )
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = _mm256_xor_si256( sc->S[0], m256_const1_64( CB0 ) );
V9 = _mm256_xor_si256( sc->S[1], m256_const1_64( CB1 ) );
VA = _mm256_xor_si256( sc->S[2], m256_const1_64( CB2 ) );
VB = _mm256_xor_si256( sc->S[3], m256_const1_64( CB3 ) );
V8 = m256_const1_64( CB0 );
V9 = m256_const1_64( CB1 );
VA = m256_const1_64( CB2 );
VB = m256_const1_64( CB3 );
VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
m256_const1_64( CB4 ) );
VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
@@ -880,19 +859,18 @@ void blake512_4way_compress( blake_4way_big_context *sc )
ROUND_B_4WAY(4);
ROUND_B_4WAY(5);
sc->H[0] = mm256_xor4( V8, V0, sc->S[0], sc->H[0] );
sc->H[1] = mm256_xor4( V9, V1, sc->S[1], sc->H[1] );
sc->H[2] = mm256_xor4( VA, V2, sc->S[2], sc->H[2] );
sc->H[3] = mm256_xor4( VB, V3, sc->S[3], sc->H[3] );
sc->H[4] = mm256_xor4( VC, V4, sc->S[0], sc->H[4] );
sc->H[5] = mm256_xor4( VD, V5, sc->S[1], sc->H[5] );
sc->H[6] = mm256_xor4( VE, V6, sc->S[2], sc->H[6] );
sc->H[7] = mm256_xor4( VF, V7, sc->S[3], sc->H[7] );
sc->H[0] = mm256_xor3( V8, V0, sc->H[0] );
sc->H[1] = mm256_xor3( V9, V1, sc->H[1] );
sc->H[2] = mm256_xor3( VA, V2, sc->H[2] );
sc->H[3] = mm256_xor3( VB, V3, sc->H[3] );
sc->H[4] = mm256_xor3( VC, V4, sc->H[4] );
sc->H[5] = mm256_xor3( VD, V5, sc->H[5] );
sc->H[6] = mm256_xor3( VE, V6, sc->H[6] );
sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
}
void blake512_4way_init( blake_4way_big_context *sc )
{
__m256i zero = m256_zero;
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
@@ -902,11 +880,6 @@ void blake512_4way_init( blake_4way_big_context *sc )
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
casti_m256i( sc->S, 0 ) = zero;
casti_m256i( sc->S, 1 ) = zero;
casti_m256i( sc->S, 2 ) = zero;
casti_m256i( sc->S, 3 ) = zero;
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
}
@@ -1026,11 +999,6 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
casti_m256i( sc->S, 0 ) = m256_zero;
casti_m256i( sc->S, 1 ) = m256_zero;
casti_m256i( sc->S, 2 ) = m256_zero;
casti_m256i( sc->S, 3 ) = m256_zero;
sc->T0 = sc->T1 = 0;
sc->ptr = 0;

View File

@@ -867,40 +867,35 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
qt[30] = expand2s8( qt, M, H, 30 );
qt[31] = expand2s8( qt, M, H, 31 );
xl = _mm256_xor_si256(
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) );
xh = _mm256_xor_si256( xl, _mm256_xor_si256(
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
xl = mm256_xor3( mm256_xor3( qt[16], qt[17], qt[18] ),
mm256_xor3( qt[19], qt[20], qt[21] ),
_mm256_xor_si256( qt[22], qt[23] ) );
xh = mm256_xor3( mm256_xor3( xl, qt[24], qt[25] ),
mm256_xor3( qt[26], qt[27], qt[28] ),
mm256_xor3( qt[29], qt[30], qt[31] ) );
#define DH1L( m, sl, sr, a, b, c ) \
_mm256_add_epi32( \
_mm256_xor_si256( M[m], \
_mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
_mm256_srli_epi32( qt[a], sr ) ) ), \
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
_mm256_add_epi32( mm256_xor3( M[m], _mm256_slli_epi32( xh, sl ), \
_mm256_srli_epi32( qt[a], sr ) ), \
mm256_xor3( xl, qt[b], qt[c] ) )
#define DH1R( m, sl, sr, a, b, c ) \
_mm256_add_epi32( \
_mm256_xor_si256( M[m], \
_mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
_mm256_slli_epi32( qt[a], sr ) ) ), \
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
_mm256_add_epi32( mm256_xor3( M[m], _mm256_srli_epi32( xh, sl ), \
_mm256_slli_epi32( qt[a], sr ) ), \
mm256_xor3( xl, qt[b], qt[c] ) )
#define DH2L( m, rl, sl, h, a, b, c ) \
_mm256_add_epi32( _mm256_add_epi32( \
mm256_rol_32( dH[h], rl ), \
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
_mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
_mm256_xor_si256( qt[b], qt[c] ) ) );
mm256_xor3( xh, qt[a], M[m] ) ), \
mm256_xor3( _mm256_slli_epi32( xl, sl ), qt[b], qt[c] ) )
#define DH2R( m, rl, sr, h, a, b, c ) \
_mm256_add_epi32( _mm256_add_epi32( \
mm256_rol_32( dH[h], rl ), \
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
_mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
_mm256_xor_si256( qt[b], qt[c] ) ) );
mm256_xor3( xh, qt[a], M[m] ) ), \
mm256_xor3( _mm256_srli_epi32( xl, sr ), qt[b], qt[c] ) )
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
@@ -924,88 +919,6 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
#undef DH2L
#undef DH2R
/*
dH[ 0] = _mm256_add_epi32(
_mm256_xor_si256( M[0],
_mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
_mm256_srli_epi32( qt[16], 5 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
dH[ 1] = _mm256_add_epi32(
_mm256_xor_si256( M[1],
_mm256_xor_si256( _mm256_srli_epi32( xh, 7 ),
_mm256_slli_epi32( qt[17], 8 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
dH[ 2] = _mm256_add_epi32(
_mm256_xor_si256( M[2],
_mm256_xor_si256( _mm256_srli_epi32( xh, 5 ),
_mm256_slli_epi32( qt[18], 5 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
dH[ 3] = _mm256_add_epi32(
_mm256_xor_si256( M[3],
_mm256_xor_si256( _mm256_srli_epi32( xh, 1 ),
_mm256_slli_epi32( qt[19], 5 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
dH[ 4] = _mm256_add_epi32(
_mm256_xor_si256( M[4],
_mm256_xor_si256( _mm256_srli_epi32( xh, 3 ),
_mm256_slli_epi32( qt[20], 0 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
dH[ 5] = _mm256_add_epi32(
_mm256_xor_si256( M[5],
_mm256_xor_si256( _mm256_slli_epi32( xh, 6 ),
_mm256_srli_epi32( qt[21], 6 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
dH[ 6] = _mm256_add_epi32(
_mm256_xor_si256( M[6],
_mm256_xor_si256( _mm256_srli_epi32( xh, 4 ),
_mm256_slli_epi32( qt[22], 6 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
dH[ 7] = _mm256_add_epi32(
_mm256_xor_si256( M[7],
_mm256_xor_si256( _mm256_srli_epi32( xh, 11 ),
_mm256_slli_epi32( qt[23], 2 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
dH[ 8] = _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32( dH[4], 9 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
_mm256_xor_si256( _mm256_slli_epi32( xl, 8 ),
_mm256_xor_si256( qt[23], qt[ 8] ) ) );
dH[ 9] = _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32( dH[5], 10 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
_mm256_xor_si256( _mm256_srli_epi32( xl, 6 ),
_mm256_xor_si256( qt[16], qt[ 9] ) ) );
dH[10] = _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32( dH[6], 11 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
_mm256_xor_si256( _mm256_slli_epi32( xl, 6 ),
_mm256_xor_si256( qt[17], qt[10] ) ) );
dH[11] = _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32( dH[7], 12 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
_mm256_xor_si256( _mm256_slli_epi32( xl, 4 ),
_mm256_xor_si256( qt[18], qt[11] ) ) );
dH[12] = _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32( dH[0], 13 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
_mm256_xor_si256( _mm256_srli_epi32( xl, 3 ),
_mm256_xor_si256( qt[19], qt[12] ) ) );
dH[13] = _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32( dH[1], 14 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
_mm256_xor_si256( _mm256_srli_epi32( xl, 4 ),
_mm256_xor_si256( qt[20], qt[13] ) ) );
dH[14] = _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32( dH[2], 15 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
_mm256_xor_si256( _mm256_srli_epi32( xl, 7 ),
_mm256_xor_si256( qt[21], qt[14] ) ) );
dH[15] = _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32( dH[3], 16 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
_mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
_mm256_xor_si256( qt[22], qt[15] ) ) );
*/
}
static const __m256i final_s8[16] =
@@ -1422,40 +1335,35 @@ void compress_small_16way( const __m512i *M, const __m512i H[16],
qt[30] = expand2s16( qt, M, H, 30 );
qt[31] = expand2s16( qt, M, H, 31 );
xl = _mm512_xor_si512(
mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
xh = _mm512_xor_si512( xl, _mm512_xor_si512(
mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
mm512_xor3( qt[19], qt[20], qt[21] ),
_mm512_xor_si512( qt[22], qt[23] ) );
xh = mm512_xor3( mm512_xor3( xl, qt[24], qt[25] ),
mm512_xor3( qt[26], qt[27], qt[28] ),
mm512_xor3( qt[29], qt[30], qt[31] ) );
#define DH1L( m, sl, sr, a, b, c ) \
_mm512_add_epi32( \
_mm512_xor_si512( M[m], \
_mm512_xor_si512( _mm512_slli_epi32( xh, sl ), \
_mm512_srli_epi32( qt[a], sr ) ) ), \
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
_mm512_add_epi32( mm512_xor3( M[m], _mm512_slli_epi32( xh, sl ), \
_mm512_srli_epi32( qt[a], sr ) ), \
mm512_xor3( xl, qt[b], qt[c] ) )
#define DH1R( m, sl, sr, a, b, c ) \
_mm512_add_epi32( \
_mm512_xor_si512( M[m], \
_mm512_xor_si512( _mm512_srli_epi32( xh, sl ), \
_mm512_slli_epi32( qt[a], sr ) ) ), \
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
_mm512_add_epi32( mm512_xor3( M[m], _mm512_srli_epi32( xh, sl ), \
_mm512_slli_epi32( qt[a], sr ) ), \
mm512_xor3( xl, qt[b], qt[c] ) )
#define DH2L( m, rl, sl, h, a, b, c ) \
_mm512_add_epi32( _mm512_add_epi32( \
mm512_rol_32( dH[h], rl ), \
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
_mm512_xor_si512( _mm512_slli_epi32( xl, sl ), \
_mm512_xor_si512( qt[b], qt[c] ) ) );
mm512_xor3( xh, qt[a], M[m] ) ), \
mm512_xor3( _mm512_slli_epi32( xl, sl ), qt[b], qt[c] ) )
#define DH2R( m, rl, sr, h, a, b, c ) \
_mm512_add_epi32( _mm512_add_epi32( \
mm512_rol_32( dH[h], rl ), \
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
_mm512_xor_si512( _mm512_srli_epi32( xl, sr ), \
_mm512_xor_si512( qt[b], qt[c] ) ) );
mm512_xor3( xh, qt[a], M[m] ) ), \
mm512_xor3( _mm512_srli_epi32( xl, sr ), qt[b], qt[c] ) )
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );

View File

@@ -1285,12 +1285,13 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
qt[30] = expand2b8( qt, M, H, 30 );
qt[31] = expand2b8( qt, M, H, 31 );
xl = _mm512_xor_si512(
mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
xh = _mm512_xor_si512( xl, _mm512_xor_si512(
mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
mm512_xor3( qt[19], qt[20], qt[21] ),
_mm512_xor_si512( qt[22], qt[23] ) );
xh = mm512_xor3( mm512_xor3( xl, qt[24], qt[25] ),
mm512_xor3( qt[26], qt[27], qt[28] ),
mm512_xor3( qt[29], qt[30], qt[31] ) );
#define DH1L( m, sl, sr, a, b, c ) \
_mm512_add_epi64( mm512_xor3( M[m], _mm512_slli_epi64( xh, sl ), \

View File

@@ -53,6 +53,20 @@ MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x000
MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
#define ECHO_SUBBYTES4(state, j) \
state[0][j] = _mm_aesenc_si128(state[0][j], k1);\
k1 = _mm_add_epi32(k1, M128(const1));\
state[1][j] = _mm_aesenc_si128(state[1][j], k1);\
k1 = _mm_add_epi32(k1, M128(const1));\
state[2][j] = _mm_aesenc_si128(state[2][j], k1);\
k1 = _mm_add_epi32(k1, M128(const1));\
state[3][j] = _mm_aesenc_si128(state[3][j], k1);\
k1 = _mm_add_epi32(k1, M128(const1));\
state[0][j] = _mm_aesenc_si128(state[0][j], m128_zero ); \
state[1][j] = _mm_aesenc_si128(state[1][j], m128_zero ); \
state[2][j] = _mm_aesenc_si128(state[2][j], m128_zero ); \
state[3][j] = _mm_aesenc_si128(state[3][j], m128_zero )
#define ECHO_SUBBYTES(state, i, j) \
state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
k1 = _mm_add_epi32(k1, M128(const1));\
@@ -73,7 +87,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
t1 = _mm_and_si128(t1, M128(lsbmask));\
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
s2 = _mm_xor_si128(s2, t2);\
state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
state2[0][j] = mm128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
@@ -83,7 +97,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
s2 = _mm_xor_si128(s2, t2);\
state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
state2[1][j] = mm128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
@@ -93,10 +107,29 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
s2 = _mm_xor_si128(s2, t2);\
state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
state2[2][j] = mm128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
state2[3][j] = _mm_xor_si128(state2[3][j], s2)
#define ECHO_ROUND_UNROLL2 \
ECHO_SUBBYTES4(_state, 0);\
ECHO_SUBBYTES4(_state, 1);\
ECHO_SUBBYTES4(_state, 2);\
ECHO_SUBBYTES4(_state, 3);\
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
ECHO_SUBBYTES4(_state2, 0);\
ECHO_SUBBYTES4(_state2, 1);\
ECHO_SUBBYTES4(_state2, 2);\
ECHO_SUBBYTES4(_state2, 3);\
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
/*
#define ECHO_ROUND_UNROLL2 \
ECHO_SUBBYTES(_state, 0, 0);\
ECHO_SUBBYTES(_state, 1, 0);\
@@ -138,7 +171,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
*/
#define SAVESTATE(dst, src)\

View File

@@ -13,12 +13,19 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
//#define mul2mask m512_const2_64( 0, 0x00001b00 )
//_mm512_set4_epi32( 0, 0, 0, 0x00001b00 )
//_mm512_set4_epi32( 0x00001b00, 0, 0, 0 )
//#define lsbmask m512_const1_32( 0x01010101 )
#define ECHO_SUBBYTES4(state, j) \
state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
k1 = _mm512_add_epi32( k1, one ); \
state[1][j] = _mm512_aesenc_epi128( state[1][j], k1 ); \
k1 = _mm512_add_epi32( k1, one ); \
state[2][j] = _mm512_aesenc_epi128( state[2][j], k1 ); \
k1 = _mm512_add_epi32( k1, one ); \
state[3][j] = _mm512_aesenc_epi128( state[3][j], k1 ); \
k1 = _mm512_add_epi32( k1, one ); \
state[0][j] = _mm512_aesenc_epi128( state[0][j], m512_zero ); \
state[1][j] = _mm512_aesenc_epi128( state[1][j], m512_zero ); \
state[2][j] = _mm512_aesenc_epi128( state[2][j], m512_zero ); \
state[3][j] = _mm512_aesenc_epi128( state[3][j], m512_zero )
#define ECHO_SUBBYTES( state, i, j ) \
state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
@@ -44,8 +51,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
t1 = _mm512_and_si512( t1, lsbmask ); \
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
s2 = _mm512_xor_si512( s2, t2 );\
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
_mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
state2[ 0 ][ j ] = mm512_xor3( state2[ 0 ][ j ], s2, state1[ 1 ][ j1 ] ); \
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
@@ -55,8 +61,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
s2 = _mm512_xor_si512( s2, t2 ); \
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
_mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
state2[ 1 ][ j ] = mm512_xor3( state2[ 1 ][ j ], s2, state1[ 2 ][ j2 ] ); \
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
@@ -66,11 +71,29 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
s2 = _mm512_xor_si512( s2, t2 ); \
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
_mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
state2[ 2 ][ j ] = mm512_xor3( state2[ 2 ][ j ], s2, state1[ 3 ][ j3] ); \
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
} while(0)
#define ECHO_ROUND_UNROLL2 \
ECHO_SUBBYTES4(_state, 0);\
ECHO_SUBBYTES4(_state, 1);\
ECHO_SUBBYTES4(_state, 2);\
ECHO_SUBBYTES4(_state, 3);\
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
ECHO_SUBBYTES4(_state2, 0);\
ECHO_SUBBYTES4(_state2, 1);\
ECHO_SUBBYTES4(_state2, 2);\
ECHO_SUBBYTES4(_state2, 3);\
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
/*
#define ECHO_ROUND_UNROLL2 \
ECHO_SUBBYTES(_state, 0, 0);\
ECHO_SUBBYTES(_state, 1, 0);\
@@ -112,6 +135,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
*/
#define SAVESTATE(dst, src)\
dst[0][0] = src[0][0];\
@@ -405,6 +429,20 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
#define lsbmask_2way m256_const1_32( 0x01010101 )
#define ECHO_SUBBYTES4_2WAY( state, j ) \
state[0][j] = _mm256_aesenc_epi128( state[0][j], k1 ); \
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
state[1][j] = _mm256_aesenc_epi128( state[1][j], k1 ); \
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
state[2][j] = _mm256_aesenc_epi128( state[2][j], k1 ); \
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
state[3][j] = _mm256_aesenc_epi128( state[3][j], k1 ); \
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
state[0][j] = _mm256_aesenc_epi128( state[0][j], m256_zero ); \
state[1][j] = _mm256_aesenc_epi128( state[1][j], m256_zero ); \
state[2][j] = _mm256_aesenc_epi128( state[2][j], m256_zero ); \
state[3][j] = _mm256_aesenc_epi128( state[3][j], m256_zero )
#define ECHO_SUBBYTES_2WAY( state, i, j ) \
state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
@@ -456,6 +494,25 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
} while(0)
#define ECHO_ROUND_UNROLL2_2WAY \
ECHO_SUBBYTES4_2WAY(_state, 0);\
ECHO_SUBBYTES4_2WAY(_state, 1);\
ECHO_SUBBYTES4_2WAY(_state, 2);\
ECHO_SUBBYTES4_2WAY(_state, 3);\
ECHO_MIXBYTES_2WAY(_state, _state2, 0, t1, t2, s2);\
ECHO_MIXBYTES_2WAY(_state, _state2, 1, t1, t2, s2);\
ECHO_MIXBYTES_2WAY(_state, _state2, 2, t1, t2, s2);\
ECHO_MIXBYTES_2WAY(_state, _state2, 3, t1, t2, s2);\
ECHO_SUBBYTES4_2WAY(_state2, 0);\
ECHO_SUBBYTES4_2WAY(_state2, 1);\
ECHO_SUBBYTES4_2WAY(_state2, 2);\
ECHO_SUBBYTES4_2WAY(_state2, 3);\
ECHO_MIXBYTES_2WAY(_state2, _state, 0, t1, t2, s2);\
ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
/*
#define ECHO_ROUND_UNROLL2_2WAY \
ECHO_SUBBYTES_2WAY(_state, 0, 0);\
ECHO_SUBBYTES_2WAY(_state, 1, 0);\
@@ -497,6 +554,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
*/
#define SAVESTATE_2WAY(dst, src)\
dst[0][0] = src[0][0];\

View File

@@ -124,7 +124,16 @@ MYALIGN const unsigned int _IV512[] = {
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
s7 = _mm_xor_si128(s7, t1)
#define PRESUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
t3 = _mm_add_epi8(t0, t0);\
t4 = _mm_add_epi8(t3, t3);\
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
/*
#define PRESUPERMIX(x, t1, s1, s2, t2)\
s1 = x;\
s2 = _mm_add_epi8(x, x);\
@@ -133,37 +142,59 @@ MYALIGN const unsigned int _IV512[] = {
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
*/
#define SUBSTITUTE(r0, _t1, _t2, _t3, _t0)\
#define SUBSTITUTE(r0, _t2 )\
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
_t2 = _mm_aesenclast_si128( _t2, m128_zero )
#define SUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
t3 = _mm_add_epi8(t0, t0);\
t4 = _mm_add_epi8(t3, t3);\
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t2 = mm128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t4 = mm128_xor3( t4, t1, t2 ); \
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t4 = mm128_xor3( t4, t2, t1 ); \
t0 = _mm_xor_si128(t0, t3);\
t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
/*
#define SUPERMIX(t0, t1, t2, t3, t4)\
PRESUPERMIX(t0, t1, t2, t3, t4);\
POSTSUPERMIX(t0, t1, t2, t3, t4)
*/
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
t1 = t2;\
t1 = _mm_shuffle_epi8(t1, M128(_supermix1b));\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
t4 = t1;\
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
t4 = _mm_xor_si128(t4, t1);\
t1 = t4;\
t1 = _mm_shuffle_epi8(t1, M128(_supermix1d));\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\
t1 = t2;\
t1 = _mm_shuffle_epi8(t1, M128(_supermix1a));\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t4 = _mm_xor_si128(t4, t1);\
t2 = _mm_xor_si128(t2, t3);\
t2 = _mm_xor_si128(t2, t0);\
t2 = mm128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t4 = _mm_xor_si128(t4, t2);\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t4 = _mm_xor_si128(t4, t2);\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = t0;\
t1 = _mm_shuffle_epi8(t1, M128(_supermix4a));\
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t4 = _mm_xor_si128(t4, t1);\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t0 = _mm_xor_si128(t0, t3);\
@@ -171,59 +202,55 @@ MYALIGN const unsigned int _IV512[] = {
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
t4 = _mm_xor_si128(t4, t0)
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
PACK_S0(r1c, r1a, _t0);\
SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
SUBSTITUTE(r1c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
r2c = _mm_xor_si128(r2c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r2d = _mm_xor_si128(r2d, _t0);\
UNPACK_S0(r1c, r1a, _t3);\
SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
SUBSTITUTE(r2c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
r3c = _mm_xor_si128(r3c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r3d = _mm_xor_si128(r3d, _t0);\
UNPACK_S0(r2c, r2a, _t3);\
SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
SUBSTITUTE(r3c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
UNPACK_S0(r3c, r3a, _t3)
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
PACK_S0(r1c, r1a, _t0);\
SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
SUBSTITUTE( r1c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
r2c = _mm_xor_si128(r2c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r2d = _mm_xor_si128(r2d, _t0);\
UNPACK_S0(r1c, r1a, _t3);\
SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
SUBSTITUTE(r2c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
r3c = _mm_xor_si128(r3c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r3d = _mm_xor_si128(r3d, _t0);\
UNPACK_S0(r2c, r2a, _t3);\
SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
SUBSTITUTE( r3c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
_t0 = _mm_shuffle_epi32(r3c, 0x39);\
r4c = _mm_xor_si128(r4c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r4d = _mm_xor_si128(r4d, _t0);\
UNPACK_S0(r3c, r3a, _t3);\
SUBSTITUTE(r4c, _t1, _t2, _t3, _t0);\
SUBSTITUTE( r4c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
UNPACK_S0(r4c, r4a, _t3)
#define LOADCOLUMN(x, s, a)\
block[0] = col[(base + a + 0) % s];\
block[1] = col[(base + a + 1) % s];\
@@ -278,44 +305,42 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
break;
}
while( uBlockCount > 0 )
{
TIX512( pmsg, ctx->state[ 7], ctx->state[2], ctx->state[8], ctx->state[9],
ctx->state[10], ctx->state[0], ctx->state[1], ctx->state[2],
TIX512( pmsg, ctx->state[ 7],ctx->state[2],ctx->state[8],ctx->state[9],
ctx->state[10],ctx->state[0],ctx->state[1],ctx->state[2],
_t0, _t1, _t2 );
SUBROUND512_4( ctx->state[0], ctx->state[1], ctx->state[11],
ctx->state[5], ctx->state[11], ctx->state[0],
ctx->state[10], ctx->state[4], ctx->state[10],
ctx->state[11], ctx->state[9], ctx->state[3],
ctx->state[9], ctx->state[10], ctx->state[8],
ctx->state[2] );
SUBROUND512_4( ctx->state[0], ctx->state[1],ctx->state[11],ctx->state[5],
ctx->state[11],ctx->state[0],ctx->state[10],ctx->state[4],
ctx->state[10],ctx->state[11],ctx->state[9],ctx->state[3],
ctx->state[9],ctx->state[10],ctx->state[8],ctx->state[2] );
ctx->base++;
pmsg += 4;
uBlockCount--;
if( uBlockCount == 0 ) break;
TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4], ctx->state[5],
ctx->state[6], ctx->state[8], ctx->state[9], ctx->state[10],
TIX512( pmsg, ctx->state[3],ctx->state[10],ctx->state[4],ctx->state[5],
ctx->state[6],ctx->state[8], ctx->state[9],ctx->state[10],
_t0, _t1, _t2 );
SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], ctx->state[1], ctx->state[7], ctx->state[8], ctx->state[6], ctx->state[0],
ctx->state[6], ctx->state[7], ctx->state[5], ctx->state[11],
ctx->state[5], ctx->state[6, ctx->state[4], ctx->state[10]);
SUBROUND512_4( ctx->state[8],ctx->state[9],ctx->state[7],ctx->state[1],
ctx->state[7],ctx->state[8],ctx->state[6],ctx->state[0],
ctx->state[6],ctx->state[7],ctx->state[5],ctx->state[11],
ctx->state[5],ctx->state[6],ctx->state[4],ctx->state[10] );
ctx->base++;
pmsg += 4;
uBlockCount--;
if( uBlockCount == 0 ) break;
TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0], ctx->state[1],
ctx->state[2], ctx->state[4], ctx->state[5], ctx->state[6],
TIX512( pmsg, ctx->state[11],ctx->state[6],ctx->state[0],ctx->state[1],
ctx->state[2], ctx->state[4],ctx->state[5],ctx->state[6],
_t0, _t1, _t2);
SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3], ctx->state[9],
ctx->state[3], ctx->state[4], ctx->state[2], ctx->state[8],
ctx->state[2], ctx->state[3], ctx->state[1], ctx->state[7],
ctx->state[1], ctx->state[2], ctx->state[0], ctx->state[6]);
SUBROUND512_4( ctx->state[4],ctx->state[5],ctx->state[3],ctx->state[9],
ctx->state[3],ctx->state[4],ctx->state[2],ctx->state[8],
ctx->state[2],ctx->state[3],ctx->state[1],ctx->state[7],
ctx->state[1],ctx->state[2],ctx->state[0],ctx->state[6]);
ctx->base = 0;
pmsg += 4;
@@ -357,7 +382,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
}
@@ -375,7 +400,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
@@ -390,7 +415,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
@@ -405,7 +430,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
@@ -420,7 +445,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
}

View File

@@ -67,11 +67,9 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
* xmm[j] will be lost
* xmm[k] has to be all 0x1b */
#define MUL2(i, j, k){\
j = _mm_xor_si128(j, j);\
j = _mm_cmpgt_epi8(j, i);\
j = _mm_cmpgt_epi8( m128_zero, i);\
i = _mm_add_epi8(i, i);\
j = _mm_and_si128(j, k);\
i = _mm_xor_si128(i, j);\
i = mm128_xorand(i, j, k );\
}
/**/
@@ -93,6 +91,96 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
We almost fit into 16 registers, need only 3 spills to memory.
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#if defined(__AVX512VL__)
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\
b6 = a0;\
b7 = a1;\
a0 = _mm_xor_si128(a0, a1);\
b0 = a2;\
a1 = _mm_xor_si128(a1, a2);\
b1 = a3;\
TEMP2 = _mm_xor_si128(a2, a3);\
b2 = a4;\
a3 = _mm_xor_si128(a3, a4);\
b3 = a5;\
a4 = _mm_xor_si128(a4, a5);\
b4 = a6;\
a5 = _mm_xor_si128(a5, a6);\
b5 = a7;\
a6 = _mm_xor_si128(a6, a7);\
a7 = _mm_xor_si128(a7, b6);\
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
TEMP0 = mm128_xor3( b0, a4, a6 ); \
/* spill values y_4, y_5 to memory */\
TEMP1 = mm128_xor3( b1, a5, a7 );\
b2 = mm128_xor3( b2, a6, a0 ); \
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
b0 = a0;\
b3 = mm128_xor3( b3, a7, a1 ); \
b1 = a1;\
b6 = mm128_xor3( b6, a4, TEMP2 ); \
b4 = mm128_xor3( b4, a0, TEMP2 ); \
b7 = mm128_xor3( b7, a5, a3 ); \
b5 = mm128_xor3( b5, a1, a3 ); \
\
/* compute x_i = t_i + t_{i+3} */\
a0 = _mm_xor_si128(a0, a3);\
a1 = _mm_xor_si128(a1, a4);\
a2 = _mm_xor_si128(TEMP2, a5);\
a3 = _mm_xor_si128(a3, a6);\
a4 = _mm_xor_si128(a4, a7);\
a5 = _mm_xor_si128(a5, b0);\
a6 = _mm_xor_si128(a6, b1);\
a7 = _mm_xor_si128(a7, TEMP2);\
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
MUL2(a0, b0, b1);\
a0 = _mm_xor_si128(a0, TEMP0);\
MUL2(a1, b0, b1);\
a1 = _mm_xor_si128(a1, TEMP1);\
MUL2(a2, b0, b1);\
a2 = _mm_xor_si128(a2, b2);\
MUL2(a3, b0, b1);\
a3 = _mm_xor_si128(a3, b3);\
MUL2(a4, b0, b1);\
a4 = _mm_xor_si128(a4, b4);\
MUL2(a5, b0, b1);\
a5 = _mm_xor_si128(a5, b5);\
MUL2(a6, b0, b1);\
a6 = _mm_xor_si128(a6, b6);\
MUL2(a7, b0, b1);\
a7 = _mm_xor_si128(a7, b7);\
\
/* compute v_i : double w_i */\
/* add to y_4 y_5 .. v3, v4, ... */\
MUL2(a0, b0, b1);\
b5 = _mm_xor_si128(b5, a0);\
MUL2(a1, b0, b1);\
b6 = _mm_xor_si128(b6, a1);\
MUL2(a2, b0, b1);\
b7 = _mm_xor_si128(b7, a2);\
MUL2(a5, b0, b1);\
b2 = _mm_xor_si128(b2, a5);\
MUL2(a6, b0, b1);\
b3 = _mm_xor_si128(b3, a6);\
MUL2(a7, b0, b1);\
b4 = _mm_xor_si128(b4, a7);\
MUL2(a3, b0, b1);\
MUL2(a4, b0, b1);\
b0 = TEMP0;\
b1 = TEMP1;\
b0 = _mm_xor_si128(b0, a3);\
b1 = _mm_xor_si128(b1, a4);\
}/*MixBytes*/
#else
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\
b6 = a0;\
@@ -189,6 +277,8 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
b1 = _mm_xor_si128(b1, a4);\
}/*MixBytes*/
#endif
/* one round
* a0-a7 = input rows

View File

@@ -58,11 +58,9 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
* xmm[j] will be lost
* xmm[k] has to be all 0x1b */
#define MUL2(i, j, k){\
j = _mm_xor_si128(j, j);\
j = _mm_cmpgt_epi8(j, i);\
j = _mm_cmpgt_epi8( m128_zero, i);\
i = _mm_add_epi8(i, i);\
j = _mm_and_si128(j, k);\
i = _mm_xor_si128(i, j);\
i = mm128_xorand(i, j, k );\
}
/* Yet another implementation of MixBytes.
@@ -82,6 +80,96 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
We almost fit into 16 registers, need only 3 spills to memory.
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#if defined(__AVX512VL__)
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\
b6 = a0;\
b7 = a1;\
a0 = _mm_xor_si128(a0, a1);\
b0 = a2;\
a1 = _mm_xor_si128(a1, a2);\
b1 = a3;\
TEMP2 = _mm_xor_si128(a2, a3);\
b2 = a4;\
a3 = _mm_xor_si128(a3, a4);\
b3 = a5;\
a4 = _mm_xor_si128(a4, a5);\
b4 = a6;\
a5 = _mm_xor_si128(a5, a6);\
b5 = a7;\
a6 = _mm_xor_si128(a6, a7);\
a7 = _mm_xor_si128(a7, b6);\
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
TEMP0 = mm128_xor3( b0, a4, a6 ); \
/* spill values y_4, y_5 to memory */\
TEMP1 = mm128_xor3( b1, a5, a7 );\
b2 = mm128_xor3( b2, a6, a0 ); \
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
b0 = a0;\
b3 = mm128_xor3( b3, a7, a1 ); \
b1 = a1;\
b6 = mm128_xor3( b6, a4, TEMP2 ); \
b4 = mm128_xor3( b4, a0, TEMP2 ); \
b7 = mm128_xor3( b7, a5, a3 ); \
b5 = mm128_xor3( b5, a1, a3 ); \
\
/* compute x_i = t_i + t_{i+3} */\
a0 = _mm_xor_si128(a0, a3);\
a1 = _mm_xor_si128(a1, a4);\
a2 = _mm_xor_si128(TEMP2, a5);\
a3 = _mm_xor_si128(a3, a6);\
a4 = _mm_xor_si128(a4, a7);\
a5 = _mm_xor_si128(a5, b0);\
a6 = _mm_xor_si128(a6, b1);\
a7 = _mm_xor_si128(a7, TEMP2);\
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
MUL2(a0, b0, b1);\
a0 = _mm_xor_si128(a0, TEMP0);\
MUL2(a1, b0, b1);\
a1 = _mm_xor_si128(a1, TEMP1);\
MUL2(a2, b0, b1);\
a2 = _mm_xor_si128(a2, b2);\
MUL2(a3, b0, b1);\
a3 = _mm_xor_si128(a3, b3);\
MUL2(a4, b0, b1);\
a4 = _mm_xor_si128(a4, b4);\
MUL2(a5, b0, b1);\
a5 = _mm_xor_si128(a5, b5);\
MUL2(a6, b0, b1);\
a6 = _mm_xor_si128(a6, b6);\
MUL2(a7, b0, b1);\
a7 = _mm_xor_si128(a7, b7);\
\
/* compute v_i : double w_i */\
/* add to y_4 y_5 .. v3, v4, ... */\
MUL2(a0, b0, b1);\
b5 = _mm_xor_si128(b5, a0);\
MUL2(a1, b0, b1);\
b6 = _mm_xor_si128(b6, a1);\
MUL2(a2, b0, b1);\
b7 = _mm_xor_si128(b7, a2);\
MUL2(a5, b0, b1);\
b2 = _mm_xor_si128(b2, a5);\
MUL2(a6, b0, b1);\
b3 = _mm_xor_si128(b3, a6);\
MUL2(a7, b0, b1);\
b4 = _mm_xor_si128(b4, a7);\
MUL2(a3, b0, b1);\
MUL2(a4, b0, b1);\
b0 = TEMP0;\
b1 = TEMP1;\
b0 = _mm_xor_si128(b0, a3);\
b1 = _mm_xor_si128(b1, a4);\
}/*MixBytes*/
#else
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\
b6 = a0;\
@@ -178,6 +266,8 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
b1 = _mm_xor_si128(b1, a4);\
}/*MixBytes*/
#endif
/* one round
* i = round number
* a0-a7 = input rows

View File

@@ -96,11 +96,9 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
* xmm[j] will be lost
* xmm[k] has to be all 0x1b */
#define MUL2(i, j, k){\
j = _mm512_xor_si512(j, j);\
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
i = _mm512_add_epi8(i, i);\
j = _mm512_and_si512(j, k);\
i = _mm512_xor_si512(i, j);\
i = mm512_xorand( i, j, k );\
}
/* Yet another implementation of MixBytes.
@@ -120,6 +118,95 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
We almost fit into 16 registers, need only 3 spills to memory.
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
b0, b1, b2, b3, b4, b5, b6, b7) { \
/* t_i = a_i + a_{i+1} */\
b6 = a0; \
b7 = a1; \
a0 = _mm512_xor_si512( a0, a1 ); \
b0 = a2; \
a1 = _mm512_xor_si512( a1, a2 ); \
b1 = a3; \
TEMP2 = _mm512_xor_si512( a2, a3 ); \
b2 = a4; \
a3 = _mm512_xor_si512( a3, a4 ); \
b3 = a5; \
a4 = _mm512_xor_si512( a4, a5 );\
b4 = a6; \
a5 = _mm512_xor_si512( a5, a6 ); \
b5 = a7; \
a6 = _mm512_xor_si512( a6, a7 ); \
a7 = _mm512_xor_si512( a7, b6 ); \
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
TEMP0 = mm512_xor3( b0, a4, a6 ); \
/* spill values y_4, y_5 to memory */\
TEMP1 = mm512_xor3( b1, a5, a7 ); \
b2 = mm512_xor3( b2, a6, a0 ); \
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
b0 = a0; \
b3 = mm512_xor3( b3, a7, a1 ); \
b1 = a1; \
b6 = mm512_xor3( b6, a4, TEMP2 ); \
b4 = mm512_xor3( b4, a0, TEMP2 ); \
b7 = mm512_xor3( b7, a5, a3 ); \
b5 = mm512_xor3( b5, a1, a3 ); \
\
/* compute x_i = t_i + t_{i+3} */\
a0 = _mm512_xor_si512( a0, a3 ); \
a1 = _mm512_xor_si512( a1, a4 ); \
a2 = _mm512_xor_si512( TEMP2, a5 ); \
a3 = _mm512_xor_si512( a3, a6 ); \
a4 = _mm512_xor_si512( a4, a7 ); \
a5 = _mm512_xor_si512( a5, b0 ); \
a6 = _mm512_xor_si512( a6, b1 ); \
a7 = _mm512_xor_si512( a7, TEMP2 ); \
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
MUL2( a0, b0, b1 ); \
a0 = _mm512_xor_si512( a0, TEMP0 ); \
MUL2( a1, b0, b1 ); \
a1 = _mm512_xor_si512( a1, TEMP1 ); \
MUL2( a2, b0, b1 ); \
a2 = _mm512_xor_si512( a2, b2 ); \
MUL2( a3, b0, b1 ); \
a3 = _mm512_xor_si512( a3, b3 ); \
MUL2( a4, b0, b1 ); \
a4 = _mm512_xor_si512( a4, b4 ); \
MUL2( a5, b0, b1 ); \
a5 = _mm512_xor_si512( a5, b5 ); \
MUL2( a6, b0, b1 ); \
a6 = _mm512_xor_si512( a6, b6 ); \
MUL2( a7, b0, b1 ); \
a7 = _mm512_xor_si512( a7, b7 ); \
\
/* compute v_i : double w_i */\
/* add to y_4 y_5 .. v3, v4, ... */\
MUL2( a0, b0, b1 ); \
b5 = _mm512_xor_si512( b5, a0 ); \
MUL2( a1, b0, b1 ); \
b6 = _mm512_xor_si512( b6, a1 ); \
MUL2( a2, b0, b1 ); \
b7 = _mm512_xor_si512( b7, a2 ); \
MUL2( a5, b0, b1 ); \
b2 = _mm512_xor_si512( b2, a5 ); \
MUL2( a6, b0, b1 ); \
b3 = _mm512_xor_si512( b3, a6 ); \
MUL2( a7, b0, b1 ); \
b4 = _mm512_xor_si512( b4, a7 ); \
MUL2( a3, b0, b1 ); \
MUL2( a4, b0, b1 ); \
b0 = TEMP0;\
b1 = TEMP1;\
b0 = _mm512_xor_si512( b0, a3 ); \
b1 = _mm512_xor_si512( b1, a4 ); \
}/*MixBytes*/
#if 0
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\
b6 = a0;\
@@ -215,7 +302,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
b0 = _mm512_xor_si512(b0, a3);\
b1 = _mm512_xor_si512(b1, a4);\
}/*MixBytes*/
#endif
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant */\

View File

@@ -104,11 +104,9 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
* xmm[j] will be lost
* xmm[k] has to be all 0x1b */
#define MUL2(i, j, k){\
j = _mm512_xor_si512(j, j);\
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
i = _mm512_add_epi8(i, i);\
j = _mm512_and_si512(j, k);\
i = _mm512_xor_si512(i, j);\
i = mm512_xorand( i, j, k );\
}
/**/
@@ -130,100 +128,90 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
We almost fit into 16 registers, need only 3 spills to memory.
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
b0, b1, b2, b3, b4, b5, b6, b7) { \
/* t_i = a_i + a_{i+1} */\
b6 = a0;\
b7 = a1;\
a0 = _mm512_xor_si512(a0, a1);\
b0 = a2;\
a1 = _mm512_xor_si512(a1, a2);\
b1 = a3;\
a2 = _mm512_xor_si512(a2, a3);\
b2 = a4;\
a3 = _mm512_xor_si512(a3, a4);\
b3 = a5;\
a4 = _mm512_xor_si512(a4, a5);\
b4 = a6;\
a5 = _mm512_xor_si512(a5, a6);\
b5 = a7;\
a6 = _mm512_xor_si512(a6, a7);\
a7 = _mm512_xor_si512(a7, b6);\
b6 = a0; \
b7 = a1; \
a0 = _mm512_xor_si512( a0, a1 ); \
b0 = a2; \
a1 = _mm512_xor_si512( a1, a2 ); \
b1 = a3; \
TEMP2 = _mm512_xor_si512( a2, a3 ); \
b2 = a4; \
a3 = _mm512_xor_si512( a3, a4 ); \
b3 = a5; \
a4 = _mm512_xor_si512( a4, a5 );\
b4 = a6; \
a5 = _mm512_xor_si512( a5, a6 ); \
b5 = a7; \
a6 = _mm512_xor_si512( a6, a7 ); \
a7 = _mm512_xor_si512( a7, b6 ); \
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
b0 = _mm512_xor_si512(b0, a4);\
b6 = _mm512_xor_si512(b6, a4);\
b1 = _mm512_xor_si512(b1, a5);\
b7 = _mm512_xor_si512(b7, a5);\
b2 = _mm512_xor_si512(b2, a6);\
b0 = _mm512_xor_si512(b0, a6);\
TEMP0 = mm512_xor3( b0, a4, a6 ); \
/* spill values y_4, y_5 to memory */\
TEMP0 = b0;\
b3 = _mm512_xor_si512(b3, a7);\
b1 = _mm512_xor_si512(b1, a7);\
TEMP1 = b1;\
b4 = _mm512_xor_si512(b4, a0);\
b2 = _mm512_xor_si512(b2, a0);\
TEMP1 = mm512_xor3( b1, a5, a7 ); \
b2 = mm512_xor3( b2, a6, a0 ); \
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
b0 = a0;\
b5 = _mm512_xor_si512(b5, a1);\
b3 = _mm512_xor_si512(b3, a1);\
b1 = a1;\
b6 = _mm512_xor_si512(b6, a2);\
b4 = _mm512_xor_si512(b4, a2);\
TEMP2 = a2;\
b7 = _mm512_xor_si512(b7, a3);\
b5 = _mm512_xor_si512(b5, a3);\
b0 = a0; \
b3 = mm512_xor3( b3, a7, a1 ); \
b1 = a1; \
b6 = mm512_xor3( b6, a4, TEMP2 ); \
b4 = mm512_xor3( b4, a0, TEMP2 ); \
b7 = mm512_xor3( b7, a5, a3 ); \
b5 = mm512_xor3( b5, a1, a3 ); \
\
/* compute x_i = t_i + t_{i+3} */\
a0 = _mm512_xor_si512(a0, a3);\
a1 = _mm512_xor_si512(a1, a4);\
a2 = _mm512_xor_si512(a2, a5);\
a3 = _mm512_xor_si512(a3, a6);\
a4 = _mm512_xor_si512(a4, a7);\
a5 = _mm512_xor_si512(a5, b0);\
a6 = _mm512_xor_si512(a6, b1);\
a7 = _mm512_xor_si512(a7, TEMP2);\
a0 = _mm512_xor_si512( a0, a3 ); \
a1 = _mm512_xor_si512( a1, a4 ); \
a2 = _mm512_xor_si512( TEMP2, a5 ); \
a3 = _mm512_xor_si512( a3, a6 ); \
a4 = _mm512_xor_si512( a4, a7 ); \
a5 = _mm512_xor_si512( a5, b0 ); \
a6 = _mm512_xor_si512( a6, b1 ); \
a7 = _mm512_xor_si512( a7, TEMP2 ); \
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
MUL2(a0, b0, b1);\
a0 = _mm512_xor_si512(a0, TEMP0);\
MUL2(a1, b0, b1);\
a1 = _mm512_xor_si512(a1, TEMP1);\
MUL2(a2, b0, b1);\
a2 = _mm512_xor_si512(a2, b2);\
MUL2(a3, b0, b1);\
a3 = _mm512_xor_si512(a3, b3);\
MUL2(a4, b0, b1);\
a4 = _mm512_xor_si512(a4, b4);\
MUL2(a5, b0, b1);\
a5 = _mm512_xor_si512(a5, b5);\
MUL2(a6, b0, b1);\
a6 = _mm512_xor_si512(a6, b6);\
MUL2(a7, b0, b1);\
a7 = _mm512_xor_si512(a7, b7);\
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
MUL2( a0, b0, b1 ); \
a0 = _mm512_xor_si512( a0, TEMP0 ); \
MUL2( a1, b0, b1 ); \
a1 = _mm512_xor_si512( a1, TEMP1 ); \
MUL2( a2, b0, b1 ); \
a2 = _mm512_xor_si512( a2, b2 ); \
MUL2( a3, b0, b1 ); \
a3 = _mm512_xor_si512( a3, b3 ); \
MUL2( a4, b0, b1 ); \
a4 = _mm512_xor_si512( a4, b4 ); \
MUL2( a5, b0, b1 ); \
a5 = _mm512_xor_si512( a5, b5 ); \
MUL2( a6, b0, b1 ); \
a6 = _mm512_xor_si512( a6, b6 ); \
MUL2( a7, b0, b1 ); \
a7 = _mm512_xor_si512( a7, b7 ); \
\
/* compute v_i : double w_i */\
/* add to y_4 y_5 .. v3, v4, ... */\
MUL2(a0, b0, b1);\
b5 = _mm512_xor_si512(b5, a0);\
MUL2(a1, b0, b1);\
b6 = _mm512_xor_si512(b6, a1);\
MUL2(a2, b0, b1);\
b7 = _mm512_xor_si512(b7, a2);\
MUL2(a5, b0, b1);\
b2 = _mm512_xor_si512(b2, a5);\
MUL2(a6, b0, b1);\
b3 = _mm512_xor_si512(b3, a6);\
MUL2(a7, b0, b1);\
b4 = _mm512_xor_si512(b4, a7);\
MUL2(a3, b0, b1);\
MUL2(a4, b0, b1);\
MUL2( a0, b0, b1 ); \
b5 = _mm512_xor_si512( b5, a0 ); \
MUL2( a1, b0, b1 ); \
b6 = _mm512_xor_si512( b6, a1 ); \
MUL2( a2, b0, b1 ); \
b7 = _mm512_xor_si512( b7, a2 ); \
MUL2( a5, b0, b1 ); \
b2 = _mm512_xor_si512( b2, a5 ); \
MUL2( a6, b0, b1 ); \
b3 = _mm512_xor_si512( b3, a6 ); \
MUL2( a7, b0, b1 ); \
b4 = _mm512_xor_si512( b4, a7 ); \
MUL2( a3, b0, b1 ); \
MUL2( a4, b0, b1 ); \
b0 = TEMP0;\
b1 = TEMP1;\
b0 = _mm512_xor_si512(b0, a3);\
b1 = _mm512_xor_si512(b1, a4);\
b0 = _mm512_xor_si512( b0, a3 ); \
b1 = _mm512_xor_si512( b1, a4 ); \
}/*MixBytes*/
/* one round
@@ -709,11 +697,9 @@ static const __m256i SUBSH_MASK7_2WAY =
* xmm[j] will be lost
* xmm[k] has to be all 0x1b */
#define MUL2_2WAY(i, j, k){\
j = _mm256_xor_si256(j, j);\
j = _mm256_cmpgt_epi8(j, i );\
j = _mm256_cmpgt_epi8( m256_zero, i );\
i = _mm256_add_epi8(i, i);\
j = _mm256_and_si256(j, k);\
i = _mm256_xor_si256(i, j);\
i = mm256_xorand( i, j, k );\
}
#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\

View File

@@ -44,6 +44,7 @@ void myriad_8way_hash( void *output, const void *input )
rintrlv_8x64_4x128( vhashA, vhashB, input, 640 );
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 );
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(groestl512_4way_context) );
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 );
uint32_t hash0[20] __attribute__ ((aligned (64)));
@@ -58,8 +59,6 @@ void myriad_8way_hash( void *output, const void *input )
// rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
hash6, hash7 );
#else
@@ -76,27 +75,27 @@ void myriad_8way_hash( void *output, const void *input )
hash4, hash5, hash6, hash7, input, 640 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, 512 );
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
#endif
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
hash6, hash7 );
sha256_8way_update( &ctx.sha, vhash, 64 );
sha256_8way_close( &ctx.sha, output );
}

View File

@@ -39,17 +39,10 @@
void
SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
{
#if defined(HMAC_SPH_SHA)
sph_sha256_context ctx;
sph_sha256_init( &ctx );
sph_sha256( &ctx, in, len );
sph_sha256_close( &ctx, digest );
#else
SHA256_CTX ctx;
SHA256_Init( &ctx );
SHA256_Update( &ctx, in, len );
SHA256_Final( digest, &ctx );
#endif
}
/**
@@ -79,51 +72,29 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
/* If Klen > 64, the key is really SHA256(K). */
if ( Klen > 64 )
{
#if defined(HMAC_SPH_SHA)
sph_sha256_init( &ctx->ictx );
sph_sha256( &ctx->ictx, K, Klen );
sph_sha256_close( &ctx->ictx, khash );
#else
SHA256_Init( &ctx->ictx );
SHA256_Update( &ctx->ictx, K, Klen );
SHA256_Final( khash, &ctx->ictx );
#endif
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
#if defined(HMAC_SPH_SHA)
sph_sha256_init( &ctx->ictx );
#else
SHA256_Init( &ctx->ictx );
#endif
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x36;
memset( pad + Klen, 0x36, 64 - Klen );
#if defined(HMAC_SPH_SHA)
sph_sha256( &ctx->ictx, pad, 64 );
#else
SHA256_Update( &ctx->ictx, pad, 64 );
#endif
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
#if defined(HMAC_SPH_SHA)
sph_sha256_init( &ctx->octx );
#else
SHA256_Init( &ctx->octx );
#endif
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x5c;
memset( pad + Klen, 0x5c, 64 - Klen );
#if defined(HMAC_SPH_SHA)
sph_sha256( &ctx->octx, pad, 64 );
#else
SHA256_Update( &ctx->octx, pad, 64 );
#endif
}
/* Add bytes to the HMAC-SHA256 operation. */
@@ -131,11 +102,7 @@ void
HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
{
/* Feed data to the inner SHA256 operation. */
#if defined(HMAC_SPH_SHA)
sph_sha256( &ctx->ictx, in, len );
#else
SHA256_Update( &ctx->ictx, in, len );
#endif
}
/* Finish an HMAC-SHA256 operation. */
@@ -144,20 +111,9 @@ HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
{
unsigned char ihash[32];
#if defined(HMAC_SPH_SHA)
sph_sha256_close( &ctx->ictx, ihash );
sph_sha256( &ctx->octx, ihash, 32 );
sph_sha256_close( &ctx->octx, digest );
#else
/* Finish the inner SHA256 operation. */
SHA256_Final( ihash, &ctx->ictx );
/* Feed the inner hash to the outer SHA256 operation. */
SHA256_Update( &ctx->octx, ihash, 32 );
/* Finish the outer SHA256 operation. */
SHA256_Final( digest, &ctx->octx );
#endif
}
/**

View File

@@ -29,24 +29,14 @@
#ifndef HMAC_SHA256_H__
#define HMAC_SHA256_H__
//#define HMAC_SSL_SHA 1
#define HMAC_SPH_SHA 1
#include <sys/types.h>
#include <stdint.h>
#include "sph_sha2.h"
#include <openssl/sha.h>
typedef struct HMAC_SHA256Context
{
#if defined(HMAC_SPH_SHA)
sph_sha256_context ictx;
sph_sha256_context octx;
#else
SHA256_CTX ictx;
SHA256_CTX octx;
#endif
} HMAC_SHA256_CTX;
void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );

View File

@@ -41,7 +41,7 @@
#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
//#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X)))
#define MAJ( X, Y, Z ) ( Y ^ ( ( X ^ Y ) & ( Y ^ Z ) ) )
#define MAJ( X, Y, Z ) ( Y ^ ( ( X_xor_Y = X ^ Y ) & ( Y_xor_Z ) ) )
#define ROTR SPH_ROTR32
#define BSG2_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
@@ -319,6 +319,7 @@ static const sph_u32 K[64] = {
t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \
+ K[pcount + (pc)] + W[(pc) & 0x0F]); \
t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \
Y_xor_Z = X_xor_Y; \
d = SPH_T32(d + t1); \
h = SPH_T32(t1 + t2); \
} while (0)
@@ -329,7 +330,7 @@ static const sph_u32 K[64] = {
SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc)
#define SHA2_ROUND_BODY(in, r) do { \
sph_u32 A, B, C, D, E, F, G, H; \
sph_u32 A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; \
sph_u32 W[16]; \
unsigned pcount; \
\
@@ -342,6 +343,7 @@ static const sph_u32 K[64] = {
G = (r)[6]; \
H = (r)[7]; \
pcount = 0; \
Y_xor_Z = B ^ C; \
SHA2_STEP1(A, B, C, D, E, F, G, H, in, 0); \
SHA2_STEP1(H, A, B, C, D, E, F, G, in, 1); \
SHA2_STEP1(G, H, A, B, C, D, E, F, in, 2); \
@@ -389,7 +391,7 @@ static const sph_u32 K[64] = {
#else // large footprint (default)
#define SHA2_ROUND_BODY(in, r) do { \
sph_u32 A, B, C, D, E, F, G, H, T1, T2; \
sph_u32 A, B, C, D, E, F, G, H, T1, T2, X_xor_Y, Y_xor_Z;; \
sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
\
@@ -401,388 +403,453 @@ static const sph_u32 K[64] = {
F = (r)[5]; \
G = (r)[6]; \
H = (r)[7]; \
Y_xor_Z = B ^ C; \
W00 = in(0); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0x428A2F98) + W00); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
Y_xor_Z = X_xor_Y; \
D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \
W01 = in(1); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0x71374491) + W01); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
Y_xor_Z = X_xor_Y; \
C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \
W02 = in(2); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0xB5C0FBCF) + W02); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
Y_xor_Z = X_xor_Y; \
B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \
W03 = in(3); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0xE9B5DBA5) + W03); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
Y_xor_Z = X_xor_Y; \
A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \
W04 = in(4); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x3956C25B) + W04); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
Y_xor_Z = X_xor_Y; \
H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \
W05 = in(5); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0x59F111F1) + W05); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
Y_xor_Z = X_xor_Y; \
G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \
W06 = in(6); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x923F82A4) + W06); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
Y_xor_Z = X_xor_Y; \
F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \
W07 = in(7); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0xAB1C5ED5) + W07); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
Y_xor_Z = X_xor_Y; \
E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \
W08 = in(8); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0xD807AA98) + W08); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
Y_xor_Z = X_xor_Y; \
D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \
W09 = in(9); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0x12835B01) + W09); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
Y_xor_Z = X_xor_Y; \
C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \
W10 = in(10); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0x243185BE) + W10); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
Y_xor_Z = X_xor_Y; \
B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \
W11 = in(11); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0x550C7DC3) + W11); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
Y_xor_Z = X_xor_Y; \
A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \
W12 = in(12); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x72BE5D74) + W12); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
Y_xor_Z = X_xor_Y; \
H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \
W13 = in(13); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0x80DEB1FE) + W13); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
Y_xor_Z = X_xor_Y; \
G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \
W14 = in(14); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x9BDC06A7) + W14); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
Y_xor_Z = X_xor_Y; \
F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \
W15 = in(15); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0xC19BF174) + W15); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
Y_xor_Z = X_xor_Y; \
E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0xE49B69C1) + W00); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
Y_xor_Z = X_xor_Y; \
D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0xEFBE4786) + W01); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
Y_xor_Z = X_xor_Y; \
C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0x0FC19DC6) + W02); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
Y_xor_Z = X_xor_Y; \
B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0x240CA1CC) + W03); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
Y_xor_Z = X_xor_Y; \
A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x2DE92C6F) + W04); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
Y_xor_Z = X_xor_Y; \
H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0x4A7484AA) + W05); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
Y_xor_Z = X_xor_Y; \
G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x5CB0A9DC) + W06); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
Y_xor_Z = X_xor_Y; \
F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0x76F988DA) + W07); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
Y_xor_Z = X_xor_Y; \
E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0x983E5152) + W08); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
Y_xor_Z = X_xor_Y; \
D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0xA831C66D) + W09); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
Y_xor_Z = X_xor_Y; \
C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0xB00327C8) + W10); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
Y_xor_Z = X_xor_Y; \
B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0xBF597FC7) + W11); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
Y_xor_Z = X_xor_Y; \
A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0xC6E00BF3) + W12); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
Y_xor_Z = X_xor_Y; \
H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0xD5A79147) + W13); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
Y_xor_Z = X_xor_Y; \
G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x06CA6351) + W14); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
Y_xor_Z = X_xor_Y; \
F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0x14292967) + W15); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
Y_xor_Z = X_xor_Y; \
E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0x27B70A85) + W00); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
Y_xor_Z = X_xor_Y; \
D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0x2E1B2138) + W01); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
Y_xor_Z = X_xor_Y; \
C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0x4D2C6DFC) + W02); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
Y_xor_Z = X_xor_Y; \
B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0x53380D13) + W03); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
Y_xor_Z = X_xor_Y; \
A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x650A7354) + W04); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
Y_xor_Z = X_xor_Y; \
H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0x766A0ABB) + W05); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
Y_xor_Z = X_xor_Y; \
G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x81C2C92E) + W06); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
Y_xor_Z = X_xor_Y; \
F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0x92722C85) + W07); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
Y_xor_Z = X_xor_Y; \
E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0xA2BFE8A1) + W08); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
Y_xor_Z = X_xor_Y; \
D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0xA81A664B) + W09); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
Y_xor_Z = X_xor_Y; \
C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0xC24B8B70) + W10); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
Y_xor_Z = X_xor_Y; \
B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0xC76C51A3) + W11); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
Y_xor_Z = X_xor_Y; \
A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0xD192E819) + W12); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
Y_xor_Z = X_xor_Y; \
H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0xD6990624) + W13); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
Y_xor_Z = X_xor_Y; \
G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0xF40E3585) + W14); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
Y_xor_Z = X_xor_Y; \
F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0x106AA070) + W15); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
Y_xor_Z = X_xor_Y; \
E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0x19A4C116) + W00); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
Y_xor_Z = X_xor_Y; \
D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0x1E376C08) + W01); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
Y_xor_Z = X_xor_Y; \
C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0x2748774C) + W02); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
Y_xor_Z = X_xor_Y; \
B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0x34B0BCB5) + W03); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
Y_xor_Z = X_xor_Y; \
A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x391C0CB3) + W04); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
Y_xor_Z = X_xor_Y; \
H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0x4ED8AA4A) + W05); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
Y_xor_Z = X_xor_Y; \
G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x5B9CCA4F) + W06); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
Y_xor_Z = X_xor_Y; \
F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0x682E6FF3) + W07); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
Y_xor_Z = X_xor_Y; \
E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0x748F82EE) + W08); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
Y_xor_Z = X_xor_Y; \
D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0x78A5636F) + W09); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
Y_xor_Z = X_xor_Y; \
C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0x84C87814) + W10); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
Y_xor_Z = X_xor_Y; \
B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0x8CC70208) + W11); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
Y_xor_Z = X_xor_Y; \
A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x90BEFFFA) + W12); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
Y_xor_Z = X_xor_Y; \
H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0xA4506CEB) + W13); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
Y_xor_Z = X_xor_Y; \
G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0xBEF9A3F7) + W14); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
Y_xor_Z = X_xor_Y; \
F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0xC67178F2) + W15); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
Y_xor_Z = X_xor_Y; \
E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \
(r)[0] = SPH_T32((r)[0] + A); \

View File

@@ -23,6 +23,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
register __m512i K0, K1, K2, K3, K4, K5, K6, K7;
__m512i *M = (__m512i*)msg;
__m512i *H = (__m512i*)ctx->h;
const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2,
ctx->count1, ctx->count0 );
int r;
P0 = H[0];
@@ -62,16 +64,16 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
_mm512_aesenc_epi128( K0, m512_zero ) ) );
if ( r == 0 )
K0 = _mm512_xor_si512( K0, _mm512_set4_epi32(
~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
K0 = _mm512_xor_si512( K0,
_mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
K1 = _mm512_xor_si512( K0,
mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
if ( r == 1 )
K1 = _mm512_xor_si512( K1, _mm512_set4_epi32(
~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) );
K1 = _mm512_xor_si512( K1, mm512_ror128_32(
_mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
K2 = _mm512_xor_si512( K1,
@@ -96,8 +98,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
if ( r == 2 )
K7 = _mm512_xor_si512( K7, _mm512_set4_epi32(
~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
K7 = _mm512_xor_si512( K7, mm512_swap128_64(
_mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
P1 = _mm512_xor_si512( P1, X );

View File

@@ -101,15 +101,6 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round
// working proof of concept
/*
__m512i K = m512_const1_128( m[0] );
__m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K );
X = _mm512_aesenc_epi128( X, m512_zero );
k00 = _mm512_castsi512_si128( K );
x = _mm512_castsi512_si128( X );
*/
k00 = m[0];
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, zero );

View File

@@ -747,11 +747,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
static const m512_v16 code[] = { c1_16_512(185), c1_16_512(233) };
// static const m512_v16 code[] = { c1_16(185), c1_16(233),
// c1_16(185), c1_16(233) };
S0l = _mm512_xor_si512( S[0], M[0] );
S0h = _mm512_xor_si512( S[1], M[1] );
S1l = _mm512_xor_si512( S[2], M[2] );
@@ -764,11 +759,16 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
// targetted, local macros don't need a unique name
#define S(i) S##i
#define F_0( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xca )
#define F_1( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xe8 )
/*
#define F_0(B, C, D) \
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( C,D ), B ), D )
#define F_1(B, C, D) \
_mm512_or_si512( _mm512_and_si512( D, C ),\
_mm512_and_si512( _mm512_or_si512( D,C ), B ) )
*/
#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)

View File

@@ -6,10 +6,6 @@
#define PRINT_SOME 0
/* JDD all ocurrances of macro X in this file renamed to XX
* due to name conflict
*/
int SupportedLength(int hashbitlen) {
if (hashbitlen <= 0 || hashbitlen > 512)
return 0;

View File

@@ -373,6 +373,45 @@ static inline void salsa20(salsa20_blk_t *restrict B,
#define INTEGERIFY (uint32_t)X.d[0]
#endif
// AVX512 ternary logic optimization
#if defined(__AVX512VL__)
#define XOR_X_XOR_X( in1, in2 ) \
X0 = _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
X1 = _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
X2 = _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
X3 = _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 );
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
X0 = _mm_ternarylogic_epi32( (in1).q[0], (in2).q[0], (in3).q[0], 0x96 ); \
X1 = _mm_ternarylogic_epi32( (in1).q[1], (in2).q[1], (in3).q[1], 0x96 ); \
X2 = _mm_ternarylogic_epi32( (in1).q[2], (in2).q[2], (in3).q[2], 0x96 ); \
X3 = _mm_ternarylogic_epi32( (in1).q[3], (in2).q[3], (in3).q[3], 0x96 );
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
X0 = _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
X1 = _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
X2 = _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
X3 = _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); \
SALSA20(out)
#else
#define XOR_X_XOR_X( in1, in2 ) \
XOR_X( in1 ) \
XOR_X( in2 )
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
XOR_X_2( in1, in2 ) \
XOR_X( in3 )
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
XOR_X(in1) \
XOR_X(in2) \
SALSA20( out )
#endif
/**
* Apply the Salsa20 core to the block provided in X ^ in.
*/
@@ -406,11 +445,15 @@ static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1,
{
DECL_X
XOR_X_2(Bin1[1], Bin2[1])
XOR_X(Bin1[0])
XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] )
// XOR_X_2(Bin1[1], Bin2[1])
// XOR_X(Bin1[0])
SALSA20_XOR_MEM(Bin2[0], Bout[0])
XOR_X(Bin1[1])
SALSA20_XOR_MEM(Bin2[1], Bout[1])
// Factor out the XOR from salsa20 to do a xor3
XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] )
// XOR_X(Bin1[1])
// SALSA20_XOR_MEM(Bin2[1], Bout[1])
return INTEGERIFY;
}
@@ -745,13 +788,15 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
i = 0;
r--;
do {
XOR_X(Bin1[i])
XOR_X(Bin2[i])
XOR_X_XOR_X( Bin1[i], Bin2[i] )
// XOR_X(Bin1[i])
// XOR_X(Bin2[i])
PWXFORM
WRITE_X(Bout[i])
XOR_X(Bin1[i + 1])
XOR_X(Bin2[i + 1])
XOR_X_XOR_X( Bin1[ i+1 ], Bin2[ i+1 ] )
// XOR_X(Bin1[i + 1])
// XOR_X(Bin2[i + 1])
PWXFORM
if (unlikely(i >= r))

View File

@@ -35,7 +35,6 @@
#include "miner.h"
#include "simd-utils.h"
#include "algo/sha/sph_sha2.h"
#include <openssl/sha.h>
#ifdef __cplusplus
extern "C" {

View File

@@ -63,7 +63,7 @@ mv cpuminer cpuminer-avx
# Westmere SSE4.2 AES
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=westmere -Wall -fno-common" ./configure --with-curl
CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-aes-sse42.exe

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.0.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.1.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.17.0'
PACKAGE_STRING='cpuminer-opt 3.17.0'
PACKAGE_VERSION='3.17.1'
PACKAGE_STRING='cpuminer-opt 3.17.1'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.17.0 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.17.1 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.17.0:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.17.1:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.17.0
cpuminer-opt configure 3.17.1
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.17.0, which was
It was created by cpuminer-opt $as_me 3.17.1, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.17.0'
VERSION='3.17.1'
cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.17.0, which was
This file was extended by cpuminer-opt $as_me 3.17.1, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.17.0
cpuminer-opt config.status 3.17.1
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.17.0])
AC_INIT([cpuminer-opt], [3.17.1])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -1054,6 +1054,8 @@ void report_summary_log( bool force )
applog( LOG_NOTICE,"CPU temp: curr %s max %d, Freq: %.3f/%.3f GHz",
tempstr, hi_temp, lo_freq / 1e6, hi_freq / 1e6 );
if ( curr_temp > hi_temp ) hi_temp = curr_temp;
if ( ( opt_max_temp > 0.0 ) && ( curr_temp > opt_max_temp ) )
restart_threads();
prev_temp = curr_temp;
}
}
@@ -2856,7 +2858,6 @@ static bool cpu_capability( bool display_only )
bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
bool use_aes;
bool use_sse2;
bool use_sse42;
bool use_avx2;
bool use_avx512;
bool use_sha;
@@ -2934,7 +2935,8 @@ static bool cpu_capability( bool display_only )
else if ( algo_has_avx2 ) printf( " AVX2 " );
else if ( algo_has_sse42 ) printf( " SSE4.2" );
else if ( algo_has_sse2 ) printf( " SSE2 " );
if ( algo_has_vaes ) printf( " VAES" );
if ( algo_has_vaes ||
algo_has_vaes256 ) printf( " VAES" );
else if ( algo_has_aes ) printf( " AES" );
if ( algo_has_sha ) printf( " SHA" );
}
@@ -2972,13 +2974,12 @@ static bool cpu_capability( bool display_only )
// Determine mining options
use_sse2 = cpu_has_sse2 && algo_has_sse2;
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42;
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes
&& ( use_avx512 || algo_has_vaes256 );
use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
use_vaes = cpu_has_vaes && sw_has_vaes && ( algo_has_vaes
|| algo_has_vaes256 );
use_none = !( use_sse2 || use_aes || use_avx512 || use_avx2 ||
use_sha || use_vaes );
// Display best options
@@ -2988,7 +2989,6 @@ static bool cpu_capability( bool display_only )
{
if ( use_avx512 ) printf( " AVX512" );
else if ( use_avx2 ) printf( " AVX2" );
else if ( use_sse42 ) printf( " SSE4.2" );
else if ( use_sse2 ) printf( " SSE2" );
if ( use_vaes ) printf( " VAES" );
else if ( use_aes ) printf( " AES" );

View File

@@ -237,6 +237,25 @@ static inline void memset_128( __m128i *dst, const __m128i a, const int n )
static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
#if defined(__AVX512VL__)
// a ^ b ^ c
#define mm128_xor3( a, b, c ) \
_mm_ternarylogic_epi64( a, b, c, 0x96 )
// a ^ ( b & c )
#define mm128_xorand( a, b, c ) \
_mm_ternarylogic_epi64( a, b, c, 0x78 )
#else
#define mm128_xor3( a, b, c ) \
_mm_xor_si128( a, _mm_xor_si128( b, c ) )
#define mm128_xorand( a, b, c ) \
_mm_xor_si128( a, _mm_and_si128( b, c ) )
#endif
//
// Bit rotations

View File

@@ -275,15 +275,17 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
//
// Rotate elements accross all lanes.
//
// Swap 128 bit elements in 256 bit vector.
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
// Rotate 256 bit vector by one 64 bit element
#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
#if defined(__AVX512VL__)
#if defined(__AVX512F__) && defined(__AVX512VL__)
static inline __m256i mm256_swap_128( const __m256i v )
{ return _mm256_alignr_epi64( v, v, 2 ); }
static inline __m256i mm256_ror_1x64( const __m256i v )
{ return _mm256_alignr_epi64( v, v, 1 ); }
static inline __m256i mm256_rol_1x64( const __m256i v )
{ return _mm256_alignr_epi64( v, v, 3 ); }
static inline __m256i mm256_ror_1x32( const __m256i v )
{ return _mm256_alignr_epi32( v, v, 1 ); }
@@ -293,6 +295,13 @@ static inline __m256i mm256_rol_1x32( const __m256i v )
#else // AVX2
// Swap 128 bit elements in 256 bit vector.
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
// Rotate 256 bit vector by one 64 bit element
#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
// Rotate 256 bit vector by one 32 bit element.
#define mm256_ror_1x32( v ) \
_mm256_permutevar8x32_epi32( v, \
@@ -304,6 +313,7 @@ static inline __m256i mm256_rol_1x32( const __m256i v )
m256_const_64( 0x0000000600000005, 0x0000000400000003, \
0x0000000200000001, 0x0000000000000007 )
#endif // AVX512 else AVX2
//

View File

@@ -107,7 +107,7 @@ mv cpuminer.exe release/cpuminer-avx.exe
# Westmere SSE4.2 AES
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=westmere -Wall" ./configure $CONFIGURE_ARGS
CFLAGS="-O3 -march=westmere -maes -Wall" ./configure $CONFIGURE_ARGS
#CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe