diff --git a/RELEASE_NOTES b/RELEASE_NOTES index cdacd32..3f6b080 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,12 @@ If not what makes it happen or not happen? Change Log ---------- +v3.17.1 + +Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES. +More ternary logic optimizations for AVX512, AVX512+VAES, and AVX512+AES. +Fixed my-gr algo for VAES. + v3.17.0 AVX512 optimized using ternary logic instructions. diff --git a/algo/blake/blake-hash-4way.h b/algo/blake/blake-hash-4way.h index fc64583..a5d74e0 100644 --- a/algo/blake/blake-hash-4way.h +++ b/algo/blake/blake-hash-4way.h @@ -180,6 +180,7 @@ void blake512_8way_update( void *cc, const void *data, size_t len ); void blake512_8way_close( void *cc, void *dst ); void blake512_8way_full( blake_8way_big_context *sc, void * dst, const void *data, size_t len ); +void blake512_8way_hash_le80( void *hash, const void *data ); #endif // AVX512 #endif // AVX2 diff --git a/algo/blake/blake256-hash-4way.c b/algo/blake/blake256-hash-4way.c index 3de0363..65fbe1f 100644 --- a/algo/blake/blake256-hash-4way.c +++ b/algo/blake/blake256-hash-4way.c @@ -669,14 +669,14 @@ do { \ ROUND_S_8WAY(2); \ ROUND_S_8WAY(3); \ } \ - H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \ - H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \ - H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \ - H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \ - H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \ - H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \ - H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \ - H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \ + H0 = mm256_xor3( V8, V0, H0 ); \ + H1 = mm256_xor3( V9, V1, H1 ); \ + H2 = mm256_xor3( VA, V2, H2 ); \ + H3 = mm256_xor3( VB, V3, H3 ); \ + H4 = mm256_xor3( VC, V4, H4 ); \ + H5 = mm256_xor3( VD, V5, H5 ); \ + H6 = mm256_xor3( VE, V6, H6 ); \ + H7 = mm256_xor3( VF, V7, H7 ); \ } while (0) @@ -808,14 +808,14 @@ do { \ ROUND_S_16WAY(2); \ ROUND_S_16WAY(3); \ } \ - H0 = _mm512_xor_si512( _mm512_xor_si512( V8, V0 ), H0 ); \ - H1 = _mm512_xor_si512( _mm512_xor_si512( V9, V1 ), H1 ); \ - H2 = _mm512_xor_si512( _mm512_xor_si512( VA, V2 ), H2 ); \ - H3 = _mm512_xor_si512( _mm512_xor_si512( VB, V3 ), H3 ); \ - H4 = _mm512_xor_si512( _mm512_xor_si512( VC, V4 ), H4 ); \ - H5 = _mm512_xor_si512( _mm512_xor_si512( VD, V5 ), H5 ); \ - H6 = _mm512_xor_si512( _mm512_xor_si512( VE, V6 ), H6 ); \ - H7 = _mm512_xor_si512( _mm512_xor_si512( VF, V7 ), H7 ); \ + H0 = mm512_xor3( V8, V0, H0 ); \ + H1 = mm512_xor3( V9, V1, H1 ); \ + H2 = mm512_xor3( VA, V2, H2 ); \ + H3 = mm512_xor3( VB, V3, H3 ); \ + H4 = mm512_xor3( VC, V4, H4 ); \ + H5 = mm512_xor3( VD, V5, H5 ); \ + H6 = mm512_xor3( VE, V6, H6 ); \ + H7 = mm512_xor3( VF, V7, H7 ); \ } while (0) #endif diff --git a/algo/blake/blake2b-hash-4way.c b/algo/blake/blake2b-hash-4way.c index d9853c2..f482443 100644 --- a/algo/blake/blake2b-hash-4way.c +++ b/algo/blake/blake2b-hash-4way.c @@ -122,14 +122,14 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last ) B2B8W_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] ); } - ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] ); - ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] ); - ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] ); - ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] ); - ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] ); - ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] ); - ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] ); - ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] ); + ctx->h[0] = mm512_xor3( ctx->h[0], v[0], v[ 8] ); + ctx->h[1] = mm512_xor3( ctx->h[1], v[1], v[ 9] ); + ctx->h[2] = mm512_xor3( ctx->h[2], v[2], v[10] ); + ctx->h[3] = mm512_xor3( ctx->h[3], v[3], v[11] ); + ctx->h[4] = mm512_xor3( ctx->h[4], v[4], v[12] ); + ctx->h[5] = mm512_xor3( ctx->h[5], v[5], v[13] ); + ctx->h[6] = mm512_xor3( ctx->h[6], v[6], v[14] ); + ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] ); } int blake2b_8way_init( blake2b_8way_ctx *ctx ) diff --git a/algo/blake/blake2s-gate.h b/algo/blake/blake2s-gate.h index 4c621b4..4a7942c 100644 --- a/algo/blake/blake2s-gate.h +++ b/algo/blake/blake2s-gate.h @@ -4,7 +4,6 @@ #include #include "algo-gate-api.h" -//#if defined(__SSE4_2__) #if defined(__SSE2__) #define BLAKE2S_4WAY #endif @@ -27,8 +26,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce, #elif defined (BLAKE2S_8WAY) -//#if defined(BLAKE2S_8WAY) - void blake2s_8way_hash( void *state, const void *input ); int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); diff --git a/algo/blake/blake2s-hash-4way.c b/algo/blake/blake2s-hash-4way.c index 094edd0..190ad0b 100644 --- a/algo/blake/blake2s-hash-4way.c +++ b/algo/blake/blake2s-hash-4way.c @@ -368,7 +368,7 @@ do { \ ROUND8W( 9 ); for( size_t i = 0; i < 8; ++i ) - S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] ); + S->h[i] = mm256_xor3( S->h[i], v[i], v[i + 8] ); #undef G8W #undef ROUND8W @@ -566,7 +566,7 @@ do { \ ROUND16W( 9 ); for( size_t i = 0; i < 8; ++i ) - S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] ); + S->h[i] = mm512_xor3( S->h[i], v[i], v[i + 8] ); #undef G16W #undef ROUND16W diff --git a/algo/blake/blake512-hash-4way.c b/algo/blake/blake512-hash-4way.c index a5d5394..d1b5d2b 100644 --- a/algo/blake/blake512-hash-4way.c +++ b/algo/blake/blake512-hash-4way.c @@ -293,10 +293,6 @@ static const sph_u64 CB[16] = { H5 = (state)->H[5]; \ H6 = (state)->H[6]; \ H7 = (state)->H[7]; \ - S0 = (state)->S[0]; \ - S1 = (state)->S[1]; \ - S2 = (state)->S[2]; \ - S3 = (state)->S[3]; \ T0 = (state)->T0; \ T1 = (state)->T1; \ } while (0) @@ -310,10 +306,6 @@ static const sph_u64 CB[16] = { (state)->H[5] = H5; \ (state)->H[6] = H6; \ (state)->H[7] = H7; \ - (state)->S[0] = S0; \ - (state)->S[1] = S1; \ - (state)->S[2] = S2; \ - (state)->S[3] = S3; \ (state)->T0 = T0; \ (state)->T1 = T1; \ } while (0) @@ -348,7 +340,6 @@ static const sph_u64 CB[16] = { #define DECL_STATE64_8WAY \ __m512i H0, H1, H2, H3, H4, H5, H6, H7; \ - __m512i S0, S1, S2, S3; \ uint64_t T0, T1; #define COMPRESS64_8WAY( buf ) do \ @@ -366,10 +357,10 @@ static const sph_u64 CB[16] = { V5 = H5; \ V6 = H6; \ V7 = H7; \ - V8 = _mm512_xor_si512( S0, m512_const1_64( CB0 ) ); \ - V9 = _mm512_xor_si512( S1, m512_const1_64( CB1 ) ); \ - VA = _mm512_xor_si512( S2, m512_const1_64( CB2 ) ); \ - VB = _mm512_xor_si512( S3, m512_const1_64( CB3 ) ); \ + V8 = m512_const1_64( CB0 ); \ + V9 = m512_const1_64( CB1 ); \ + VA = m512_const1_64( CB2 ); \ + VB = m512_const1_64( CB3 ); \ VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \ m512_const1_64( CB4 ) ); \ VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \ @@ -414,14 +405,14 @@ static const sph_u64 CB[16] = { ROUND_B_8WAY(3); \ ROUND_B_8WAY(4); \ ROUND_B_8WAY(5); \ - H0 = mm512_xor4( V8, V0, S0, H0 ); \ - H1 = mm512_xor4( V9, V1, S1, H1 ); \ - H2 = mm512_xor4( VA, V2, S2, H2 ); \ - H3 = mm512_xor4( VB, V3, S3, H3 ); \ - H4 = mm512_xor4( VC, V4, S0, H4 ); \ - H5 = mm512_xor4( VD, V5, S1, H5 ); \ - H6 = mm512_xor4( VE, V6, S2, H6 ); \ - H7 = mm512_xor4( VF, V7, S3, H7 ); \ + H0 = mm512_xor3( V8, V0, H0 ); \ + H1 = mm512_xor3( V9, V1, H1 ); \ + H2 = mm512_xor3( VA, V2, H2 ); \ + H3 = mm512_xor3( VB, V3, H3 ); \ + H4 = mm512_xor3( VC, V4, H4 ); \ + H5 = mm512_xor3( VD, V5, H5 ); \ + H6 = mm512_xor3( VE, V6, H6 ); \ + H7 = mm512_xor3( VF, V7, H7 ); \ } while (0) void blake512_8way_compress( blake_8way_big_context *sc ) @@ -440,10 +431,10 @@ void blake512_8way_compress( blake_8way_big_context *sc ) V5 = sc->H[5]; V6 = sc->H[6]; V7 = sc->H[7]; - V8 = _mm512_xor_si512( sc->S[0], m512_const1_64( CB0 ) ); - V9 = _mm512_xor_si512( sc->S[1], m512_const1_64( CB1 ) ); - VA = _mm512_xor_si512( sc->S[2], m512_const1_64( CB2 ) ); - VB = _mm512_xor_si512( sc->S[3], m512_const1_64( CB3 ) ); + V8 = m512_const1_64( CB0 ); + V9 = m512_const1_64( CB1 ); + VA = m512_const1_64( CB2 ); + VB = m512_const1_64( CB3 ); VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ), m512_const1_64( CB4 ) ); VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ), @@ -492,19 +483,18 @@ void blake512_8way_compress( blake_8way_big_context *sc ) ROUND_B_8WAY(4); ROUND_B_8WAY(5); - sc->H[0] = mm512_xor4( V8, V0, sc->S[0], sc->H[0] ); - sc->H[1] = mm512_xor4( V9, V1, sc->S[1], sc->H[1] ); - sc->H[2] = mm512_xor4( VA, V2, sc->S[2], sc->H[2] ); - sc->H[3] = mm512_xor4( VB, V3, sc->S[3], sc->H[3] ); - sc->H[4] = mm512_xor4( VC, V4, sc->S[0], sc->H[4] ); - sc->H[5] = mm512_xor4( VD, V5, sc->S[1], sc->H[5] ); - sc->H[6] = mm512_xor4( VE, V6, sc->S[2], sc->H[6] ); - sc->H[7] = mm512_xor4( VF, V7, sc->S[3], sc->H[7] ); + sc->H[0] = mm512_xor3( V8, V0, sc->H[0] ); + sc->H[1] = mm512_xor3( V9, V1, sc->H[1] ); + sc->H[2] = mm512_xor3( VA, V2, sc->H[2] ); + sc->H[3] = mm512_xor3( VB, V3, sc->H[3] ); + sc->H[4] = mm512_xor3( VC, V4, sc->H[4] ); + sc->H[5] = mm512_xor3( VD, V5, sc->H[5] ); + sc->H[6] = mm512_xor3( VE, V6, sc->H[6] ); + sc->H[7] = mm512_xor3( VF, V7, sc->H[7] ); } void blake512_8way_init( blake_8way_big_context *sc ) { - __m512i zero = m512_zero; casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 ); casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B ); casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B ); @@ -514,11 +504,6 @@ void blake512_8way_init( blake_8way_big_context *sc ) casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B ); casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 ); - casti_m512i( sc->S, 0 ) = zero; - casti_m512i( sc->S, 1 ) = zero; - casti_m512i( sc->S, 2 ) = zero; - casti_m512i( sc->S, 3 ) = zero; - sc->T0 = sc->T1 = 0; sc->ptr = 0; } @@ -641,11 +626,6 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst, casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B ); casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 ); - casti_m512i( sc->S, 0 ) = m512_zero; - casti_m512i( sc->S, 1 ) = m512_zero; - casti_m512i( sc->S, 2 ) = m512_zero; - casti_m512i( sc->S, 3 ) = m512_zero; - sc->T0 = sc->T1 = 0; sc->ptr = 0; @@ -740,7 +720,6 @@ blake512_8way_close(void *cc, void *dst) #define DECL_STATE64_4WAY \ __m256i H0, H1, H2, H3, H4, H5, H6, H7; \ - __m256i S0, S1, S2, S3; \ uint64_t T0, T1; #define COMPRESS64_4WAY do \ @@ -758,10 +737,10 @@ blake512_8way_close(void *cc, void *dst) V5 = H5; \ V6 = H6; \ V7 = H7; \ - V8 = _mm256_xor_si256( S0, m256_const1_64( CB0 ) ); \ - V9 = _mm256_xor_si256( S1, m256_const1_64( CB1 ) ); \ - VA = _mm256_xor_si256( S2, m256_const1_64( CB2 ) ); \ - VB = _mm256_xor_si256( S3, m256_const1_64( CB3 ) ); \ + V8 = m256_const1_64( CB0 ); \ + V9 = m256_const1_64( CB1 ); \ + VA = m256_const1_64( CB2 ); \ + VB = m256_const1_64( CB3 ); \ VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \ m256_const1_64( CB4 ) ); \ VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \ @@ -804,14 +783,14 @@ blake512_8way_close(void *cc, void *dst) ROUND_B_4WAY(3); \ ROUND_B_4WAY(4); \ ROUND_B_4WAY(5); \ - H0 = mm256_xor4( V8, V0, S0, H0 ); \ - H1 = mm256_xor4( V9, V1, S1, H1 ); \ - H2 = mm256_xor4( VA, V2, S2, H2 ); \ - H3 = mm256_xor4( VB, V3, S3, H3 ); \ - H4 = mm256_xor4( VC, V4, S0, H4 ); \ - H5 = mm256_xor4( VD, V5, S1, H5 ); \ - H6 = mm256_xor4( VE, V6, S2, H6 ); \ - H7 = mm256_xor4( VF, V7, S3, H7 ); \ + H0 = mm256_xor3( V8, V0, H0 ); \ + H1 = mm256_xor3( V9, V1, H1 ); \ + H2 = mm256_xor3( VA, V2, H2 ); \ + H3 = mm256_xor3( VB, V3, H3 ); \ + H4 = mm256_xor3( VC, V4, H4 ); \ + H5 = mm256_xor3( VD, V5, H5 ); \ + H6 = mm256_xor3( VE, V6, H6 ); \ + H7 = mm256_xor3( VF, V7, H7 ); \ } while (0) @@ -831,10 +810,10 @@ void blake512_4way_compress( blake_4way_big_context *sc ) V5 = sc->H[5]; V6 = sc->H[6]; V7 = sc->H[7]; - V8 = _mm256_xor_si256( sc->S[0], m256_const1_64( CB0 ) ); - V9 = _mm256_xor_si256( sc->S[1], m256_const1_64( CB1 ) ); - VA = _mm256_xor_si256( sc->S[2], m256_const1_64( CB2 ) ); - VB = _mm256_xor_si256( sc->S[3], m256_const1_64( CB3 ) ); + V8 = m256_const1_64( CB0 ); + V9 = m256_const1_64( CB1 ); + VA = m256_const1_64( CB2 ); + VB = m256_const1_64( CB3 ); VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ), m256_const1_64( CB4 ) ); VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ), @@ -880,19 +859,18 @@ void blake512_4way_compress( blake_4way_big_context *sc ) ROUND_B_4WAY(4); ROUND_B_4WAY(5); - sc->H[0] = mm256_xor4( V8, V0, sc->S[0], sc->H[0] ); - sc->H[1] = mm256_xor4( V9, V1, sc->S[1], sc->H[1] ); - sc->H[2] = mm256_xor4( VA, V2, sc->S[2], sc->H[2] ); - sc->H[3] = mm256_xor4( VB, V3, sc->S[3], sc->H[3] ); - sc->H[4] = mm256_xor4( VC, V4, sc->S[0], sc->H[4] ); - sc->H[5] = mm256_xor4( VD, V5, sc->S[1], sc->H[5] ); - sc->H[6] = mm256_xor4( VE, V6, sc->S[2], sc->H[6] ); - sc->H[7] = mm256_xor4( VF, V7, sc->S[3], sc->H[7] ); + sc->H[0] = mm256_xor3( V8, V0, sc->H[0] ); + sc->H[1] = mm256_xor3( V9, V1, sc->H[1] ); + sc->H[2] = mm256_xor3( VA, V2, sc->H[2] ); + sc->H[3] = mm256_xor3( VB, V3, sc->H[3] ); + sc->H[4] = mm256_xor3( VC, V4, sc->H[4] ); + sc->H[5] = mm256_xor3( VD, V5, sc->H[5] ); + sc->H[6] = mm256_xor3( VE, V6, sc->H[6] ); + sc->H[7] = mm256_xor3( VF, V7, sc->H[7] ); } void blake512_4way_init( blake_4way_big_context *sc ) { - __m256i zero = m256_zero; casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 ); casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B ); casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B ); @@ -902,11 +880,6 @@ void blake512_4way_init( blake_4way_big_context *sc ) casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B ); casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 ); - casti_m256i( sc->S, 0 ) = zero; - casti_m256i( sc->S, 1 ) = zero; - casti_m256i( sc->S, 2 ) = zero; - casti_m256i( sc->S, 3 ) = zero; - sc->T0 = sc->T1 = 0; sc->ptr = 0; } @@ -1026,11 +999,6 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst, casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B ); casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 ); - casti_m256i( sc->S, 0 ) = m256_zero; - casti_m256i( sc->S, 1 ) = m256_zero; - casti_m256i( sc->S, 2 ) = m256_zero; - casti_m256i( sc->S, 3 ) = m256_zero; - sc->T0 = sc->T1 = 0; sc->ptr = 0; diff --git a/algo/bmw/bmw256-hash-4way.c b/algo/bmw/bmw256-hash-4way.c index 92e7183..8b9de76 100644 --- a/algo/bmw/bmw256-hash-4way.c +++ b/algo/bmw/bmw256-hash-4way.c @@ -867,40 +867,35 @@ void compress_small_8way( const __m256i *M, const __m256i H[16], qt[30] = expand2s8( qt, M, H, 30 ); qt[31] = expand2s8( qt, M, H, 31 ); - xl = _mm256_xor_si256( - mm256_xor4( qt[16], qt[17], qt[18], qt[19] ), - mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) ); - xh = _mm256_xor_si256( xl, _mm256_xor_si256( - mm256_xor4( qt[24], qt[25], qt[26], qt[27] ), - mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); + xl = mm256_xor3( mm256_xor3( qt[16], qt[17], qt[18] ), + mm256_xor3( qt[19], qt[20], qt[21] ), + _mm256_xor_si256( qt[22], qt[23] ) ); + + xh = mm256_xor3( mm256_xor3( xl, qt[24], qt[25] ), + mm256_xor3( qt[26], qt[27], qt[28] ), + mm256_xor3( qt[29], qt[30], qt[31] ) ); #define DH1L( m, sl, sr, a, b, c ) \ - _mm256_add_epi32( \ - _mm256_xor_si256( M[m], \ - _mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \ - _mm256_srli_epi32( qt[a], sr ) ) ), \ - _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) ) + _mm256_add_epi32( mm256_xor3( M[m], _mm256_slli_epi32( xh, sl ), \ + _mm256_srli_epi32( qt[a], sr ) ), \ + mm256_xor3( xl, qt[b], qt[c] ) ) #define DH1R( m, sl, sr, a, b, c ) \ - _mm256_add_epi32( \ - _mm256_xor_si256( M[m], \ - _mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \ - _mm256_slli_epi32( qt[a], sr ) ) ), \ - _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) ) + _mm256_add_epi32( mm256_xor3( M[m], _mm256_srli_epi32( xh, sl ), \ + _mm256_slli_epi32( qt[a], sr ) ), \ + mm256_xor3( xl, qt[b], qt[c] ) ) #define DH2L( m, rl, sl, h, a, b, c ) \ _mm256_add_epi32( _mm256_add_epi32( \ - mm256_rol_32( dH[h], rl ), \ - _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \ - _mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \ - _mm256_xor_si256( qt[b], qt[c] ) ) ); + mm256_rol_32( dH[h], rl ), \ + mm256_xor3( xh, qt[a], M[m] ) ), \ + mm256_xor3( _mm256_slli_epi32( xl, sl ), qt[b], qt[c] ) ) #define DH2R( m, rl, sr, h, a, b, c ) \ _mm256_add_epi32( _mm256_add_epi32( \ - mm256_rol_32( dH[h], rl ), \ - _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \ - _mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \ - _mm256_xor_si256( qt[b], qt[c] ) ) ); + mm256_rol_32( dH[h], rl ), \ + mm256_xor3( xh, qt[a], M[m] ) ), \ + mm256_xor3( _mm256_srli_epi32( xl, sr ), qt[b], qt[c] ) ) dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 ); dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 ); @@ -924,88 +919,6 @@ void compress_small_8way( const __m256i *M, const __m256i H[16], #undef DH2L #undef DH2R -/* - dH[ 0] = _mm256_add_epi32( - _mm256_xor_si256( M[0], - _mm256_xor_si256( _mm256_slli_epi32( xh, 5 ), - _mm256_srli_epi32( qt[16], 5 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] )); - dH[ 1] = _mm256_add_epi32( - _mm256_xor_si256( M[1], - _mm256_xor_si256( _mm256_srli_epi32( xh, 7 ), - _mm256_slli_epi32( qt[17], 8 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] )); - dH[ 2] = _mm256_add_epi32( - _mm256_xor_si256( M[2], - _mm256_xor_si256( _mm256_srli_epi32( xh, 5 ), - _mm256_slli_epi32( qt[18], 5 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] )); - dH[ 3] = _mm256_add_epi32( - _mm256_xor_si256( M[3], - _mm256_xor_si256( _mm256_srli_epi32( xh, 1 ), - _mm256_slli_epi32( qt[19], 5 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] )); - dH[ 4] = _mm256_add_epi32( - _mm256_xor_si256( M[4], - _mm256_xor_si256( _mm256_srli_epi32( xh, 3 ), - _mm256_slli_epi32( qt[20], 0 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] )); - dH[ 5] = _mm256_add_epi32( - _mm256_xor_si256( M[5], - _mm256_xor_si256( _mm256_slli_epi32( xh, 6 ), - _mm256_srli_epi32( qt[21], 6 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] )); - dH[ 6] = _mm256_add_epi32( - _mm256_xor_si256( M[6], - _mm256_xor_si256( _mm256_srli_epi32( xh, 4 ), - _mm256_slli_epi32( qt[22], 6 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] )); - dH[ 7] = _mm256_add_epi32( - _mm256_xor_si256( M[7], - _mm256_xor_si256( _mm256_srli_epi32( xh, 11 ), - _mm256_slli_epi32( qt[23], 2 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] )); - dH[ 8] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[4], 9 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )), - _mm256_xor_si256( _mm256_slli_epi32( xl, 8 ), - _mm256_xor_si256( qt[23], qt[ 8] ) ) ); - dH[ 9] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[5], 10 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )), - _mm256_xor_si256( _mm256_srli_epi32( xl, 6 ), - _mm256_xor_si256( qt[16], qt[ 9] ) ) ); - dH[10] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[6], 11 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )), - _mm256_xor_si256( _mm256_slli_epi32( xl, 6 ), - _mm256_xor_si256( qt[17], qt[10] ) ) ); - dH[11] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[7], 12 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )), - _mm256_xor_si256( _mm256_slli_epi32( xl, 4 ), - _mm256_xor_si256( qt[18], qt[11] ) ) ); - dH[12] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[0], 13 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )), - _mm256_xor_si256( _mm256_srli_epi32( xl, 3 ), - _mm256_xor_si256( qt[19], qt[12] ) ) ); - dH[13] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[1], 14 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )), - _mm256_xor_si256( _mm256_srli_epi32( xl, 4 ), - _mm256_xor_si256( qt[20], qt[13] ) ) ); - dH[14] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[2], 15 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )), - _mm256_xor_si256( _mm256_srli_epi32( xl, 7 ), - _mm256_xor_si256( qt[21], qt[14] ) ) ); - dH[15] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[3], 16 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )), - _mm256_xor_si256( _mm256_srli_epi32( xl, 2 ), - _mm256_xor_si256( qt[22], qt[15] ) ) ); -*/ } static const __m256i final_s8[16] = @@ -1422,40 +1335,35 @@ void compress_small_16way( const __m512i *M, const __m512i H[16], qt[30] = expand2s16( qt, M, H, 30 ); qt[31] = expand2s16( qt, M, H, 31 ); - xl = _mm512_xor_si512( - mm512_xor4( qt[16], qt[17], qt[18], qt[19] ), - mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) ); - xh = _mm512_xor_si512( xl, _mm512_xor_si512( - mm512_xor4( qt[24], qt[25], qt[26], qt[27] ), - mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); + xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ), + mm512_xor3( qt[19], qt[20], qt[21] ), + _mm512_xor_si512( qt[22], qt[23] ) ); + + xh = mm512_xor3( mm512_xor3( xl, qt[24], qt[25] ), + mm512_xor3( qt[26], qt[27], qt[28] ), + mm512_xor3( qt[29], qt[30], qt[31] ) ); #define DH1L( m, sl, sr, a, b, c ) \ - _mm512_add_epi32( \ - _mm512_xor_si512( M[m], \ - _mm512_xor_si512( _mm512_slli_epi32( xh, sl ), \ - _mm512_srli_epi32( qt[a], sr ) ) ), \ - _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) ) + _mm512_add_epi32( mm512_xor3( M[m], _mm512_slli_epi32( xh, sl ), \ + _mm512_srli_epi32( qt[a], sr ) ), \ + mm512_xor3( xl, qt[b], qt[c] ) ) #define DH1R( m, sl, sr, a, b, c ) \ - _mm512_add_epi32( \ - _mm512_xor_si512( M[m], \ - _mm512_xor_si512( _mm512_srli_epi32( xh, sl ), \ - _mm512_slli_epi32( qt[a], sr ) ) ), \ - _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) ) + _mm512_add_epi32( mm512_xor3( M[m], _mm512_srli_epi32( xh, sl ), \ + _mm512_slli_epi32( qt[a], sr ) ), \ + mm512_xor3( xl, qt[b], qt[c] ) ) #define DH2L( m, rl, sl, h, a, b, c ) \ _mm512_add_epi32( _mm512_add_epi32( \ - mm512_rol_32( dH[h], rl ), \ - _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \ - _mm512_xor_si512( _mm512_slli_epi32( xl, sl ), \ - _mm512_xor_si512( qt[b], qt[c] ) ) ); + mm512_rol_32( dH[h], rl ), \ + mm512_xor3( xh, qt[a], M[m] ) ), \ + mm512_xor3( _mm512_slli_epi32( xl, sl ), qt[b], qt[c] ) ) #define DH2R( m, rl, sr, h, a, b, c ) \ _mm512_add_epi32( _mm512_add_epi32( \ - mm512_rol_32( dH[h], rl ), \ - _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \ - _mm512_xor_si512( _mm512_srli_epi32( xl, sr ), \ - _mm512_xor_si512( qt[b], qt[c] ) ) ); + mm512_rol_32( dH[h], rl ), \ + mm512_xor3( xh, qt[a], M[m] ) ), \ + mm512_xor3( _mm512_srli_epi32( xl, sr ), qt[b], qt[c] ) ) dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 ); dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 ); diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c index 4778914..3587cc4 100644 --- a/algo/bmw/bmw512-hash-4way.c +++ b/algo/bmw/bmw512-hash-4way.c @@ -1285,12 +1285,13 @@ void compress_big_8way( const __m512i *M, const __m512i H[16], qt[30] = expand2b8( qt, M, H, 30 ); qt[31] = expand2b8( qt, M, H, 31 ); - xl = _mm512_xor_si512( - mm512_xor4( qt[16], qt[17], qt[18], qt[19] ), - mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) ); - xh = _mm512_xor_si512( xl, _mm512_xor_si512( - mm512_xor4( qt[24], qt[25], qt[26], qt[27] ), - mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); + xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ), + mm512_xor3( qt[19], qt[20], qt[21] ), + _mm512_xor_si512( qt[22], qt[23] ) ); + + xh = mm512_xor3( mm512_xor3( xl, qt[24], qt[25] ), + mm512_xor3( qt[26], qt[27], qt[28] ), + mm512_xor3( qt[29], qt[30], qt[31] ) ); #define DH1L( m, sl, sr, a, b, c ) \ _mm512_add_epi64( mm512_xor3( M[m], _mm512_slli_epi64( xh, sl ), \ diff --git a/algo/echo/aes_ni/hash.c b/algo/echo/aes_ni/hash.c index a4e3958..ca1688a 100644 --- a/algo/echo/aes_ni/hash.c +++ b/algo/echo/aes_ni/hash.c @@ -53,6 +53,20 @@ MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x000 MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234}; +#define ECHO_SUBBYTES4(state, j) \ + state[0][j] = _mm_aesenc_si128(state[0][j], k1);\ + k1 = _mm_add_epi32(k1, M128(const1));\ + state[1][j] = _mm_aesenc_si128(state[1][j], k1);\ + k1 = _mm_add_epi32(k1, M128(const1));\ + state[2][j] = _mm_aesenc_si128(state[2][j], k1);\ + k1 = _mm_add_epi32(k1, M128(const1));\ + state[3][j] = _mm_aesenc_si128(state[3][j], k1);\ + k1 = _mm_add_epi32(k1, M128(const1));\ + state[0][j] = _mm_aesenc_si128(state[0][j], m128_zero ); \ + state[1][j] = _mm_aesenc_si128(state[1][j], m128_zero ); \ + state[2][j] = _mm_aesenc_si128(state[2][j], m128_zero ); \ + state[3][j] = _mm_aesenc_si128(state[3][j], m128_zero ) + #define ECHO_SUBBYTES(state, i, j) \ state[i][j] = _mm_aesenc_si128(state[i][j], k1);\ k1 = _mm_add_epi32(k1, M128(const1));\ @@ -73,7 +87,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2 t1 = _mm_and_si128(t1, M128(lsbmask));\ t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\ s2 = _mm_xor_si128(s2, t2);\ - state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\ + state2[0][j] = mm128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\ state2[1][j] = _mm_xor_si128(state2[1][j], s2);\ state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\ state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\ @@ -83,7 +97,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2 t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\ s2 = _mm_xor_si128(s2, t2);\ state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\ - state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\ + state2[1][j] = mm128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\ state2[2][j] = _mm_xor_si128(state2[2][j], s2);\ state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\ s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\ @@ -93,10 +107,29 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2 s2 = _mm_xor_si128(s2, t2);\ state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\ state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\ - state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\ + state2[2][j] = mm128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\ state2[3][j] = _mm_xor_si128(state2[3][j], s2) +#define ECHO_ROUND_UNROLL2 \ + ECHO_SUBBYTES4(_state, 0);\ + ECHO_SUBBYTES4(_state, 1);\ + ECHO_SUBBYTES4(_state, 2);\ + ECHO_SUBBYTES4(_state, 3);\ + ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\ + ECHO_SUBBYTES4(_state2, 0);\ + ECHO_SUBBYTES4(_state2, 1);\ + ECHO_SUBBYTES4(_state2, 2);\ + ECHO_SUBBYTES4(_state2, 3);\ + ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2) + +/* #define ECHO_ROUND_UNROLL2 \ ECHO_SUBBYTES(_state, 0, 0);\ ECHO_SUBBYTES(_state, 1, 0);\ @@ -138,7 +171,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2 ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\ ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\ ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2) - +*/ #define SAVESTATE(dst, src)\ diff --git a/algo/echo/echo-hash-4way.c b/algo/echo/echo-hash-4way.c index 51a9f0a..c8e52ca 100644 --- a/algo/echo/echo-hash-4way.c +++ b/algo/echo/echo-hash-4way.c @@ -13,12 +13,19 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) = #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - -//#define mul2mask m512_const2_64( 0, 0x00001b00 ) -//_mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) -//_mm512_set4_epi32( 0x00001b00, 0, 0, 0 ) - -//#define lsbmask m512_const1_32( 0x01010101 ) +#define ECHO_SUBBYTES4(state, j) \ + state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \ + k1 = _mm512_add_epi32( k1, one ); \ + state[1][j] = _mm512_aesenc_epi128( state[1][j], k1 ); \ + k1 = _mm512_add_epi32( k1, one ); \ + state[2][j] = _mm512_aesenc_epi128( state[2][j], k1 ); \ + k1 = _mm512_add_epi32( k1, one ); \ + state[3][j] = _mm512_aesenc_epi128( state[3][j], k1 ); \ + k1 = _mm512_add_epi32( k1, one ); \ + state[0][j] = _mm512_aesenc_epi128( state[0][j], m512_zero ); \ + state[1][j] = _mm512_aesenc_epi128( state[1][j], m512_zero ); \ + state[2][j] = _mm512_aesenc_epi128( state[2][j], m512_zero ); \ + state[3][j] = _mm512_aesenc_epi128( state[3][j], m512_zero ) #define ECHO_SUBBYTES( state, i, j ) \ state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \ @@ -44,8 +51,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) = t1 = _mm512_and_si512( t1, lsbmask ); \ t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \ s2 = _mm512_xor_si512( s2, t2 );\ - state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \ - _mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \ + state2[ 0 ][ j ] = mm512_xor3( state2[ 0 ][ j ], s2, state1[ 1 ][ j1 ] ); \ state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \ state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \ state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \ @@ -55,8 +61,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) = t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \ s2 = _mm512_xor_si512( s2, t2 ); \ state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \ - state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \ - _mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \ + state2[ 1 ][ j ] = mm512_xor3( state2[ 1 ][ j ], s2, state1[ 2 ][ j2 ] ); \ state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \ state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \ s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \ @@ -66,11 +71,29 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) = s2 = _mm512_xor_si512( s2, t2 ); \ state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \ state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \ - state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \ - _mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \ + state2[ 2 ][ j ] = mm512_xor3( state2[ 2 ][ j ], s2, state1[ 3 ][ j3] ); \ state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \ } while(0) +#define ECHO_ROUND_UNROLL2 \ + ECHO_SUBBYTES4(_state, 0);\ + ECHO_SUBBYTES4(_state, 1);\ + ECHO_SUBBYTES4(_state, 2);\ + ECHO_SUBBYTES4(_state, 3);\ + ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\ + ECHO_SUBBYTES4(_state2, 0);\ + ECHO_SUBBYTES4(_state2, 1);\ + ECHO_SUBBYTES4(_state2, 2);\ + ECHO_SUBBYTES4(_state2, 3);\ + ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2) + +/* #define ECHO_ROUND_UNROLL2 \ ECHO_SUBBYTES(_state, 0, 0);\ ECHO_SUBBYTES(_state, 1, 0);\ @@ -112,6 +135,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) = ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\ ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\ ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2) +*/ #define SAVESTATE(dst, src)\ dst[0][0] = src[0][0];\ @@ -405,6 +429,20 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, #define lsbmask_2way m256_const1_32( 0x01010101 ) +#define ECHO_SUBBYTES4_2WAY( state, j ) \ + state[0][j] = _mm256_aesenc_epi128( state[0][j], k1 ); \ + k1 = _mm256_add_epi32( k1, m256_one_128 ); \ + state[1][j] = _mm256_aesenc_epi128( state[1][j], k1 ); \ + k1 = _mm256_add_epi32( k1, m256_one_128 ); \ + state[2][j] = _mm256_aesenc_epi128( state[2][j], k1 ); \ + k1 = _mm256_add_epi32( k1, m256_one_128 ); \ + state[3][j] = _mm256_aesenc_epi128( state[3][j], k1 ); \ + k1 = _mm256_add_epi32( k1, m256_one_128 ); \ + state[0][j] = _mm256_aesenc_epi128( state[0][j], m256_zero ); \ + state[1][j] = _mm256_aesenc_epi128( state[1][j], m256_zero ); \ + state[2][j] = _mm256_aesenc_epi128( state[2][j], m256_zero ); \ + state[3][j] = _mm256_aesenc_epi128( state[3][j], m256_zero ) + #define ECHO_SUBBYTES_2WAY( state, i, j ) \ state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \ k1 = _mm256_add_epi32( k1, m256_one_128 ); \ @@ -456,6 +494,25 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \ } while(0) +#define ECHO_ROUND_UNROLL2_2WAY \ + ECHO_SUBBYTES4_2WAY(_state, 0);\ + ECHO_SUBBYTES4_2WAY(_state, 1);\ + ECHO_SUBBYTES4_2WAY(_state, 2);\ + ECHO_SUBBYTES4_2WAY(_state, 3);\ + ECHO_MIXBYTES_2WAY(_state, _state2, 0, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state, _state2, 1, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state, _state2, 2, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state, _state2, 3, t1, t2, s2);\ + ECHO_SUBBYTES4_2WAY(_state2, 0);\ + ECHO_SUBBYTES4_2WAY(_state2, 1);\ + ECHO_SUBBYTES4_2WAY(_state2, 2);\ + ECHO_SUBBYTES4_2WAY(_state2, 3);\ + ECHO_MIXBYTES_2WAY(_state2, _state, 0, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2) + +/* #define ECHO_ROUND_UNROLL2_2WAY \ ECHO_SUBBYTES_2WAY(_state, 0, 0);\ ECHO_SUBBYTES_2WAY(_state, 1, 0);\ @@ -497,6 +554,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\ ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\ ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2) +*/ #define SAVESTATE_2WAY(dst, src)\ dst[0][0] = src[0][0];\ diff --git a/algo/fugue/fugue-aesni.c b/algo/fugue/fugue-aesni.c index 2dd253a..8f0af13 100644 --- a/algo/fugue/fugue-aesni.c +++ b/algo/fugue/fugue-aesni.c @@ -124,7 +124,16 @@ MYALIGN const unsigned int _IV512[] = { t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\ s7 = _mm_xor_si128(s7, t1) +#define PRESUPERMIX(t0, t1, t2, t3, t4)\ + t2 = t0;\ + t3 = _mm_add_epi8(t0, t0);\ + t4 = _mm_add_epi8(t3, t3);\ + t1 = _mm_srli_epi16(t0, 6);\ + t1 = _mm_and_si128(t1, M128(_lsbmask2));\ + t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\ + t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)) +/* #define PRESUPERMIX(x, t1, s1, s2, t2)\ s1 = x;\ s2 = _mm_add_epi8(x, x);\ @@ -133,37 +142,59 @@ MYALIGN const unsigned int _IV512[] = { t1 = _mm_and_si128(t1, M128(_lsbmask2));\ s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\ x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1)) +*/ -#define SUBSTITUTE(r0, _t1, _t2, _t3, _t0)\ +#define SUBSTITUTE(r0, _t2 )\ _t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\ _t2 = _mm_aesenclast_si128( _t2, m128_zero ) - + +#define SUPERMIX(t0, t1, t2, t3, t4)\ + t2 = t0;\ + t3 = _mm_add_epi8(t0, t0);\ + t4 = _mm_add_epi8(t3, t3);\ + t1 = _mm_srli_epi16(t0, 6);\ + t1 = _mm_and_si128(t1, M128(_lsbmask2));\ + t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \ + t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\ + t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\ + t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\ + t4 = _mm_xor_si128(t4, t1);\ + t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\ + t4 = _mm_xor_si128(t4, t1);\ + t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\ + t2 = mm128_xor3(t2, t3, t0 );\ + t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\ + t4 = mm128_xor3( t4, t1, t2 ); \ + t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\ + t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\ + t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\ + t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\ + t4 = mm128_xor3( t4, t2, t1 ); \ + t0 = _mm_xor_si128(t0, t3);\ + t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c))); + +/* #define SUPERMIX(t0, t1, t2, t3, t4)\ PRESUPERMIX(t0, t1, t2, t3, t4);\ POSTSUPERMIX(t0, t1, t2, t3, t4) - +*/ #define POSTSUPERMIX(t0, t1, t2, t3, t4)\ - t1 = t2;\ - t1 = _mm_shuffle_epi8(t1, M128(_supermix1b));\ + t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\ t4 = t1;\ t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\ t4 = _mm_xor_si128(t4, t1);\ - t1 = t4;\ - t1 = _mm_shuffle_epi8(t1, M128(_supermix1d));\ + t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\ t4 = _mm_xor_si128(t4, t1);\ - t1 = t2;\ - t1 = _mm_shuffle_epi8(t1, M128(_supermix1a));\ + t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\ t4 = _mm_xor_si128(t4, t1);\ - t2 = _mm_xor_si128(t2, t3);\ - t2 = _mm_xor_si128(t2, t0);\ + t2 = mm128_xor3(t2, t3, t0 );\ t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\ t4 = _mm_xor_si128(t4, t2);\ t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\ t4 = _mm_xor_si128(t4, t2);\ t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\ - t1 = t0;\ - t1 = _mm_shuffle_epi8(t1, M128(_supermix4a));\ + t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\ t4 = _mm_xor_si128(t4, t1);\ t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\ t0 = _mm_xor_si128(t0, t3);\ @@ -171,59 +202,55 @@ MYALIGN const unsigned int _IV512[] = { t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\ t4 = _mm_xor_si128(t4, t0) - #define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\ CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\ PACK_S0(r1c, r1a, _t0);\ - SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\ + SUBSTITUTE(r1c, _t2 );\ SUPERMIX(_t2, _t3, _t0, _t1, r1c);\ _t0 = _mm_shuffle_epi32(r1c, 0x39);\ r2c = _mm_xor_si128(r2c, _t0);\ _t0 = mm128_mask_32( _t0, 8 ); \ r2d = _mm_xor_si128(r2d, _t0);\ UNPACK_S0(r1c, r1a, _t3);\ - SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\ + SUBSTITUTE(r2c, _t2 );\ SUPERMIX(_t2, _t3, _t0, _t1, r2c);\ _t0 = _mm_shuffle_epi32(r2c, 0x39);\ r3c = _mm_xor_si128(r3c, _t0);\ _t0 = mm128_mask_32( _t0, 8 ); \ r3d = _mm_xor_si128(r3d, _t0);\ UNPACK_S0(r2c, r2a, _t3);\ - SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\ + SUBSTITUTE(r3c, _t2 );\ SUPERMIX(_t2, _t3, _t0, _t1, r3c);\ UNPACK_S0(r3c, r3a, _t3) - #define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\ CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\ PACK_S0(r1c, r1a, _t0);\ - SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\ + SUBSTITUTE( r1c, _t2 );\ SUPERMIX(_t2, _t3, _t0, _t1, r1c);\ _t0 = _mm_shuffle_epi32(r1c, 0x39);\ r2c = _mm_xor_si128(r2c, _t0);\ _t0 = mm128_mask_32( _t0, 8 ); \ r2d = _mm_xor_si128(r2d, _t0);\ UNPACK_S0(r1c, r1a, _t3);\ - SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\ + SUBSTITUTE(r2c, _t2 );\ SUPERMIX(_t2, _t3, _t0, _t1, r2c);\ _t0 = _mm_shuffle_epi32(r2c, 0x39);\ r3c = _mm_xor_si128(r3c, _t0);\ _t0 = mm128_mask_32( _t0, 8 ); \ r3d = _mm_xor_si128(r3d, _t0);\ UNPACK_S0(r2c, r2a, _t3);\ - SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\ + SUBSTITUTE( r3c, _t2 );\ SUPERMIX(_t2, _t3, _t0, _t1, r3c);\ _t0 = _mm_shuffle_epi32(r3c, 0x39);\ r4c = _mm_xor_si128(r4c, _t0);\ _t0 = mm128_mask_32( _t0, 8 ); \ r4d = _mm_xor_si128(r4d, _t0);\ UNPACK_S0(r3c, r3a, _t3);\ - SUBSTITUTE(r4c, _t1, _t2, _t3, _t0);\ + SUBSTITUTE( r4c, _t2 );\ SUPERMIX(_t2, _t3, _t0, _t1, r4c);\ UNPACK_S0(r4c, r4a, _t3) - - #define LOADCOLUMN(x, s, a)\ block[0] = col[(base + a + 0) % s];\ block[1] = col[(base + a + 1) % s];\ @@ -247,14 +274,14 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u case 1: TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4], ctx->state[5], ctx->state[ 6], ctx->state[8], - ctx->state[9], ctx->state[10], _t0, _t1, _t2 ); + ctx->state[9], ctx->state[10], _t0, _t1, _t2 ); - SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], + SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], ctx->state[1], ctx->state[7], ctx->state[8], - ctx->state[6], ctx->state[0], ctx->state[6], - ctx->state[7], ctx->state[5], ctx->state[11], - ctx->state[5], ctx->state[6], ctx->state[4], - ctx->state[10] ); + ctx->state[6], ctx->state[0], ctx->state[6], + ctx->state[7], ctx->state[5], ctx->state[11], + ctx->state[5], ctx->state[6], ctx->state[4], + ctx->state[10] ); ctx->base++; pmsg += 4; uBlockCount--; @@ -263,14 +290,14 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u case 2: TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0], ctx->state[ 1], ctx->state[2], ctx->state[4], - ctx->state[ 5], ctx->state[6], _t0, _t1, _t2); + ctx->state[ 5], ctx->state[6], _t0, _t1, _t2); SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3], ctx->state[9], ctx->state[3], ctx->state[4], - ctx->state[2], ctx->state[8], ctx->state[2], - ctx->state[3], ctx->state[1], ctx->state[7], - ctx->state[1], ctx->state[2], ctx->state[0], - ctx->state[6]); + ctx->state[2], ctx->state[8], ctx->state[2], + ctx->state[3], ctx->state[1], ctx->state[7], + ctx->state[1], ctx->state[2], ctx->state[0], + ctx->state[6]); ctx->base = 0; pmsg += 4; @@ -278,44 +305,42 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u break; } - while( uBlockCount > 0 ) { - TIX512( pmsg, ctx->state[ 7], ctx->state[2], ctx->state[8], ctx->state[9], - ctx->state[10], ctx->state[0], ctx->state[1], ctx->state[2], - _t0, _t1, _t2 ); - SUBROUND512_4( ctx->state[0], ctx->state[1], ctx->state[11], - ctx->state[5], ctx->state[11], ctx->state[0], - ctx->state[10], ctx->state[4], ctx->state[10], - ctx->state[11], ctx->state[9], ctx->state[3], - ctx->state[9], ctx->state[10], ctx->state[8], - ctx->state[2] ); + TIX512( pmsg, ctx->state[ 7],ctx->state[2],ctx->state[8],ctx->state[9], + ctx->state[10],ctx->state[0],ctx->state[1],ctx->state[2], + _t0, _t1, _t2 ); + SUBROUND512_4( ctx->state[0], ctx->state[1],ctx->state[11],ctx->state[5], + ctx->state[11],ctx->state[0],ctx->state[10],ctx->state[4], + ctx->state[10],ctx->state[11],ctx->state[9],ctx->state[3], + ctx->state[9],ctx->state[10],ctx->state[8],ctx->state[2] ); ctx->base++; pmsg += 4; uBlockCount--; if( uBlockCount == 0 ) break; - TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4], ctx->state[5], - ctx->state[6], ctx->state[8], ctx->state[9], ctx->state[10], - _t0, _t1, _t2 ); + TIX512( pmsg, ctx->state[3],ctx->state[10],ctx->state[4],ctx->state[5], + ctx->state[6],ctx->state[8], ctx->state[9],ctx->state[10], + _t0, _t1, _t2 ); - SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], ctx->state[1], ctx->state[7], ctx->state[8], ctx->state[6], ctx->state[0], - ctx->state[6], ctx->state[7], ctx->state[5], ctx->state[11], - ctx->state[5], ctx->state[6, ctx->state[4], ctx->state[10]); + SUBROUND512_4( ctx->state[8],ctx->state[9],ctx->state[7],ctx->state[1], + ctx->state[7],ctx->state[8],ctx->state[6],ctx->state[0], + ctx->state[6],ctx->state[7],ctx->state[5],ctx->state[11], + ctx->state[5],ctx->state[6],ctx->state[4],ctx->state[10] ); ctx->base++; pmsg += 4; uBlockCount--; if( uBlockCount == 0 ) break; - TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0], ctx->state[1], - ctx->state[2], ctx->state[4], ctx->state[5], ctx->state[6], - _t0, _t1, _t2); - SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3], ctx->state[9], - ctx->state[3], ctx->state[4], ctx->state[2], ctx->state[8], - ctx->state[2], ctx->state[3], ctx->state[1], ctx->state[7], - ctx->state[1], ctx->state[2], ctx->state[0], ctx->state[6]); + TIX512( pmsg, ctx->state[11],ctx->state[6],ctx->state[0],ctx->state[1], + ctx->state[2], ctx->state[4],ctx->state[5],ctx->state[6], + _t0, _t1, _t2); + SUBROUND512_4( ctx->state[4],ctx->state[5],ctx->state[3],ctx->state[9], + ctx->state[3],ctx->state[4],ctx->state[2],ctx->state[8], + ctx->state[2],ctx->state[3],ctx->state[1],ctx->state[7], + ctx->state[1],ctx->state[2],ctx->state[0],ctx->state[6]); ctx->base = 0; pmsg += 4; @@ -326,8 +351,8 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u void Final512(hashState_fugue *ctx, BitSequence *hashval) { - unsigned int block[4] __attribute__ ((aligned (32))); - unsigned int col[36] __attribute__ ((aligned (16))); + unsigned int block[4] __attribute__ ((aligned (32))); + unsigned int col[36] __attribute__ ((aligned (16))); unsigned int i, base; __m128i r0, _t0, _t1, _t2, _t3; @@ -357,7 +382,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval) // SMIX LOADCOLUMN(r0, 36, 0); - SUBSTITUTE(r0, _t1, _t2, _t3, _t0); + SUBSTITUTE(r0, _t2); SUPERMIX(_t2, _t3, _t0, _t1, r0); STORECOLUMN(r0, 36); } @@ -375,7 +400,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval) // SMIX LOADCOLUMN(r0, 36, 0); - SUBSTITUTE(r0, _t1, _t2, _t3, _t0); + SUBSTITUTE(r0, _t2); SUPERMIX(_t2, _t3, _t0, _t1, r0); STORECOLUMN(r0, 36); @@ -390,7 +415,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval) // SMIX LOADCOLUMN(r0, 36, 0); - SUBSTITUTE(r0, _t1, _t2, _t3, _t0); + SUBSTITUTE(r0, _t2); SUPERMIX(_t2, _t3, _t0, _t1, r0); STORECOLUMN(r0, 36); @@ -405,7 +430,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval) // SMIX LOADCOLUMN(r0, 36, 0); - SUBSTITUTE(r0, _t1, _t2, _t3, _t0); + SUBSTITUTE(r0, _t2); SUPERMIX(_t2, _t3, _t0, _t1, r0); STORECOLUMN(r0, 36); @@ -420,7 +445,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval) // SMIX LOADCOLUMN(r0, 36, 0); - SUBSTITUTE(r0, _t1, _t2, _t3, _t0); + SUBSTITUTE(r0, _t2); SUPERMIX(_t2, _t3, _t0, _t1, r0); STORECOLUMN(r0, 36); } diff --git a/algo/groestl/aes_ni/groestl-intr-aes.h b/algo/groestl/aes_ni/groestl-intr-aes.h index e09e8de..f2d376e 100644 --- a/algo/groestl/aes_ni/groestl-intr-aes.h +++ b/algo/groestl/aes_ni/groestl-intr-aes.h @@ -67,11 +67,9 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 }; * xmm[j] will be lost * xmm[k] has to be all 0x1b */ #define MUL2(i, j, k){\ - j = _mm_xor_si128(j, j);\ - j = _mm_cmpgt_epi8(j, i);\ + j = _mm_cmpgt_epi8( m128_zero, i);\ i = _mm_add_epi8(i, i);\ - j = _mm_and_si128(j, k);\ - i = _mm_xor_si128(i, j);\ + i = mm128_xorand(i, j, k );\ } /**/ @@ -93,6 +91,96 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 }; We almost fit into 16 registers, need only 3 spills to memory. This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. K. Matusiewicz, 2011/05/29 */ + +#if defined(__AVX512VL__) + +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* t_i = a_i + a_{i+1} */\ + b6 = a0;\ + b7 = a1;\ + a0 = _mm_xor_si128(a0, a1);\ + b0 = a2;\ + a1 = _mm_xor_si128(a1, a2);\ + b1 = a3;\ + TEMP2 = _mm_xor_si128(a2, a3);\ + b2 = a4;\ + a3 = _mm_xor_si128(a3, a4);\ + b3 = a5;\ + a4 = _mm_xor_si128(a4, a5);\ + b4 = a6;\ + a5 = _mm_xor_si128(a5, a6);\ + b5 = a7;\ + a6 = _mm_xor_si128(a6, a7);\ + a7 = _mm_xor_si128(a7, b6);\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + TEMP0 = mm128_xor3( b0, a4, a6 ); \ + /* spill values y_4, y_5 to memory */\ + TEMP1 = mm128_xor3( b1, a5, a7 );\ + b2 = mm128_xor3( b2, a6, a0 ); \ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + b0 = a0;\ + b3 = mm128_xor3( b3, a7, a1 ); \ + b1 = a1;\ + b6 = mm128_xor3( b6, a4, TEMP2 ); \ + b4 = mm128_xor3( b4, a0, TEMP2 ); \ + b7 = mm128_xor3( b7, a5, a3 ); \ + b5 = mm128_xor3( b5, a1, a3 ); \ + \ + /* compute x_i = t_i + t_{i+3} */\ + a0 = _mm_xor_si128(a0, a3);\ + a1 = _mm_xor_si128(a1, a4);\ + a2 = _mm_xor_si128(TEMP2, a5);\ + a3 = _mm_xor_si128(a3, a6);\ + a4 = _mm_xor_si128(a4, a7);\ + a5 = _mm_xor_si128(a5, b0);\ + a6 = _mm_xor_si128(a6, b1);\ + a7 = _mm_xor_si128(a7, TEMP2);\ + \ + /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + /* compute w_i : add y_{i+4} */\ + b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\ + MUL2(a0, b0, b1);\ + a0 = _mm_xor_si128(a0, TEMP0);\ + MUL2(a1, b0, b1);\ + a1 = _mm_xor_si128(a1, TEMP1);\ + MUL2(a2, b0, b1);\ + a2 = _mm_xor_si128(a2, b2);\ + MUL2(a3, b0, b1);\ + a3 = _mm_xor_si128(a3, b3);\ + MUL2(a4, b0, b1);\ + a4 = _mm_xor_si128(a4, b4);\ + MUL2(a5, b0, b1);\ + a5 = _mm_xor_si128(a5, b5);\ + MUL2(a6, b0, b1);\ + a6 = _mm_xor_si128(a6, b6);\ + MUL2(a7, b0, b1);\ + a7 = _mm_xor_si128(a7, b7);\ + \ + /* compute v_i : double w_i */\ + /* add to y_4 y_5 .. v3, v4, ... */\ + MUL2(a0, b0, b1);\ + b5 = _mm_xor_si128(b5, a0);\ + MUL2(a1, b0, b1);\ + b6 = _mm_xor_si128(b6, a1);\ + MUL2(a2, b0, b1);\ + b7 = _mm_xor_si128(b7, a2);\ + MUL2(a5, b0, b1);\ + b2 = _mm_xor_si128(b2, a5);\ + MUL2(a6, b0, b1);\ + b3 = _mm_xor_si128(b3, a6);\ + MUL2(a7, b0, b1);\ + b4 = _mm_xor_si128(b4, a7);\ + MUL2(a3, b0, b1);\ + MUL2(a4, b0, b1);\ + b0 = TEMP0;\ + b1 = TEMP1;\ + b0 = _mm_xor_si128(b0, a3);\ + b1 = _mm_xor_si128(b1, a4);\ +}/*MixBytes*/ + +#else + #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ /* t_i = a_i + a_{i+1} */\ b6 = a0;\ @@ -189,6 +277,8 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 }; b1 = _mm_xor_si128(b1, a4);\ }/*MixBytes*/ +#endif + /* one round * a0-a7 = input rows diff --git a/algo/groestl/aes_ni/groestl256-intr-aes.h b/algo/groestl/aes_ni/groestl256-intr-aes.h index 61c1b7b..a8e7674 100644 --- a/algo/groestl/aes_ni/groestl256-intr-aes.h +++ b/algo/groestl/aes_ni/groestl256-intr-aes.h @@ -58,11 +58,9 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e }; * xmm[j] will be lost * xmm[k] has to be all 0x1b */ #define MUL2(i, j, k){\ - j = _mm_xor_si128(j, j);\ - j = _mm_cmpgt_epi8(j, i);\ + j = _mm_cmpgt_epi8( m128_zero, i);\ i = _mm_add_epi8(i, i);\ - j = _mm_and_si128(j, k);\ - i = _mm_xor_si128(i, j);\ + i = mm128_xorand(i, j, k );\ } /* Yet another implementation of MixBytes. @@ -82,6 +80,96 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e }; We almost fit into 16 registers, need only 3 spills to memory. This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. K. Matusiewicz, 2011/05/29 */ + +#if defined(__AVX512VL__) + +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* t_i = a_i + a_{i+1} */\ + b6 = a0;\ + b7 = a1;\ + a0 = _mm_xor_si128(a0, a1);\ + b0 = a2;\ + a1 = _mm_xor_si128(a1, a2);\ + b1 = a3;\ + TEMP2 = _mm_xor_si128(a2, a3);\ + b2 = a4;\ + a3 = _mm_xor_si128(a3, a4);\ + b3 = a5;\ + a4 = _mm_xor_si128(a4, a5);\ + b4 = a6;\ + a5 = _mm_xor_si128(a5, a6);\ + b5 = a7;\ + a6 = _mm_xor_si128(a6, a7);\ + a7 = _mm_xor_si128(a7, b6);\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + TEMP0 = mm128_xor3( b0, a4, a6 ); \ + /* spill values y_4, y_5 to memory */\ + TEMP1 = mm128_xor3( b1, a5, a7 );\ + b2 = mm128_xor3( b2, a6, a0 ); \ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + b0 = a0;\ + b3 = mm128_xor3( b3, a7, a1 ); \ + b1 = a1;\ + b6 = mm128_xor3( b6, a4, TEMP2 ); \ + b4 = mm128_xor3( b4, a0, TEMP2 ); \ + b7 = mm128_xor3( b7, a5, a3 ); \ + b5 = mm128_xor3( b5, a1, a3 ); \ + \ + /* compute x_i = t_i + t_{i+3} */\ + a0 = _mm_xor_si128(a0, a3);\ + a1 = _mm_xor_si128(a1, a4);\ + a2 = _mm_xor_si128(TEMP2, a5);\ + a3 = _mm_xor_si128(a3, a6);\ + a4 = _mm_xor_si128(a4, a7);\ + a5 = _mm_xor_si128(a5, b0);\ + a6 = _mm_xor_si128(a6, b1);\ + a7 = _mm_xor_si128(a7, TEMP2);\ + \ + /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + /* compute w_i : add y_{i+4} */\ + b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\ + MUL2(a0, b0, b1);\ + a0 = _mm_xor_si128(a0, TEMP0);\ + MUL2(a1, b0, b1);\ + a1 = _mm_xor_si128(a1, TEMP1);\ + MUL2(a2, b0, b1);\ + a2 = _mm_xor_si128(a2, b2);\ + MUL2(a3, b0, b1);\ + a3 = _mm_xor_si128(a3, b3);\ + MUL2(a4, b0, b1);\ + a4 = _mm_xor_si128(a4, b4);\ + MUL2(a5, b0, b1);\ + a5 = _mm_xor_si128(a5, b5);\ + MUL2(a6, b0, b1);\ + a6 = _mm_xor_si128(a6, b6);\ + MUL2(a7, b0, b1);\ + a7 = _mm_xor_si128(a7, b7);\ + \ + /* compute v_i : double w_i */\ + /* add to y_4 y_5 .. v3, v4, ... */\ + MUL2(a0, b0, b1);\ + b5 = _mm_xor_si128(b5, a0);\ + MUL2(a1, b0, b1);\ + b6 = _mm_xor_si128(b6, a1);\ + MUL2(a2, b0, b1);\ + b7 = _mm_xor_si128(b7, a2);\ + MUL2(a5, b0, b1);\ + b2 = _mm_xor_si128(b2, a5);\ + MUL2(a6, b0, b1);\ + b3 = _mm_xor_si128(b3, a6);\ + MUL2(a7, b0, b1);\ + b4 = _mm_xor_si128(b4, a7);\ + MUL2(a3, b0, b1);\ + MUL2(a4, b0, b1);\ + b0 = TEMP0;\ + b1 = TEMP1;\ + b0 = _mm_xor_si128(b0, a3);\ + b1 = _mm_xor_si128(b1, a4);\ +}/*MixBytes*/ + +#else + #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ /* t_i = a_i + a_{i+1} */\ b6 = a0;\ @@ -178,6 +266,8 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e }; b1 = _mm_xor_si128(b1, a4);\ }/*MixBytes*/ +#endif + /* one round * i = round number * a0-a7 = input rows diff --git a/algo/groestl/groestl256-intr-4way.h b/algo/groestl/groestl256-intr-4way.h index 25d9171..ff62a1c 100644 --- a/algo/groestl/groestl256-intr-4way.h +++ b/algo/groestl/groestl256-intr-4way.h @@ -96,11 +96,9 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e, * xmm[j] will be lost * xmm[k] has to be all 0x1b */ #define MUL2(i, j, k){\ - j = _mm512_xor_si512(j, j);\ - j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\ + j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\ i = _mm512_add_epi8(i, i);\ - j = _mm512_and_si512(j, k);\ - i = _mm512_xor_si512(i, j);\ + i = mm512_xorand( i, j, k );\ } /* Yet another implementation of MixBytes. @@ -120,6 +118,95 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e, We almost fit into 16 registers, need only 3 spills to memory. This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. K. Matusiewicz, 2011/05/29 */ + +#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \ + b0, b1, b2, b3, b4, b5, b6, b7) { \ + /* t_i = a_i + a_{i+1} */\ + b6 = a0; \ + b7 = a1; \ + a0 = _mm512_xor_si512( a0, a1 ); \ + b0 = a2; \ + a1 = _mm512_xor_si512( a1, a2 ); \ + b1 = a3; \ + TEMP2 = _mm512_xor_si512( a2, a3 ); \ + b2 = a4; \ + a3 = _mm512_xor_si512( a3, a4 ); \ + b3 = a5; \ + a4 = _mm512_xor_si512( a4, a5 );\ + b4 = a6; \ + a5 = _mm512_xor_si512( a5, a6 ); \ + b5 = a7; \ + a6 = _mm512_xor_si512( a6, a7 ); \ + a7 = _mm512_xor_si512( a7, b6 ); \ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + TEMP0 = mm512_xor3( b0, a4, a6 ); \ + /* spill values y_4, y_5 to memory */\ + TEMP1 = mm512_xor3( b1, a5, a7 ); \ + b2 = mm512_xor3( b2, a6, a0 ); \ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + b0 = a0; \ + b3 = mm512_xor3( b3, a7, a1 ); \ + b1 = a1; \ + b6 = mm512_xor3( b6, a4, TEMP2 ); \ + b4 = mm512_xor3( b4, a0, TEMP2 ); \ + b7 = mm512_xor3( b7, a5, a3 ); \ + b5 = mm512_xor3( b5, a1, a3 ); \ + \ + /* compute x_i = t_i + t_{i+3} */\ + a0 = _mm512_xor_si512( a0, a3 ); \ + a1 = _mm512_xor_si512( a1, a4 ); \ + a2 = _mm512_xor_si512( TEMP2, a5 ); \ + a3 = _mm512_xor_si512( a3, a6 ); \ + a4 = _mm512_xor_si512( a4, a7 ); \ + a5 = _mm512_xor_si512( a5, b0 ); \ + a6 = _mm512_xor_si512( a6, b1 ); \ + a7 = _mm512_xor_si512( a7, TEMP2 ); \ + \ + /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + /* compute w_i : add y_{i+4} */\ + b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \ + MUL2( a0, b0, b1 ); \ + a0 = _mm512_xor_si512( a0, TEMP0 ); \ + MUL2( a1, b0, b1 ); \ + a1 = _mm512_xor_si512( a1, TEMP1 ); \ + MUL2( a2, b0, b1 ); \ + a2 = _mm512_xor_si512( a2, b2 ); \ + MUL2( a3, b0, b1 ); \ + a3 = _mm512_xor_si512( a3, b3 ); \ + MUL2( a4, b0, b1 ); \ + a4 = _mm512_xor_si512( a4, b4 ); \ + MUL2( a5, b0, b1 ); \ + a5 = _mm512_xor_si512( a5, b5 ); \ + MUL2( a6, b0, b1 ); \ + a6 = _mm512_xor_si512( a6, b6 ); \ + MUL2( a7, b0, b1 ); \ + a7 = _mm512_xor_si512( a7, b7 ); \ + \ + /* compute v_i : double w_i */\ + /* add to y_4 y_5 .. v3, v4, ... */\ + MUL2( a0, b0, b1 ); \ + b5 = _mm512_xor_si512( b5, a0 ); \ + MUL2( a1, b0, b1 ); \ + b6 = _mm512_xor_si512( b6, a1 ); \ + MUL2( a2, b0, b1 ); \ + b7 = _mm512_xor_si512( b7, a2 ); \ + MUL2( a5, b0, b1 ); \ + b2 = _mm512_xor_si512( b2, a5 ); \ + MUL2( a6, b0, b1 ); \ + b3 = _mm512_xor_si512( b3, a6 ); \ + MUL2( a7, b0, b1 ); \ + b4 = _mm512_xor_si512( b4, a7 ); \ + MUL2( a3, b0, b1 ); \ + MUL2( a4, b0, b1 ); \ + b0 = TEMP0;\ + b1 = TEMP1;\ + b0 = _mm512_xor_si512( b0, a3 ); \ + b1 = _mm512_xor_si512( b1, a4 ); \ +}/*MixBytes*/ + + +#if 0 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ /* t_i = a_i + a_{i+1} */\ b6 = a0;\ @@ -215,7 +302,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e, b0 = _mm512_xor_si512(b0, a3);\ b1 = _mm512_xor_si512(b1, a4);\ }/*MixBytes*/ - +#endif #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ /* AddRoundConstant */\ diff --git a/algo/groestl/groestl512-intr-4way.h b/algo/groestl/groestl512-intr-4way.h index 5d8d715..354e018 100644 --- a/algo/groestl/groestl512-intr-4way.h +++ b/algo/groestl/groestl512-intr-4way.h @@ -104,11 +104,9 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003, * xmm[j] will be lost * xmm[k] has to be all 0x1b */ #define MUL2(i, j, k){\ - j = _mm512_xor_si512(j, j);\ - j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\ + j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\ i = _mm512_add_epi8(i, i);\ - j = _mm512_and_si512(j, k);\ - i = _mm512_xor_si512(i, j);\ + i = mm512_xorand( i, j, k );\ } /**/ @@ -130,100 +128,90 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003, We almost fit into 16 registers, need only 3 spills to memory. This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. K. Matusiewicz, 2011/05/29 */ -#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ +#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \ + b0, b1, b2, b3, b4, b5, b6, b7) { \ /* t_i = a_i + a_{i+1} */\ - b6 = a0;\ - b7 = a1;\ - a0 = _mm512_xor_si512(a0, a1);\ - b0 = a2;\ - a1 = _mm512_xor_si512(a1, a2);\ - b1 = a3;\ - a2 = _mm512_xor_si512(a2, a3);\ - b2 = a4;\ - a3 = _mm512_xor_si512(a3, a4);\ - b3 = a5;\ - a4 = _mm512_xor_si512(a4, a5);\ - b4 = a6;\ - a5 = _mm512_xor_si512(a5, a6);\ - b5 = a7;\ - a6 = _mm512_xor_si512(a6, a7);\ - a7 = _mm512_xor_si512(a7, b6);\ + b6 = a0; \ + b7 = a1; \ + a0 = _mm512_xor_si512( a0, a1 ); \ + b0 = a2; \ + a1 = _mm512_xor_si512( a1, a2 ); \ + b1 = a3; \ + TEMP2 = _mm512_xor_si512( a2, a3 ); \ + b2 = a4; \ + a3 = _mm512_xor_si512( a3, a4 ); \ + b3 = a5; \ + a4 = _mm512_xor_si512( a4, a5 );\ + b4 = a6; \ + a5 = _mm512_xor_si512( a5, a6 ); \ + b5 = a7; \ + a6 = _mm512_xor_si512( a6, a7 ); \ + a7 = _mm512_xor_si512( a7, b6 ); \ \ /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ - b0 = _mm512_xor_si512(b0, a4);\ - b6 = _mm512_xor_si512(b6, a4);\ - b1 = _mm512_xor_si512(b1, a5);\ - b7 = _mm512_xor_si512(b7, a5);\ - b2 = _mm512_xor_si512(b2, a6);\ - b0 = _mm512_xor_si512(b0, a6);\ + TEMP0 = mm512_xor3( b0, a4, a6 ); \ /* spill values y_4, y_5 to memory */\ - TEMP0 = b0;\ - b3 = _mm512_xor_si512(b3, a7);\ - b1 = _mm512_xor_si512(b1, a7);\ - TEMP1 = b1;\ - b4 = _mm512_xor_si512(b4, a0);\ - b2 = _mm512_xor_si512(b2, a0);\ + TEMP1 = mm512_xor3( b1, a5, a7 ); \ + b2 = mm512_xor3( b2, a6, a0 ); \ /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ - b0 = a0;\ - b5 = _mm512_xor_si512(b5, a1);\ - b3 = _mm512_xor_si512(b3, a1);\ - b1 = a1;\ - b6 = _mm512_xor_si512(b6, a2);\ - b4 = _mm512_xor_si512(b4, a2);\ - TEMP2 = a2;\ - b7 = _mm512_xor_si512(b7, a3);\ - b5 = _mm512_xor_si512(b5, a3);\ + b0 = a0; \ + b3 = mm512_xor3( b3, a7, a1 ); \ + b1 = a1; \ + b6 = mm512_xor3( b6, a4, TEMP2 ); \ + b4 = mm512_xor3( b4, a0, TEMP2 ); \ + b7 = mm512_xor3( b7, a5, a3 ); \ + b5 = mm512_xor3( b5, a1, a3 ); \ \ /* compute x_i = t_i + t_{i+3} */\ - a0 = _mm512_xor_si512(a0, a3);\ - a1 = _mm512_xor_si512(a1, a4);\ - a2 = _mm512_xor_si512(a2, a5);\ - a3 = _mm512_xor_si512(a3, a6);\ - a4 = _mm512_xor_si512(a4, a7);\ - a5 = _mm512_xor_si512(a5, b0);\ - a6 = _mm512_xor_si512(a6, b1);\ - a7 = _mm512_xor_si512(a7, TEMP2);\ + a0 = _mm512_xor_si512( a0, a3 ); \ + a1 = _mm512_xor_si512( a1, a4 ); \ + a2 = _mm512_xor_si512( TEMP2, a5 ); \ + a3 = _mm512_xor_si512( a3, a6 ); \ + a4 = _mm512_xor_si512( a4, a7 ); \ + a5 = _mm512_xor_si512( a5, b0 ); \ + a6 = _mm512_xor_si512( a6, b1 ); \ + a7 = _mm512_xor_si512( a7, TEMP2 ); \ \ /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ /* compute w_i : add y_{i+4} */\ - b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\ - MUL2(a0, b0, b1);\ - a0 = _mm512_xor_si512(a0, TEMP0);\ - MUL2(a1, b0, b1);\ - a1 = _mm512_xor_si512(a1, TEMP1);\ - MUL2(a2, b0, b1);\ - a2 = _mm512_xor_si512(a2, b2);\ - MUL2(a3, b0, b1);\ - a3 = _mm512_xor_si512(a3, b3);\ - MUL2(a4, b0, b1);\ - a4 = _mm512_xor_si512(a4, b4);\ - MUL2(a5, b0, b1);\ - a5 = _mm512_xor_si512(a5, b5);\ - MUL2(a6, b0, b1);\ - a6 = _mm512_xor_si512(a6, b6);\ - MUL2(a7, b0, b1);\ - a7 = _mm512_xor_si512(a7, b7);\ + b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \ + MUL2( a0, b0, b1 ); \ + a0 = _mm512_xor_si512( a0, TEMP0 ); \ + MUL2( a1, b0, b1 ); \ + a1 = _mm512_xor_si512( a1, TEMP1 ); \ + MUL2( a2, b0, b1 ); \ + a2 = _mm512_xor_si512( a2, b2 ); \ + MUL2( a3, b0, b1 ); \ + a3 = _mm512_xor_si512( a3, b3 ); \ + MUL2( a4, b0, b1 ); \ + a4 = _mm512_xor_si512( a4, b4 ); \ + MUL2( a5, b0, b1 ); \ + a5 = _mm512_xor_si512( a5, b5 ); \ + MUL2( a6, b0, b1 ); \ + a6 = _mm512_xor_si512( a6, b6 ); \ + MUL2( a7, b0, b1 ); \ + a7 = _mm512_xor_si512( a7, b7 ); \ \ /* compute v_i : double w_i */\ /* add to y_4 y_5 .. v3, v4, ... */\ - MUL2(a0, b0, b1);\ - b5 = _mm512_xor_si512(b5, a0);\ - MUL2(a1, b0, b1);\ - b6 = _mm512_xor_si512(b6, a1);\ - MUL2(a2, b0, b1);\ - b7 = _mm512_xor_si512(b7, a2);\ - MUL2(a5, b0, b1);\ - b2 = _mm512_xor_si512(b2, a5);\ - MUL2(a6, b0, b1);\ - b3 = _mm512_xor_si512(b3, a6);\ - MUL2(a7, b0, b1);\ - b4 = _mm512_xor_si512(b4, a7);\ - MUL2(a3, b0, b1);\ - MUL2(a4, b0, b1);\ + MUL2( a0, b0, b1 ); \ + b5 = _mm512_xor_si512( b5, a0 ); \ + MUL2( a1, b0, b1 ); \ + b6 = _mm512_xor_si512( b6, a1 ); \ + MUL2( a2, b0, b1 ); \ + b7 = _mm512_xor_si512( b7, a2 ); \ + MUL2( a5, b0, b1 ); \ + b2 = _mm512_xor_si512( b2, a5 ); \ + MUL2( a6, b0, b1 ); \ + b3 = _mm512_xor_si512( b3, a6 ); \ + MUL2( a7, b0, b1 ); \ + b4 = _mm512_xor_si512( b4, a7 ); \ + MUL2( a3, b0, b1 ); \ + MUL2( a4, b0, b1 ); \ b0 = TEMP0;\ b1 = TEMP1;\ - b0 = _mm512_xor_si512(b0, a3);\ - b1 = _mm512_xor_si512(b1, a4);\ + b0 = _mm512_xor_si512( b0, a3 ); \ + b1 = _mm512_xor_si512( b1, a4 ); \ }/*MixBytes*/ /* one round @@ -709,11 +697,9 @@ static const __m256i SUBSH_MASK7_2WAY = * xmm[j] will be lost * xmm[k] has to be all 0x1b */ #define MUL2_2WAY(i, j, k){\ - j = _mm256_xor_si256(j, j);\ - j = _mm256_cmpgt_epi8(j, i );\ + j = _mm256_cmpgt_epi8( m256_zero, i );\ i = _mm256_add_epi8(i, i);\ - j = _mm256_and_si256(j, k);\ - i = _mm256_xor_si256(i, j);\ + i = mm256_xorand( i, j, k );\ } #define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ diff --git a/algo/groestl/myrgr-4way.c b/algo/groestl/myrgr-4way.c index 9fca48b..c9f558c 100644 --- a/algo/groestl/myrgr-4way.c +++ b/algo/groestl/myrgr-4way.c @@ -44,6 +44,7 @@ void myriad_8way_hash( void *output, const void *input ) rintrlv_8x64_4x128( vhashA, vhashB, input, 640 ); groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(groestl512_4way_context) ); groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 ); uint32_t hash0[20] __attribute__ ((aligned (64))); @@ -58,8 +59,6 @@ void myriad_8way_hash( void *output, const void *input ) // rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); - intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, - hash6, hash7 ); #else @@ -76,27 +75,27 @@ void myriad_8way_hash( void *output, const void *input ) hash4, hash5, hash6, hash7, input, 640 ); update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); - - intrlv_8x32( vhash, hash0, hash1, hash2, hash3, - hash4, hash5, hash6, hash7, 512 ); + memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) ); #endif + intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, + hash6, hash7 ); + sha256_8way_update( &ctx.sha, vhash, 64 ); sha256_8way_close( &ctx.sha, output ); } diff --git a/algo/sha/hmac-sha256-hash.c b/algo/sha/hmac-sha256-hash.c index 3c2f4d2..e09a4c2 100644 --- a/algo/sha/hmac-sha256-hash.c +++ b/algo/sha/hmac-sha256-hash.c @@ -39,17 +39,10 @@ void SHA256_Buf( const void * in, size_t len, uint8_t digest[32] ) { -#if defined(HMAC_SPH_SHA) sph_sha256_context ctx; sph_sha256_init( &ctx ); sph_sha256( &ctx, in, len ); sph_sha256_close( &ctx, digest ); -#else - SHA256_CTX ctx; - SHA256_Init( &ctx ); - SHA256_Update( &ctx, in, len ); - SHA256_Final( digest, &ctx ); -#endif } /** @@ -79,51 +72,29 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen ) /* If Klen > 64, the key is really SHA256(K). */ if ( Klen > 64 ) { - -#if defined(HMAC_SPH_SHA) sph_sha256_init( &ctx->ictx ); sph_sha256( &ctx->ictx, K, Klen ); sph_sha256_close( &ctx->ictx, khash ); -#else - SHA256_Init( &ctx->ictx ); - SHA256_Update( &ctx->ictx, K, Klen ); - SHA256_Final( khash, &ctx->ictx ); -#endif - K = khash; - Klen = 32; + + K = khash; + Klen = 32; } /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ -#if defined(HMAC_SPH_SHA) sph_sha256_init( &ctx->ictx ); -#else - SHA256_Init( &ctx->ictx ); -#endif for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x36; memset( pad + Klen, 0x36, 64 - Klen ); -#if defined(HMAC_SPH_SHA) sph_sha256( &ctx->ictx, pad, 64 ); -#else - SHA256_Update( &ctx->ictx, pad, 64 ); -#endif /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ -#if defined(HMAC_SPH_SHA) sph_sha256_init( &ctx->octx ); -#else - SHA256_Init( &ctx->octx ); -#endif for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x5c; memset( pad + Klen, 0x5c, 64 - Klen ); -#if defined(HMAC_SPH_SHA) sph_sha256( &ctx->octx, pad, 64 ); -#else - SHA256_Update( &ctx->octx, pad, 64 ); -#endif } /* Add bytes to the HMAC-SHA256 operation. */ @@ -131,11 +102,7 @@ void HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len ) { /* Feed data to the inner SHA256 operation. */ -#if defined(HMAC_SPH_SHA) sph_sha256( &ctx->ictx, in, len ); -#else - SHA256_Update( &ctx->ictx, in, len ); -#endif } /* Finish an HMAC-SHA256 operation. */ @@ -144,20 +111,9 @@ HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx ) { unsigned char ihash[32]; -#if defined(HMAC_SPH_SHA) sph_sha256_close( &ctx->ictx, ihash ); sph_sha256( &ctx->octx, ihash, 32 ); sph_sha256_close( &ctx->octx, digest ); -#else - /* Finish the inner SHA256 operation. */ - SHA256_Final( ihash, &ctx->ictx ); - - /* Feed the inner hash to the outer SHA256 operation. */ - SHA256_Update( &ctx->octx, ihash, 32 ); - - /* Finish the outer SHA256 operation. */ - SHA256_Final( digest, &ctx->octx ); -#endif } /** diff --git a/algo/sha/hmac-sha256-hash.h b/algo/sha/hmac-sha256-hash.h index 41e5673..a735c53 100644 --- a/algo/sha/hmac-sha256-hash.h +++ b/algo/sha/hmac-sha256-hash.h @@ -29,24 +29,14 @@ #ifndef HMAC_SHA256_H__ #define HMAC_SHA256_H__ -//#define HMAC_SSL_SHA 1 -#define HMAC_SPH_SHA 1 - #include #include #include "sph_sha2.h" -#include - typedef struct HMAC_SHA256Context { -#if defined(HMAC_SPH_SHA) sph_sha256_context ictx; sph_sha256_context octx; -#else - SHA256_CTX ictx; - SHA256_CTX octx; -#endif } HMAC_SHA256_CTX; void SHA256_Buf( const void *, size_t len, uint8_t digest[32] ); diff --git a/algo/sha/sph_sha2.c b/algo/sha/sph_sha2.c index 5e70c3e..7e39954 100644 --- a/algo/sha/sph_sha2.c +++ b/algo/sha/sph_sha2.c @@ -41,7 +41,7 @@ #define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z)) //#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X))) -#define MAJ( X, Y, Z ) ( Y ^ ( ( X ^ Y ) & ( Y ^ Z ) ) ) +#define MAJ( X, Y, Z ) ( Y ^ ( ( X_xor_Y = X ^ Y ) & ( Y_xor_Z ) ) ) #define ROTR SPH_ROTR32 #define BSG2_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) @@ -319,6 +319,7 @@ static const sph_u32 K[64] = { t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \ + K[pcount + (pc)] + W[(pc) & 0x0F]); \ t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \ + Y_xor_Z = X_xor_Y; \ d = SPH_T32(d + t1); \ h = SPH_T32(t1 + t2); \ } while (0) @@ -329,7 +330,7 @@ static const sph_u32 K[64] = { SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc) #define SHA2_ROUND_BODY(in, r) do { \ - sph_u32 A, B, C, D, E, F, G, H; \ + sph_u32 A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; \ sph_u32 W[16]; \ unsigned pcount; \ \ @@ -342,6 +343,7 @@ static const sph_u32 K[64] = { G = (r)[6]; \ H = (r)[7]; \ pcount = 0; \ + Y_xor_Z = B ^ C; \ SHA2_STEP1(A, B, C, D, E, F, G, H, in, 0); \ SHA2_STEP1(H, A, B, C, D, E, F, G, in, 1); \ SHA2_STEP1(G, H, A, B, C, D, E, F, in, 2); \ @@ -389,7 +391,7 @@ static const sph_u32 K[64] = { #else // large footprint (default) #define SHA2_ROUND_BODY(in, r) do { \ - sph_u32 A, B, C, D, E, F, G, H, T1, T2; \ + sph_u32 A, B, C, D, E, F, G, H, T1, T2, X_xor_Y, Y_xor_Z;; \ sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \ sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \ \ @@ -401,388 +403,453 @@ static const sph_u32 K[64] = { F = (r)[5]; \ G = (r)[6]; \ H = (r)[7]; \ + Y_xor_Z = B ^ C; \ W00 = in(0); \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + SPH_C32(0x428A2F98) + W00); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + Y_xor_Z = X_xor_Y; \ D = SPH_T32(D + T1); \ H = SPH_T32(T1 + T2); \ W01 = in(1); \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + SPH_C32(0x71374491) + W01); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + Y_xor_Z = X_xor_Y; \ C = SPH_T32(C + T1); \ G = SPH_T32(T1 + T2); \ W02 = in(2); \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + SPH_C32(0xB5C0FBCF) + W02); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + Y_xor_Z = X_xor_Y; \ B = SPH_T32(B + T1); \ F = SPH_T32(T1 + T2); \ W03 = in(3); \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + SPH_C32(0xE9B5DBA5) + W03); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + Y_xor_Z = X_xor_Y; \ A = SPH_T32(A + T1); \ E = SPH_T32(T1 + T2); \ W04 = in(4); \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + SPH_C32(0x3956C25B) + W04); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + Y_xor_Z = X_xor_Y; \ H = SPH_T32(H + T1); \ D = SPH_T32(T1 + T2); \ W05 = in(5); \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + SPH_C32(0x59F111F1) + W05); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + Y_xor_Z = X_xor_Y; \ G = SPH_T32(G + T1); \ C = SPH_T32(T1 + T2); \ W06 = in(6); \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + SPH_C32(0x923F82A4) + W06); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + Y_xor_Z = X_xor_Y; \ F = SPH_T32(F + T1); \ B = SPH_T32(T1 + T2); \ W07 = in(7); \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + SPH_C32(0xAB1C5ED5) + W07); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + Y_xor_Z = X_xor_Y; \ E = SPH_T32(E + T1); \ A = SPH_T32(T1 + T2); \ W08 = in(8); \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + SPH_C32(0xD807AA98) + W08); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + Y_xor_Z = X_xor_Y; \ D = SPH_T32(D + T1); \ H = SPH_T32(T1 + T2); \ W09 = in(9); \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + SPH_C32(0x12835B01) + W09); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + Y_xor_Z = X_xor_Y; \ C = SPH_T32(C + T1); \ G = SPH_T32(T1 + T2); \ W10 = in(10); \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + SPH_C32(0x243185BE) + W10); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + Y_xor_Z = X_xor_Y; \ B = SPH_T32(B + T1); \ F = SPH_T32(T1 + T2); \ W11 = in(11); \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + SPH_C32(0x550C7DC3) + W11); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + Y_xor_Z = X_xor_Y; \ A = SPH_T32(A + T1); \ E = SPH_T32(T1 + T2); \ W12 = in(12); \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + SPH_C32(0x72BE5D74) + W12); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + Y_xor_Z = X_xor_Y; \ H = SPH_T32(H + T1); \ D = SPH_T32(T1 + T2); \ W13 = in(13); \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + SPH_C32(0x80DEB1FE) + W13); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + Y_xor_Z = X_xor_Y; \ G = SPH_T32(G + T1); \ C = SPH_T32(T1 + T2); \ W14 = in(14); \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + SPH_C32(0x9BDC06A7) + W14); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + Y_xor_Z = X_xor_Y; \ F = SPH_T32(F + T1); \ B = SPH_T32(T1 + T2); \ W15 = in(15); \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + SPH_C32(0xC19BF174) + W15); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + Y_xor_Z = X_xor_Y; \ E = SPH_T32(E + T1); \ A = SPH_T32(T1 + T2); \ W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + SPH_C32(0xE49B69C1) + W00); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + Y_xor_Z = X_xor_Y; \ D = SPH_T32(D + T1); \ H = SPH_T32(T1 + T2); \ W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + SPH_C32(0xEFBE4786) + W01); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + Y_xor_Z = X_xor_Y; \ C = SPH_T32(C + T1); \ G = SPH_T32(T1 + T2); \ W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + SPH_C32(0x0FC19DC6) + W02); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + Y_xor_Z = X_xor_Y; \ B = SPH_T32(B + T1); \ F = SPH_T32(T1 + T2); \ W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + SPH_C32(0x240CA1CC) + W03); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + Y_xor_Z = X_xor_Y; \ A = SPH_T32(A + T1); \ E = SPH_T32(T1 + T2); \ W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + SPH_C32(0x2DE92C6F) + W04); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + Y_xor_Z = X_xor_Y; \ H = SPH_T32(H + T1); \ D = SPH_T32(T1 + T2); \ W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + SPH_C32(0x4A7484AA) + W05); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + Y_xor_Z = X_xor_Y; \ G = SPH_T32(G + T1); \ C = SPH_T32(T1 + T2); \ W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + SPH_C32(0x5CB0A9DC) + W06); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + Y_xor_Z = X_xor_Y; \ F = SPH_T32(F + T1); \ B = SPH_T32(T1 + T2); \ W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + SPH_C32(0x76F988DA) + W07); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + Y_xor_Z = X_xor_Y; \ E = SPH_T32(E + T1); \ A = SPH_T32(T1 + T2); \ W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + SPH_C32(0x983E5152) + W08); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + Y_xor_Z = X_xor_Y; \ D = SPH_T32(D + T1); \ H = SPH_T32(T1 + T2); \ W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + SPH_C32(0xA831C66D) + W09); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + Y_xor_Z = X_xor_Y; \ C = SPH_T32(C + T1); \ G = SPH_T32(T1 + T2); \ W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + SPH_C32(0xB00327C8) + W10); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + Y_xor_Z = X_xor_Y; \ B = SPH_T32(B + T1); \ F = SPH_T32(T1 + T2); \ W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + SPH_C32(0xBF597FC7) + W11); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + Y_xor_Z = X_xor_Y; \ A = SPH_T32(A + T1); \ E = SPH_T32(T1 + T2); \ W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + SPH_C32(0xC6E00BF3) + W12); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + Y_xor_Z = X_xor_Y; \ H = SPH_T32(H + T1); \ D = SPH_T32(T1 + T2); \ W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + SPH_C32(0xD5A79147) + W13); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + Y_xor_Z = X_xor_Y; \ G = SPH_T32(G + T1); \ C = SPH_T32(T1 + T2); \ W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + SPH_C32(0x06CA6351) + W14); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + Y_xor_Z = X_xor_Y; \ F = SPH_T32(F + T1); \ B = SPH_T32(T1 + T2); \ W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + SPH_C32(0x14292967) + W15); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + Y_xor_Z = X_xor_Y; \ E = SPH_T32(E + T1); \ A = SPH_T32(T1 + T2); \ W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + SPH_C32(0x27B70A85) + W00); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + Y_xor_Z = X_xor_Y; \ D = SPH_T32(D + T1); \ H = SPH_T32(T1 + T2); \ W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + SPH_C32(0x2E1B2138) + W01); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + Y_xor_Z = X_xor_Y; \ C = SPH_T32(C + T1); \ G = SPH_T32(T1 + T2); \ W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + SPH_C32(0x4D2C6DFC) + W02); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + Y_xor_Z = X_xor_Y; \ B = SPH_T32(B + T1); \ F = SPH_T32(T1 + T2); \ W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + SPH_C32(0x53380D13) + W03); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + Y_xor_Z = X_xor_Y; \ A = SPH_T32(A + T1); \ E = SPH_T32(T1 + T2); \ W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + SPH_C32(0x650A7354) + W04); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + Y_xor_Z = X_xor_Y; \ H = SPH_T32(H + T1); \ D = SPH_T32(T1 + T2); \ W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + SPH_C32(0x766A0ABB) + W05); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + Y_xor_Z = X_xor_Y; \ G = SPH_T32(G + T1); \ C = SPH_T32(T1 + T2); \ W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + SPH_C32(0x81C2C92E) + W06); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + Y_xor_Z = X_xor_Y; \ F = SPH_T32(F + T1); \ B = SPH_T32(T1 + T2); \ W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + SPH_C32(0x92722C85) + W07); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + Y_xor_Z = X_xor_Y; \ E = SPH_T32(E + T1); \ A = SPH_T32(T1 + T2); \ W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + SPH_C32(0xA2BFE8A1) + W08); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + Y_xor_Z = X_xor_Y; \ D = SPH_T32(D + T1); \ H = SPH_T32(T1 + T2); \ W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + SPH_C32(0xA81A664B) + W09); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + Y_xor_Z = X_xor_Y; \ C = SPH_T32(C + T1); \ G = SPH_T32(T1 + T2); \ W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + SPH_C32(0xC24B8B70) + W10); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + Y_xor_Z = X_xor_Y; \ B = SPH_T32(B + T1); \ F = SPH_T32(T1 + T2); \ W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + SPH_C32(0xC76C51A3) + W11); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + Y_xor_Z = X_xor_Y; \ A = SPH_T32(A + T1); \ E = SPH_T32(T1 + T2); \ W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + SPH_C32(0xD192E819) + W12); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + Y_xor_Z = X_xor_Y; \ H = SPH_T32(H + T1); \ D = SPH_T32(T1 + T2); \ W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + SPH_C32(0xD6990624) + W13); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + Y_xor_Z = X_xor_Y; \ G = SPH_T32(G + T1); \ C = SPH_T32(T1 + T2); \ W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + SPH_C32(0xF40E3585) + W14); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + Y_xor_Z = X_xor_Y; \ F = SPH_T32(F + T1); \ B = SPH_T32(T1 + T2); \ W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + SPH_C32(0x106AA070) + W15); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + Y_xor_Z = X_xor_Y; \ E = SPH_T32(E + T1); \ A = SPH_T32(T1 + T2); \ W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + SPH_C32(0x19A4C116) + W00); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + Y_xor_Z = X_xor_Y; \ D = SPH_T32(D + T1); \ H = SPH_T32(T1 + T2); \ W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + SPH_C32(0x1E376C08) + W01); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + Y_xor_Z = X_xor_Y; \ C = SPH_T32(C + T1); \ G = SPH_T32(T1 + T2); \ W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + SPH_C32(0x2748774C) + W02); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + Y_xor_Z = X_xor_Y; \ B = SPH_T32(B + T1); \ F = SPH_T32(T1 + T2); \ W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + SPH_C32(0x34B0BCB5) + W03); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + Y_xor_Z = X_xor_Y; \ A = SPH_T32(A + T1); \ E = SPH_T32(T1 + T2); \ W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + SPH_C32(0x391C0CB3) + W04); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + Y_xor_Z = X_xor_Y; \ H = SPH_T32(H + T1); \ D = SPH_T32(T1 + T2); \ W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + SPH_C32(0x4ED8AA4A) + W05); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + Y_xor_Z = X_xor_Y; \ G = SPH_T32(G + T1); \ C = SPH_T32(T1 + T2); \ W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + SPH_C32(0x5B9CCA4F) + W06); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + Y_xor_Z = X_xor_Y; \ F = SPH_T32(F + T1); \ B = SPH_T32(T1 + T2); \ W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + SPH_C32(0x682E6FF3) + W07); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + Y_xor_Z = X_xor_Y; \ E = SPH_T32(E + T1); \ A = SPH_T32(T1 + T2); \ W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + SPH_C32(0x748F82EE) + W08); \ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + Y_xor_Z = X_xor_Y; \ D = SPH_T32(D + T1); \ H = SPH_T32(T1 + T2); \ W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + SPH_C32(0x78A5636F) + W09); \ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + Y_xor_Z = X_xor_Y; \ C = SPH_T32(C + T1); \ G = SPH_T32(T1 + T2); \ W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + SPH_C32(0x84C87814) + W10); \ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + Y_xor_Z = X_xor_Y; \ B = SPH_T32(B + T1); \ F = SPH_T32(T1 + T2); \ W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + SPH_C32(0x8CC70208) + W11); \ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + Y_xor_Z = X_xor_Y; \ A = SPH_T32(A + T1); \ E = SPH_T32(T1 + T2); \ W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + SPH_C32(0x90BEFFFA) + W12); \ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + Y_xor_Z = X_xor_Y; \ H = SPH_T32(H + T1); \ D = SPH_T32(T1 + T2); \ W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + SPH_C32(0xA4506CEB) + W13); \ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + Y_xor_Z = X_xor_Y; \ G = SPH_T32(G + T1); \ C = SPH_T32(T1 + T2); \ W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + SPH_C32(0xBEF9A3F7) + W14); \ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + Y_xor_Z = X_xor_Y; \ F = SPH_T32(F + T1); \ B = SPH_T32(T1 + T2); \ W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + SPH_C32(0xC67178F2) + W15); \ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + Y_xor_Z = X_xor_Y; \ E = SPH_T32(E + T1); \ A = SPH_T32(T1 + T2); \ (r)[0] = SPH_T32((r)[0] + A); \ diff --git a/algo/shavite/shavite-hash-4way.c b/algo/shavite/shavite-hash-4way.c index 2b0b735..2c93df9 100644 --- a/algo/shavite/shavite-hash-4way.c +++ b/algo/shavite/shavite-hash-4way.c @@ -23,6 +23,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg ) register __m512i K0, K1, K2, K3, K4, K5, K6, K7; __m512i *M = (__m512i*)msg; __m512i *H = (__m512i*)ctx->h; + const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2, + ctx->count1, ctx->count0 ); int r; P0 = H[0]; @@ -62,16 +64,16 @@ c512_4way( shavite512_4way_context *ctx, const void *msg ) _mm512_aesenc_epi128( K0, m512_zero ) ) ); if ( r == 0 ) - K0 = _mm512_xor_si512( K0, _mm512_set4_epi32( - ~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) ); + K0 = _mm512_xor_si512( K0, + _mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero ); K1 = _mm512_xor_si512( K0, mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) ); if ( r == 1 ) - K1 = _mm512_xor_si512( K1, _mm512_set4_epi32( - ~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) ); + K1 = _mm512_xor_si512( K1, mm512_ror128_32( + _mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); K2 = _mm512_xor_si512( K1, @@ -96,8 +98,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg ) mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) ); if ( r == 2 ) - K7 = _mm512_xor_si512( K7, _mm512_set4_epi32( - ~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) ); + K7 = _mm512_xor_si512( K7, mm512_swap128_64( + _mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); P1 = _mm512_xor_si512( P1, X ); diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c index e047d77..a593cf5 100644 --- a/algo/shavite/sph-shavite-aesni.c +++ b/algo/shavite/sph-shavite-aesni.c @@ -101,15 +101,6 @@ c512( sph_shavite_big_context *sc, const void *msg ) // round -// working proof of concept -/* - __m512i K = m512_const1_128( m[0] ); - __m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K ); - X = _mm512_aesenc_epi128( X, m512_zero ); - k00 = _mm512_castsi512_si128( K ); - x = _mm512_castsi512_si128( X ); -*/ - k00 = m[0]; x = _mm_xor_si128( p1, k00 ); x = _mm_aesenc_si128( x, zero ); diff --git a/algo/simd/simd-hash-2way.c b/algo/simd/simd-hash-2way.c index f2652f3..856a07f 100644 --- a/algo/simd/simd-hash-2way.c +++ b/algo/simd/simd-hash-2way.c @@ -747,11 +747,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft ) static const m512_v16 code[] = { c1_16_512(185), c1_16_512(233) }; - -// static const m512_v16 code[] = { c1_16(185), c1_16(233), -// c1_16(185), c1_16(233) }; - - S0l = _mm512_xor_si512( S[0], M[0] ); S0h = _mm512_xor_si512( S[1], M[1] ); S1l = _mm512_xor_si512( S[2], M[2] ); @@ -764,11 +759,16 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft ) // targetted, local macros don't need a unique name #define S(i) S##i +#define F_0( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xca ) +#define F_1( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xe8 ) + +/* #define F_0(B, C, D) \ _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( C,D ), B ), D ) #define F_1(B, C, D) \ _mm512_or_si512( _mm512_and_si512( D, C ),\ _mm512_and_si512( _mm512_or_si512( D,C ), B ) ) +*/ #define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l) #define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h) diff --git a/algo/simd/vector.c b/algo/simd/vector.c index 12692db..60f0cc7 100644 --- a/algo/simd/vector.c +++ b/algo/simd/vector.c @@ -6,10 +6,6 @@ #define PRINT_SOME 0 -/* JDD all ocurrances of macro X in this file renamed to XX - * due to name conflict - */ - int SupportedLength(int hashbitlen) { if (hashbitlen <= 0 || hashbitlen > 512) return 0; diff --git a/algo/yespower/yespower-opt.c b/algo/yespower/yespower-opt.c index e21e4f1..fd16c24 100644 --- a/algo/yespower/yespower-opt.c +++ b/algo/yespower/yespower-opt.c @@ -373,6 +373,45 @@ static inline void salsa20(salsa20_blk_t *restrict B, #define INTEGERIFY (uint32_t)X.d[0] #endif +// AVX512 ternary logic optimization +#if defined(__AVX512VL__) + +#define XOR_X_XOR_X( in1, in2 ) \ + X0 = _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \ + X1 = _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \ + X2 = _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \ + X3 = _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); + +#define XOR_X_2_XOR_X( in1, in2, in3 ) \ + X0 = _mm_ternarylogic_epi32( (in1).q[0], (in2).q[0], (in3).q[0], 0x96 ); \ + X1 = _mm_ternarylogic_epi32( (in1).q[1], (in2).q[1], (in3).q[1], 0x96 ); \ + X2 = _mm_ternarylogic_epi32( (in1).q[2], (in2).q[2], (in3).q[2], 0x96 ); \ + X3 = _mm_ternarylogic_epi32( (in1).q[3], (in2).q[3], (in3).q[3], 0x96 ); + +#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \ + X0 = _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \ + X1 = _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \ + X2 = _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \ + X3 = _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); \ + SALSA20(out) + +#else + +#define XOR_X_XOR_X( in1, in2 ) \ + XOR_X( in1 ) \ + XOR_X( in2 ) + +#define XOR_X_2_XOR_X( in1, in2, in3 ) \ + XOR_X_2( in1, in2 ) \ + XOR_X( in3 ) + +#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \ + XOR_X(in1) \ + XOR_X(in2) \ + SALSA20( out ) + +#endif + /** * Apply the Salsa20 core to the block provided in X ^ in. */ @@ -406,11 +445,15 @@ static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1, { DECL_X - XOR_X_2(Bin1[1], Bin2[1]) - XOR_X(Bin1[0]) + XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] ) +// XOR_X_2(Bin1[1], Bin2[1]) +// XOR_X(Bin1[0]) SALSA20_XOR_MEM(Bin2[0], Bout[0]) - XOR_X(Bin1[1]) - SALSA20_XOR_MEM(Bin2[1], Bout[1]) + +// Factor out the XOR from salsa20 to do a xor3 + XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] ) +// XOR_X(Bin1[1]) +// SALSA20_XOR_MEM(Bin2[1], Bout[1]) return INTEGERIFY; } @@ -745,13 +788,15 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1, i = 0; r--; do { - XOR_X(Bin1[i]) - XOR_X(Bin2[i]) + XOR_X_XOR_X( Bin1[i], Bin2[i] ) +// XOR_X(Bin1[i]) +// XOR_X(Bin2[i]) PWXFORM WRITE_X(Bout[i]) - XOR_X(Bin1[i + 1]) - XOR_X(Bin2[i + 1]) + XOR_X_XOR_X( Bin1[ i+1 ], Bin2[ i+1 ] ) +// XOR_X(Bin1[i + 1]) +// XOR_X(Bin2[i + 1]) PWXFORM if (unlikely(i >= r)) diff --git a/algo/yespower/yespower.h b/algo/yespower/yespower.h index c5b6d78..260322a 100644 --- a/algo/yespower/yespower.h +++ b/algo/yespower/yespower.h @@ -35,7 +35,6 @@ #include "miner.h" #include "simd-utils.h" #include "algo/sha/sph_sha2.h" -#include #ifdef __cplusplus extern "C" { diff --git a/build-allarch.sh b/build-allarch.sh index fa1d866..c4d9ffd 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -63,7 +63,7 @@ mv cpuminer cpuminer-avx # Westmere SSE4.2 AES make clean || echo clean rm -f config.status -CFLAGS="-O3 -march=westmere -Wall -fno-common" ./configure --with-curl +CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl make -j 8 strip -s cpuminer.exe mv cpuminer.exe cpuminer-aes-sse42.exe diff --git a/configure b/configure index 8382a1b..7430186 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.0. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.1. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.17.0' -PACKAGE_STRING='cpuminer-opt 3.17.0' +PACKAGE_VERSION='3.17.1' +PACKAGE_STRING='cpuminer-opt 3.17.1' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.17.0 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.17.1 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.17.0:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.17.1:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.17.0 +cpuminer-opt configure 3.17.1 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.17.0, which was +It was created by cpuminer-opt $as_me 3.17.1, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.17.0' + VERSION='3.17.1' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.17.0, which was +This file was extended by cpuminer-opt $as_me 3.17.1, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.17.0 +cpuminer-opt config.status 3.17.1 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index f5612ef..332d1e6 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.17.0]) +AC_INIT([cpuminer-opt], [3.17.1]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 9b72376..e9c01fe 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -1054,6 +1054,8 @@ void report_summary_log( bool force ) applog( LOG_NOTICE,"CPU temp: curr %s max %d, Freq: %.3f/%.3f GHz", tempstr, hi_temp, lo_freq / 1e6, hi_freq / 1e6 ); if ( curr_temp > hi_temp ) hi_temp = curr_temp; + if ( ( opt_max_temp > 0.0 ) && ( curr_temp > opt_max_temp ) ) + restart_threads(); prev_temp = curr_temp; } } @@ -2856,7 +2858,6 @@ static bool cpu_capability( bool display_only ) bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features ); bool use_aes; bool use_sse2; - bool use_sse42; bool use_avx2; bool use_avx512; bool use_sha; @@ -2930,13 +2931,14 @@ static bool cpu_capability( bool display_only ) if ( algo_features == EMPTY_SET ) printf( " None" ); else { - if ( algo_has_avx512 ) printf( " AVX512" ); - else if ( algo_has_avx2 ) printf( " AVX2 " ); - else if ( algo_has_sse42 ) printf( " SSE4.2" ); - else if ( algo_has_sse2 ) printf( " SSE2 " ); - if ( algo_has_vaes ) printf( " VAES" ); - else if ( algo_has_aes ) printf( " AES" ); - if ( algo_has_sha ) printf( " SHA" ); + if ( algo_has_avx512 ) printf( " AVX512" ); + else if ( algo_has_avx2 ) printf( " AVX2 " ); + else if ( algo_has_sse42 ) printf( " SSE4.2" ); + else if ( algo_has_sse2 ) printf( " SSE2 " ); + if ( algo_has_vaes || + algo_has_vaes256 ) printf( " VAES" ); + else if ( algo_has_aes ) printf( " AES" ); + if ( algo_has_sha ) printf( " SHA" ); } printf("\n"); @@ -2972,13 +2974,12 @@ static bool cpu_capability( bool display_only ) // Determine mining options use_sse2 = cpu_has_sse2 && algo_has_sse2; use_aes = cpu_has_aes && sw_has_aes && algo_has_aes; - use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42; use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2; use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512; use_sha = cpu_has_sha && sw_has_sha && algo_has_sha; - use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes - && ( use_avx512 || algo_has_vaes256 ); - use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 || + use_vaes = cpu_has_vaes && sw_has_vaes && ( algo_has_vaes + || algo_has_vaes256 ); + use_none = !( use_sse2 || use_aes || use_avx512 || use_avx2 || use_sha || use_vaes ); // Display best options @@ -2988,7 +2989,6 @@ static bool cpu_capability( bool display_only ) { if ( use_avx512 ) printf( " AVX512" ); else if ( use_avx2 ) printf( " AVX2" ); - else if ( use_sse42 ) printf( " SSE4.2" ); else if ( use_sse2 ) printf( " SSE2" ); if ( use_vaes ) printf( " VAES" ); else if ( use_aes ) printf( " AES" ); diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index e166b14..90066f0 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -237,6 +237,25 @@ static inline void memset_128( __m128i *dst, const __m128i a, const int n ) static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) { for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } +#if defined(__AVX512VL__) + +// a ^ b ^ c +#define mm128_xor3( a, b, c ) \ + _mm_ternarylogic_epi64( a, b, c, 0x96 ) + +// a ^ ( b & c ) +#define mm128_xorand( a, b, c ) \ + _mm_ternarylogic_epi64( a, b, c, 0x78 ) + +#else + +#define mm128_xor3( a, b, c ) \ + _mm_xor_si128( a, _mm_xor_si128( b, c ) ) + +#define mm128_xorand( a, b, c ) \ + _mm_xor_si128( a, _mm_and_si128( b, c ) ) + +#endif // // Bit rotations diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 1b9fca8..125e2c8 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -275,15 +275,17 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) // // Rotate elements accross all lanes. -// -// Swap 128 bit elements in 256 bit vector. -#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e ) -// Rotate 256 bit vector by one 64 bit element -#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 ) -#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 ) +#if defined(__AVX512VL__) -#if defined(__AVX512F__) && defined(__AVX512VL__) +static inline __m256i mm256_swap_128( const __m256i v ) +{ return _mm256_alignr_epi64( v, v, 2 ); } + +static inline __m256i mm256_ror_1x64( const __m256i v ) +{ return _mm256_alignr_epi64( v, v, 1 ); } + +static inline __m256i mm256_rol_1x64( const __m256i v ) +{ return _mm256_alignr_epi64( v, v, 3 ); } static inline __m256i mm256_ror_1x32( const __m256i v ) { return _mm256_alignr_epi32( v, v, 1 ); } @@ -293,6 +295,13 @@ static inline __m256i mm256_rol_1x32( const __m256i v ) #else // AVX2 +// Swap 128 bit elements in 256 bit vector. +#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e ) + +// Rotate 256 bit vector by one 64 bit element +#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 ) +#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 ) + // Rotate 256 bit vector by one 32 bit element. #define mm256_ror_1x32( v ) \ _mm256_permutevar8x32_epi32( v, \ @@ -304,6 +313,7 @@ static inline __m256i mm256_rol_1x32( const __m256i v ) m256_const_64( 0x0000000600000005, 0x0000000400000003, \ 0x0000000200000001, 0x0000000000000007 ) + #endif // AVX512 else AVX2 // diff --git a/winbuild-cross.sh b/winbuild-cross.sh index f6402ba..4953cec 100755 --- a/winbuild-cross.sh +++ b/winbuild-cross.sh @@ -107,7 +107,7 @@ mv cpuminer.exe release/cpuminer-avx.exe # Westmere SSE4.2 AES make clean || echo clean rm -f config.status -CFLAGS="-O3 -march=westmere -Wall" ./configure $CONFIGURE_ARGS +CFLAGS="-O3 -march=westmere -maes -Wall" ./configure $CONFIGURE_ARGS #CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe