This commit is contained in:
Jay D Dee
2024-05-20 23:08:50 -04:00
parent 4f930574cc
commit 042d13d1e1
129 changed files with 835 additions and 538 deletions

View File

@@ -306,7 +306,7 @@ pbkdf2_sha256_8way( uint8_t *buf, size_t dkLen, const uint8_t *passwd,
}
}
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// HMAC 16-way AVX512

View File

@@ -84,7 +84,7 @@ void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct _hmac_sha256_16way_context
{

View File

@@ -580,7 +580,7 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
// to avoid recalculating it as Y^Z. This optimization is not applicable
// when MAJ is optimized with ternary logic.
#if defined(__AVX512VL__)
#if defined(VL256)
#define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
@@ -788,7 +788,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
G = _mm256_load_si256( state_in + 6 );
H = _mm256_load_si256( state_in + 7 );
#if !defined(__AVX512VL__)
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
#endif
@@ -830,7 +830,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
G = _mm256_load_si256( state_mid + 6 );
H = _mm256_load_si256( state_mid + 7 );
#if !defined(__AVX512VL__)
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
#endif
@@ -936,7 +936,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i IV7 = H;
const __m256i IV6 = G;
#if !defined(__AVX512VL__)
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
#endif
@@ -981,7 +981,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
W[11] = SHA256_8WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );
#if !defined(__AVX512VL__)
#if !defined(VL256)
Y_xor_Z = _mm256_xor_si256( B, C );
#endif
@@ -1172,7 +1172,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
sha256_8way_close( &ctx, dst );
}
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// SHA-256 16 way

View File

@@ -10,6 +10,28 @@ static const uint32_t SHA256_IV[8] =
#if defined(__x86_64__) && defined(__SHA__)
/* common code used for rounds 12 through 51 */
#define sha256_generic_qround( s0, s1, m, a, b, c ) \
TMP = _mm_alignr_epi8( a, c, 4 ); \
s1 = _mm_sha256rnds2_epu32( s1, s0, m ); \
b = _mm_add_epi32( b, TMP ); \
b = _mm_sha256msg2_epu32( b, a ); \
m = _mm_shuffle_epi32( m, 0x0e ); \
s0 = _mm_sha256rnds2_epu32( s0, s1, m ); \
c = _mm_sha256msg1_epu32( c, a );
// r12-15
// sha256_generic_qround( s0, s1, m, t3, t0, t2 )
// r16-19
// sha256_generic_qround( s0, s1, m, t0, t1, t3 )
// r20-23
// sha256_generic_qround( s0, s1, m, t1, t2, t0 )
// r24-27
// sha256_generic_qround( s0, s1, m, t2, t3, t1 ) ...
#define sha256_opt_rounds( state_out, input, state_in ) \
{ \
__m128i STATE0, STATE1; \
@@ -887,14 +909,14 @@ static const uint32_t K256[64] =
#define sha256_neon_rounds( state_out, input, state_in ) \
{ \
uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; \
uint32x4_t STATE0, STATE1, ABCD_SAVE, EFGH_SAVE; \
uint32x4_t MSG0, MSG1, MSG2, MSG3; \
uint32x4_t TMP0, TMP1, TMP2; \
\
STATE0 = vld1q_u32( state_in ); \
STATE1 = vld1q_u32( state_in+4 ); \
ABEF_SAVE = STATE0; \
CDGH_SAVE = STATE1; \
ABCD_SAVE = STATE0; \
EFGH_SAVE = STATE1; \
\
MSG0 = load_msg( input, 0 ); \
MSG1 = load_msg( input, 1 ); \
@@ -1004,8 +1026,8 @@ static const uint32_t K256[64] =
TMP2 = STATE0; \
STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
STATE0 = vaddq_u32( STATE0, ABEF_SAVE ); \
STATE1 = vaddq_u32( STATE1, CDGH_SAVE ); \
STATE0 = vaddq_u32( STATE0, ABCD_SAVE ); \
STATE1 = vaddq_u32( STATE1, EFGH_SAVE ); \
vst1q_u32( state_out , STATE0 ); \
vst1q_u32( state_out+4, STATE1 ); \
}
@@ -1029,8 +1051,8 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
#define sha256_neon_x2sha_rounds( state_out_X, state_out_Y, input_X, \
input_Y, state_in_X, state_in_Y ) \
{ \
uint32x4_t STATE0_X, STATE1_X, ABEF_SAVE_X, CDGH_SAVE_X; \
uint32x4_t STATE0_Y, STATE1_Y, ABEF_SAVE_Y, CDGH_SAVE_Y; \
uint32x4_t STATE0_X, STATE1_X, ABCD_SAVE_X, EFGH_SAVE_X; \
uint32x4_t STATE0_Y, STATE1_Y, ABCD_SAVE_Y, EFGH_SAVE_Y; \
uint32x4_t MSG0_X, MSG1_X, MSG2_X, MSG3_X; \
uint32x4_t MSG0_Y, MSG1_Y, MSG2_Y, MSG3_Y; \
uint32x4_t TMP0_X, TMP1_X, TMP2_X; \
@@ -1040,10 +1062,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
STATE0_Y = vld1q_u32( state_in_Y ); \
STATE1_X = vld1q_u32( state_in_X+4 ); \
STATE1_Y = vld1q_u32( state_in_Y+4 ); \
ABEF_SAVE_X = STATE0_X; \
ABEF_SAVE_Y = STATE0_Y; \
CDGH_SAVE_X = STATE1_X; \
CDGH_SAVE_Y = STATE1_Y; \
ABCD_SAVE_X = STATE0_X; \
ABCD_SAVE_Y = STATE0_Y; \
EFGH_SAVE_X = STATE1_X; \
EFGH_SAVE_Y = STATE1_Y; \
\
MSG0_X = load_msg( input_X, 0 ); \
MSG0_Y = load_msg( input_Y, 0 ); \
@@ -1245,10 +1267,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
STATE0_X = vaddq_u32( STATE0_X, ABEF_SAVE_X ); \
STATE0_Y = vaddq_u32( STATE0_Y, ABEF_SAVE_Y ); \
STATE1_X = vaddq_u32( STATE1_X, CDGH_SAVE_X ); \
STATE1_Y = vaddq_u32( STATE1_Y, CDGH_SAVE_Y ); \
STATE0_X = vaddq_u32( STATE0_X, ABCD_SAVE_X ); \
STATE0_Y = vaddq_u32( STATE0_Y, ABCD_SAVE_Y ); \
STATE1_X = vaddq_u32( STATE1_X, EFGH_SAVE_X ); \
STATE1_Y = vaddq_u32( STATE1_Y, EFGH_SAVE_Y ); \
vst1q_u32( state_out_X , STATE0_X ); \
vst1q_u32( state_out_Y , STATE0_Y ); \
vst1q_u32( state_out_X+4, STATE1_X ); \

View File

@@ -113,7 +113,7 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// SHA-256 16 way x86_64

View File

@@ -4,7 +4,7 @@
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA256D_16WAY 1
#elif defined(__SHA__)
#define SHA256D_SHA 1

View File

@@ -4,7 +4,7 @@
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA256D_16WAY 1
#elif defined(__SHA__)
#define SHA256D_SHA 1

View File

@@ -6,7 +6,7 @@
#include "sha256-hash.h"
#include "sph_sha2.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA256DT_16X32 1
#elif defined(__x86_64__) && defined(__SHA__)
#define SHA256DT_X86_SHA256 1

View File

@@ -4,7 +4,7 @@
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA256T_16WAY 1
#elif defined(__SHA__)
#define SHA256T_SHA 1

View File

@@ -73,29 +73,10 @@ static const uint64_t K512[80] =
// Experimental. Not tested. Not reviewed. Compile tested only.
// Needs GCC-13 for compilation.
// Needs Intel Lunar lake or Arrow Lake CPU, or AMD Zen-{5,6}? for execution.
// Needs GCC-14 for compilation.
// Needs Intel Lunarlake or Arrowlake CPU, or AMD Zen-6? for execution.
// Modelled after noloader sha256 implementation.
// It's not clear how SHA512 will be supported before AVX10 considering how
// dependant it is on _mm256_alignr_epi64 which is only available with AVX512VL
// until AVX10-256.
#if defined(__AVX512VL__)
#define mm256_alignr_1x64( v1, v0 ) _mm256_alignr_epi64( v1, v0, 1 )
#else
// Ugly workaround to make it work with AVX2
static const __m256i mask __attribute__ ((aligned (32)))
= { 0xffffffffffffffffull, 0ull, 0ull, 0ull };
#define mm256_alignr_1x64( v1, v0 ) \
_mm256_or_si256( _mm256_and_si256( mm256_shuflr_64( v1 ), mask ), \
_mm256_and_si256( mm256_shuflr_64( v0 ), mm256_not(mask) ) );
#endif
void sha512_opt_transform_be( uint64_t *state_out, const void *input,
const uint64_t *state_in )
@@ -109,7 +90,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f,
0x0001020304050607 ) )
0x0001020304050607 ) );
TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB
STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH
STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
@@ -123,153 +104,233 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 );
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128 (MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
// Rounds 4-7
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 );
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 8-11
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 );
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 12-15
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 );
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_shuffle2_64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi32( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 16-19
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 20-23
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 24-27
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi32( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 28-31
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 32-35
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 36-39
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 40-43
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 44-47
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 48-51
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 52-55
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 56-59
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 60-63
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 64-67
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 16 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 68-71
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 17 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
// Rounds 72-75
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 18 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
// Rounds 76-79
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 19 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
// Add initial state
STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
@@ -289,7 +350,7 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
const uint64_t *state_in )
{
__m256i STATE0, STATE1;
__m256i MSG, TMP, BSWAP64;
__m256i MSG, TMP;
__m256i TMSG0, TMSG1, TMSG2, TMSG3;
__m256i ABEF_SAVE, CDGH_SAVE;
@@ -308,141 +369,190 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
// Rounds 0-3
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
// Rounds 4-7
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 8-11
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 12-15
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_shuffle2_64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi32( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 16-19
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 20-23
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 24-27
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi32( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 28-31
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 32-35
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 36-39
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 40-43
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 44-47
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 48-51
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 52-55
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 56-59
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 60-63
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 64-67
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 16 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 68-71
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 17 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
// Rounds 72-75
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 18 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
// Rounds 76-79
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 19 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
// Add initial state
STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
@@ -462,7 +572,7 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// SHA-512 8 way 64 bit
@@ -664,8 +774,7 @@ void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data,
mm256_ror_64( x, 61 ), \
_mm256_srli_epi64( x, 6 ) )
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#if defined(VL256)
// 4 way is not used whith AVX512 but will be whith AVX10_256 when it
// becomes available.
@@ -717,7 +826,7 @@ sha512_4x64_round( sha512_4x64_context *ctx, __m256i *in, __m256i r[8] )
int i;
register __m256i A, B, C, D, E, F, G, H;
#if !defined(__AVX512VL__)
#if !defined(VL256)
// Disable for AVX10_256
__m256i X_xor_Y, Y_xor_Z;
#endif
@@ -754,7 +863,7 @@ sha512_4x64_round( sha512_4x64_context *ctx, __m256i *in, __m256i r[8] )
H = v256_64( 0x5BE0CD19137E2179 );
}
#if !defined(__AVX512VL__)
#if !defined(VL256)
// Disable for AVX10_256
Y_xor_Z = _mm256_xor_si256( B, C );
#endif

View File

@@ -25,7 +25,7 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// SHA-512 8 way

View File

@@ -4,7 +4,7 @@
#include <string.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA512256D_8WAY 1
#elif defined(__AVX2__)
#define SHA512256D_4WAY 1