mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2026-02-22 16:33:08 +00:00
v3.9.5.3
This commit is contained in:
@@ -412,34 +412,16 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
|
||||
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
|
||||
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
|
||||
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
|
||||
VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
|
||||
_mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
|
||||
VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
|
||||
_mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
|
||||
VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ) \
|
||||
, _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
|
||||
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
|
||||
M[0x0] = mm128_bswap_32( *(buf + 0) ); \
|
||||
M[0x1] = mm128_bswap_32( *(buf + 1) ); \
|
||||
M[0x2] = mm128_bswap_32( *(buf + 2) ); \
|
||||
M[0x3] = mm128_bswap_32( *(buf + 3) ); \
|
||||
M[0x4] = mm128_bswap_32( *(buf + 4) ); \
|
||||
M[0x5] = mm128_bswap_32( *(buf + 5) ); \
|
||||
M[0x6] = mm128_bswap_32( *(buf + 6) ); \
|
||||
M[0x7] = mm128_bswap_32( *(buf + 7) ); \
|
||||
M[0x8] = mm128_bswap_32( *(buf + 8) ); \
|
||||
M[0x9] = mm128_bswap_32( *(buf + 9) ); \
|
||||
M[0xA] = mm128_bswap_32( *(buf + 10) ); \
|
||||
M[0xB] = mm128_bswap_32( *(buf + 11) ); \
|
||||
M[0xC] = mm128_bswap_32( *(buf + 12) ); \
|
||||
M[0xD] = mm128_bswap_32( *(buf + 13) ); \
|
||||
M[0xE] = mm128_bswap_32( *(buf + 14) ); \
|
||||
M[0xF] = mm128_bswap_32( *(buf + 15) ); \
|
||||
V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \
|
||||
V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \
|
||||
VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \
|
||||
VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \
|
||||
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
|
||||
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
|
||||
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
|
||||
mm128_block_bswap_32( M, buf ); \
|
||||
mm128_block_bswap_32( M+8, buf+8 ); \
|
||||
for (r = 0; r < rounds; r ++) \
|
||||
ROUND_S_4WAY(r); \
|
||||
H0 = _mm_xor_si128( _mm_xor_si128( \
|
||||
@@ -464,6 +446,54 @@ do { \
|
||||
|
||||
// current impl
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define BLAKE256_4WAY_BLOCK_BSWAP32 do \
|
||||
{ \
|
||||
__m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ); \
|
||||
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
|
||||
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
|
||||
M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
|
||||
M3 = _mm_shuffle_epi8( buf[ 3], shuf_bswap32 ); \
|
||||
M4 = _mm_shuffle_epi8( buf[ 4], shuf_bswap32 ); \
|
||||
M5 = _mm_shuffle_epi8( buf[ 5], shuf_bswap32 ); \
|
||||
M6 = _mm_shuffle_epi8( buf[ 6], shuf_bswap32 ); \
|
||||
M7 = _mm_shuffle_epi8( buf[ 7], shuf_bswap32 ); \
|
||||
M8 = _mm_shuffle_epi8( buf[ 8], shuf_bswap32 ); \
|
||||
M9 = _mm_shuffle_epi8( buf[ 9], shuf_bswap32 ); \
|
||||
MA = _mm_shuffle_epi8( buf[10], shuf_bswap32 ); \
|
||||
MB = _mm_shuffle_epi8( buf[11], shuf_bswap32 ); \
|
||||
MC = _mm_shuffle_epi8( buf[12], shuf_bswap32 ); \
|
||||
MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
|
||||
ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
|
||||
MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
|
||||
} while(0)
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define BLAKE256_4WAY_BLOCK_BSWAP32 do \
|
||||
{ \
|
||||
M0 = mm128_bswap_32( buf[0] ); \
|
||||
M1 = mm128_bswap_32( buf[1] ); \
|
||||
M2 = mm128_bswap_32( buf[2] ); \
|
||||
M3 = mm128_bswap_32( buf[3] ); \
|
||||
M4 = mm128_bswap_32( buf[4] ); \
|
||||
M5 = mm128_bswap_32( buf[5] ); \
|
||||
M6 = mm128_bswap_32( buf[6] ); \
|
||||
M7 = mm128_bswap_32( buf[7] ); \
|
||||
M8 = mm128_bswap_32( buf[8] ); \
|
||||
M9 = mm128_bswap_32( buf[9] ); \
|
||||
MA = mm128_bswap_32( buf[10] ); \
|
||||
MB = mm128_bswap_32( buf[11] ); \
|
||||
MC = mm128_bswap_32( buf[12] ); \
|
||||
MD = mm128_bswap_32( buf[13] ); \
|
||||
ME = mm128_bswap_32( buf[14] ); \
|
||||
MF = mm128_bswap_32( buf[15] ); \
|
||||
} while(0)
|
||||
|
||||
#endif // SSSE3 else SSE2
|
||||
|
||||
#define COMPRESS32_4WAY( rounds ) \
|
||||
do { \
|
||||
__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
@@ -486,22 +516,7 @@ do { \
|
||||
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
|
||||
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
|
||||
M0 = mm128_bswap_32( buf[ 0] ); \
|
||||
M1 = mm128_bswap_32( buf[ 1] ); \
|
||||
M2 = mm128_bswap_32( buf[ 2] ); \
|
||||
M3 = mm128_bswap_32( buf[ 3] ); \
|
||||
M4 = mm128_bswap_32( buf[ 4] ); \
|
||||
M5 = mm128_bswap_32( buf[ 5] ); \
|
||||
M6 = mm128_bswap_32( buf[ 6] ); \
|
||||
M7 = mm128_bswap_32( buf[ 7] ); \
|
||||
M8 = mm128_bswap_32( buf[ 8] ); \
|
||||
M9 = mm128_bswap_32( buf[ 9] ); \
|
||||
MA = mm128_bswap_32( buf[10] ); \
|
||||
MB = mm128_bswap_32( buf[11] ); \
|
||||
MC = mm128_bswap_32( buf[12] ); \
|
||||
MD = mm128_bswap_32( buf[13] ); \
|
||||
ME = mm128_bswap_32( buf[14] ); \
|
||||
MF = mm128_bswap_32( buf[15] ); \
|
||||
BLAKE256_4WAY_BLOCK_BSWAP32; \
|
||||
ROUND_S_4WAY(0); \
|
||||
ROUND_S_4WAY(1); \
|
||||
ROUND_S_4WAY(2); \
|
||||
@@ -519,14 +534,14 @@ do { \
|
||||
ROUND_S_4WAY(2); \
|
||||
ROUND_S_4WAY(3); \
|
||||
} \
|
||||
H0 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
|
||||
H1 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
|
||||
H2 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
|
||||
H3 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
|
||||
H4 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
|
||||
H5 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
|
||||
H6 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
|
||||
H7 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
|
||||
H0 = mm128_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm128_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm128_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm128_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm128_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm128_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm128_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm128_xor4( VF, V7, S3, H7 ); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
@@ -607,6 +622,7 @@ do { \
|
||||
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
__m256i shuf_bswap32; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
@@ -623,22 +639,24 @@ do { \
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
|
||||
M0 = mm256_bswap_32( * buf ); \
|
||||
M1 = mm256_bswap_32( *(buf+1) ); \
|
||||
M2 = mm256_bswap_32( *(buf+2) ); \
|
||||
M3 = mm256_bswap_32( *(buf+3) ); \
|
||||
M4 = mm256_bswap_32( *(buf+4) ); \
|
||||
M5 = mm256_bswap_32( *(buf+5) ); \
|
||||
M6 = mm256_bswap_32( *(buf+6) ); \
|
||||
M7 = mm256_bswap_32( *(buf+7) ); \
|
||||
M8 = mm256_bswap_32( *(buf+8) ); \
|
||||
M9 = mm256_bswap_32( *(buf+9) ); \
|
||||
MA = mm256_bswap_32( *(buf+10) ); \
|
||||
MB = mm256_bswap_32( *(buf+11) ); \
|
||||
MC = mm256_bswap_32( *(buf+12) ); \
|
||||
MD = mm256_bswap_32( *(buf+13) ); \
|
||||
ME = mm256_bswap_32( *(buf+14) ); \
|
||||
MF = mm256_bswap_32( *(buf+15) ); \
|
||||
shuf_bswap32 = _mm256_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
|
||||
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
|
||||
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
|
||||
M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
|
||||
M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
|
||||
M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
|
||||
M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
|
||||
M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
|
||||
M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
|
||||
M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
|
||||
MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
|
||||
MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
|
||||
MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
|
||||
MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
|
||||
ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
|
||||
MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
|
||||
ROUND_S_8WAY(0); \
|
||||
ROUND_S_8WAY(1); \
|
||||
ROUND_S_8WAY(2); \
|
||||
@@ -656,22 +674,14 @@ do { \
|
||||
ROUND_S_8WAY(2); \
|
||||
ROUND_S_8WAY(3); \
|
||||
} \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), \
|
||||
S0 ), H0 ); \
|
||||
H1 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), \
|
||||
S1 ), H1 ); \
|
||||
H2 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), \
|
||||
S2 ), H2 ); \
|
||||
H3 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), \
|
||||
S3 ), H3 ); \
|
||||
H4 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), \
|
||||
S0 ), H4 ); \
|
||||
H5 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), \
|
||||
S1 ), H5 ); \
|
||||
H6 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), \
|
||||
S2 ), H6 ); \
|
||||
H7 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), \
|
||||
S3 ), H7 ); \
|
||||
H0 = mm256_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm256_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm256_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm256_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm256_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm256_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm256_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm256_xor4( VF, V7, S3, H7 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
@@ -685,6 +695,7 @@ static void
|
||||
blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
{
|
||||
__m128i zero = m128_zero;
|
||||
casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
|
||||
casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
|
||||
casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
|
||||
@@ -694,16 +705,10 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
|
||||
casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
|
||||
casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
|
||||
|
||||
casti_m128i( ctx->S, 0 ) = m128_zero;
|
||||
casti_m128i( ctx->S, 1 ) = m128_zero;
|
||||
casti_m128i( ctx->S, 2 ) = m128_zero;
|
||||
casti_m128i( ctx->S, 3 ) = m128_zero;
|
||||
/*
|
||||
sc->S[0] = _mm_set1_epi32( salt[0] );
|
||||
sc->S[1] = _mm_set1_epi32( salt[1] );
|
||||
sc->S[2] = _mm_set1_epi32( salt[2] );
|
||||
sc->S[3] = _mm_set1_epi32( salt[3] );
|
||||
*/
|
||||
casti_m128i( ctx->S, 0 ) = zero;
|
||||
casti_m128i( ctx->S, 1 ) = zero;
|
||||
casti_m128i( ctx->S, 2 ) = zero;
|
||||
casti_m128i( ctx->S, 3 ) = zero;
|
||||
ctx->T0 = ctx->T1 = 0;
|
||||
ctx->ptr = 0;
|
||||
ctx->rounds = rounds;
|
||||
@@ -796,14 +801,7 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
|
||||
blake32_4way( ctx, buf, 64 );
|
||||
}
|
||||
|
||||
casti_m128i( dst, 0 ) = mm128_bswap_32( casti_m128i( ctx->H, 0 ) );
|
||||
casti_m128i( dst, 1 ) = mm128_bswap_32( casti_m128i( ctx->H, 1 ) );
|
||||
casti_m128i( dst, 2 ) = mm128_bswap_32( casti_m128i( ctx->H, 2 ) );
|
||||
casti_m128i( dst, 3 ) = mm128_bswap_32( casti_m128i( ctx->H, 3 ) );
|
||||
casti_m128i( dst, 4 ) = mm128_bswap_32( casti_m128i( ctx->H, 4 ) );
|
||||
casti_m128i( dst, 5 ) = mm128_bswap_32( casti_m128i( ctx->H, 5 ) );
|
||||
casti_m128i( dst, 6 ) = mm128_bswap_32( casti_m128i( ctx->H, 6 ) );
|
||||
casti_m128i( dst, 7 ) = mm128_bswap_32( casti_m128i( ctx->H, 7 ) );
|
||||
mm128_block_bswap_32( (__m128i*)dst, (__m128i*)ctx->H );
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
@@ -816,11 +814,21 @@ static void
|
||||
blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
|
||||
const sph_u32 *salt, int rounds )
|
||||
{
|
||||
int i;
|
||||
for ( i = 0; i < 8; i++ )
|
||||
sc->H[i] = _mm256_set1_epi32( iv[i] );
|
||||
for ( i = 0; i < 4; i++ )
|
||||
sc->S[i] = _mm256_set1_epi32( salt[i] );
|
||||
__m256i zero = m256_zero;
|
||||
casti_m256i( sc->H, 0 ) = _mm256_set1_epi32( iv[0] );
|
||||
casti_m256i( sc->H, 1 ) = _mm256_set1_epi32( iv[1] );
|
||||
casti_m256i( sc->H, 2 ) = _mm256_set1_epi32( iv[2] );
|
||||
casti_m256i( sc->H, 3 ) = _mm256_set1_epi32( iv[3] );
|
||||
casti_m256i( sc->H, 4 ) = _mm256_set1_epi32( iv[4] );
|
||||
casti_m256i( sc->H, 5 ) = _mm256_set1_epi32( iv[5] );
|
||||
casti_m256i( sc->H, 6 ) = _mm256_set1_epi32( iv[6] );
|
||||
casti_m256i( sc->H, 7 ) = _mm256_set1_epi32( iv[7] );
|
||||
|
||||
casti_m256i( sc->S, 0 ) = zero;
|
||||
casti_m256i( sc->S, 1 ) = zero;
|
||||
casti_m256i( sc->S, 2 ) = zero;
|
||||
casti_m256i( sc->S, 3 ) = zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
sc->rounds = rounds;
|
||||
@@ -872,14 +880,10 @@ static void
|
||||
blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w32 )
|
||||
{
|
||||
// union {
|
||||
__m256i buf[16];
|
||||
// sph_u32 dummy;
|
||||
// } u;
|
||||
size_t ptr, k;
|
||||
__m256i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
sph_u32 th, tl;
|
||||
__m256i *out;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
@@ -923,9 +927,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
*(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
|
||||
blake32_8way( sc, buf, 64 );
|
||||
}
|
||||
out = (__m256i*)dst;
|
||||
for ( k = 0; k < out_size_w32; k++ )
|
||||
out[k] = mm256_bswap_32( sc->H[k] );
|
||||
mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -412,18 +412,18 @@ static const sph_u64 CB[16] = {
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
|
||||
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
|
||||
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
|
||||
M[0x0] = mm256_bswap_64( *(buf+0) ); \
|
||||
M[0x1] = mm256_bswap_64( *(buf+1) ); \
|
||||
M[0x2] = mm256_bswap_64( *(buf+2) ); \
|
||||
@@ -464,80 +464,76 @@ static const sph_u64 CB[16] = {
|
||||
|
||||
//current impl
|
||||
|
||||
#define COMPRESS64_4WAY do { \
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
|
||||
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
|
||||
M0 = mm256_bswap_64( *(buf + 0) ); \
|
||||
M1 = mm256_bswap_64( *(buf + 1) ); \
|
||||
M2 = mm256_bswap_64( *(buf + 2) ); \
|
||||
M3 = mm256_bswap_64( *(buf + 3) ); \
|
||||
M4 = mm256_bswap_64( *(buf + 4) ); \
|
||||
M5 = mm256_bswap_64( *(buf + 5) ); \
|
||||
M6 = mm256_bswap_64( *(buf + 6) ); \
|
||||
M7 = mm256_bswap_64( *(buf + 7) ); \
|
||||
M8 = mm256_bswap_64( *(buf + 8) ); \
|
||||
M9 = mm256_bswap_64( *(buf + 9) ); \
|
||||
MA = mm256_bswap_64( *(buf + 10) ); \
|
||||
MB = mm256_bswap_64( *(buf + 11) ); \
|
||||
MC = mm256_bswap_64( *(buf + 12) ); \
|
||||
MD = mm256_bswap_64( *(buf + 13) ); \
|
||||
ME = mm256_bswap_64( *(buf + 14) ); \
|
||||
MF = mm256_bswap_64( *(buf + 15) ); \
|
||||
ROUND_B_4WAY(0); \
|
||||
ROUND_B_4WAY(1); \
|
||||
ROUND_B_4WAY(2); \
|
||||
ROUND_B_4WAY(3); \
|
||||
ROUND_B_4WAY(4); \
|
||||
ROUND_B_4WAY(5); \
|
||||
ROUND_B_4WAY(6); \
|
||||
ROUND_B_4WAY(7); \
|
||||
ROUND_B_4WAY(8); \
|
||||
ROUND_B_4WAY(9); \
|
||||
ROUND_B_4WAY(0); \
|
||||
ROUND_B_4WAY(1); \
|
||||
ROUND_B_4WAY(2); \
|
||||
ROUND_B_4WAY(3); \
|
||||
ROUND_B_4WAY(4); \
|
||||
ROUND_B_4WAY(5); \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
|
||||
H1 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
|
||||
H2 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S2, V2 ), VA ), H2 ); \
|
||||
H3 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S3, V3 ), VB ), H3 ); \
|
||||
H4 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S0, V4 ), VC ), H4 ); \
|
||||
H5 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S1, V5 ), VD ), H5 ); \
|
||||
H6 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S2, V6 ), VE ), H6 ); \
|
||||
H7 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S3, V7 ), VF ), H7 ); \
|
||||
} while (0)
|
||||
#define COMPRESS64_4WAY do \
|
||||
{ \
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
__m256i shuf_bswap64; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set1_epi64x( CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set1_epi64x( CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set1_epi64x( CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set1_epi64x( CB3 ) ); \
|
||||
VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
|
||||
_mm256_set1_epi64x( CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
|
||||
_mm256_set1_epi64x( CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
|
||||
_mm256_set1_epi64x( CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
|
||||
_mm256_set1_epi64x( CB7 ) ); \
|
||||
shuf_bswap64 = _mm256_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607, \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
|
||||
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
|
||||
M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
|
||||
M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
|
||||
M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
|
||||
M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
|
||||
M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
|
||||
M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
|
||||
M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
|
||||
MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
|
||||
MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
|
||||
MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
|
||||
MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
|
||||
ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
|
||||
MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
|
||||
ROUND_B_4WAY(0); \
|
||||
ROUND_B_4WAY(1); \
|
||||
ROUND_B_4WAY(2); \
|
||||
ROUND_B_4WAY(3); \
|
||||
ROUND_B_4WAY(4); \
|
||||
ROUND_B_4WAY(5); \
|
||||
ROUND_B_4WAY(6); \
|
||||
ROUND_B_4WAY(7); \
|
||||
ROUND_B_4WAY(8); \
|
||||
ROUND_B_4WAY(9); \
|
||||
ROUND_B_4WAY(0); \
|
||||
ROUND_B_4WAY(1); \
|
||||
ROUND_B_4WAY(2); \
|
||||
ROUND_B_4WAY(3); \
|
||||
ROUND_B_4WAY(4); \
|
||||
ROUND_B_4WAY(5); \
|
||||
H0 = mm256_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm256_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm256_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm256_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm256_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm256_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm256_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm256_xor4( VF, V7, S3, H7 ); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
@@ -547,13 +543,23 @@ static void
|
||||
blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
|
||||
const sph_u64 *salt )
|
||||
{
|
||||
int i;
|
||||
for ( i = 0; i < 8; i++ )
|
||||
sc->H[i] = _mm256_set1_epi64x( iv[i] );
|
||||
for ( i = 0; i < 4; i++ )
|
||||
sc->S[i] = _mm256_set1_epi64x( salt[i] );
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
__m256i zero = m256_zero;
|
||||
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( iv[0] );
|
||||
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( iv[1] );
|
||||
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( iv[2] );
|
||||
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( iv[3] );
|
||||
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( iv[4] );
|
||||
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( iv[5] );
|
||||
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( iv[6] );
|
||||
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( iv[7] );
|
||||
|
||||
casti_m256i( sc->S, 0 ) = zero;
|
||||
casti_m256i( sc->S, 1 ) = zero;
|
||||
casti_m256i( sc->S, 2 ) = zero;
|
||||
casti_m256i( sc->S, 3 ) = zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -604,15 +610,11 @@ static void
|
||||
blake64_4way_close( blake_4way_big_context *sc,
|
||||
unsigned ub, unsigned n, void *dst, size_t out_size_w64)
|
||||
{
|
||||
// union {
|
||||
__m256i buf[16];
|
||||
// sph_u64 dummy;
|
||||
// } u;
|
||||
size_t ptr, k;
|
||||
__m256i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
uint64_t z, zz;
|
||||
sph_u64 th, tl;
|
||||
__m256i *out;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
@@ -665,9 +667,7 @@ blake64_4way_close( blake_4way_big_context *sc,
|
||||
|
||||
blake64_4way( sc, buf, 128 );
|
||||
}
|
||||
out = (__m256i*)dst;
|
||||
for ( k = 0; k < out_size_w64; k++ )
|
||||
out[k] = mm256_bswap_64( sc->H[k] );
|
||||
mm256_block_bswap_64( (__m256i*)dst, sc->H );
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -113,50 +113,27 @@ static const uint32_t IV256[] = {
|
||||
|
||||
|
||||
#define expand1s( qt, M, H, i ) \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( ss1( qt[ (i)-16 ] ), \
|
||||
ss2( qt[ (i)-15 ] ) ), \
|
||||
_mm_add_epi32( ss3( qt[ (i)-14 ] ), \
|
||||
ss0( qt[ (i)-13 ] ) ) ), \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( ss1( qt[ (i)-12 ] ), \
|
||||
ss2( qt[ (i)-11 ] ) ), \
|
||||
_mm_add_epi32( ss3( qt[ (i)-10 ] ), \
|
||||
ss0( qt[ (i)- 9 ] ) ) ) ), \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( ss1( qt[ (i)- 8 ] ), \
|
||||
ss2( qt[ (i)- 7 ] ) ), \
|
||||
_mm_add_epi32( ss3( qt[ (i)- 6 ] ), \
|
||||
ss0( qt[ (i)- 5 ] ) ) ), \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( ss1( qt[ (i)- 4 ] ), \
|
||||
ss2( qt[ (i)- 3 ] ) ), \
|
||||
_mm_add_epi32( ss3( qt[ (i)- 2 ] ), \
|
||||
ss0( qt[ (i)- 1 ] ) ) ) ) ), \
|
||||
_mm_add_epi32( mm128_add4_32( \
|
||||
mm128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
|
||||
ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \
|
||||
mm128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
|
||||
ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \
|
||||
mm128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
|
||||
ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ), \
|
||||
mm128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
|
||||
ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_s( M, H, (i)-16 ) )
|
||||
|
||||
#define expand2s( qt, M, H, i) \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ) ), \
|
||||
_mm_add_epi32( qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ) ), \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ) ), \
|
||||
_mm_add_epi32( qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ) ) ), \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ) ), \
|
||||
_mm_add_epi32( qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ) ), \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ) ), \
|
||||
_mm_add_epi32( ss4( qt[ (i)- 2 ] ), \
|
||||
ss5( qt[ (i)- 1 ] ) ) ) ) ), \
|
||||
_mm_add_epi32( mm128_add4_32( \
|
||||
mm128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
|
||||
qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \
|
||||
mm128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
|
||||
qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \
|
||||
mm128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
|
||||
qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \
|
||||
mm128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
|
||||
ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_s( M, H, (i)-16 ) )
|
||||
|
||||
#define Ws0 \
|
||||
@@ -357,17 +334,11 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
|
||||
qt[30] = expand2s( qt, M, H, 30 );
|
||||
qt[31] = expand2s( qt, M, H, 31 );
|
||||
|
||||
xl = _mm_xor_si128(
|
||||
_mm_xor_si128( _mm_xor_si128( qt[16], qt[17] ),
|
||||
_mm_xor_si128( qt[18], qt[19] ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( qt[20], qt[21] ),
|
||||
_mm_xor_si128( qt[22], qt[23] ) ) );
|
||||
xh = _mm_xor_si128( xl,
|
||||
_mm_xor_si128(
|
||||
_mm_xor_si128( _mm_xor_si128( qt[24], qt[25] ),
|
||||
_mm_xor_si128( qt[26], qt[27] ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( qt[28], qt[29] ),
|
||||
_mm_xor_si128( qt[30], qt[31] ) )));
|
||||
xl = _mm_xor_si128( mm128_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm_xor_si128( xl, _mm_xor_si128(
|
||||
mm128_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
|
||||
dH[ 0] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[0],
|
||||
@@ -695,22 +666,15 @@ bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
|
||||
#define expand2s8( qt, M, H, i) \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( qt[ (i)-16 ], r8s1( qt[ (i)-15 ] ) ), \
|
||||
_mm256_add_epi32( qt[ (i)-14 ], r8s2( qt[ (i)-13 ] ) ) ), \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( qt[ (i)-12 ], r8s3( qt[ (i)-11 ] ) ), \
|
||||
_mm256_add_epi32( qt[ (i)-10 ], r8s4( qt[ (i)- 9 ] ) ) ) ), \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( qt[ (i)- 8 ], r8s5( qt[ (i)- 7 ] ) ), \
|
||||
_mm256_add_epi32( qt[ (i)- 6 ], r8s6( qt[ (i)- 5 ] ) ) ), \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( qt[ (i)- 4 ], r8s7( qt[ (i)- 3 ] ) ), \
|
||||
_mm256_add_epi32( s8s4( qt[ (i)- 2 ] ), \
|
||||
s8s5( qt[ (i)- 1 ] ) ) ) ) ), \
|
||||
mm256_add4_32( \
|
||||
mm256_add4_32( qt[ (i)-16 ], r8s1( qt[ (i)-15 ] ), \
|
||||
qt[ (i)-14 ], r8s2( qt[ (i)-13 ] ) ), \
|
||||
mm256_add4_32( qt[ (i)-12 ], r8s3( qt[ (i)-11 ] ), \
|
||||
qt[ (i)-10 ], r8s4( qt[ (i)- 9 ] ) ), \
|
||||
mm256_add4_32( qt[ (i)- 8 ], r8s5( qt[ (i)- 7 ] ), \
|
||||
qt[ (i)- 6 ], r8s6( qt[ (i)- 5 ] ) ), \
|
||||
mm256_add4_32( qt[ (i)- 4 ], r8s7( qt[ (i)- 3 ] ), \
|
||||
s8s4( qt[ (i)- 2 ] ), s8s5( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_s8( M, H, (i)-16 ) )
|
||||
|
||||
|
||||
@@ -913,16 +877,11 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
|
||||
qt[31] = expand2s8( qt, M, H, 31 );
|
||||
|
||||
xl = _mm256_xor_si256(
|
||||
_mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ),
|
||||
_mm256_xor_si256( qt[18], qt[19] ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( qt[20], qt[21] ),
|
||||
_mm256_xor_si256( qt[22], qt[23] ) ) );
|
||||
xh = _mm256_xor_si256( xl,
|
||||
_mm256_xor_si256(
|
||||
_mm256_xor_si256( _mm256_xor_si256( qt[24], qt[25] ),
|
||||
_mm256_xor_si256( qt[26], qt[27] ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
|
||||
_mm256_xor_si256( qt[30], qt[31] ) )));
|
||||
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm256_xor_si256( xl, _mm256_xor_si256(
|
||||
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
|
||||
dH[ 0] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[0],
|
||||
|
||||
@@ -569,28 +569,20 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
||||
|
||||
|
||||
#define sb0(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
|
||||
_mm256_slli_epi64( (x), 3) ), \
|
||||
_mm256_xor_si256( mm256_rol_64( (x), 4), \
|
||||
mm256_rol_64( (x), 37) ) )
|
||||
mm256_xor4( _mm256_srli_epi64( (x), 1), _mm256_slli_epi64( (x), 3), \
|
||||
mm256_rol_64( (x), 4), mm256_rol_64( (x),37) )
|
||||
|
||||
#define sb1(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
|
||||
_mm256_slli_epi64( (x), 2) ), \
|
||||
_mm256_xor_si256( mm256_rol_64( (x), 13), \
|
||||
mm256_rol_64( (x), 43) ) )
|
||||
mm256_xor4( _mm256_srli_epi64( (x), 1), _mm256_slli_epi64( (x), 2), \
|
||||
mm256_rol_64( (x),13), mm256_rol_64( (x),43) )
|
||||
|
||||
#define sb2(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
|
||||
_mm256_slli_epi64( (x), 1) ), \
|
||||
_mm256_xor_si256( mm256_rol_64( (x), 19), \
|
||||
mm256_rol_64( (x), 53) ) )
|
||||
mm256_xor4( _mm256_srli_epi64( (x), 2), _mm256_slli_epi64( (x), 1), \
|
||||
mm256_rol_64( (x),19), mm256_rol_64( (x),53) )
|
||||
|
||||
#define sb3(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
|
||||
_mm256_slli_epi64( (x), 2) ), \
|
||||
_mm256_xor_si256( mm256_rol_64( (x), 28), \
|
||||
mm256_rol_64( (x), 59) ) )
|
||||
mm256_xor4( _mm256_srli_epi64( (x), 2), _mm256_slli_epi64( (x), 2), \
|
||||
mm256_rol_64( (x),28), mm256_rol_64( (x),59) )
|
||||
|
||||
#define sb4(x) \
|
||||
_mm256_xor_si256( (x), _mm256_srli_epi64( (x), 1 ) )
|
||||
@@ -618,55 +610,32 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
||||
rol_off_64( M, j, 10 ) ), \
|
||||
_mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
|
||||
H[ ( (j)+7 ) & 0xF ] )
|
||||
|
||||
|
||||
|
||||
#define expand1b( qt, M, H, i ) \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( sb1( qt[ (i)-16 ] ), \
|
||||
sb2( qt[ (i)-15 ] ) ), \
|
||||
_mm256_add_epi64( sb3( qt[ (i)-14 ] ), \
|
||||
sb0( qt[ (i)-13 ] ) ) ), \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( sb1( qt[ (i)-12 ] ), \
|
||||
sb2( qt[ (i)-11 ] ) ), \
|
||||
_mm256_add_epi64( sb3( qt[ (i)-10 ] ), \
|
||||
sb0( qt[ (i)- 9 ] ) ) ) ), \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( sb1( qt[ (i)- 8 ] ), \
|
||||
sb2( qt[ (i)- 7 ] ) ), \
|
||||
_mm256_add_epi64( sb3( qt[ (i)- 6 ] ), \
|
||||
sb0( qt[ (i)- 5 ] ) ) ), \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( sb1( qt[ (i)- 4 ] ), \
|
||||
sb2( qt[ (i)- 3 ] ) ), \
|
||||
_mm256_add_epi64( sb3( qt[ (i)- 2 ] ), \
|
||||
sb0( qt[ (i)- 1 ] ) ) ) ) ), \
|
||||
_mm256_add_epi64( mm256_add4_64( \
|
||||
mm256_add4_64( sb1( qt[ (i)-16 ] ), sb2( qt[ (i)-15 ] ), \
|
||||
sb3( qt[ (i)-14 ] ), sb0( qt[ (i)-13 ] )), \
|
||||
mm256_add4_64( sb1( qt[ (i)-12 ] ), sb2( qt[ (i)-11 ] ), \
|
||||
sb3( qt[ (i)-10 ] ), sb0( qt[ (i)- 9 ] )), \
|
||||
mm256_add4_64( sb1( qt[ (i)- 8 ] ), sb2( qt[ (i)- 7 ] ), \
|
||||
sb3( qt[ (i)- 6 ] ), sb0( qt[ (i)- 5 ] )), \
|
||||
mm256_add4_64( sb1( qt[ (i)- 4 ] ), sb2( qt[ (i)- 3 ] ), \
|
||||
sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b( M, H, (i)-16 ) )
|
||||
|
||||
#define expand2b( qt, M, H, i) \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ) ), \
|
||||
_mm256_add_epi64( qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ) ), \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ) ), \
|
||||
_mm256_add_epi64( qt[ (i)-10 ], rb4( qt[ (i)- 9 ] ) ) ) ), \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ) ), \
|
||||
_mm256_add_epi64( qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ) ), \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ) ), \
|
||||
_mm256_add_epi64( sb4( qt[ (i)- 2 ] ), \
|
||||
sb5( qt[ (i)- 1 ] ) ) ) ) ), \
|
||||
_mm256_add_epi64( mm256_add4_64( \
|
||||
mm256_add4_64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ), \
|
||||
qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ), \
|
||||
mm256_add4_64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ), \
|
||||
qt[ (i)-10 ], rb4( qt[ (i)- 9 ] ) ), \
|
||||
mm256_add4_64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ), \
|
||||
qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ), \
|
||||
mm256_add4_64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ), \
|
||||
sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b( M, H, (i)-16 ) )
|
||||
|
||||
|
||||
#define Wb0 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
@@ -864,95 +833,90 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
qt[30] = expand2b( qt, M, H, 30 );
|
||||
qt[31] = expand2b( qt, M, H, 31 );
|
||||
|
||||
xl = _mm256_xor_si256(
|
||||
_mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ),
|
||||
_mm256_xor_si256( qt[18], qt[19] ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( qt[20], qt[21] ),
|
||||
_mm256_xor_si256( qt[22], qt[23] ) ) );
|
||||
xh = _mm256_xor_si256( xl,
|
||||
_mm256_xor_si256(
|
||||
_mm256_xor_si256( _mm256_xor_si256( qt[24], qt[25] ),
|
||||
_mm256_xor_si256( qt[26], qt[27] ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
|
||||
_mm256_xor_si256( qt[30], qt[31] ) )));
|
||||
xl = _mm256_xor_si256(
|
||||
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm256_xor_si256( xl, _mm256_xor_si256(
|
||||
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
|
||||
dH[ 0] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[0],
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
|
||||
_mm256_srli_epi64( qt[16], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
|
||||
_mm256_xor_si256( M[0],
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
|
||||
_mm256_srli_epi64( qt[16], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ) );
|
||||
dH[ 1] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[1],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
|
||||
_mm256_slli_epi64( qt[17], 8 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
|
||||
_mm256_xor_si256( M[1],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
|
||||
_mm256_slli_epi64( qt[17], 8 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ) );
|
||||
dH[ 2] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[2],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
|
||||
_mm256_slli_epi64( qt[18], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
|
||||
_mm256_xor_si256( M[2],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
|
||||
_mm256_slli_epi64( qt[18], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ) );
|
||||
dH[ 3] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[3],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
|
||||
_mm256_slli_epi64( qt[19], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
|
||||
_mm256_xor_si256( M[3],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
|
||||
_mm256_slli_epi64( qt[19], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ) );
|
||||
dH[ 4] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[4],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
|
||||
_mm256_slli_epi64( qt[20], 0 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
|
||||
_mm256_xor_si256( M[4],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
|
||||
_mm256_slli_epi64( qt[20], 0 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ) );
|
||||
dH[ 5] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[5],
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
|
||||
_mm256_srli_epi64( qt[21], 6 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
|
||||
_mm256_xor_si256( M[5],
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
|
||||
_mm256_srli_epi64( qt[21], 6 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ) );
|
||||
dH[ 6] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[6],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
|
||||
_mm256_slli_epi64( qt[22], 6 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
|
||||
_mm256_xor_si256( M[6],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
|
||||
_mm256_slli_epi64( qt[22], 6 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ) );
|
||||
dH[ 7] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[7],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
|
||||
_mm256_slli_epi64( qt[23], 2 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
|
||||
_mm256_xor_si256( M[7],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
|
||||
_mm256_slli_epi64( qt[23], 2 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ) );
|
||||
dH[ 8] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rol_64( dH[4], 9 ),
|
||||
mm256_rol_64( dH[4], 9 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xl, 8 ),
|
||||
_mm256_xor_si256( qt[23], qt[ 8] ) ) );
|
||||
dH[ 9] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rol_64( dH[5], 10 ),
|
||||
mm256_rol_64( dH[5], 10 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 6 ),
|
||||
_mm256_xor_si256( qt[16], qt[ 9] ) ) );
|
||||
dH[10] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rol_64( dH[6], 11 ),
|
||||
mm256_rol_64( dH[6], 11 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xl, 6 ),
|
||||
_mm256_xor_si256( qt[17], qt[10] ) ) );
|
||||
dH[11] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rol_64( dH[7], 12 ),
|
||||
mm256_rol_64( dH[7], 12 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xl, 4 ),
|
||||
_mm256_xor_si256( qt[18], qt[11] ) ) );
|
||||
dH[12] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rol_64( dH[0], 13 ),
|
||||
mm256_rol_64( dH[0], 13 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 3 ),
|
||||
_mm256_xor_si256( qt[19], qt[12] ) ) );
|
||||
dH[13] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rol_64( dH[1], 14 ),
|
||||
mm256_rol_64( dH[1], 14 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 4 ),
|
||||
_mm256_xor_si256( qt[20], qt[13] ) ) );
|
||||
dH[14] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rol_64( dH[2], 15 ),
|
||||
mm256_rol_64( dH[2], 15 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 7 ),
|
||||
_mm256_xor_si256( qt[21], qt[14] ) ) );
|
||||
dH[15] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rol_64( dH[3], 16 ),
|
||||
mm256_rol_64( dH[3], 16 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 2 ),
|
||||
_mm256_xor_si256( qt[22], qt[15] ) ) );
|
||||
|
||||
@@ -531,16 +531,17 @@ static const sph_u32 T512[64][16] = {
|
||||
|
||||
#define INPUT_BIG \
|
||||
do { \
|
||||
const __m256i zero = _mm256_setzero_si256(); \
|
||||
__m256i db = *buf; \
|
||||
const sph_u32 *tp = &T512[0][0]; \
|
||||
m0 = m256_zero; \
|
||||
m1 = m256_zero; \
|
||||
m2 = m256_zero; \
|
||||
m3 = m256_zero; \
|
||||
m4 = m256_zero; \
|
||||
m5 = m256_zero; \
|
||||
m6 = m256_zero; \
|
||||
m7 = m256_zero; \
|
||||
m0 = zero; \
|
||||
m1 = zero; \
|
||||
m2 = zero; \
|
||||
m3 = zero; \
|
||||
m4 = zero; \
|
||||
m5 = zero; \
|
||||
m6 = zero; \
|
||||
m7 = zero; \
|
||||
for ( int u = 0; u < 64; u++ ) \
|
||||
{ \
|
||||
__m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
|
||||
@@ -913,9 +914,7 @@ void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
|
||||
|
||||
void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
{
|
||||
__m256i *out = (__m256i*)dst;
|
||||
__m256i pad[1];
|
||||
size_t u;
|
||||
int ch, cl;
|
||||
|
||||
sph_enc32be( &ch, sc->count_high );
|
||||
@@ -925,8 +924,8 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
0UL, 0x80UL, 0UL, 0x80UL );
|
||||
hamsi_big( sc, sc->buf, 1 );
|
||||
hamsi_big_final( sc, pad );
|
||||
for ( u = 0; u < 8; u ++ )
|
||||
out[u] = mm256_bswap_32( sc->h[u] );
|
||||
|
||||
mm256_block_bswap_32( (__m256i*)dst, sc->h );
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@@ -83,7 +83,7 @@ void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf)
|
||||
keys[14] = tmp1;
|
||||
}
|
||||
|
||||
#ifdef __SSE4_2__
|
||||
#if defined(__SSE4_2__)
|
||||
//#ifdef __AVX__
|
||||
|
||||
#define AESENC(i,j) \
|
||||
@@ -151,7 +151,7 @@ void AES256CBC(__m128i** data, const __m128i** next, __m128i ExpandedKey[][16],
|
||||
}
|
||||
}
|
||||
|
||||
#else // NO SSE4.2
|
||||
#else // NO AVX
|
||||
|
||||
static inline __m128i AES256Core(__m128i State, const __m128i *ExpandedKey)
|
||||
{
|
||||
|
||||
@@ -166,7 +166,7 @@ bool register_hodl_algo( algo_gate_t* gate )
|
||||
// return false;
|
||||
// }
|
||||
pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
|
||||
gate->optimizations = AES_OPT | SSE42_OPT | AVX2_OPT;
|
||||
gate->optimizations = AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&hodl_scanhash;
|
||||
gate->get_new_work = (void*)&hodl_get_new_work;
|
||||
gate->longpoll_rpc_call = (void*)&hodl_longpoll_rpc_call;
|
||||
|
||||
@@ -17,7 +17,7 @@ void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
|
||||
const uint32_t StartChunk = ThreadID * Chunk;
|
||||
const uint32_t EndChunk = StartChunk + Chunk;
|
||||
|
||||
#ifdef __SSE4_2__
|
||||
#if defined(__SSE4_2__)
|
||||
//#ifdef __AVX__
|
||||
uint64_t* TempBufs[ SHA512_PARALLEL_N ] ;
|
||||
uint64_t* desination[ SHA512_PARALLEL_N ];
|
||||
@@ -64,7 +64,7 @@ void Rev256(uint32_t *Dest, const uint32_t *Src)
|
||||
int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
#ifdef __SSE4_2__
|
||||
#if defined(__SSE4_2__)
|
||||
//#ifdef __AVX__
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -140,7 +140,7 @@ int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
|
||||
return(0);
|
||||
|
||||
|
||||
#else // no SSE4.2
|
||||
#else // no AVX
|
||||
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -148,6 +148,7 @@ int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
|
||||
CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
|
||||
CacheEntry Cache;
|
||||
uint32_t CollisionCount = 0;
|
||||
int threadNumber = mythr->id;
|
||||
|
||||
swab32_array( BlockHdr, pdata, 20 );
|
||||
// Search for pattern in psuedorandom data
|
||||
@@ -205,7 +206,7 @@ int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
|
||||
*hashes_done = CollisionCount;
|
||||
return(0);
|
||||
|
||||
#endif // SSE4.2 else
|
||||
#endif // AVX else
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -23,6 +23,7 @@ typedef struct
|
||||
__m256i h[8];
|
||||
__m256i w[80];
|
||||
#elif defined(__SSE4_2__)
|
||||
//#elif defined(__AVX__)
|
||||
__m128i h[8];
|
||||
__m128i w[80];
|
||||
#else
|
||||
@@ -32,7 +33,8 @@ typedef struct
|
||||
|
||||
#ifdef __AVX2__
|
||||
#define SHA512_PARALLEL_N 8
|
||||
#elif defined(__SSE$_2__)
|
||||
#elif defined(__SSE4_2__)
|
||||
//#elif defined(__AVX__)
|
||||
#define SHA512_PARALLEL_N 4
|
||||
#else
|
||||
#define SHA512_PARALLEL_N 1 // dummy value
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#ifndef __AVX2__
|
||||
|
||||
#ifdef __SSE4_2__
|
||||
#if defined(__SSE4_2__)
|
||||
//#ifdef __AVX__
|
||||
|
||||
//Dependencies
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf);
|
||||
|
||||
#ifdef __SSE4_2__
|
||||
#if defined(__SSE4_2__)
|
||||
//#ifdef __AVX__
|
||||
|
||||
#define AES_PARALLEL_N 8
|
||||
|
||||
@@ -77,6 +77,24 @@ static const sph_u32 V_INIT[5][8] = {
|
||||
}
|
||||
};
|
||||
|
||||
#if SPH_LUFFA_PARALLEL
|
||||
|
||||
static const sph_u64 RCW010[8] = {
|
||||
SPH_C64(0xb6de10ed303994a6), SPH_C64(0x70f47aaec0e65299),
|
||||
SPH_C64(0x0707a3d46cc33a12), SPH_C64(0x1c1e8f51dc56983e),
|
||||
SPH_C64(0x707a3d451e00108f), SPH_C64(0xaeb285627800423d),
|
||||
SPH_C64(0xbaca15898f5b7882), SPH_C64(0x40a46f3e96e1db12)
|
||||
};
|
||||
|
||||
static const sph_u64 RCW014[8] = {
|
||||
SPH_C64(0x01685f3de0337818), SPH_C64(0x05a17cf4441ba90d),
|
||||
SPH_C64(0xbd09caca7f34d442), SPH_C64(0xf4272b289389217f),
|
||||
SPH_C64(0x144ae5cce5a8bce6), SPH_C64(0xfaa7ae2b5274baf4),
|
||||
SPH_C64(0x2e48f1c126889ba7), SPH_C64(0xb923c7049a226e9d)
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
static const sph_u32 RC00[8] = {
|
||||
SPH_C32(0x303994a6), SPH_C32(0xc0e65299),
|
||||
SPH_C32(0x6cc33a12), SPH_C32(0xdc56983e),
|
||||
@@ -105,20 +123,18 @@ static const sph_u32 RC14[8] = {
|
||||
SPH_C32(0x2e48f1c1), SPH_C32(0xb923c704)
|
||||
};
|
||||
|
||||
#if SPH_LUFFA_PARALLEL
|
||||
|
||||
static const sph_u64 RCW010[8] = {
|
||||
SPH_C64(0xb6de10ed303994a6), SPH_C64(0x70f47aaec0e65299),
|
||||
SPH_C64(0x0707a3d46cc33a12), SPH_C64(0x1c1e8f51dc56983e),
|
||||
SPH_C64(0x707a3d451e00108f), SPH_C64(0xaeb285627800423d),
|
||||
SPH_C64(0xbaca15898f5b7882), SPH_C64(0x40a46f3e96e1db12)
|
||||
static const sph_u32 RC30[8] = {
|
||||
SPH_C32(0xb213afa5), SPH_C32(0xc84ebe95),
|
||||
SPH_C32(0x4e608a22), SPH_C32(0x56d858fe),
|
||||
SPH_C32(0x343b138f), SPH_C32(0xd0ec4e3d),
|
||||
SPH_C32(0x2ceb4882), SPH_C32(0xb3ad2208)
|
||||
};
|
||||
|
||||
static const sph_u64 RCW014[8] = {
|
||||
SPH_C64(0x01685f3de0337818), SPH_C64(0x05a17cf4441ba90d),
|
||||
SPH_C64(0xbd09caca7f34d442), SPH_C64(0xf4272b289389217f),
|
||||
SPH_C64(0x144ae5cce5a8bce6), SPH_C64(0xfaa7ae2b5274baf4),
|
||||
SPH_C64(0x2e48f1c126889ba7), SPH_C64(0xb923c7049a226e9d)
|
||||
static const sph_u32 RC34[8] = {
|
||||
SPH_C32(0xe028c9bf), SPH_C32(0x44756f91),
|
||||
SPH_C32(0x7e8fce32), SPH_C32(0x956548be),
|
||||
SPH_C32(0xfe191be2), SPH_C32(0x3cb226e5),
|
||||
SPH_C32(0x5944a28e), SPH_C32(0xa1c4c355)
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -137,19 +153,6 @@ static const sph_u32 RC24[8] = {
|
||||
SPH_C32(0x36eda57f), SPH_C32(0x703aace7)
|
||||
};
|
||||
|
||||
static const sph_u32 RC30[8] = {
|
||||
SPH_C32(0xb213afa5), SPH_C32(0xc84ebe95),
|
||||
SPH_C32(0x4e608a22), SPH_C32(0x56d858fe),
|
||||
SPH_C32(0x343b138f), SPH_C32(0xd0ec4e3d),
|
||||
SPH_C32(0x2ceb4882), SPH_C32(0xb3ad2208)
|
||||
};
|
||||
|
||||
static const sph_u32 RC34[8] = {
|
||||
SPH_C32(0xe028c9bf), SPH_C32(0x44756f91),
|
||||
SPH_C32(0x7e8fce32), SPH_C32(0x956548be),
|
||||
SPH_C32(0xfe191be2), SPH_C32(0x3cb226e5),
|
||||
SPH_C32(0x5944a28e), SPH_C32(0xa1c4c355)
|
||||
};
|
||||
|
||||
#if SPH_LUFFA_PARALLEL
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
//#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
|
||||
__thread uint64_t* lyra2h_4way_matrix;
|
||||
|
||||
@@ -50,6 +50,7 @@ void anime_4way_hash( void *state, const void *input )
|
||||
__m256i vh_mask;
|
||||
const uint32_t mask = 8;
|
||||
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
anime_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
|
||||
|
||||
@@ -59,8 +60,7 @@ void anime_4way_hash( void *state, const void *input )
|
||||
blake512_4way( &ctx.blake, vhash, 64 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
m256_zero );
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
@@ -114,8 +114,7 @@ void anime_4way_hash( void *state, const void *input )
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
m256_zero );
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
|
||||
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
{
|
||||
@@ -139,8 +138,7 @@ void anime_4way_hash( void *state, const void *input )
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
m256_zero );
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
|
||||
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
{
|
||||
|
||||
@@ -51,6 +51,7 @@ void quark_4way_hash( void *state, const void *input )
|
||||
quark_4way_ctx_holder ctx;
|
||||
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
|
||||
const uint32_t mask = 8;
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
|
||||
memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
|
||||
|
||||
@@ -60,8 +61,7 @@ void quark_4way_hash( void *state, const void *input )
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
m256_zero );
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
@@ -115,8 +115,7 @@ void quark_4way_hash( void *state, const void *input )
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
m256_zero );
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
|
||||
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
{
|
||||
@@ -141,8 +140,7 @@ void quark_4way_hash( void *state, const void *input )
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
m256_zero );
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
|
||||
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
{
|
||||
|
||||
@@ -86,8 +86,7 @@ static const sph_u32 K256[64] = {
|
||||
// SHA-256 4 way
|
||||
|
||||
#define SHA2s_MEXP( a, b, c, d ) \
|
||||
_mm_add_epi32( _mm_add_epi32( _mm_add_epi32( \
|
||||
SSG2_1( W[a] ), W[b] ), SSG2_0( W[c] ) ), W[d] );
|
||||
mm128_add4_32( SSG2_1( W[a] ), W[b], SSG2_0( W[c] ), W[d] );
|
||||
|
||||
#define CHs(X, Y, Z) \
|
||||
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z )
|
||||
@@ -115,9 +114,8 @@ static const sph_u32 K256[64] = {
|
||||
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
register __m128i T1, T2; \
|
||||
T1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( \
|
||||
_mm_add_epi32( H, BSG2_1(E) ), CHs(E, F, G) ), \
|
||||
_mm_set1_epi32( K256[( (j)+(i) )] ) ), W[i] ); \
|
||||
T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
|
||||
_mm_set1_epi32( K256[( (j)+(i) )] ), W[i] ) ); \
|
||||
T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
|
||||
D = _mm_add_epi32( D, T1 ); \
|
||||
H = _mm_add_epi32( T1, T2 ); \
|
||||
@@ -129,22 +127,8 @@ sha256_4way_round( __m128i *in, __m128i r[8] )
|
||||
register __m128i A, B, C, D, E, F, G, H;
|
||||
__m128i W[16];
|
||||
|
||||
W[ 0] = mm128_bswap_32( in[ 0] );
|
||||
W[ 1] = mm128_bswap_32( in[ 1] );
|
||||
W[ 2] = mm128_bswap_32( in[ 2] );
|
||||
W[ 3] = mm128_bswap_32( in[ 3] );
|
||||
W[ 4] = mm128_bswap_32( in[ 4] );
|
||||
W[ 5] = mm128_bswap_32( in[ 5] );
|
||||
W[ 6] = mm128_bswap_32( in[ 6] );
|
||||
W[ 7] = mm128_bswap_32( in[ 7] );
|
||||
W[ 8] = mm128_bswap_32( in[ 8] );
|
||||
W[ 9] = mm128_bswap_32( in[ 9] );
|
||||
W[10] = mm128_bswap_32( in[10] );
|
||||
W[11] = mm128_bswap_32( in[11] );
|
||||
W[12] = mm128_bswap_32( in[12] );
|
||||
W[13] = mm128_bswap_32( in[13] );
|
||||
W[14] = mm128_bswap_32( in[14] );
|
||||
W[15] = mm128_bswap_32( in[15] );
|
||||
mm128_block_bswap_32( W, in );
|
||||
mm128_block_bswap_32( W+8, in+8 );
|
||||
|
||||
A = r[0];
|
||||
B = r[1];
|
||||
@@ -266,7 +250,7 @@ void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
|
||||
|
||||
void sha256_4way_close( sha256_4way_context *sc, void *dst )
|
||||
{
|
||||
unsigned ptr, u;
|
||||
unsigned ptr;
|
||||
uint32_t low, high;
|
||||
const int buf_size = 64;
|
||||
const int pad = buf_size - 8;
|
||||
@@ -294,8 +278,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
|
||||
mm128_bswap_32( _mm_set1_epi32( low ) );
|
||||
sha256_4way_round( sc->buf, sc->val );
|
||||
|
||||
for ( u = 0; u < 8; u ++ )
|
||||
((__m128i*)dst)[u] = mm128_bswap_32( sc->val[u] );
|
||||
mm128_block_bswap_32( dst, sc->val );
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
@@ -326,15 +309,13 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
|
||||
mm256_ror_32(x, 17), mm256_ror_32(x, 19) ), _mm256_srli_epi32(x, 10) )
|
||||
|
||||
#define SHA2x_MEXP( a, b, c, d ) \
|
||||
_mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
|
||||
SSG2_1x( W[a] ), W[b] ), SSG2_0x( W[c] ) ), W[d] );
|
||||
mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );
|
||||
|
||||
#define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
register __m256i T1, T2; \
|
||||
T1 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
|
||||
_mm256_add_epi32( H, BSG2_1x(E) ), CHx(E, F, G) ), \
|
||||
_mm256_set1_epi32( K256[( (j)+(i) )] ) ), W[i] ); \
|
||||
T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \
|
||||
_mm256_set1_epi32( K256[( (j)+(i) )] ), W[i] ) ); \
|
||||
T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
|
||||
D = _mm256_add_epi32( D, T1 ); \
|
||||
H = _mm256_add_epi32( T1, T2 ); \
|
||||
@@ -346,22 +327,8 @@ sha256_8way_round( __m256i *in, __m256i r[8] )
|
||||
register __m256i A, B, C, D, E, F, G, H;
|
||||
__m256i W[16];
|
||||
|
||||
W[ 0] = mm256_bswap_32( in[ 0] );
|
||||
W[ 1] = mm256_bswap_32( in[ 1] );
|
||||
W[ 2] = mm256_bswap_32( in[ 2] );
|
||||
W[ 3] = mm256_bswap_32( in[ 3] );
|
||||
W[ 4] = mm256_bswap_32( in[ 4] );
|
||||
W[ 5] = mm256_bswap_32( in[ 5] );
|
||||
W[ 6] = mm256_bswap_32( in[ 6] );
|
||||
W[ 7] = mm256_bswap_32( in[ 7] );
|
||||
W[ 8] = mm256_bswap_32( in[ 8] );
|
||||
W[ 9] = mm256_bswap_32( in[ 9] );
|
||||
W[10] = mm256_bswap_32( in[10] );
|
||||
W[11] = mm256_bswap_32( in[11] );
|
||||
W[12] = mm256_bswap_32( in[12] );
|
||||
W[13] = mm256_bswap_32( in[13] );
|
||||
W[14] = mm256_bswap_32( in[14] );
|
||||
W[15] = mm256_bswap_32( in[15] );
|
||||
mm256_block_bswap_32( W , in );
|
||||
mm256_block_bswap_32( W+8, in+8 );
|
||||
|
||||
A = r[0];
|
||||
B = r[1];
|
||||
@@ -484,7 +451,7 @@ void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
|
||||
|
||||
void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
||||
{
|
||||
unsigned ptr, u;
|
||||
unsigned ptr;
|
||||
uint32_t low, high;
|
||||
const int buf_size = 64;
|
||||
const int pad = buf_size - 8;
|
||||
@@ -513,8 +480,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
||||
|
||||
sha256_8way_round( sc->buf, sc->val );
|
||||
|
||||
for ( u = 0; u < 8; u ++ )
|
||||
((__m256i*)dst)[u] = mm256_bswap_32( sc->val[u] );
|
||||
mm256_block_bswap_32( dst, sc->val );
|
||||
}
|
||||
|
||||
|
||||
@@ -596,9 +562,8 @@ static const sph_u64 K512[80] = {
|
||||
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
||||
do { \
|
||||
register __m256i T1, T2; \
|
||||
T1 = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64( \
|
||||
_mm256_add_epi64( H, BSG5_1(E) ), CH(E, F, G) ), \
|
||||
_mm256_set1_epi64x( K512[i] ) ), W[i] ); \
|
||||
T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
|
||||
_mm256_set1_epi64x( K512[i] ), W[i] ) ); \
|
||||
T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
|
||||
D = _mm256_add_epi64( D, T1 ); \
|
||||
H = _mm256_add_epi64( T1, T2 ); \
|
||||
@@ -611,11 +576,12 @@ sha512_4way_round( __m256i *in, __m256i r[8] )
|
||||
register __m256i A, B, C, D, E, F, G, H;
|
||||
__m256i W[80];
|
||||
|
||||
for ( i = 0; i < 16; i++ )
|
||||
W[i] = mm256_bswap_64( in[i] );
|
||||
mm256_block_bswap_64( W , in );
|
||||
mm256_block_bswap_64( W+8, in+8 );
|
||||
|
||||
for ( i = 16; i < 80; i++ )
|
||||
W[i] = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64(
|
||||
SSG5_1( W[ i-2 ] ), W[ i-7 ] ), SSG5_0( W[ i-15 ] ) ), W[ i-16 ] );
|
||||
W[i] = mm256_add4_64( SSG5_1( W[ i- 2 ] ), W[ i- 7 ],
|
||||
SSG5_0( W[ i-15 ] ), W[ i-16 ] );
|
||||
|
||||
A = r[0];
|
||||
B = r[1];
|
||||
@@ -689,7 +655,7 @@ void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
|
||||
|
||||
void sha512_4way_close( sha512_4way_context *sc, void *dst )
|
||||
{
|
||||
unsigned ptr, u;
|
||||
unsigned ptr;
|
||||
const int buf_size = 128;
|
||||
const int pad = buf_size - 16;
|
||||
|
||||
@@ -711,8 +677,7 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
sha512_4way_round( sc->buf, sc->val );
|
||||
|
||||
for ( u = 0; u < 8; u ++ )
|
||||
((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
|
||||
mm256_block_bswap_64( dst, sc->val );
|
||||
}
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
@@ -20,6 +20,7 @@ static const uint32_t IV512[] =
|
||||
static void
|
||||
c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
{
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m256i p0, p1, p2, p3, x;
|
||||
__m256i k00, k01, k02, k03, k10, k11, k12, k13;
|
||||
__m256i *m = (__m256i*)msg;
|
||||
@@ -33,24 +34,24 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
|
||||
// round
|
||||
k00 = m[0];
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero );
|
||||
k01 = m[1];
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||
k02 = m[2];
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||
k03 = m[3];
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||
|
||||
p0 = _mm256_xor_si256( p0, x );
|
||||
|
||||
k10 = m[4];
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ), zero );
|
||||
k11 = m[5];
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||
k12 = m[6];
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||
k13 = m[7];
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
||||
|
||||
p2 = _mm256_xor_si256( p2, x );
|
||||
|
||||
@@ -59,129 +60,129 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
// round 1, 5, 9
|
||||
|
||||
k00 = _mm256_xor_si256( k13, mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k00 ) ) );
|
||||
mm256_aesenc_2x128( k00, zero ) ) );
|
||||
|
||||
if ( r == 0 )
|
||||
k00 = _mm256_xor_si256( k00, _mm256_set_epi32(
|
||||
~ctx->count3, ctx->count2, ctx->count1, ctx->count0,
|
||||
~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
|
||||
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
|
||||
k01 = _mm256_xor_si256( k00,
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k01 ) ) );
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k01, zero ) ) );
|
||||
|
||||
if ( r == 1 )
|
||||
k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
|
||||
~ctx->count0, ctx->count1, ctx->count2, ctx->count3,
|
||||
~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) );
|
||||
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||
k02 = _mm256_xor_si256( k01,
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k02 ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k02, zero ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||
k03 = _mm256_xor_si256( k02,
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k03 ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k03, zero ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||
|
||||
p3 = _mm256_xor_si256( p3, x );
|
||||
|
||||
k10 = _mm256_xor_si256( k03,
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k10 ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ) );
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k10, zero ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
|
||||
k11 = _mm256_xor_si256( k10,
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k11 ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k11, zero ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||
k12 = _mm256_xor_si256( k11,
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k12 ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||
k13 = _mm256_xor_si256( k12,
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k13 ) ) );
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k13, zero ) ) );
|
||||
|
||||
if ( r == 2 )
|
||||
k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
|
||||
~ctx->count1, ctx->count0, ctx->count3, ctx->count2,
|
||||
~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
|
||||
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
||||
p1 = _mm256_xor_si256( p1, x );
|
||||
|
||||
// round 2, 6, 10
|
||||
|
||||
k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k00 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k00 ), zero );
|
||||
k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||
k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||
k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||
|
||||
p2 = _mm256_xor_si256( p2, x );
|
||||
|
||||
k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k10 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k10 ), zero );
|
||||
k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||
k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||
k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
||||
|
||||
p0 = _mm256_xor_si256( p0, x );
|
||||
|
||||
// round 3, 7, 11
|
||||
|
||||
k00 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k00 ) ), k13 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ) );
|
||||
mm256_aesenc_2x128( k00, zero ) ), k13 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
|
||||
k01 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k01 ) ), k00 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
|
||||
mm256_aesenc_2x128( k01, zero ) ), k00 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||
k02 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k02 ) ), k01 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
|
||||
mm256_aesenc_2x128( k02, zero ) ), k01 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||
k03 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k03 ) ), k02 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
|
||||
mm256_aesenc_2x128( k03, zero ) ), k02 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||
|
||||
p1 = _mm256_xor_si256( p1, x );
|
||||
|
||||
k10 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k10 ) ), k03 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ) );
|
||||
mm256_aesenc_2x128( k10, zero ) ), k03 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
|
||||
k11 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k11 ) ), k10 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
|
||||
mm256_aesenc_2x128( k11, zero ) ), k10 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||
k12 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k12 ) ), k11 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
|
||||
mm256_aesenc_2x128( k12, zero ) ), k11 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||
k13 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k13 ) ), k12 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
|
||||
mm256_aesenc_2x128( k13, zero ) ), k12 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
||||
|
||||
p3 = _mm256_xor_si256( p3, x );
|
||||
|
||||
// round 4, 8, 12
|
||||
|
||||
k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero );
|
||||
k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||
k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||
k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||
|
||||
p0 = _mm256_xor_si256( p0, x );
|
||||
|
||||
k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ), zero );
|
||||
k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||
k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||
k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
||||
|
||||
p2 = _mm256_xor_si256( p2, x );
|
||||
|
||||
@@ -190,36 +191,36 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
// round 13
|
||||
|
||||
k00 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k00 ) ), k13 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ) );
|
||||
mm256_aesenc_2x128( k00, zero ) ), k13 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
|
||||
k01 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k01 ) ), k00 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
|
||||
mm256_aesenc_2x128( k01, zero ) ), k00 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||
k02 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k02 ) ), k01 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
|
||||
mm256_aesenc_2x128( k02, zero ) ), k01 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||
k03 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k03 ) ), k02 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
|
||||
mm256_aesenc_2x128( k03, zero ) ), k02 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||
|
||||
p3 = _mm256_xor_si256( p3, x );
|
||||
|
||||
k10 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k10 ) ), k03 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ) );
|
||||
mm256_aesenc_2x128( k10, zero ) ), k03 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
|
||||
k11 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k11 ) ), k10 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
|
||||
mm256_aesenc_2x128( k11, zero ) ), k10 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||
|
||||
k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12 ) );
|
||||
k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) );
|
||||
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
|
||||
~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
|
||||
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
|
||||
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||
k13 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k13 ) ), k12 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
|
||||
mm256_aesenc_2x128( k13, zero ) ), k12 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
||||
|
||||
p1 = _mm256_xor_si256( p1, x );
|
||||
|
||||
|
||||
@@ -87,6 +87,7 @@ static const sph_u32 IV512[] = {
|
||||
static void
|
||||
c512( sph_shavite_big_context *sc, const void *msg )
|
||||
{
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i p0, p1, p2, p3, x;
|
||||
__m128i k00, k01, k02, k03, k10, k11, k12, k13;
|
||||
__m128i *m = (__m128i*)msg;
|
||||
@@ -101,38 +102,38 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
// round
|
||||
k00 = m[0];
|
||||
x = _mm_xor_si128( p1, k00 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k01 = m[1];
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k02 = m[2];
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k03 = m[3];
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
|
||||
k10 = m[4];
|
||||
x = _mm_xor_si128( p3, k10 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k11 = m[5];
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k12 = m[6];
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k13 = m[7];
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
|
||||
for ( r = 0; r < 3; r ++ )
|
||||
{
|
||||
// round 1, 5, 9
|
||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
|
||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
|
||||
if ( r == 0 )
|
||||
@@ -140,8 +141,8 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
~sc->count3, sc->count2, sc->count1, sc->count0 ) );
|
||||
|
||||
x = _mm_xor_si128( p0, k00 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
|
||||
if ( r == 1 )
|
||||
@@ -149,32 +150,32 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
~sc->count0, sc->count1, sc->count2, sc->count3 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
|
||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
|
||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
|
||||
x = _mm_xor_si128( p2, k10 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
|
||||
k12 = _mm_xor_si128( k12, k11 );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
|
||||
if ( r == 2 )
|
||||
@@ -182,78 +183,78 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
~sc->count1, sc->count0, sc->count3, sc->count2 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
p1 = _mm_xor_si128( p1, x );
|
||||
|
||||
// round 2, 6, 10
|
||||
|
||||
k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
|
||||
x = _mm_xor_si128( p3, k00 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
|
||||
k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
|
||||
x = _mm_xor_si128( p1, k10 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
|
||||
// round 3, 7, 11
|
||||
|
||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
|
||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
x = _mm_xor_si128( p2, k00 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p1 = _mm_xor_si128( p1, x );
|
||||
|
||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
|
||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
x = _mm_xor_si128( p0, k10 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
|
||||
k12 = _mm_xor_si128( k12, k11 );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
|
||||
@@ -261,73 +262,73 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
|
||||
x = _mm_xor_si128( p1, k00 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
|
||||
k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
|
||||
x = _mm_xor_si128( p3, k10 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
}
|
||||
|
||||
// round 13
|
||||
|
||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
|
||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
x = _mm_xor_si128( p0, k00 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
|
||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
|
||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
x = _mm_xor_si128( p2, k10 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
|
||||
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
|
||||
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p1 = _mm_xor_si128( p1, x );
|
||||
|
||||
|
||||
@@ -342,6 +342,7 @@ void fft128_2way( void *a )
|
||||
|
||||
void fft128_2way_msg( uint16_t *a, const uint8_t *x, int final )
|
||||
{
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
static const m256_v16 Tweak = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
|
||||
static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
|
||||
|
||||
@@ -352,10 +353,10 @@ void fft128_2way_msg( uint16_t *a, const uint8_t *x, int final )
|
||||
#define UNPACK( i ) \
|
||||
do { \
|
||||
__m256i t = X[i]; \
|
||||
A[2*i] = _mm256_unpacklo_epi8( t, m256_zero ); \
|
||||
A[2*i] = _mm256_unpacklo_epi8( t, zero ); \
|
||||
A[2*i+8] = _mm256_mullo_epi16( A[2*i], FFT128_Twiddle[2*i].v256 ); \
|
||||
A[2*i+8] = REDUCE(A[2*i+8]); \
|
||||
A[2*i+1] = _mm256_unpackhi_epi8( t, m256_zero ); \
|
||||
A[2*i+1] = _mm256_unpackhi_epi8( t, zero ); \
|
||||
A[2*i+9] = _mm256_mullo_epi16(A[2*i+1], FFT128_Twiddle[2*i+1].v256 ); \
|
||||
A[2*i+9] = REDUCE(A[2*i+9]); \
|
||||
} while(0)
|
||||
@@ -365,10 +366,10 @@ do { \
|
||||
do { \
|
||||
__m256i t = X[i]; \
|
||||
__m256i tmp; \
|
||||
A[2*i] = _mm256_unpacklo_epi8( t, m256_zero ); \
|
||||
A[2*i] = _mm256_unpacklo_epi8( t, zero ); \
|
||||
A[2*i+8] = _mm256_mullo_epi16( A[ 2*i ], FFT128_Twiddle[ 2*i ].v256 ); \
|
||||
A[2*i+8] = REDUCE( A[ 2*i+8 ] ); \
|
||||
tmp = _mm256_unpackhi_epi8( t, m256_zero ); \
|
||||
tmp = _mm256_unpackhi_epi8( t, zero ); \
|
||||
A[2*i+1] = _mm256_add_epi16( tmp, tw ); \
|
||||
A[2*i+9] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
|
||||
FFT128_Twiddle[ 2*i+1 ].v256 );\
|
||||
@@ -392,6 +393,7 @@ do { \
|
||||
|
||||
void fft256_2way_msg( uint16_t *a, const uint8_t *x, int final )
|
||||
{
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
static const m256_v16 Tweak = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
|
||||
static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
|
||||
|
||||
@@ -402,11 +404,11 @@ void fft256_2way_msg( uint16_t *a, const uint8_t *x, int final )
|
||||
#define UNPACK( i ) \
|
||||
do { \
|
||||
__m256i t = X[i]; \
|
||||
A[ 2*i ] = _mm256_unpacklo_epi8( t, m256_zero ); \
|
||||
A[ 2*i ] = _mm256_unpacklo_epi8( t, zero ); \
|
||||
A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
|
||||
FFT256_Twiddle[ 2*i ].v256 ); \
|
||||
A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
|
||||
A[ 2*i + 1 ] = _mm256_unpackhi_epi8( t, m256_zero ); \
|
||||
A[ 2*i + 1 ] = _mm256_unpackhi_epi8( t, zero ); \
|
||||
A[ 2*i + 17 ] = _mm256_mullo_epi16( A[ 2*i + 1 ], \
|
||||
FFT256_Twiddle[ 2*i + 1 ].v256 ); \
|
||||
A[ 2*i + 17 ] = REDUCE( A[ 2*i + 17 ] ); \
|
||||
@@ -417,11 +419,11 @@ do { \
|
||||
do { \
|
||||
__m256i t = X[i]; \
|
||||
__m256i tmp; \
|
||||
A[ 2*i ] = _mm256_unpacklo_epi8( t, m256_zero ); \
|
||||
A[ 2*i ] = _mm256_unpacklo_epi8( t, zero ); \
|
||||
A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
|
||||
FFT256_Twiddle[ 2*i ].v256 ); \
|
||||
A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
|
||||
tmp = _mm256_unpackhi_epi8( t, m256_zero ); \
|
||||
tmp = _mm256_unpackhi_epi8( t, zero ); \
|
||||
A[ 2*i + 1 ] = _mm256_add_epi16( tmp, tw ); \
|
||||
A[ 2*i + 17 ] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
|
||||
FFT256_Twiddle[ 2*i + 1 ].v256 ); \
|
||||
@@ -446,6 +448,8 @@ do { \
|
||||
fft128_2way( a+256 );
|
||||
}
|
||||
|
||||
#define c1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
|
||||
|
||||
void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
|
||||
{
|
||||
register __m256i S0l, S1l, S2l, S3l;
|
||||
@@ -453,7 +457,8 @@ void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
|
||||
__m256i *S = (__m256i*) state;
|
||||
__m256i *M = (__m256i*) msg;
|
||||
__m256i *W = (__m256i*) fft;
|
||||
static const m256_v16 code[] = { mm256_const1_16(185), mm256_const1_16(233) };
|
||||
static const m256_v16 code[] = { c1_16(185), c1_16(233) };
|
||||
|
||||
|
||||
S0l = _mm256_xor_si256( S[0], M[0] );
|
||||
S0h = _mm256_xor_si256( S[1], M[1] );
|
||||
|
||||
Reference in New Issue
Block a user