This commit is contained in:
Jay D Dee
2019-05-19 13:39:45 -04:00
parent bfd1c002f9
commit e1aead3c76
139 changed files with 10907 additions and 4218 deletions

View File

@@ -48,6 +48,10 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
return ( w >> c ) | ( w << ( 64 - c ) );
}
// serial data is only 32 bytes so AVX2 is the limit for that dimension.
// However, 2 way parallel looks trivial to code for AVX512 except for
// a data dependency with rowa.
#if defined __AVX2__
// only available with avx2
@@ -65,13 +69,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_ror256_1x64( s1); \
s1 = mm256_ror_1x64( s1); \
s2 = mm256_swap_128( s2 ); \
s3 = mm256_rol256_1x64( s3 ); \
s3 = mm256_rol_1x64( s3 ); \
G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_rol256_1x64( s1 ); \
s1 = mm256_rol_1x64( s1 ); \
s2 = mm256_swap_128( s2 ); \
s3 = mm256_ror256_1x64( s3 );
s3 = mm256_ror_1x64( s3 );
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -93,25 +97,25 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
// returns void, all args updated
#define G_2X64(a,b,c,d) \
a = _mm_add_epi64( a, b ); \
d = mm_ror_64( _mm_xor_si128( d, a), 32 ); \
d = mm128_ror_64( _mm_xor_si128( d, a), 32 ); \
c = _mm_add_epi64( c, d ); \
b = mm_ror_64( _mm_xor_si128( b, c ), 24 ); \
b = mm128_ror_64( _mm_xor_si128( b, c ), 24 ); \
a = _mm_add_epi64( a, b ); \
d = mm_ror_64( _mm_xor_si128( d, a ), 16 ); \
d = mm128_ror_64( _mm_xor_si128( d, a ), 16 ); \
c = _mm_add_epi64( c, d ); \
b = mm_ror_64( _mm_xor_si128( b, c ), 63 );
b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm_ror256_1x64( s2, s3 ); \
mm_swap_128( s4, s5 ); \
mm_rol256_1x64( s6, s7 ); \
mm128_ror256_1x64( s2, s3 ); \
mm128_swap256_128( s4, s5 ); \
mm128_rol256_1x64( s6, s7 ); \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm_rol256_1x64( s2, s3 ); \
mm_swap_128( s4, s5 ); \
mm_ror256_1x64( s6, s7 );
mm128_rol256_1x64( s2, s3 ); \
mm128_swap256_128( s4, s5 ); \
mm128_ror256_1x64( s6, s7 );
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \