This commit is contained in:
Jay D Dee
2021-04-08 18:09:31 -04:00
parent 902ec046dd
commit f3333b0070
17 changed files with 826 additions and 336 deletions

View File

@@ -1225,37 +1225,6 @@ static inline void intrlv_4x64( void *dst, const void *src0,
d[31] = _mm_unpackhi_epi64( s2[7], s3[7] );
}
/*
static inline void intrlv_4x64( void *dst, void *src0,
void *src1, void *src2, void *src3, int bit_len )
{
uint64_t *d = (uint64_t*)dst;
uint64_t *s0 = (uint64_t*)src0;
uint64_t *s1 = (uint64_t*)src1;
uint64_t *s2 = (uint64_t*)src2;
uint64_t *s3 = (uint64_t*)src3;
d[ 0] = s0[ 0]; d[ 1] = s1[ 0]; d[ 2] = s2[ 0]; d[ 3] = s3[ 0];
d[ 4] = s0[ 1]; d[ 5] = s1[ 1]; d[ 6] = s2[ 1]; d[ 7] = s3[ 1];
d[ 8] = s0[ 2]; d[ 9] = s1[ 2]; d[ 10] = s2[ 2]; d[ 11] = s3[ 2];
d[ 12] = s0[ 3]; d[ 13] = s1[ 3]; d[ 14] = s2[ 3]; d[ 15] = s3[ 3];
if ( bit_len <= 256 ) return;
d[ 16] = s0[ 4]; d[ 17] = s1[ 4]; d[ 18] = s2[ 4]; d[ 19] = s3[ 4];
d[ 20] = s0[ 5]; d[ 21] = s1[ 5]; d[ 22] = s2[ 5]; d[ 23] = s3[ 5];
d[ 24] = s0[ 6]; d[ 25] = s1[ 6]; d[ 26] = s2[ 6]; d[ 27] = s3[ 6];
d[ 28] = s0[ 7]; d[ 29] = s1[ 7]; d[ 30] = s2[ 7]; d[ 31] = s3[ 7];
if ( bit_len <= 512 ) return;
d[ 32] = s0[ 8]; d[ 33] = s1[ 8]; d[ 34] = s2[ 8]; d[ 35] = s3[ 8];
d[ 36] = s0[ 9]; d[ 37] = s1[ 9]; d[ 38] = s2[ 9]; d[ 39] = s3[ 9];
if ( bit_len <= 640 ) return;
d[ 40] = s0[10]; d[ 41] = s1[10]; d[ 42] = s2[10]; d[ 43] = s3[10];
d[ 44] = s0[11]; d[ 45] = s1[11]; d[ 46] = s2[11]; d[ 47] = s3[11];
d[ 48] = s0[12]; d[ 49] = s1[12]; d[ 50] = s2[12]; d[ 51] = s3[12];
d[ 52] = s0[13]; d[ 53] = s1[13]; d[ 54] = s2[13]; d[ 55] = s3[13];
d[ 56] = s0[14]; d[ 57] = s1[14]; d[ 58] = s2[14]; d[ 59] = s3[14];
d[ 60] = s0[15]; d[ 61] = s1[15]; d[ 62] = s2[15]; d[ 63] = s3[15];
}
*/
static inline void intrlv_4x64_512( void *dst, const void *src0,
const void *src1, const void *src2, const void *src3 )
{
@@ -1282,26 +1251,6 @@ static inline void intrlv_4x64_512( void *dst, const void *src0,
d[15] = _mm_unpackhi_epi64( s2[3], s3[3] );
}
/*
static inline void intrlv_4x64_512( void *dst, const void *src0,
const void *src1, const void *src2, const void *src3 )
{
uint64_t *d = (uint64_t*)dst;
const uint64_t *s0 = (const uint64_t*)src0;
const uint64_t *s1 = (const uint64_t*)src1;
const uint64_t *s2 = (const uint64_t*)src2;
const uint64_t *s3 = (const uint64_t*)src3;
d[ 0] = s0[ 0]; d[ 1] = s1[ 0]; d[ 2] = s2[ 0]; d[ 3] = s3[ 0];
d[ 4] = s0[ 1]; d[ 5] = s1[ 1]; d[ 6] = s2[ 1]; d[ 7] = s3[ 1];
d[ 8] = s0[ 2]; d[ 9] = s1[ 2]; d[ 10] = s2[ 2]; d[ 11] = s3[ 2];
d[ 12] = s0[ 3]; d[ 13] = s1[ 3]; d[ 14] = s2[ 3]; d[ 15] = s3[ 3];
d[ 16] = s0[ 4]; d[ 17] = s1[ 4]; d[ 18] = s2[ 4]; d[ 19] = s3[ 4];
d[ 20] = s0[ 5]; d[ 21] = s1[ 5]; d[ 22] = s2[ 5]; d[ 23] = s3[ 5];
d[ 24] = s0[ 6]; d[ 25] = s1[ 6]; d[ 26] = s2[ 6]; d[ 27] = s3[ 6];
d[ 28] = s0[ 7]; d[ 29] = s1[ 7]; d[ 30] = s2[ 7]; d[ 31] = s3[ 7];
}
*/
static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src, const int bit_len )
{
@@ -1347,38 +1296,6 @@ static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2,
d3[7] = _mm_unpackhi_epi64( s[29], s[31] );
}
/*
static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src, int bit_len )
{
uint64_t *d0 = (uint64_t*)dst0;
uint64_t *d1 = (uint64_t*)dst1;
uint64_t *d2 = (uint64_t*)dst2;
uint64_t *d3 = (uint64_t*)dst3;
const uint64_t *s = (const uint64_t*)src;
d0[ 0] = s[ 0]; d1[ 0] = s[ 1]; d2[ 0] = s[ 2]; d3[ 0] = s[ 3];
d0[ 1] = s[ 4]; d1[ 1] = s[ 5]; d2[ 1] = s[ 6]; d3[ 1] = s[ 7];
d0[ 2] = s[ 8]; d1[ 2] = s[ 9]; d2[ 2] = s[10]; d3[ 2] = s[11];
d0[ 3] = s[12]; d1[ 3] = s[13]; d2[ 3] = s[14]; d3[ 3] = s[15];
if ( bit_len <= 256 ) return;
d0[ 4] = s[16]; d1[ 4] = s[17]; d2[ 4] = s[18]; d3[ 4] = s[19];
d0[ 5] = s[20]; d1[ 5] = s[21]; d2[ 5] = s[22]; d3[ 5] = s[23];
d0[ 6] = s[24]; d1[ 6] = s[25]; d2[ 6] = s[26]; d3[ 6] = s[27];
d0[ 7] = s[28]; d1[ 7] = s[29]; d2[ 7] = s[30]; d3[ 7] = s[31];
if ( bit_len <= 512 ) return;
d0[ 8] = s[32]; d1[ 8] = s[33]; d2[ 8] = s[34]; d3[ 8] = s[35];
d0[ 9] = s[36]; d1[ 9] = s[37]; d2[ 9] = s[38]; d3[ 9] = s[39];
if ( bit_len <= 640 ) return;
d0[10] = s[40]; d1[10] = s[41]; d2[10] = s[42]; d3[10] = s[43];
d0[11] = s[44]; d1[11] = s[45]; d2[11] = s[46]; d3[11] = s[47];
d0[12] = s[48]; d1[12] = s[49]; d2[12] = s[50]; d3[12] = s[51];
d0[13] = s[52]; d1[13] = s[53]; d2[13] = s[54]; d3[13] = s[55];
d0[14] = s[56]; d1[14] = s[57]; d2[14] = s[58]; d3[14] = s[59];
d0[15] = s[60]; d1[15] = s[61]; d2[15] = s[62]; d3[15] = s[63];
}
*/
static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src )
{
@@ -1405,26 +1322,6 @@ static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2,
d3[3] = _mm_unpackhi_epi64( s[13], s[15] );
}
/*
static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src )
{
uint64_t *d0 = (uint64_t*)dst0;
uint64_t *d1 = (uint64_t*)dst1;
uint64_t *d2 = (uint64_t*)dst2;
uint64_t *d3 = (uint64_t*)dst3;
const uint64_t *s = (const uint64_t*)src;
d0[ 0] = s[ 0]; d1[ 0] = s[ 1]; d2[ 0] = s[ 2]; d3[ 0] = s[ 3];
d0[ 1] = s[ 4]; d1[ 1] = s[ 5]; d2[ 1] = s[ 6]; d3[ 1] = s[ 7];
d0[ 2] = s[ 8]; d1[ 2] = s[ 9]; d2[ 2] = s[10]; d3[ 2] = s[11];
d0[ 3] = s[12]; d1[ 3] = s[13]; d2[ 3] = s[14]; d3[ 3] = s[15];
d0[ 4] = s[16]; d1[ 4] = s[17]; d2[ 4] = s[18]; d3[ 4] = s[19];
d0[ 5] = s[20]; d1[ 5] = s[21]; d2[ 5] = s[22]; d3[ 5] = s[23];
d0[ 6] = s[24]; d1[ 6] = s[25]; d2[ 6] = s[26]; d3[ 6] = s[27];
d0[ 7] = s[28]; d1[ 7] = s[29]; d2[ 7] = s[30]; d3[ 7] = s[31];
}
*/
static inline void extr_lane_4x64( void *d, const void *s,
const int lane, const int bit_len )
{
@@ -1440,9 +1337,41 @@ static inline void extr_lane_4x64( void *d, const void *s,
}
#if defined(__AVX2__)
// Doesn't really need AVX2, just SSSE3, but is only used with AVX2 code.
// There a alignment problems with the source buffer on Wwindows,
// can't use 256 bit bswap.
static inline void mm256_intrlv80_4x64( void *d, const void *src )
{
__m128i s0 = casti_m128i( src,0 );
__m128i s1 = casti_m128i( src,1 );
__m128i s2 = casti_m128i( src,2 );
__m128i s3 = casti_m128i( src,3 );
__m128i s4 = casti_m128i( src,4 );
casti_m128i( d, 0 ) =
casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0, 0x44 );
casti_m128i( d, 2 ) =
casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0, 0xee );
casti_m128i( d, 4 ) =
casti_m128i( d, 5 ) = _mm_shuffle_epi32( s1, 0x44 );
casti_m128i( d, 6 ) =
casti_m128i( d, 7 ) = _mm_shuffle_epi32( s1, 0xee );
casti_m128i( d, 8 ) =
casti_m128i( d, 9 ) = _mm_shuffle_epi32( s2, 0x44 );
casti_m128i( d, 10 ) =
casti_m128i( d, 11 ) = _mm_shuffle_epi32( s2, 0xee );
casti_m128i( d, 12 ) =
casti_m128i( d, 13 ) = _mm_shuffle_epi32( s3, 0x44 );
casti_m128i( d, 14 ) =
casti_m128i( d, 15 ) = _mm_shuffle_epi32( s3, 0xee );
casti_m128i( d, 16 ) =
casti_m128i( d, 17 ) = _mm_shuffle_epi32( s4, 0x44 );
casti_m128i( d, 18 ) =
casti_m128i( d, 19 ) = _mm_shuffle_epi32( s4, 0xee );
}
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
{
@@ -1636,40 +1565,6 @@ static inline void intrlv_8x64_512( void *dst, const void *src0,
d[31] = _mm_unpackhi_epi64( s6[3], s7[3] );
}
/*
#define ILEAVE_8x64( i ) do \
{ \
uint64_t *d = (uint64_t*)(dst) + ( (i) << 3 ); \
d[0] = *( (const uint64_t*)(s0) +(i) ); \
d[1] = *( (const uint64_t*)(s1) +(i) ); \
d[2] = *( (const uint64_t*)(s2) +(i) ); \
d[3] = *( (const uint64_t*)(s3) +(i) ); \
d[4] = *( (const uint64_t*)(s4) +(i) ); \
d[5] = *( (const uint64_t*)(s5) +(i) ); \
d[6] = *( (const uint64_t*)(s6) +(i) ); \
d[7] = *( (const uint64_t*)(s7) +(i) ); \
} while(0)
static inline void intrlv_8x64( void *dst, const void *s0,
const void *s1, const void *s2, const void *s3, const void *s4,
const void *s5, const void *s6, const void *s7, int bit_len )
{
ILEAVE_8x64( 0 ); ILEAVE_8x64( 1 );
ILEAVE_8x64( 2 ); ILEAVE_8x64( 3 );
if ( bit_len <= 256 ) return;
ILEAVE_8x64( 4 ); ILEAVE_8x64( 5 );
ILEAVE_8x64( 6 ); ILEAVE_8x64( 7 );
if ( bit_len <= 512 ) return;
ILEAVE_8x64( 8 ); ILEAVE_8x64( 9 );
if ( bit_len <= 640 ) return;
ILEAVE_8x64( 10 ); ILEAVE_8x64( 11 );
ILEAVE_8x64( 12 ); ILEAVE_8x64( 13 );
ILEAVE_8x64( 14 ); ILEAVE_8x64( 15 );
}
#undef ILEAVE_8x64
*/
static inline void dintrlv_8x64( void *dst0, void *dst1, void *dst2,
void *dst3, void *dst4, void *dst5, void *dst6, void *dst7,
@@ -1815,39 +1710,6 @@ static inline void dintrlv_8x64_512( void *dst0, void *dst1, void *dst2,
d7[3] = _mm_unpackhi_epi64( s[27], s[31] );
}
/*
#define DLEAVE_8x64( i ) do \
{ \
const uint64_t *s = (const uint64_t*)(src) + ( (i) << 3 ); \
*( (uint64_t*)(d0) +(i) ) = s[0]; \
*( (uint64_t*)(d1) +(i) ) = s[1]; \
*( (uint64_t*)(d2) +(i) ) = s[2]; \
*( (uint64_t*)(d3) +(i) ) = s[3]; \
*( (uint64_t*)(d4) +(i) ) = s[4]; \
*( (uint64_t*)(d5) +(i) ) = s[5]; \
*( (uint64_t*)(d6) +(i) ) = s[6]; \
*( (uint64_t*)(d7) +(i) ) = s[7]; \
} while(0)
static inline void dintrlv_8x64( void *d0, void *d1, void *d2, void *d3,
void *d4, void *d5, void *d6, void *d7, const void *src, int bit_len )
{
DLEAVE_8x64( 0 ); DLEAVE_8x64( 1 );
DLEAVE_8x64( 2 ); DLEAVE_8x64( 3 );
if ( bit_len <= 256 ) return;
DLEAVE_8x64( 4 ); DLEAVE_8x64( 5 );
DLEAVE_8x64( 6 ); DLEAVE_8x64( 7 );
if ( bit_len <= 512 ) return;
DLEAVE_8x64( 8 ); DLEAVE_8x64( 9 );
if ( bit_len <= 640 ) return;
DLEAVE_8x64( 10 ); DLEAVE_8x64( 11 );
DLEAVE_8x64( 12 ); DLEAVE_8x64( 13 );
DLEAVE_8x64( 14 ); DLEAVE_8x64( 15 );
}
#undef DLEAVE_8x64
*/
static inline void extr_lane_8x64( void *d, const void *s,
const int lane, const int bit_len )
{

View File

@@ -178,7 +178,7 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
// Basic operations without equivalent SIMD intrinsic
// Bitwise not (~v)
#define mm128_not( v ) _mm_xor_si128( (v), m128_neg1 )
#define mm128_not( v ) _mm_xor_si128( v, m128_neg1 )
// Unary negation of elements (-v)
#define mm128_negate_64( v ) _mm_sub_epi64( m128_zero, v )
@@ -263,7 +263,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__AVX512VL__)
//#if defined(__AVX512F__) && defined(__AVX512VL__)
#define mm128_ror_64 _mm_ror_epi64
#define mm128_rol_64 _mm_rol_epi64
@@ -291,16 +292,13 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
#define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
#define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
//#define mm128_swap_64( v ) _mm_alignr_epi8( v, v, 8 )
//#define mm128_ror_1x32( v ) _mm_alignr_epi8( v, v, 4 )
//#define mm128_rol_1x32( v ) _mm_alignr_epi8( v, v, 12 )
// Swap 32 bit elements in 64 bit lanes
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
#if defined(__SSSE3__)
// Rotate right by c bytes
// Rotate right by c bytes, no SSE2 equivalent.
static inline __m128i mm128_ror_x8( const __m128i v, const int c )
{ return _mm_alignr_epi8( v, v, c ); }

View File

@@ -18,7 +18,7 @@
#define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
#define mm256_mov32_256( i ) _mm256_castsi128_si256( mm128_mov32_128( i ) )
// Mo0ve low element of vector to integer.
// Move low element of vector to integer.
#define mm256_mov256_64( v ) mm128_mov128_64( _mm256_castsi256_si128( v ) )
#define mm256_mov256_32( v ) mm128_mov128_32( _mm256_castsi256_si128( v ) )
@@ -42,7 +42,7 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
// 128 bit vector argument
#define m256_const1_128( v ) \
_mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
// 64 bit integer argument
// 64 bit integer argument zero extended to 128 bits.
#define m256_const1_i128( i ) m256_const1_128( mm128_mov64_128( i ) )
#define m256_const1_64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
#define m256_const1_32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
@@ -168,7 +168,10 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
_mm256_srli_epi32( v, 32-(c) ) )
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// The spec says both F & VL are required, but just in case AMD
// decides to implement ROL/R without AVX512F.
#if defined(__AVX512VL__)
//#if defined(__AVX512F__) && defined(__AVX512VL__)
// AVX512, control must be 8 bit immediate.
@@ -198,21 +201,14 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
//
// Rotate elements accross all lanes.
//
// AVX2 has no full vector permute for elements less than 32 bits.
// AVX512 has finer granularity full vector permutes.
// AVX512 has full vector alignr which might be faster, especially for 32 bit
// Swap 128 bit elements in 256 bit vector.
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
// Rotate 256 bit vector by one 64 bit element
#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
static inline __m256i mm256_swap_128( const __m256i v )
{ return _mm256_alignr_epi64( v, v, 2 ); }
static inline __m256i mm256_ror_1x64( const __m256i v )
{ return _mm256_alignr_epi64( v, v, 1 ); }
static inline __m256i mm256_rol_1x64( const __m256i v )
{ return _mm256_alignr_epi64( v, v, 3 ); }
#if defined(__AVX512F__) && defined(__AVX512VL__)
static inline __m256i mm256_ror_1x32( const __m256i v )
{ return _mm256_alignr_epi32( v, v, 1 ); }
@@ -220,21 +216,8 @@ static inline __m256i mm256_ror_1x32( const __m256i v )
static inline __m256i mm256_rol_1x32( const __m256i v )
{ return _mm256_alignr_epi32( v, v, 7 ); }
static inline __m256i mm256_ror_3x32( const __m256i v )
{ return _mm256_alignr_epi32( v, v, 3 ); }
static inline __m256i mm256_rol_3x32( const __m256i v )
{ return _mm256_alignr_epi32( v, v, 5 ); }
#else // AVX2
// Swap 128 bit elements in 256 bit vector.
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
// Rotate 256 bit vector by one 64 bit element
#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
// Rotate 256 bit vector by one 32 bit element.
#define mm256_ror_1x32( v ) \
_mm256_permutevar8x32_epi32( v, \
@@ -246,17 +229,6 @@ static inline __m256i mm256_rol_3x32( const __m256i v )
m256_const_64( 0x0000000600000005, 0x0000000400000003, \
0x0000000200000001, 0x0000000000000007 )
// Rotate 256 bit vector by three 32 bit elements (96 bits).
#define mm256_ror_3x32( v ) \
_mm256_permutevar8x32_epi32( v, \
m256_const_64( 0x0000000200000001, 0x0000000000000007, \
0x0000000600000005, 0x0000000400000003 )
#define mm256_rol_3x32( v ) \
_mm256_permutevar8x32_epi32( v, \
m256_const_64( 0x0000000400000003, 0x0000000200000001, \
0x0000000000000007, 0x0000000600000005 )
#endif // AVX512 else AVX2
//