This commit is contained in:
Jay D Dee
2021-09-29 17:31:16 -04:00
parent 9b905fccc8
commit 2cd1507c2e
80 changed files with 8145 additions and 2097 deletions

View File

@@ -65,7 +65,7 @@ static inline void dintrlv_2x32( void *dst0, void *dst1,
d0[24] = s[48]; d1[24] = s[49]; d0[25] = s[50]; d1[25] = s[51];
d0[26] = s[52]; d1[26] = s[53]; d0[27] = s[54]; d1[27] = s[55];
d0[28] = s[56]; d1[28] = s[57]; d0[29] = s[58]; d1[29] = s[59];
d0[30] = s[60]; d1[30] = s[61]; d0[31] = s[61]; d1[31] = s[63];
d0[30] = s[60]; d1[30] = s[61]; d0[31] = s[62]; d1[31] = s[63];
}
static inline void extr_lane_2x32( void *dst, const void *src,

View File

@@ -35,6 +35,13 @@
///////////////////////////////////////////////////////////////////////////
// Used instead if casting.
typedef union
{
__m128i m128;
uint32_t u32[4];
} __attribute__ ((aligned (16))) m128_ovly;
// Efficient and convenient moving between GP & low bits of XMM.
// Use VEX when available to give access to xmm8-15 and zero extend for
// larger vectors.
@@ -61,7 +68,10 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
return a;
}
static inline uint64_t mm128_mov128_64( const __m128i a )
// Inconstant naming, prefix should reflect return value:
// u64_mov128_64
static inline uint64_t u64_mov128_64( const __m128i a )
{
uint64_t n;
#if defined(__AVX__)
@@ -72,7 +82,7 @@ static inline uint64_t mm128_mov128_64( const __m128i a )
return n;
}
static inline uint32_t mm128_mov128_32( const __m128i a )
static inline uint32_t u32_mov128_32( const __m128i a )
{
uint32_t n;
#if defined(__AVX__)
@@ -166,12 +176,17 @@ static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
// Extract 32 bit element c from v and return as integer.
static inline uint32_t mm128_extract_32( const __m128i v, const int c )
{ return mm128_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }
{ return u32_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }
// Clear (zero) 32 bit elements based on bits set in 4 bit mask.
static inline __m128i mm128_mask_32( const __m128i v, const int m )
{ return mm128_xim_32( v, v, m ); }
// Move element i2 of v2 to element i1 of v1. For reference and convenience,
// it's faster to precalculate the index.
#define mm128_shuflmov_32( v1, i1, v2, i2 ) \
mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )
#endif // SSE4_1
//
@@ -257,12 +272,37 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#endif
// Diagonal blend: d = s3[3], s2[2], s1[1], s0[0] ||
// Blend 4 32 bit elements from 4 vectors
#if defined (__AVX2__)
#define mm128_diagonal_32( v3, v2, v1, v0 ) \
mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \
_mm_blend_epi32( s1, s0, 0x1 ), 0x3 )
#elif defined(__SSE4_1)
#define mm128_diagonal_32( v3, v2, v1, v0 ) \
mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \
_mm_blend_epi16( s1, s0, 0x03 ), 0x0f )
#endif
//
// Bit rotations
// AVX512VL has implemented bit rotation for 128 bit vectors with
// 64 and 32 bit elements.
// x2 rotates elements in 2 individual vectors in a double buffered
// optimization for SSE2, does nothing for AVX512 but is there for
// transparency.
// compiler doesn't like when a variable is used for the last arg of
// _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same
// specification but works with a variable. Therefore use rol_var where
@@ -290,6 +330,22 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_ror_32 _mm_ror_epi32
#define mm128_rol_32 _mm_rol_epi32
#define mm128_rorx2_64( v1, v0, c ) \
_mm_ror_epi64( v0, c ); \
_mm_ror_epi64( v1, c )
#define mm128_rolx2_64( v1, v0, c ) \
_mm_rol_epi64( v0, c ); \
_mm_rol_epi64( v1, c )
#define mm128_rorx2_32( v1, v0, c ) \
_mm_ror_epi32( v0, c ); \
_mm_ror_epi32( v1, c )
#define mm128_rolx2_32( v1, v0, c ) \
_mm_rol_epi32( v0, c ); \
_mm_rol_epi32( v1, c )
#else // SSE2
#define mm128_ror_64 mm128_ror_var_64
@@ -297,6 +353,46 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_ror_32 mm128_ror_var_32
#define mm128_rol_32 mm128_rol_var_32
#define mm128_rorx2_64( v1, v0, c ) \
{ \
__m128i t0 = _mm_srli_epi64( v0, c ); \
__m128i t1 = _mm_srli_epi64( v1, c ); \
v0 = _mm_slli_epi64( v0, 64-(c) ); \
v1 = _mm_slli_epi64( v1, 64-(c) ); \
v0 = _mm_or_si256( v0, t0 ); \
v1 = _mm_or_si256( v1, t1 ); \
}
#define mm128_rolx2_64( v1, v0, c ) \
{ \
__m128i t0 = _mm_slli_epi64( v0, c ); \
__m128i t1 = _mm_slli_epi64( v1, c ); \
v0 = _mm_srli_epi64( v0, 64-(c) ); \
v1 = _mm_srli_epi64( v1, 64-(c) ); \
v0 = _mm_or_si256( v0, t0 ); \
v1 = _mm_or_si256( v1, t1 ); \
}
#define mm128_rorx2_32( v1, v0, c ) \
{ \
__m128i t0 = _mm_srli_epi32( v0, c ); \
__m128i t1 = _mm_srli_epi32( v1, c ); \
v0 = _mm_slli_epi32( v0, 32-(c) ); \
v1 = _mm_slli_epi32( v1, 32-(c) ); \
v0 = _mm_or_si256( v0, t0 ); \
v1 = _mm_or_si256( v1, t1 ); \
}
#define mm128_rolx2_32( v1, v0, c ) \
{ \
__m128i t0 = _mm_slli_epi32( v0, c ); \
__m128i t1 = _mm_slli_epi32( v1, c ); \
v0 = _mm_srli_epi32( v0, 32-(c) ); \
v1 = _mm_srli_epi32( v1, 32-(c) ); \
v0 = _mm_or_si256( v0, t0 ); \
v1 = _mm_or_si256( v1, t1 ); \
}
#endif // AVX512 else SSE2
#define mm128_ror_16( v, c ) \
@@ -309,16 +405,22 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
// Rotate vector elements accross all lanes
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
#define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
#define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
#define mm128_shuflr_64 mm128_swap_64
#define mm128_shufll_64 mm128_swap_64
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
// Swap 32 bit elements in 64 bit lanes
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
#define mm128_shuflr64_32 mm128_swap64_32
#define mm128_shufll64_32 mm128_swap64_32
#if defined(__SSSE3__)
// Rotate right by c bytes, no SSE2 equivalent.
static inline __m128i mm128_ror_x8( const __m128i v, const int c )
static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
{ return _mm_alignr_epi8( v, v, c ); }
//
@@ -422,59 +524,88 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
v1 = _mm_xor_si128( v1, v2 );
// Two input shuffle-rotate.
// Concatenate v1 & v2 and rotate as one 256 bit vector.
#if defined(__SSE4_1__)
// Continue to use vror/vrol for now to avoid confusion with
// shufl2r/shufl2l function macros available with AVX512.
#define mm128_ror256_64( v1, v2 ) \
#if defined(__SSSE3__)
// Function macro with two inputs and one output, inputs are preserved.
// Returns modified first arg.
// Two input functions are not available without SSSE3. Use procedure
// belowe instead.
#define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
#define mm128_shufl2l_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
#define mm128_shufl2r_32( v1, v2 ) _mm_alignr_epi8( v2, v1, 4 )
#define mm128_shufl2l_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 )
#define mm128_shufl2r_16( v1, v2 ) _mm_alignr_epi8( v2, v1, 2 )
#define mm128_shufl2l_16( v1, v2 ) _mm_alignr_epi8( v1, v2, 2 )
#define mm128_shufl2r_8( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
// Procedure macroswith 2 inputs and 2 outputs, inputs are destroyed.
// Returns both modified args in place.
// These macros retain the vrol/vror name for now to avoid
// confusion with the shufl2r/shuffle2l function macros above.
// These may be renamed to something like shufl2r2 for 2 1nputs and
// 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs.
#define mm128_vror256_64( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
v1 = _mm_alignr_epi8( v2, v1, 8 ); \
v2 = t; \
} while(0)
#define mm128_rol256_64( v1, v2 ) \
#define mm128_vrol256_64( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
v2 = _mm_alignr_epi8( v2, v1, 8 ); \
v1 = t; \
} while(0)
#define mm128_ror256_32( v1, v2 ) \
#define mm128_vror256_32( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 4 ); \
v1 = _mm_alignr_epi8( v2, v1, 4 ); \
v2 = t; \
} while(0)
#define mm128_rol256_32( v1, v2 ) \
#define mm128_vrol256_32( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 12 ); \
v2 = _mm_alignr_epi8( v2, v1, 12 ); \
v1 = t; \
} while(0)
#define mm128_ror256_16( v1, v2 ) \
#define mm128_vror256_16( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 2 ); \
v1 = _mm_alignr_epi8( v2, v1, 2 ); \
v2 = t; \
} while(0)
#define mm128_rol256_16( v1, v2 ) \
#define mm128_vrol256_16( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 14 ); \
v2 = _mm_alignr_epi8( v2, v1, 14 ); \
v1 = t; \
} while(0)
#define mm128_ror256_8( v1, v2 ) \
#define mm128_vror256_8( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 1 ); \
v1 = _mm_alignr_epi8( v2, v1, 1 ); \
v2 = t; \
} while(0)
#define mm128_rol256_8( v1, v2 ) \
#define mm128_vrol256_8( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 15 ); \
v2 = _mm_alignr_epi8( v2, v1, 15 ); \
@@ -483,7 +614,7 @@ do { \
#else // SSE2
#define mm128_ror256_64( v1, v2 ) \
#define mm128_vror256_64( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
_mm_slli_si128( v2, 8 ) ); \
@@ -492,7 +623,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_rol256_64( v1, v2 ) \
#define mm128_vrol256_64( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
_mm_srli_si128( v2, 8 ) ); \
@@ -501,7 +632,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_ror256_32( v1, v2 ) \
#define mm128_vror256_32( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
_mm_slli_si128( v2, 12 ) ); \
@@ -510,7 +641,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_rol256_32( v1, v2 ) \
#define mm128_vrol256_32( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
_mm_srli_si128( v2, 12 ) ); \
@@ -519,7 +650,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_ror256_16( v1, v2 ) \
#define mm128_vror256_16( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
_mm_slli_si128( v2, 14 ) ); \
@@ -528,7 +659,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_rol256_16( v1, v2 ) \
#define mm128_vrol256_16( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
_mm_srli_si128( v2, 14 ) ); \
@@ -537,7 +668,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_ror256_8( v1, v2 ) \
#define mm128_vror256_8( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
_mm_slli_si128( v2, 15 ) ); \
@@ -546,7 +677,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_rol256_8( v1, v2 ) \
#define mm128_vrol256_8( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
_mm_srli_si128( v2, 15 ) ); \

View File

@@ -14,13 +14,28 @@
// is limited because 256 bit vectors are less likely to be used when 512
// is available.
// Used instead if casting.
typedef union
{
__m256i m256;
__m128i m128[2];
uint64_t u64[4];
uint32_t u32[8];
} __attribute__ ((aligned (32))) m256_ovly;
// Move integer to low element of vector, other elements are set to zero.
#define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
#define mm256_mov32_256( i ) _mm256_castsi128_si256( mm128_mov32_128( i ) )
// Move low element of vector to integer.
#define mm256_mov256_64( v ) mm128_mov128_64( _mm256_castsi256_si128( v ) )
#define mm256_mov256_32( v ) mm128_mov128_32( _mm256_castsi256_si128( v ) )
#define u64_mov256_64( v ) u64_mov128_64( _mm256_castsi256_si128( v ) )
#define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) )
// deprecated
//#define mm256_mov256_64 u64_mov256_64
//#define mm256_mov256_32 u32_mov256_32
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
#define mm256_concat_128( hi, lo ) \
@@ -214,12 +229,41 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#endif
// Diagonal blending
// Blend 4 64 bit elements from 4 vectors
#define mm256_diagonal_64( v3, v2, v1, v0 ) \
mm256_blend_epi32( _mm256_blend_epi32( v3, v2, 0x30 ), \
_mm256_blend_epi32( v1, v0, 0x03 ), 0x0f )
// Blend 8 32 bit elements from 8 vectors
#define mm256_diagonal_32( v7, v6, v5, v4, v3, v2, v1, v0 ) \
_mm256_blend_epi32( \
_mm256_blend_epi32( \
_mm256_blend_epi32( v7, v6, 0x40 ), \
_mm256_blend_epi32( v5, v4, 0x10 ) 0x30 ), \
_mm256_blend_epi32( \
_mm256_blend_epi32( v3, v2, 0x04) \
_mm256_blend_epi32( v1, v0, 0x01 ), 0x03 ), 0x0f )
// Blend 4 32 bit elements from each 128 bit lane.
#define mm256_diagonal128_32( v3, v2, v1, v0 ) \
_mm256_blend_epi32( \
_mm256_blend_epi32( v3, v2, 0x44) \
_mm256_blend_epi32( v1, v0, 0x11 ) )
//
// Bit rotations.
//
// The only bit shift for more than 64 bits is with __int128.
// The only bit shift for more than 64 bits is with __int128 which is slow.
//
// AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements
//
// x2 rotates elements in 2 individual vectors in a double buffered
// optimization for SSE2, does nothing for AVX512 but is there for
// transparency.
// compiler doesn't like when a variable is used for the last arg of
@@ -255,6 +299,22 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#define mm256_ror_32 _mm256_ror_epi32
#define mm256_rol_32 _mm256_rol_epi32
#define mm256_rorx2_64( v1, v0, c ) \
_mm256_ror_epi64( v0, c ); \
_mm256_ror_epi64( v1, c )
#define mm256_rolx2_64( v1, v0, c ) \
_mm256_rol_epi64( v0, c ); \
_mm256_rol_epi64( v1, c )
#define mm256_rorx2_32( v1, v0, c ) \
_mm256_ror_epi32( v0, c ); \
_mm256_ror_epi32( v1, c )
#define mm256_rolx2_32( v1, v0, c ) \
_mm256_rol_epi32( v0, c ); \
_mm256_rol_epi32( v1, c )
#else // AVX2
#define mm256_ror_64 mm256_ror_var_64
@@ -262,6 +322,46 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#define mm256_ror_32 mm256_ror_var_32
#define mm256_rol_32 mm256_rol_var_32
#define mm256_rorx2_64( v1, v0, c ) \
{ \
__m256i t0 = _mm256_srli_epi64( v0, c ); \
__m256i t1 = _mm256_srli_epi64( v1, c ); \
v0 = _mm256_slli_epi64( v0, 64-(c) ); \
v1 = _mm256_slli_epi64( v1, 64-(c) ); \
v0 = _mm256_or_si256( v0, t0 ); \
v1 = _mm256_or_si256( v1, t1 ); \
}
#define mm256_rolx2_64( v1, v0, c ) \
{ \
__m256i t0 = _mm256_slli_epi64( v0, c ); \
__m256i t1 = _mm256_slli_epi64( v1, c ); \
v0 = _mm256_srli_epi64( v0, 64-(c) ); \
v1 = _mm256_srli_epi64( v1, 64-(c) ); \
v0 = _mm256_or_si256( v0, t0 ); \
v1 = _mm256_or_si256( v1, t1 ); \
}
#define mm256_rorx2_32( v1, v0, c ) \
{ \
__m256i t0 = _mm256_srli_epi32( v0, c ); \
__m256i t1 = _mm256_srli_epi32( v1, c ); \
v0 = _mm256_slli_epi32( v0, 32-(c) ); \
v1 = _mm256_slli_epi32( v1, 32-(c) ); \
v0 = _mm256_or_si256( v0, t0 ); \
v1 = _mm256_or_si256( v1, t1 ); \
}
#define mm256_rolx2_32( v1, v0, c ) \
{ \
__m256i t0 = _mm256_slli_epi32( v0, c ); \
__m256i t1 = _mm256_slli_epi32( v1, c ); \
v0 = _mm256_srli_epi32( v0, 32-(c) ); \
v1 = _mm256_srli_epi32( v1, 32-(c) ); \
v0 = _mm256_or_si256( v0, t0 ); \
v1 = _mm256_or_si256( v1, t1 ); \
}
#endif // AVX512 else AVX2
#define mm256_ror_16( v, c ) \
@@ -276,58 +376,45 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
//
// Rotate elements accross all lanes.
#if defined(__AVX512VL__)
static inline __m256i mm256_swap_128( const __m256i v )
{ return _mm256_alignr_epi64( v, v, 2 ); }
static inline __m256i mm256_ror_1x64( const __m256i v )
{ return _mm256_alignr_epi64( v, v, 1 ); }
static inline __m256i mm256_rol_1x64( const __m256i v )
{ return _mm256_alignr_epi64( v, v, 3 ); }
static inline __m256i mm256_ror_1x32( const __m256i v )
{ return _mm256_alignr_epi32( v, v, 1 ); }
static inline __m256i mm256_rol_1x32( const __m256i v )
{ return _mm256_alignr_epi32( v, v, 7 ); }
#else // AVX2
// Swap 128 bit elements in 256 bit vector.
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
#define mm256_shuflr_128 mm256_swap_128
#define mm256_shufll_128 mm256_swap_128
// Rotate 256 bit vector by one 64 bit element
#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
// Rotate 256 bit vector by one 32 bit element.
#define mm256_ror_1x32( v ) \
#define mm256_shuflr_32( v ) \
_mm256_permutevar8x32_epi32( v, \
m256_const_64( 0x0000000000000007, 0x0000000600000005, \
0x0000000400000003, 0x0000000200000001 )
0x0000000400000003, 0x0000000200000001 ) )
#define mm256_rol_1x32( v ) \
#define mm256_shufll_32( v ) \
_mm256_permutevar8x32_epi32( v, \
m256_const_64( 0x0000000600000005, 0x0000000400000003, \
0x0000000200000001, 0x0000000000000007 )
0x0000000200000001, 0x0000000000000007 ) )
#endif // AVX512 else AVX2
//
// Rotate elements within each 128 bit lane of 256 bit vector.
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
#define mm256_ror128_32( v ) _mm256_shuffle_epi32( v, 0x39 )
#define mm256_rol128_32( v ) _mm256_shuffle_epi32( v, 0x93 )
#define mm256_shuflr128_64 mm256_swap128_64
#define mm256_shufll128_64 mm256_swap128_64
static inline __m256i mm256_ror128_x8( const __m256i v, const int c )
#define mm256_shuflr128_32( v ) _mm256_shuffle_epi32( v, 0x39 )
#define mm256_shufll128_32( v ) _mm256_shuffle_epi32( v, 0x93 )
static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
{ return _mm256_alignr_epi8( v, v, c ); }
// Swap 32 bit elements in each 64 bit lane.
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
#define mm256_shuflr64_32 mm256_swap64_32
#define mm256_shufll64_32 mm256_swap64_32
//
// Swap bytes in vector elements, endian bswap.
@@ -387,19 +474,21 @@ static inline __m256i mm256_ror128_x8( const __m256i v, const int c )
// _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
// makes these macros unnecessary.
// continue using vror/vrol notation for now to avoid confusion with
// shufl2r/shufl2l macro functions available with AVX512.
#define mm256_swap512_256( v1, v2 ) \
v1 = _mm256_xor_si256( v1, v2 ); \
v2 = _mm256_xor_si256( v1, v2 ); \
v1 = _mm256_xor_si256( v1, v2 );
#define mm256_ror512_128( v1, v2 ) \
#define mm256_vror512_128( v1, v2 ) \
do { \
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
v1 = _mm256_permute2x128( v2, v1, 0x21 ); \
v2 = t; \
} while(0)
#define mm256_rol512_128( v1, v2 ) \
#define mm256_vrol512_128( v1, v2 ) \
do { \
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
v2 = _mm256_permute2x128( v2, v1, 0x21 ); \

View File

@@ -74,13 +74,22 @@
// __AVX512VBMI__ __AVX512VAES__
//
// Used instead if casting.
typedef union
{
__m512i m512;
__m128i m128[4];
uint32_t u32[16];
uint64_t u64[8];
} __attribute__ ((aligned (64))) m512_ovly;
// Move integer to/from element 0 of vector.
#define mm512_mov64_512( n ) _mm512_castsi128_si512( mm128_mov64_128( n ) )
#define mm512_mov32_512( n ) _mm512_castsi128_si512( mm128_mov32_128( n ) )
#define mm512_mov256_64( a ) mm128_mov128_64( _mm256_castsi512_si128( a ) )
#define mm512_mov256_32( a ) mm128_mov128_32( _mm256_castsi512_si128( a ) )
#define u64_mov512_64( a ) u64_mov128_64( _mm256_castsi512_si128( a ) )
#define u32_mov512_32( a ) u32_mov128_32( _mm256_castsi512_si128( a ) )
// A simple 128 bit permute, using function instead of macro avoids
// problems if the v arg passed as an expression.
@@ -91,6 +100,10 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
#define mm512_concat_256( hi, lo ) \
_mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 )
#define m512_const_128( v3, v2, v1, v0 ) \
mm512_concat_256( mm256_concat_128( v3, v2 ), \
mm256_concat_128( v1, v0 ) )
// Equivalent of set, assign 64 bit integers to respective 64 bit elements.
// Use stack memory overlay
static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
@@ -225,7 +238,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
//
// Ternary logic uses 8 bit truth table to define any 3 input logical
// operation using any number or combinations of AND, OR XOR, NOT.
// expression using any number or combinations of AND, OR, XOR, NOT.
// a ^ b ^ c
#define mm512_xor3( a, b, c ) \
@@ -251,11 +264,11 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_andxor( a, b, c ) \
_mm512_ternarylogic_epi64( a, b, c, 0x60 )
// a ^ ( b & c )
// a ^ ( b | c )
#define mm512_xoror( a, b, c ) \
_mm512_ternarylogic_epi64( a, b, c, 0x1e )
// a ^ ( ~b & c ) [ xor( a, andnot( b, c ) ]
// a ^ ( ~b & c ) xor( a, andnot( b, c ) )
#define mm512_xorandnot( a, b, c ) \
_mm512_ternarylogic_epi64( a, b, c, 0xd2 )
@@ -265,11 +278,11 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
// Some 2 input operations that don't have their own instruction mnemonic.
// ~( a | b )
// ~( a | b ), (~a) & (~b)
#define mm512_nor( a, b ) \
_mm512_ternarylogic_epi64( a, b, b, 0x01 )
// ~( a ^ b ), same as (~a) ^ b
// ~( a ^ b ), (~a) ^ b
#define mm512_xnor( a, b ) \
_mm512_ternarylogic_epi64( a, b, b, 0x81 )
@@ -278,6 +291,27 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
_mm512_ternarylogic_epi64( a, b, b, 0xef )
// Diagonal blending
// Blend 8 64 bit elements from 8 vectors
#define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \
_mm512_mask_blend_epi64( 0x0f, \
_mm512_mask_blend_epi64( 0x30, \
_mm512_mask_blend_epi64( 0x40, v7, v6 ), \
_mm512_mask_blend_epi64( 0x40, v5, v4 ) ), \
_mm512_mask_blend_epi64( 0x03, \
_mm512_mask_blend_epi64( 0x04, v3, v2 ) \
_mm512_mask_blend_epi64( 0x01, v1, v0 ) ) )
// Blend 4 32 bit elements from each 128 bit lane.
#define mm512_diagonal128_32( v3, v2, v1, v0 ) \
_mm512_mask_blend_epi32( 0x3333, \
_mm512_mask_blend_epi32( 0x4444, v3, v2 ), \
_mm512_mask_blend_epi32( 0x1111, v1, v0 ) )
// Bit rotations.
// AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
@@ -395,59 +429,95 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
} while(0)
//
// Rotate elements in 512 bit vector.
// Shift with zero fill & shuffle-rotate elements in 512 bit vector.
//
// rename plan change ror to vror for Vector ROtate Right,
// and vrol for Vector ROtate Left, not to be confused with
//variable rotate rorv, rolv,
// Plan changed, use shuflr & shufll instead symbolizing a shuffle-rotate
// operation. 1xNN notaion ia also removed and replaced with simpler NN.
// Swap will still have its own mnemonic and will be aliased as both
// left and right shuffles.
// Shift elements right or left in 512 bit vector, filling with zeros.
// Multiple element shifts can be combined into a single larger
// element shift.
#define mm512_shiftr_256( v ) \
_mm512_alignr_epi64( _mm512_setzero, v, 4 )
#define mm512_shiftl_256( v ) mm512_shifr_256
#define mm512_shiftr_128( v ) \
_mm512_alignr_epi64( _mm512_setzero, v, 2 )
#define mm512_shiftl_128( v ) \
_mm512_alignr_epi64( v, _mm512_setzero, 6 )
#define mm512_shiftr_64( v ) \
_mm512_alignr_epi64( _mm512_setzero, v, 1 )
#define mm512_shiftl_64( v ) \
_mm512_alignr_epi64( v, _mm512_setzero, 7 )
#define mm512_shiftr_32( v ) \
_mm512_alignr_epi32( _mm512_setzero, v, 1 )
#define mm512_shiftl_32( v ) \
_mm512_alignr_epi32( v, _mm512_setzero, 15 )
// Shuffle-rotate elements left or right in 512 bit vector.
static inline __m512i mm512_swap_256( const __m512i v )
{ return _mm512_alignr_epi64( v, v, 4 ); }
#define mm512_shuflr_256( v ) mm512_swap_256
#define mm512_shufll_256( v ) mm512_swap_256
static inline __m512i mm512_ror_1x128( const __m512i v )
static inline __m512i mm512_shuflr_128( const __m512i v )
{ return _mm512_alignr_epi64( v, v, 2 ); }
static inline __m512i mm512_rol_1x128( const __m512i v )
static inline __m512i mm512_shufll_128( const __m512i v )
{ return _mm512_alignr_epi64( v, v, 6 ); }
static inline __m512i mm512_ror_1x64( const __m512i v )
static inline __m512i mm512_shuflr_64( const __m512i v )
{ return _mm512_alignr_epi64( v, v, 1 ); }
static inline __m512i mm512_rol_1x64( const __m512i v )
static inline __m512i mm512_shufll_64( const __m512i v )
{ return _mm512_alignr_epi64( v, v, 7 ); }
static inline __m512i mm512_ror_1x32( const __m512i v )
static inline __m512i mm512_shuflr_32( const __m512i v )
{ return _mm512_alignr_epi32( v, v, 1 ); }
static inline __m512i mm512_rol_1x32( const __m512i v )
static inline __m512i mm512_shufll_32( const __m512i v )
{ return _mm512_alignr_epi32( v, v, 15 ); }
static inline __m512i mm512_ror_x64( const __m512i v, const int n )
// Generic
static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
{ return _mm512_alignr_epi64( v, v, n ); }
static inline __m512i mm512_ror_x32( const __m512i v, const int n )
static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
{ return _mm512_alignr_epi32( v, v, n ); }
#define mm512_ror_1x16( v ) \
#define mm512_shuflr_16( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x0000001F001E001D, 0x001C001B001A0019, \
0X0018001700160015, 0X0014001300120011, \
0X0010000F000E000D, 0X000C000B000A0009, \
0X0008000700060005, 0X0004000300020001 ), v )
#define mm512_rol_1x16( v ) \
#define mm512_shufll_16( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x001E001D001C001B, 0x001A001900180017, \
0X0016001500140013, 0X001200110010000F, \
0X000E000D000C000B, 0X000A000900080007, \
0X0006000500040003, 0X000200010000001F ), v )
#define mm512_ror_1x8( v ) \
#define mm512_shuflr_8( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
0x003F3E3D3C3B3A39, 0x3837363534333231, \
0x302F2E2D2C2B2A29, 0x2827262524232221, \
0x201F1E1D1C1B1A19. 0x1817161514131211, \
0x100F0E0D0C0B0A09, 0x0807060504030201 ) )
#define mm512_rol_1x8( v ) \
#define mm512_shufll_8( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
0x3E3D3C3B3A393837, 0x363534333231302F. \
0x2E2D2C2B2A292827, 0x262524232221201F, \
@@ -456,51 +526,55 @@ static inline __m512i mm512_ror_x32( const __m512i v, const int n )
//
// Rotate elements within 256 bit lanes of 512 bit vector.
// 128 bit lane shift is handled by bslli bsrli.
// Swap hi & lo 128 bits in each 256 bit lane
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
#define mm512_shuflr256_128 mm512_swap256_128
#define mm512_shufll256_128 mm512_swap256_128
// Rotate 256 bit lanes by one 64 bit element
#define mm512_ror256_64( v ) _mm512_permutex_epi64( v, 0x39 )
#define mm512_rol256_64( v ) _mm512_permutex_epi64( v, 0x93 )
#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 )
#define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 )
// Rotate 256 bit lanes by one 32 bit element
#define mm512_ror256_32( v ) \
#define mm512_shuflr256_32( v ) \
_mm512_permutexvar_epi32( m512_const_64( \
0x000000080000000f, 0x0000000e0000000d, \
0x0000000c0000000b, 0x0000000a00000009, \
0x0000000000000007, 0x0000000600000005, \
0x0000000400000003, 0x0000000200000001 ), v )
#define mm512_rol256_32( v ) \
#define mm512_shufll256_32( v ) \
_mm512_permutexvar_epi32( m512_const_64( \
0x0000000e0000000d, 0x0000000c0000000b, \
0x0000000a00000009, 0x000000080000000f, \
0x0000000600000005, 0x0000000400000003, \
0x0000000200000001, 0x0000000000000007 ), v )
#define mm512_ror256_16( v ) \
#define mm512_shuflr256_16( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x00100001001e001d, 0x001c001b001a0019, \
0x0018001700160015, 0x0014001300120011, \
0x0000000f000e000d, 0x000c000b000a0009, \
0x0008000700060005, 0x0004000300020001 ), v )
#define mm512_rol256_16( v ) \
#define mm512_shufll256_16( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x001e001d001c001b, 0x001a001900180017, \
0x0016001500140013, 0x001200110010001f, \
0x000e000d000c000b, 0x000a000900080007, \
0x0006000500040003, 0x000200010000000f ), v )
#define mm512_ror256_8( v ) \
#define mm512_shuflr256_8( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
0x203f3e3d3c3b3a39, 0x3837363534333231, \
0x302f2e2d2c2b2a29, 0x2827262524232221, \
0x001f1e1d1c1b1a19, 0x1817161514131211, \
0x100f0e0d0c0b0a09, 0x0807060504030201 ) )
#define mm512_rol256_8( v ) \
#define mm512_shufll256_8( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
0x3e3d3c3b3a393837, 0x363534333231302f, \
0x2e2d2c2b2a292827, 0x262524232221203f, \
@@ -508,82 +582,120 @@ static inline __m512i mm512_ror_x32( const __m512i v, const int n )
0x0e0d0c0b0a090807, 0x060504030201001f ) )
//
// Rotate elements within 128 bit lanes of 512 bit vector.
// Shuffle-roate elements within 128 bit lanes of 512 bit vector.
// Swap 64 bits in each 128 bit lane
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
#define mm512_shuflr128_64 mm512_swap128_64
#define mm512_shufll128_64 mm512_swap128_64
// Rotate 128 bit lanes by one 32 bit element
#define mm512_ror128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
#define mm512_rol128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
// Rotate right 128 bit lanes by c bytes
static inline __m512i mm512_ror128_x8( const __m512i v, const int c )
// Rotate right 128 bit lanes by c bytes, versatile and just as fast
static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
{ return _mm512_alignr_epi8( v, v, c ); }
// Swap 32 bits in each 64 bit lane.
// Swap 32 bits in each 64 bit lane. Can be done with rotate instruction
// but only with AVX512. Shuffle is just as fast and availble with AVX2
// & SSE2.
#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 )
#define mm512_shuflr64_32 mm512_swap64_32
#define mm512_shufll64_32 mm512_swap64_32
// Need good way to distinguish 1 input shuffles, 2 input shuffle functions,
// and 2 input 2 output shuffle macros.
//
// Rotate elements from 2 512 bit vectors in place, source arguments
// shuflr is 1 input
// shufl2r is 2 input ...
// Drop macros? They can easilly be rebuilt using shufl2 functions
// add shuflr shufll functions performing rotate, returning first arg
// They're faster than doing both, when both not needed.
// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
// rotated v1
// visually confusing for shif2r because of arg order. First arg is always
// the target for modification, either update by reference or by function
// return.
#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 )
#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 )
#define mm512_shufl2r_128( v1, v2 ) _mm512_alignr_epi64( v2, v1, 2 )
#define mm512_shufl2l_128( v1, v2 ) _mm512_alignr_epi64( v1, v2, 2 )
#define mm512_shufl2r_64( v1, v2 ) _mm512_alignr_epi64( v2, v1, 1 )
#define mm512_shufl2l_64( v1, v2 ) _mm512_alignr_epi64( v1, v2, 1 )
#define mm512_shufl2r_32( v1, v2 ) _mm512_alignr_epi32( v2, v1, 1 )
#define mm512_shufl2l_32( v1, v2 ) _mm512_alignr_epi32( v1, v2, 1 )
// Rotate elements from 2 512 bit vectors in place, source arguments
// are overwritten.
#define mm512_swap1024_512( v1, v2 ) \
v1 = _mm512_xor_si512( v1, v2 ); \
v2 = _mm512_xor_si512( v1, v2 ); \
v1 = _mm512_xor_si512( v1, v2 );
#define mm512_shufl2l_512 mm512_swap1024_512 \
#define mm512_shufl2r_512 mm512_swap1024_512 \
#define mm512_ror1024_256( v1, v2 ) \
// Deprecated, will be removed. Use shufl2 functions instead. Leave them as is
// for now.
// Rotate elements from 2 512 bit vectors in place, both source arguments
// are updated.
#define mm512_vror1024_256( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
v2 = t; \
} while(0)
#define mm512_rol1024_256( v1, v2 ) \
#define mm512_vrol1024_256( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
v1 = t; \
} while(0)
#define mm512_ror1024_128( v1, v2 ) \
#define mm512_vror1024_128( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
v2 = t; \
} while(0)
#define mm512_rol1024_128( v1, v2 ) \
#define mm512_vrol1024_128( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
v1 = t; \
} while(0)
#define mm512_ror1024_64( v1, v2 ) \
#define mm512_vror1024_64( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
v2 = t; \
} while(0)
#define mm512_rol1024_64( v1, v2 ) \
#define mm512_vrol1024_64( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
v1 = t; \
} while(0)
#define mm512_ror1024_32( v1, v2 ) \
#define mm512_vror1024_32( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
v2 = t; \
} while(0)
#define mm512_rol1024_32( v1, v2 ) \
#define mm512_vrol1024_32( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
v2 = _mm512_alignr_epi32( v2, v1, 15 ); \

View File

@@ -68,13 +68,13 @@
// rotation.
// Swap hi & lo 32 bits.
#define mm64_swap_32( a ) _mm_shuffle_pi16( a, 0x4e )
#define mm64_swap_32( a ) _mm_shuffle_pi16( a, 0x4e )
#define mm64_ror64_1x16( a ) _mm_shuffle_pi16( a, 0x39 )
#define mm64_rol64_1x16( a ) _mm_shuffle_pi16( a, 0x93 )
#define mm64_shulfr_16( a ) _mm_shuffle_pi16( a, 0x39 )
#define mm64_shufll_16( a ) _mm_shuffle_pi16( a, 0x93 )
// Swap hi & lo 16 bits of each 32 bit element
#define mm64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 )
#define mm64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 )
#if defined(__SSSE3__)
@@ -86,7 +86,7 @@
_mm_shuffle_pi8( v, (__m64)0x0607040502030001 );
// Rotate right by c bytes
static inline __m64 mm64_ror_x8( __m64 v, const int c )
static inline __m64 mm64_vror_x8( __m64 v, const int c )
{ return _mm_alignr_pi8( v, v, c ); }
#else

View File

@@ -5,10 +5,19 @@
#define bswap_64( a ) __builtin_bswap64( a )
#define bswap_32( a ) __builtin_bswap32( a )
// safe division, integer or floating point
// Safe division, integer or floating point. For floating point it's as
// safe as 0. is precisely zero.
// Returns safe_result if division by zero.
#define safe_div( dividend, divisor, safe_result ) \
( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) ) )
// Aliases with familiar names for built in bit rotate instructions
#define rol64( a, n ) _lrotl( a, n )
#define ror64( a, n ) _lrotr( a, n )
#define rol32( a, n ) _rotl( a, n )
#define ror32( a, n ) _rotr( a, n )
#define rol16( a, n ) _rotwl( a, n )
#define ror16( a, n ) _rotwr( a, n )
///////////////////////////////////////
//
@@ -29,12 +38,14 @@
// __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 );
// my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );
// obsolete test
// Compiler check for __int128 support
// Configure also has a test for int128.
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
#define GCC_INT128 1
#endif
// obsolte test
#if !defined(GCC_INT128)
#warning "__int128 not supported, requires GCC-4.8 or newer."
#endif