This commit is contained in:
Jay D Dee
2022-08-01 20:21:05 -04:00
parent 1321ac474c
commit 58030e2788
27 changed files with 312 additions and 4734 deletions

View File

@@ -273,9 +273,9 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#endif
// Mask making
// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
// Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
// Effectively a sign test.
#define mm_movmask_64( v ) \
_mm_castpd_si128( _mm_movmask_pd( _mm_castsi128_pd( v ) ) )
@@ -306,34 +306,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
//
// Bit rotations
// AVX512VL has implemented bit rotation for 128 bit vectors with
// 64 and 32 bit elements.
// x2 rotates elements in 2 individual vectors in a double buffered
// optimization for SSE2, does nothing for AVX512 but is there for
// transparency.
// compiler doesn't like when a variable is used for the last arg of
// _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same
// specification but works with a variable. Therefore use rol_var where
// necessary.
// sm3-hash-4way.c has one instance where mm128_rol_var_32 is required.
#define mm128_ror_var_64( v, c ) \
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
#define mm128_rol_var_64( v, c ) \
_mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
#define mm128_ror_var_32( v, c ) \
_mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
#define mm128_rol_var_32( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
#if defined(__AVX512VL__)
//#if defined(__AVX512F__) && defined(__AVX512VL__)
#define mm128_ror_64 _mm_ror_epi64
#define mm128_rol_64 _mm_rol_epi64
@@ -358,10 +335,17 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#else // SSE2
#define mm128_ror_64 mm128_ror_var_64
#define mm128_rol_64 mm128_rol_var_64
#define mm128_ror_32 mm128_ror_var_32
#define mm128_rol_32 mm128_rol_var_32
#define mm128_ror_64( v, c ) \
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
#define mm128_rol_64( v, c ) \
_mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
#define mm128_ror_32( v, c ) \
_mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
#define mm128_rol_32( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
#define mm128_rorx2_64( v1, v0, c ) \
{ \
@@ -411,6 +395,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_rol_16( v, c ) \
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
// Deprecated.
#define mm128_rol_var_32( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
//
// Limited 2 input shuffle, combines shuffle with blend. The destination low
// half is always taken from src a, and the high half from src b.
#define mm128_shuffle2_64( a, b, c ) \
@@ -421,7 +410,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( a ), \
_mm_castsi128_ps( b ), c ) );
//
// Rotate vector elements accross all lanes
@@ -432,21 +420,61 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
// Swap 32 bit elements in 64 bit lanes
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
#define mm128_shuflr64_32 mm128_swap64_32
#define mm128_shufll64_32 mm128_swap64_32
#if defined(__SSSE3__)
// Rotate right by c bytes, no SSE2 equivalent.
static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
{ return _mm_alignr_epi8( v, v, c ); }
#endif
// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
// for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
// (unlikely but faster), or when SSSE3 is not available (slower).
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
#define mm128_shuflr64_32 mm128_swap64_32
#define mm128_shufll64_32 mm128_swap64_32
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#define mm128_shuflr64_24( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
#else
#define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
#endif
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#define mm128_shuflr64_16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
#else
#define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
#endif
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#define mm128_swap32_16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
#else
#define mm128_swap32_16( v ) mm128_ror_32( v, 16 )
#endif
#define mm128_shuflr32_16 mm128_swap32_16
#define mm128_shufll32_16 mm128_swap32_16
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#define mm128_shuflr32_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
#else
#define mm128_shuflr32_8( v ) mm128_ror_32( v, 8 )
#endif
//
// Endian byte swap.
#if defined(__SSSE3__)
#define mm128_bswap_64( v ) \
_mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
0x0001020304050607 ) )
@@ -537,8 +565,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
//
// Rotate in place concatenated 128 bit vectors as one 256 bit vector.
// Swap 128 bit vectorse.
// Swap 128 bit vectors.
// This should be avoided, it's more efficient to switch references.
#define mm128_swap256_128( v1, v2 ) \
v1 = _mm_xor_si128( v1, v2 ); \
v2 = _mm_xor_si128( v1, v2 ); \
@@ -552,8 +580,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
// Function macros with two inputs and one output, inputs are preserved.
// Returns the high 128 bits, ie updated v1.
// These two-input functions are not available without SSSE3. Use procedure
// macros below instead.
// These functions are preferred but only available with SSSE3. Use procedure
// macros below for SSE2 compatibility.
#define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
#define mm128_shufl2l_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
@@ -568,8 +596,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
// Deprecated for SSSE3 and above, they exist for SSSE3 only for compatibility
// with existing code. The function macros above can be used more effciently.
// Deprecated for SSSE3 and above, SSSE3 versions exist for only for
// compatibility with with existing code.
#define mm128_vror256_64( v1, v2 ) \
do { \

View File

@@ -13,6 +13,18 @@
// AVX512 implementations. They will be selected automatically but their use
// is limited because 256 bit vectors are less likely to be used when 512
// is available.
//
// AVX2 version of _mm256_shuffle_epi8 is limited to 128 bit lanes but AVX512
// version is not. Some usage has the index vector encoded as if full vector
// shuffles are supported. This has no side effects and would have the same
// results using either version.
// If needed and AVX512 is available, 256 bit full vector shuffles can be
// implemented using the AVX512 zero-mask feature with a NULL mask.
// Using intrinsics it's simple:
// _mm256_maskz_shuffle_epi8( k0, v, c )
// With asm it's a bit more complicated with the addition of the mask register
// and zero tag:
// vpshufb ymm0{k0}{z}, ymm1, ymm2
#if defined(__AVX__)
@@ -234,9 +246,9 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#endif
// Mask making
// Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
// Effectively a sign test.
#define mm256_movmask_64( v ) \
_mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
@@ -273,42 +285,11 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
//
// Bit rotations.
//
// The only bit shift for more than 64 bits is with __int128 which is slow.
//
// AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements
//
// x2 rotates elements in 2 individual vectors in a double buffered
// optimization for SSE2, does nothing for AVX512 but is there for
// optimization for AVX2, does nothing for AVX512 but is here for
// transparency.
// compiler doesn't like when a variable is used for the last arg of
// _mm_rol_epi32, must be "8 bit immediate". Therefore use rol_var where
// necessary.
#define mm256_ror_var_64( v, c ) \
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
_mm256_slli_epi64( v, 64-(c) ) )
#define mm256_rol_var_64( v, c ) \
_mm256_or_si256( _mm256_slli_epi64( v, c ), \
_mm256_srli_epi64( v, 64-(c) ) )
#define mm256_ror_var_32( v, c ) \
_mm256_or_si256( _mm256_srli_epi32( v, c ), \
_mm256_slli_epi32( v, 32-(c) ) )
#define mm256_rol_var_32( v, c ) \
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
_mm256_srli_epi32( v, 32-(c) ) )
// The spec says both F & VL are required, but just in case AMD
// decides to implement ROL/R without AVX512F.
#if defined(__AVX512VL__)
//#if defined(__AVX512F__) && defined(__AVX512VL__)
// AVX512, control must be 8 bit immediate.
#define mm256_ror_64 _mm256_ror_epi64
#define mm256_rol_64 _mm256_rol_epi64
@@ -333,10 +314,23 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#else // AVX2
#define mm256_ror_64 mm256_ror_var_64
#define mm256_rol_64 mm256_rol_var_64
#define mm256_ror_32 mm256_ror_var_32
#define mm256_rol_32 mm256_rol_var_32
// use shuflr64 shuflr32 below for optimized bit rotations of multiples of 8.
#define mm256_ror_64( v, c ) \
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
_mm256_slli_epi64( v, 64-(c) ) )
#define mm256_rol_64( v, c ) \
_mm256_or_si256( _mm256_slli_epi64( v, c ), \
_mm256_srli_epi64( v, 64-(c) ) )
#define mm256_ror_32( v, c ) \
_mm256_or_si256( _mm256_srli_epi32( v, c ), \
_mm256_slli_epi32( v, 32-(c) ) )
#define mm256_rol_32( v, c ) \
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
_mm256_srli_epi32( v, 32-(c) ) )
#define mm256_rorx2_64( v1, v0, c ) \
{ \
@@ -388,6 +382,10 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
_mm256_or_si256( _mm256_slli_epi16( v, c ), \
_mm256_srli_epi16( v, 16-(c) ) )
// Deprecated.
#define mm256_rol_var_32( v, c ) \
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
_mm256_srli_epi32( v, 32-(c) ) )
//
// Rotate elements accross all lanes.
@@ -399,7 +397,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
// Rotate 256 bit vector by one 64 bit element
#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
// Rotate 256 bit vector by one 32 bit element.
@@ -413,7 +410,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
m256_const_64( 0x0000000600000005, 0x0000000400000003, \
0x0000000200000001, 0x0000000000000007 ) )
//
// Rotate elements within each 128 bit lane of 256 bit vector.
@@ -426,7 +422,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
_mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \
_mm256_castsi256_ps( b ), c ) );
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
#define mm256_shuflr128_64 mm256_swap128_64
#define mm256_shufll128_64 mm256_swap128_64
@@ -437,11 +432,52 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
{ return _mm256_alignr_epi8( v, v, c ); }
// Swap 32 bit elements in each 64 bit lane.
// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit
// rotations for multiples of 8 bits. Uses faster ror/rol instructions when
// AVX512 is available.
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
#define mm256_shuflr64_32 mm256_swap64_32
#define mm256_shufll64_32 mm256_swap64_32
#if defined(__AVX512VL__)
#define mm256_shuflr64_24( v ) _mm256_ror_epi64( v, 24 )
#else
#define mm256_shuflr64_24( v ) \
_mm256_shuffle_epi8( v, _mm256_set_epi64x( \
0x0a09080f0e0d0c0b, 0x0201000706050403, \
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
#endif
#if defined(__AVX512VL__)
#define mm256_shuflr64_16( v ) _mm256_ror_epi64( v, 16 )
#else
#define mm256_shuflr64_16( v ) \
_mm256_shuffle_epi8( v, _mm256_set_epi64x( \
0x09080f0e0d0c0b0a, 0x0100070605040302, \
0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
#endif
#if defined(__AVX512VL__)
#define mm256_swap32_16( v ) _mm256_ror_epi32( v, 16 )
#else
#define mm256_swap32_16( v ) \
_mm256_shuffle_epi8( v, _mm256_set_epi64x( \
0x0d0c0f0e09080b0a, 0x0504070601000302, \
0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
#endif
#define mm256_shuflr32_16 mm256_swap32_16
#define mm256_shufll32_16 mm256_swap32_16
#if defined(__AVX512VL__)
#define mm256_shuflr32_8( v ) _mm256_ror_epi32( v, 8 )
#else
#define mm256_shuflr32_8( v ) \
_mm256_shuffle_epi8( v, _mm256_set_epi64x( \
0x0c0f0e0d080b0a09, 0x0407060500030201, \
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
#endif
// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
@@ -496,18 +532,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
} while(0)
//
// Rotate two concatenated 256 bit vectors as one 512 bit vector by specified
// number of elements. Rotate is done in place, source arguments are
// overwritten.
// Some of these can use permute but appears to be slower. Maybe a Ryzen
// issue
// _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
// makes these macros unnecessary.
// continue using vror/vrol notation for now to avoid confusion with
// shufl2r/shufl2l macro functions available with AVX512.
// swap 256 bit vectors in place.
// This should be avoided, it's more efficient to switch references.
#define mm256_swap512_256( v1, v2 ) \
v1 = _mm256_xor_si256( v1, v2 ); \
v2 = _mm256_xor_si256( v1, v2 ); \

View File

@@ -316,58 +316,18 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
// Bit rotations.
// AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
// elements and can be called directly. But they only accept immediate 8
// for control arg.
// The workaround is a fraud, just a fluke of the compiler's optimizer.
// It fails without -O3. The compiler seems to unroll shift loops, eliminating
// the variable control, better than rotate loops.
// elements and can be called directly.
//
// _mm512_rol_epi64, _mm512_ror_epi64, _mm512_rol_epi32, _mm512_ror_epi32
// _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
//
// For convenience and consistency with AVX2
// For convenience and consistency with AVX2 macros.
#define mm512_ror_64 _mm512_ror_epi64
#define mm512_rol_64 _mm512_rol_epi64
#define mm512_ror_32 _mm512_ror_epi32
#define mm512_rol_32 _mm512_rol_epi32
static inline __m512i mm512_ror_var_64( const __m512i v, const int c )
{
return _mm512_or_si512( _mm512_srli_epi64( v, c ),
_mm512_slli_epi64( v, 64-c ) );
}
static inline __m512i mm512_rol_var_64( const __m512i v, const int c )
{
return _mm512_or_si512( _mm512_slli_epi64( v, c ),
_mm512_srli_epi64( v, 64-c ) );
}
static inline __m512i mm512_ror_var_32( const __m512i v, const int c )
{
return _mm512_or_si512( _mm512_srli_epi32( v, c ),
_mm512_slli_epi32( v, 32-c ) );
}
static inline __m512i mm512_rol_var_32( const __m512i v, const int c )
{
return _mm512_or_si512( _mm512_slli_epi32( v, c ),
_mm512_srli_epi32( v, 32-c ) );
}
static inline __m512i mm512_ror_16( __m512i const v, const int c )
{
return _mm512_or_si512( _mm512_srli_epi16( v, c ),
_mm512_slli_epi16( v, 16-c ) );
}
static inline __m512i mm512_rol_16( const __m512i v, const int c )
{
return _mm512_or_si512( _mm512_slli_epi16( v, c ),
_mm512_srli_epi16( v, 16-c ) );
}
// Rotations using a vector control index are very slow due to overhead
// to generate the index vector. Repeated rotations using the same index
// are better handled by the calling function where the index only needs
@@ -599,22 +559,34 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
{ return _mm512_alignr_epi8( v, v, c ); }
// Swap 32 bits in each 64 bit lane. Can be done with rotate instruction
// but only with AVX512. Shuffle is just as fast and availble with AVX2
// & SSE2.
// Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
// can be done with ror & rol. Defined only for convenience and consistency
// with AVX2 & SSE2 macros.
#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 )
#define mm512_shuflr64_32 mm512_swap64_32
#define mm512_shufll64_32 mm512_swap64_32
// Need good way to distinguish 1 input shuffles, 2 input shuffle functions,
// and 2 input 2 output shuffle macros.
//
// shuflr is 1 input
// shufl2r is 2 input ...
// Drop macros? They can easilly be rebuilt using shufl2 functions
#define mm512_shuflr64_24( v ) _mm512_ror_epi64( v, 24 )
#define mm512_shufll64_24( v ) _mm512_rol_epi64( v, 24 )
#define mm512_shuflr64_16( v ) _mm512_ror_epi64( v, 16 )
#define mm512_shufll64_16( v ) _mm512_rol_epi64( v, 16 )
#define mm512_shuflr64_8( v ) _mm512_ror_epi64( v, 8 )
#define mm512_shufll64_8( v ) _mm512_rol_epi64( v, 8 )
#define mm512_swap32_16( v ) _mm512_ror_epi32( v, 16 )
#define mm512_shuflr32_16 mm512_swap32_16
#define mm512_shufll32_16 mm512_swap32_16
#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 )
#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 )
// 2 input, 1 output
// Rotate concatenated { v1, v2 ) right or left and return v1.
// Concatenate { v1, v2 ) then rotate right or left and return the high
// 512 bits, ie rotated v1.
#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 )
#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 )