This commit is contained in:
Jay D Dee
2024-05-20 23:08:50 -04:00
parent 4f930574cc
commit 042d13d1e1
129 changed files with 835 additions and 538 deletions

View File

@@ -207,12 +207,12 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
#endif
// broadcast (replicate) lane l to all lanes
#define v128_replane64( v, l ) \
// Broadcast lane l to all lanes
#define v128_duplane64( v, l ) \
( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x44 ) \
: _mm_shuffle_epi32( v, 0xee )
#define v128_replane32( v, l ) \
#define v128_duplane32( v, l ) \
( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x00 ) \
: ( (l) == 1 ) ? _mm_shuffle_epi32( v, 0x55 ) \
: ( (l) == 2 ) ? _mm_shuffle_epi32( v, 0xaa ) \
@@ -347,8 +347,7 @@ static inline __m128i v128_neg1_fn()
// Basic operations without equivalent SIMD intrinsic
// Bitwise not (~v)
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#if defined(VL256)
static inline __m128i v128_not( const __m128i v )
{ return _mm_ternarylogic_epi64( v, v, v, 1 ); }
@@ -402,8 +401,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
#define memcpy_128 v128_memcpy
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#if defined(VL256)
// ~v1 | v0
#define v128_ornot( v1, v0 ) _mm_ternarylogic_epi64( v1, v0, v0, 0xcf )
// a ^ b ^ c
#define v128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
@@ -434,6 +435,8 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#else
#define v128_ornot( v1, v0 ) _mm_or_si128( v1, v128_not( v0 ) )
#define v128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
@@ -454,7 +457,6 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#endif
#define v128_ornot( a, b ) _mm_or_si128( a, v128_not( b ) )
// Mask making
// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
@@ -494,7 +496,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#define v128_rol32_sse2( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
#if defined(__AVX512VL__)
#if defined(VL256)
// AVX512 fastest for all rotations.
#define v128_ror64 _mm_ror_epi64
@@ -609,13 +611,15 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
// deprecated
#define mm128_rol_32 v128_rol32
// ror( v1 ^ v0, n )
#define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n )
/* not used
// x2 rotates elements in 2 individual vectors in a double buffered
// optimization for SSE2, does nothing for AVX512 but is there for
// transparency.
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#if defined(VL256)
#define v128_2ror64( v1, v0, c ) \
_mm_ror_epi64( v0, c ); \
@@ -917,10 +921,8 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
#define v128_block_bswap32 mm128_block_bswap_32
#define v128_block_bswap64 mm128_block_bswap_64
// alignr instruction for 32 & 64 bit elements is only available with AVX512
// but emulated here. Behaviour is consistent with Intel alignr intrinsics.
#if defined(__SSSE3__)
#define v128_alignr8 _mm_alignr_epi8
@@ -929,6 +931,9 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
#else
#define v128_alignr8( hi, lo, c ) \
_mm_or_si128( _mm_slli_si128( hi, c ), _mm_srli_si128( lo, c ) )
#define v128_alignr64( hi, lo, c ) \
_mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
@@ -937,12 +942,15 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
#endif
// blend using vector mask
#if defined(__SSE4_1__)
// Bytewise using sign bit of each byte element of mask
#define v128_blendv _mm_blendv_epi8
#else
// Bitwise
#define v128_blendv( v1, v0, mask ) \
v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )