This commit is contained in:
Jay D Dee
2023-10-06 22:18:09 -04:00
parent bc5a5c6df8
commit 31c4dedf59
144 changed files with 5931 additions and 3746 deletions

View File

@@ -1,7 +1,7 @@
#if !defined(SIMD_128_H__)
#define SIMD_128_H__ 1
#if defined(__SSE2__)
#if defined(__x86_64__) && defined(__SSE2__)
///////////////////////////////////////////////////////////////////////////////
//
@@ -34,6 +34,109 @@
//
///////////////////////////////////////////////////////////////////////////////
// direct translation of native intrinsics
#define v128_t __m128i
#define v128_load _mm_load_si128
#define v128_store _mm_store_si128
// arithmetic
#define v128_add64 _mm_add_epi64
#define v128_add32 _mm_add_epi32
#define v128_add16 _mm_add_epi16
#define v128_add8 _mm_add_epi8
#define v128_sub64 _mm_sub_epi64
#define v128_sub32 _mm_sub_epi32
#define v128_sub16 _mm_sub_epi16
#define v128_sub8 _mm_sub_epi8
// widen
#define v128_mul64 _mm_mul_epu64
#define v128_mul32 _mm_mul_epu32
#define v128_mul16 _mm_mul_epu16
// save low half
#define v128_mullo32 _mm_mullo_epi32
#define v128_mullo16 _mm_mullo_epi16
// compare
#define v128_cmpeq64 _mm_cmpeq_epi64
#define v128_cmpeq32 _mm_cmpeq_epi32
#define v128_cmpeq16 _mm_cmpeq_epi16
#define v128_cmpgt64 _mm_cmpgt_epi64
#define v128_cmpgt32 _mm_cmpgt_epi32
#define v128_cmpgt16 _mm_cmpgt_epi16
#define v128_cmplt64 _mm_cmplt_epi64
#define v128_cmplt32 _mm_cmplt_epi32
#define v128_cmplt16 _mm_cmplt_epi16
// bit shift
#define v128_sl64 _mm_slli_epi64
#define v128_sl32 _mm_slli_epi32
#define v128_sl16 _mm_slli_epi16
#define v128_sr64 _mm_srli_epi64
#define v128_sr32 _mm_srli_epi32
#define v128_sr16 _mm_srli_epi16
#define v128_sra64 _mm_srai_epi64
#define v128_sra32 _mm_srai_epi32
#define v128_sra16 _mm_srai_epi16
// logic
#define v128_or _mm_or_si128
#define v128_and _mm_and_si128
#define v128_xor _mm_xor_si128
#define v128_xorq _mm_xor_si128
#define v128_andnot _mm_andnot_si128
#define v128_xorandnot( v2, v1, v0 ) _mm_xor_si128( v2, _mm_andnot_si128( v1, v0 ) )
#define v128_xor3( v2, v1, v0 ) _mm_xor_si128( v2, _mm_xor_si128( v1, v0 ) )
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
#define v128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
#define v128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
#define v128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
#define v128_nor mm128_nor
#define v128_alignr64 mm128_alignr_64
#define v128_alignr32 mm128_alignr_32
#if defined(__SSSE3__)
#define v128_alignr8 _mm_alignr_epi8
#endif
// NEON version uses vector mask
#if defined(__SSE4_1__)
#define v128_blend16 _mm_blend_epi16
#endif
#define v128_unpacklo64 _mm_unpacklo_epi64
#define v128_unpackhi64 _mm_unpackhi_epi64
#define v128_unpacklo32 _mm_unpacklo_epi32
#define v128_unpackhi32 _mm_unpackhi_epi32
#define v128_unpacklo16 _mm_unpacklo_epi16
#define v128_unpackhi16 _mm_unpackhi_epi16
#define v128_unpacklo8 _mm_unpacklo_epi8
#define v128_unpackhi8 _mm_unpackhi_epi8
// AES
#define v128_aesenc _mm_aesenc_si128
#define v128_aesenclast _mm_aesenclast_si128
#define v128_aesdec _mm_aesdec_si128
#define v128_aesdeclast _mm_aesdeclast_si128
// Used instead if casting.
typedef union
@@ -43,14 +146,22 @@ typedef union
} __attribute__ ((aligned (16))) m128_ovly;
#define v128_64(i64) _mm_set1_epi64x(i64)
#define v128_32(i32) _mm_set1_epi32(i32)
#define mm128_64(i64) _mm_set1_epi64x(i64)
#define mm128_32(i32) _mm_set1_epi32(i32)
#define v128_32 mm128_32
#define v128_64 mm128_64
#define v128_set64 _mm_set_epi64x
#define v128_set_64 v128_set64 // deprecated
#define v128_set32 _mm_set_epi32
#define v128_set_32 v128_set32 // deprecated
// Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements
// that make these functions either unnecessary or inefficient.
// In cases where an explicit move betweeen GP & SIMD registers is still
// necessary the cvt, set, or set1 intrinsics can be used allowing the
// compiler to exploilt new features to produce optimum code.
// compiler to exploit new features to produce optimum code.
static inline __m128i mm128_mov64_128( const uint64_t n )
{
__m128i a;
@@ -61,6 +172,8 @@ static inline __m128i mm128_mov64_128( const uint64_t n )
#endif
return a;
}
#define v128_mov64( u64 ) mm128_mov64_128( u64 )
static inline __m128i mm128_mov32_128( const uint32_t n )
{
@@ -79,7 +192,9 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
//#define mm128_bcast_m32( v ) _mm_shuffle_epi32( v, 0x00 )
// Pseudo constants
#define m128_zero _mm_setzero_si128()
#define v128_zero _mm_setzero_si128()
#define m128_zero v128_zero
#define m128_one_128 mm128_mov64_128( 1 )
// ASM avoids the need to initialize return variable to avoid compiler warning.
@@ -148,6 +263,7 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
// Copy element i2 of v2 to element i1 of dest and copy remaining elements from v1.
#define mm128_mov32_32( v1, i1, v2, i2 ) \
mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )
#define v128_mov32( dst, ld, src, ls ) mm128_mov32_32( dst, ld, src, ls )
#endif // SSE4_1
@@ -166,6 +282,21 @@ static inline __m128i mm128_not( const __m128i v )
#define mm128_not( v ) _mm_xor_si128( v, m128_neg1 )
#endif
#define v128_not mm128_not
static inline __m128i mm128_negate_64( __m128i v )
{ return _mm_sub_epi64( _mm_xor_si128( v, v ), v ); }
#define v128_negate64 mm128_negate_64
static inline __m128i mm128_negate_32( __m128i v )
{ return _mm_sub_epi32( _mm_xor_si128( v, v ), v ); }
#define v128_negate32 mm128_negate_32
static inline __m128i mm128_negate_16( __m128i v )
{ return _mm_sub_epi16( _mm_xor_si128( v, v ), v ); }
#define v128_negate16 mm128_negate_16
// Add 4 values, fewer dependencies than sequential addition.
#define mm128_add4_64( a, b, c, d ) \
@@ -173,6 +304,7 @@ static inline __m128i mm128_not( const __m128i v )
#define mm128_add4_32( a, b, c, d ) \
_mm_add_epi32( _mm_add_epi32( a, b ), _mm_add_epi32( c, d ) )
#define v128_add4_32 mm128_add4_32
#define mm128_add4_16( a, b, c, d ) \
_mm_add_epi16( _mm_add_epi16( a, b ), _mm_add_epi16( c, d ) )
@@ -191,13 +323,16 @@ static inline __m128i mm128_not( const __m128i v )
// returns p as pointer to vector type
#define castp_m128i(p) ((__m128i*)(p))
// p = any aligned pointer
// returns *p, watch your pointer arithmetic
#define cast_m128i(p) (*((__m128i*)(p)))
#define cast_v128 cast_m128i
// p = any aligned pointer, i = scaled array index
// returns value p[i]
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
#define casti_v128 casti_m128i
// p = any aligned pointer, o = scaled offset
// returns pointer p+o
@@ -211,12 +346,15 @@ static inline __m128i mm128_not( const __m128i v )
static inline void memset_zero_128( __m128i *dst, const int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; }
#define v128_memset_zero memset_zero_128
static inline void memset_128( __m128i *dst, const __m128i a, const int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
#define v128_memset memset_128
static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
#define v128_memcpy memcpy_128
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
@@ -277,9 +415,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_movmask_64( v ) \
_mm_movemask_pd( (__m128d)(v) )
#define v128_movmask64 mm128_movmask_64
#define mm128_movmask_32( v ) \
_mm_movemask_ps( (__m128)(v) )
#define v128_movmask32 mm128_movmask_32
//
// Bit rotations
@@ -295,6 +435,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_rol_64 _mm_rol_epi64
#define mm128_ror_32 _mm_ror_epi32
#define mm128_rol_32 _mm_rol_epi32
#define mm128_ror_16 _mm_ror_epi16
#define mm128_rol_16 _mm_rol_epi16
#define mm128_rorx2_64( v1, v0, c ) \
_mm_ror_epi64( v0, c ); \
@@ -326,6 +468,12 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_rol_32( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
#define mm128_ror_16( v, c ) \
_mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) )
#define mm128_rol_16( v, c ) \
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
#define mm128_rorx2_64( v1, v0, c ) \
{ \
__m128i t0 = _mm_srli_epi64( v0, c ); \
@@ -368,6 +516,15 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#endif // AVX512 else SSE2
#define v128_ror64 mm128_ror_64
#define v128 rol64 mm128_rol_64
#define v128_ror32 mm128_ror_32
#define v128_rol32 mm128_rol_32
#define v128_ror16 mm128_ror_16
#define v128_rol16 mm128_rol_16
// Cross lane shuffles
//
// Limited 2 input shuffle, combines shuffle with blend. The destination low
@@ -383,11 +540,19 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
// Rotate vector elements accross all lanes
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
#define v128_swap64 mm128_swap_64
#define mm128_shuflr_64 mm128_swap_64
#define mm128_shufll_64 mm128_swap_64
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
#define v128_shuflr32 mm128_shuflr_32
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
#define v128_shufll32 mm128_shufll_32
#define mm128_rev_32( v ) _mm_shuffle_epi32( v, 0x1b )
#define v128_rev32( v ) mm128_rev_32( v )
/* Not used
#if defined(__SSSE3__)
@@ -402,12 +567,14 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
// Rotate 64 bit lanes
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
#define v128_swap64_32 mm128_swap64_32
#define mm128_shuflr64_32 mm128_swap64_32
#define mm128_shufll64_32 mm128_swap64_32
//TODO Enable for AVX10_256
#if defined(__AVX512VL__)
#define m1286_shuflr64_24( v ) _mm_ror_epi64( v, 24 )
#define m128_shuflr64_24( v ) _mm_ror_epi64( v, 24 )
#elif defined(__SSSE3__)
#define mm128_shuflr64_24( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
@@ -415,6 +582,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#else
#define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
#endif
#define v128_shuflr64_24 mm128_shuflr64_24
#if defined(__AVX512VL__)
#define mm128_shuflr64_16( v ) _mm_ror_epi64( v, 16 )
@@ -425,6 +594,7 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#else
#define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
#endif
#define v128_shuflr64_16 mm128_shuflr64_16
// Rotate 32 bit lanes
@@ -439,6 +609,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#endif
#define mm128_shuflr32_16 mm128_swap32_16
#define mm128_shufll32_16 mm128_swap32_16
#define v128_swap32_16 mm128_swap32_16
#if defined(__AVX512VL__)
#define mm128_shuflr32_8( v ) _mm_ror_epi32( v, 8 )
@@ -449,6 +621,7 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#else
#define mm128_shuflr32_8( v ) mm128_ror_32( v, 8 )
#endif
#define v128_shuflr32_8 mm128_shuflr32_8
//
// Endian byte swap.
@@ -549,6 +722,13 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
#endif // SSSE3 else SSE2
#define v128_bswap32 mm128_bswap_32
#define v128_bswap64 mm128_bswap_64
#define v128_bswap128 mm128_bswap_128
#define v128_block_bswap32 mm128_block_bswap_32
#define v128_block_bswap64 mm128_block_bswap_64
// alignr instruction for 32 & 64 bit elements is only available with AVX512
// but emulated here. Behaviour is consistent with Intel alignr intrinsics.