v3.23.4

2026-02-22 16:33:08 +00:00 · 2023-10-06 22:18:09 -04:00
parent bc5a5c6df8
commit 31c4dedf59
144 changed files with 5931 additions and 3746 deletions
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -1,7 +1,7 @@
 #if !defined(SIMD_128_H__)
 #define SIMD_128_H__ 1

-#if defined(__SSE2__)
+#if defined(__x86_64__) && defined(__SSE2__)

 ///////////////////////////////////////////////////////////////////////////////
 //
@@ -34,6 +34,109 @@
 //
 ///////////////////////////////////////////////////////////////////////////////

+// direct translation of native intrinsics
+
+#define v128_t                         __m128i 
+
+#define v128_load                      _mm_load_si128
+#define v128_store                     _mm_store_si128
+
+// arithmetic
+#define v128_add64                     _mm_add_epi64
+#define v128_add32                     _mm_add_epi32
+#define v128_add16                     _mm_add_epi16
+#define v128_add8                      _mm_add_epi8
+
+#define v128_sub64                     _mm_sub_epi64
+#define v128_sub32                     _mm_sub_epi32
+#define v128_sub16                     _mm_sub_epi16
+#define v128_sub8                      _mm_sub_epi8
+
+// widen
+#define v128_mul64                     _mm_mul_epu64
+#define v128_mul32                     _mm_mul_epu32
+#define v128_mul16                     _mm_mul_epu16
+
+// save low half
+#define v128_mullo32                   _mm_mullo_epi32
+#define v128_mullo16                   _mm_mullo_epi16
+
+// compare
+#define v128_cmpeq64                   _mm_cmpeq_epi64
+#define v128_cmpeq32                   _mm_cmpeq_epi32
+#define v128_cmpeq16                   _mm_cmpeq_epi16
+
+#define v128_cmpgt64                   _mm_cmpgt_epi64
+#define v128_cmpgt32                   _mm_cmpgt_epi32
+#define v128_cmpgt16                   _mm_cmpgt_epi16
+
+#define v128_cmplt64                   _mm_cmplt_epi64
+#define v128_cmplt32                   _mm_cmplt_epi32
+#define v128_cmplt16                   _mm_cmplt_epi16
+
+// bit shift
+#define v128_sl64                      _mm_slli_epi64
+#define v128_sl32                      _mm_slli_epi32
+#define v128_sl16                      _mm_slli_epi16
+
+#define v128_sr64                      _mm_srli_epi64
+#define v128_sr32                      _mm_srli_epi32
+#define v128_sr16                      _mm_srli_epi16
+
+#define v128_sra64                     _mm_srai_epi64
+#define v128_sra32                     _mm_srai_epi32
+#define v128_sra16                     _mm_srai_epi16
+
+// logic
+#define v128_or                        _mm_or_si128
+#define v128_and                       _mm_and_si128
+#define v128_xor                       _mm_xor_si128
+#define v128_xorq                      _mm_xor_si128
+#define v128_andnot                    _mm_andnot_si128
+#define v128_xorandnot( v2, v1, v0 )   _mm_xor_si128( v2, _mm_andnot_si128( v1, v0 ) )
+#define v128_xor3( v2, v1, v0 )        _mm_xor_si128( v2, _mm_xor_si128( v1, v0 ) )
+#define v128_and3( a, b, c )           _mm_and_si128( a, _mm_and_si128( b, c ) )
+#define v128_or3( a, b, c )            _mm_or_si128( a, _mm_or_si128( b, c ) )
+#define v128_xorand( a, b, c )         _mm_xor_si128( a, _mm_and_si128( b, c ) )
+#define v128_andxor( a, b, c )         _mm_and_si128( a, _mm_xor_si128( b, c ))
+#define v128_xoror( a, b, c )          _mm_xor_si128( a, _mm_or_si128( b, c ) )
+#define v128_orand( a, b, c )          _mm_or_si128( a, _mm_and_si128( b, c ) )
+#define v128_xnor( a, b )              mm128_not( _mm_xor_si128( a, b ) )
+#define v128_nor                       mm128_nor
+
+#define v128_alignr64                  mm128_alignr_64
+#define v128_alignr32                  mm128_alignr_32
+
+#if defined(__SSSE3__)
+
+#define v128_alignr8                   _mm_alignr_epi8
+
+#endif
+
+// NEON version uses vector mask
+#if defined(__SSE4_1__)
+
+#define v128_blend16                   _mm_blend_epi16
+
+#endif
+
+#define v128_unpacklo64                _mm_unpacklo_epi64
+#define v128_unpackhi64                _mm_unpackhi_epi64
+
+#define v128_unpacklo32                _mm_unpacklo_epi32
+#define v128_unpackhi32                _mm_unpackhi_epi32
+
+#define v128_unpacklo16                _mm_unpacklo_epi16
+#define v128_unpackhi16                _mm_unpackhi_epi16
+
+#define v128_unpacklo8                 _mm_unpacklo_epi8
+#define v128_unpackhi8                 _mm_unpackhi_epi8
+
+// AES
+#define v128_aesenc                    _mm_aesenc_si128
+#define v128_aesenclast                _mm_aesenclast_si128
+#define v128_aesdec                    _mm_aesdec_si128
+#define v128_aesdeclast                _mm_aesdeclast_si128

 // Used instead if casting.
 typedef union
@@ -43,14 +146,22 @@ typedef union
 } __attribute__ ((aligned (16))) m128_ovly;


-#define v128_64(i64)    _mm_set1_epi64x(i64)
-#define v128_32(i32)    _mm_set1_epi32(i32)
+#define mm128_64(i64)    _mm_set1_epi64x(i64)
+#define mm128_32(i32)    _mm_set1_epi32(i32)
+#define v128_32                        mm128_32
+#define v128_64                        mm128_64
+
+#define v128_set64                     _mm_set_epi64x
+#define v128_set_64                    v128_set64     // deprecated
+#define v128_set32                     _mm_set_epi32
+#define v128_set_32                    v128_set32    // deprecated
+

 // Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements
 // that make these functions either unnecessary or inefficient.
 // In cases where an explicit move betweeen GP & SIMD registers is still
 // necessary the cvt, set, or set1 intrinsics can be used allowing the
-// compiler to exploilt new features to produce optimum code.
+// compiler to exploit new features to produce optimum code.
 static inline __m128i mm128_mov64_128( const uint64_t n )
 {
  __m128i a;
@@ -61,6 +172,8 @@ static inline __m128i mm128_mov64_128( const uint64_t n )
 #endif
  return a;
 }
+#define v128_mov64( u64 )              mm128_mov64_128( u64 )
+

 static inline __m128i mm128_mov32_128( const uint32_t n )
 {
@@ -79,7 +192,9 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
 //#define mm128_bcast_m32( v )   _mm_shuffle_epi32( v, 0x00 )

 // Pseudo constants
-#define m128_zero      _mm_setzero_si128()
+#define v128_zero      _mm_setzero_si128()
+#define m128_zero      v128_zero
+
 #define m128_one_128   mm128_mov64_128( 1 )

 // ASM avoids the need to initialize return variable to avoid compiler warning.
@@ -148,6 +263,7 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
 // Copy element i2 of v2 to element i1 of dest and copy remaining elements from v1.
 #define mm128_mov32_32( v1, i1, v2, i2 ) \
  mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )
+#define v128_mov32( dst, ld, src, ls ) mm128_mov32_32( dst, ld, src, ls )

 #endif  // SSE4_1

@@ -166,6 +282,21 @@ static inline __m128i mm128_not( const __m128i v )
 #define mm128_not( v )          _mm_xor_si128( v, m128_neg1 ) 

 #endif
+#define v128_not                       mm128_not
+
+
+static inline __m128i mm128_negate_64( __m128i v )
+{ return _mm_sub_epi64( _mm_xor_si128( v, v ), v ); }
+#define v128_negate64                 mm128_negate_64
+
+static inline __m128i mm128_negate_32( __m128i v )
+{ return _mm_sub_epi32( _mm_xor_si128( v, v ), v ); }
+#define v128_negate32                 mm128_negate_32
+
+static inline __m128i mm128_negate_16( __m128i v ) 
+{ return _mm_sub_epi16( _mm_xor_si128( v, v ), v ); }
+#define v128_negate16                 mm128_negate_16
+

 // Add 4 values, fewer dependencies than sequential addition.
 #define mm128_add4_64( a, b, c, d ) \
@@ -173,6 +304,7 @@ static inline __m128i mm128_not( const __m128i v )

 #define mm128_add4_32( a, b, c, d ) \
   _mm_add_epi32( _mm_add_epi32( a, b ), _mm_add_epi32( c, d ) )
+#define v128_add4_32                   mm128_add4_32

 #define mm128_add4_16( a, b, c, d ) \
   _mm_add_epi16( _mm_add_epi16( a, b ), _mm_add_epi16( c, d ) )
@@ -191,13 +323,16 @@ static inline __m128i mm128_not( const __m128i v )
 // returns p as pointer to vector type
 #define castp_m128i(p) ((__m128i*)(p))

+
 // p = any aligned pointer
 // returns *p, watch your pointer arithmetic
 #define cast_m128i(p) (*((__m128i*)(p)))
+#define cast_v128                      cast_m128i

 // p = any aligned pointer, i = scaled array index
 // returns value p[i]
 #define casti_m128i(p,i) (((__m128i*)(p))[(i)])
+#define casti_v128                     casti_m128i

 // p = any aligned pointer, o = scaled offset
 // returns pointer p+o
@@ -211,12 +346,15 @@ static inline __m128i mm128_not( const __m128i v )

 static inline void memset_zero_128( __m128i *dst,  const int n )
 {   for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; }
+#define v128_memset_zero               memset_zero_128

 static inline void memset_128( __m128i *dst, const __m128i a, const int n )
 {   for ( int i = 0; i < n; i++ ) dst[i] = a; }
+#define v128_memset                    memset_128

 static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 {   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
+#define v128_memcpy                    memcpy_128

 #if defined(__AVX512VL__)
 //TODO Enable for AVX10_256
@@ -277,9 +415,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )

 #define mm128_movmask_64( v ) \
   _mm_movemask_pd( (__m128d)(v) )
+#define v128_movmask64                 mm128_movmask_64

 #define mm128_movmask_32( v ) \
   _mm_movemask_ps( (__m128)(v) )
+#define v128_movmask32                 mm128_movmask_32

 //
 // Bit rotations
@@ -295,6 +435,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_64    _mm_rol_epi64
 #define mm128_ror_32    _mm_ror_epi32
 #define mm128_rol_32    _mm_rol_epi32
+#define mm128_ror_16    _mm_ror_epi16
+#define mm128_rol_16    _mm_rol_epi16

 #define mm128_rorx2_64( v1, v0, c ) \
   _mm_ror_epi64( v0, c ); \
@@ -326,6 +468,12 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_32( v, c ) \
   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )

+#define mm128_ror_16( v, c ) \
+   _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) )
+
+#define mm128_rol_16( v, c ) \
+   _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
+
 #define mm128_rorx2_64( v1, v0, c ) \
 { \
 __m128i t0 = _mm_srli_epi64( v0, c ); \
@@ -368,6 +516,15 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )

 #endif   // AVX512 else SSE2

+#define v128_ror64                     mm128_ror_64
+#define v128 rol64                     mm128_rol_64
+
+#define v128_ror32                     mm128_ror_32
+#define v128_rol32                     mm128_rol_32
+
+#define v128_ror16                     mm128_ror_16
+#define v128_rol16                     mm128_rol_16
+
 // Cross lane shuffles
 //
 // Limited 2 input shuffle, combines shuffle with blend. The destination low
@@ -383,11 +540,19 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 // Rotate vector elements accross all lanes

 #define mm128_swap_64( v )     _mm_shuffle_epi32( v, 0x4e )
+#define v128_swap64                    mm128_swap_64
+
 #define mm128_shuflr_64        mm128_swap_64
 #define mm128_shufll_64        mm128_swap_64

 #define mm128_shuflr_32( v )   _mm_shuffle_epi32( v, 0x39 )
+#define v128_shuflr32                  mm128_shuflr_32
+
 #define mm128_shufll_32( v )   _mm_shuffle_epi32( v, 0x93 )
+#define v128_shufll32                  mm128_shufll_32
+
+#define mm128_rev_32( v )      _mm_shuffle_epi32( v, 0x1b )
+#define v128_rev32( v )                mm128_rev_32( v )

 /* Not used
 #if defined(__SSSE3__)
@@ -402,12 +567,14 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 //  Rotate 64 bit lanes

 #define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
+#define v128_swap64_32                 mm128_swap64_32
+
 #define mm128_shuflr64_32     mm128_swap64_32
 #define mm128_shufll64_32     mm128_swap64_32

 //TODO Enable for AVX10_256
 #if defined(__AVX512VL__)
-  #define m1286_shuflr64_24( v )  _mm_ror_epi64( v, 24 )
+  #define m128_shuflr64_24( v )  _mm_ror_epi64( v, 24 )
 #elif defined(__SSSE3__) 
  #define mm128_shuflr64_24( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
@@ -415,6 +582,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #else
  #define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
 #endif
+#define v128_shuflr64_24               mm128_shuflr64_24
+

 #if defined(__AVX512VL__)
  #define mm128_shuflr64_16( v )  _mm_ror_epi64( v, 16 )
@@ -425,6 +594,7 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #else
  #define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
 #endif
+#define v128_shuflr64_16               mm128_shuflr64_16

 // Rotate 32 bit lanes

@@ -439,6 +609,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #endif
 #define mm128_shuflr32_16      mm128_swap32_16
 #define mm128_shufll32_16      mm128_swap32_16
+#define v128_swap32_16         mm128_swap32_16
+

 #if defined(__AVX512VL__)
  #define mm128_shuflr32_8( v )  _mm_ror_epi32( v, 8 )
@@ -449,6 +621,7 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #else
  #define mm128_shuflr32_8( v ) mm128_ror_32( v, 8 )
 #endif
+#define v128_shuflr32_8                mm128_shuflr32_8

 //
 // Endian byte swap.
@@ -549,6 +722,13 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )

 #endif // SSSE3 else SSE2

+#define v128_bswap32                   mm128_bswap_32
+#define v128_bswap64                   mm128_bswap_64
+#define v128_bswap128                  mm128_bswap_128
+#define v128_block_bswap32             mm128_block_bswap_32
+#define v128_block_bswap64             mm128_block_bswap_64
+
+
 // alignr instruction for 32 & 64 bit elements is only available with AVX512
 // but emulated here. Behaviour is consistent with Intel alignr intrinsics.