v24.2

2026-02-22 16:33:08 +00:00 · 2024-05-20 23:08:50 -04:00
parent 4f930574cc
commit 042d13d1e1
129 changed files with 835 additions and 538 deletions
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -66,8 +66,7 @@ typedef union

 // Set either the low or high 64 bit elements in 128 bit lanes, other elements
 // are set to zero.
-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)

 #define mm256_bcast128lo_64( i64 )     _mm256_maskz_set1_epi64( 0x55, i64 )
 #define mm256_bcast128hi_64( i64 )     _mm256_maskz_set1_epi64( 0xaa, i64 )
@@ -117,8 +116,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 // Basic operations without SIMD equivalent

-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)

 static inline __m256i mm256_not( const __m256i v )
 {  return _mm256_ternarylogic_epi64( v, v, v, 1 ); }
@@ -137,8 +135,10 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_add4_32( a, b, c, d ) \
   _mm256_add_epi32( _mm256_add_epi32( a, b ), _mm256_add_epi32( c, d ) )

-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)
+
+// ~v1 | v0
+#define mm256_ornot( v1, v0 )      _mm256_ternarylogic_epi64( v1, v0, v0, 0xcf )

 // a ^ b ^ c
 #define mm256_xor3( a, b, c )      _mm256_ternarylogic_epi64( a, b, c, 0x96 )
@@ -172,6 +172,8 @@ static inline __m256i mm256_not( const __m256i v )
    
 #else

+#define mm256_ornot( v1, v0 )      _mm256_or_si256( v1, mm256_not( v0 ) )
+
 #define mm256_xor3( a, b, c ) \
  _mm256_xor_si256( a, _mm256_xor_si256( b, c ) )

@@ -257,7 +259,7 @@ static inline __m256i mm256_not( const __m256i v )
   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
                    _mm256_srli_epi32( v, 32-(c) ) )

-#if defined(__AVX512VL__)
+#if defined(VL256)

 #define mm256_ror_64    _mm256_ror_epi64
 #define mm256_rol_64    _mm256_rol_epi64
@@ -343,8 +345,7 @@ static inline __m256i mm256_not( const __m256i v )
 // optimization for AVX2, does nothing for AVX512 but is here for
 // transparency.

-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)
 /*
 #define mm256_ror_64    _mm256_ror_epi64
 #define mm256_rol_64    _mm256_rol_epi64
@@ -470,7 +471,7 @@ static inline __m256i mm256_not( const __m256i v )

 /* Not used
 // Rotate 256 bit vector by one 32 bit element.
-#if defined(__AVX512VL__)
+#if defined(VL256)
 static inline __m256i mm256_shuflr_32( const __m256i v )
 { return _mm256_alignr_epi32( v, v, 1 ); }
 static inline __m256i mm256_shufll_32( const __m256i v )
@@ -507,8 +508,8 @@ static inline __m256i mm256_shufll_32( const __m256i v )
 #define mm256_shuflr128_32(v)   _mm256_shuffle_epi32( v, 0x39 )
 #define mm256_shufll128_32(v)   _mm256_shuffle_epi32( v, 0x93 )

-#define mm256_shuflr128_16(v)   _mm256_shuffle_epi16( v, 0x39 )
-#define mm256_shufll128_16(v)   _mm256_shuffle_epi16( v, 0x93 )
+#define mm256_shuflr128_16(v)   mm256_shuffle_16( v, 0x39 )
+#define mm256_shufll128_16(v)   mm256_shuffle_16( v, 0x93 )

 /* Not used
 static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
@@ -606,6 +607,22 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
  casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \
 }

+#if defined(VL256)
+
+#define mm256_alignr64      _mm256_alignr_epi64
+
+#else
+
+#define mm256_alignr64( v1, v0, c ) \
+    ( ( (c) & 3 ) == 1 ) ? _mm256_blend_epi32( mm256_shuflr_64( v1 ), \
+                                               mm256_shuflr_64( v0 ), 0x3f ) \
+  : ( ( (c) & 3 ) == 2 ) ? _mm256_blend_epi32( mm256_rev_128( v1 ), \
+                                               mm256_rev_128( v0 ), 0x0f ) \
+  : ( ( (c) & 3 ) == 3 ) ? _mm256_blend_epi32( mm256_shufll_64( v1 ), \
+                                               mm256_shufll_64( v0 ), 0x03 ) \
+  : v0
+
+#endif

 #endif // __AVX2__
 #endif // SIMD_256_H__