v3.20.3

2026-02-22 16:33:08 +00:00 · 2022-10-21 23:12:18 -04:00
parent 58030e2788
commit bd84f199fe
35 changed files with 983 additions and 4938 deletions
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -1,30 +1,28 @@
 #if !defined(SIMD_256_H__)
 #define SIMD_256_H__ 1

-//#if defined(__AVX2__)
-
 /////////////////////////////////////////////////////////////////////
 //
 //             AVX2 256 bit vectors
 //
 // Basic support for 256 bit vectors is available with AVX but integer
 // support requires AVX2.
-// Some 256 bit vector utilities require AVX512 or have more efficient
-// AVX512 implementations. They will be selected automatically but their use
-// is limited because 256 bit vectors are less likely to be used when 512
-// is available.
 //
-// AVX2 version of _mm256_shuffle_epi8 is limited to 128 bit lanes but AVX512
-// version is not. Some usage has the index vector encoded as if full vector
+// AVX512VL backports some AVX512 features to 256 bit vectors and can produce
+// more efficient implementations of some functions. They will be selected
+// automatically but their use is limited because 256 bit vectors are less
+// likely to be used when 512 is available.
+//
+// "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
+// lanes and data can't cross the 128 bit lane boundary.  
+// Some usage may have the index vector encoded as if full vector
 // shuffles are supported. This has no side effects and would have the same
 // results using either version.
-// If needed and AVX512 is available, 256 bit full vector shuffles can be
-// implemented using the AVX512 zero-mask feature with a NULL mask.
-// Using intrinsics it's simple:
-//   _mm256_maskz_shuffle_epi8( k0, v, c )
+// If the need arises and AVX512VL is available, 256 bit full vector shuffles
+// can be implemented using the AVX512 zero-mask feature with a NULL mask.
+// Using intrinsics it's simple:   _mm256_maskz_shuffle_epi8( 0, v, c )
 // With asm it's a bit more complicated with the addition of the mask register
-// and zero tag:
-//   vpshufb ymm0{k0}{z}, ymm1, ymm2 
+// and zero tag:   vpshufb ymm0{k0}{z}, ymm1, ymm2 

 #if defined(__AVX__)

@@ -57,8 +55,8 @@ typedef union
 #define casto_m256i(p,o) (((__m256i*)(p))+(o))

 #endif
-#if defined(__AVX2__)

+#if defined(__AVX2__)

 // Move integer to low element of vector, other elements are set to zero.
 #define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
@@ -72,7 +70,6 @@ typedef union
 //#define mm256_mov256_64 u64_mov256_64 
 //#define mm256_mov256_32 u32_mov256_32

-
 // concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
 #define mm256_concat_128( hi, lo ) \
   _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )
@@ -145,8 +142,17 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 // Basic operations without SIMD equivalent

 // Bitwise not ( ~v )
+#if defined(__AVX512VL__)
+
+static inline __m256i mm256_not( const __m256i v )
+{  return _mm256_ternarylogic_epi64( v, v, v, 1 ); }
+
+#else
+
 #define mm256_not( v )       _mm256_xor_si256( v, m256_neg1 ) \

+#endif
+
 // Unary negation of each element ( -v )
 #define mm256_negate_64( v ) _mm256_sub_epi64( m256_zero, v )
 #define mm256_negate_32( v ) _mm256_sub_epi32( m256_zero, v )
@@ -281,6 +287,50 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
           _mm256_blend_epi32( v3, v2, 0x44) \
           _mm256_blend_epi32( v1, v0, 0x11 ) )  

+/*
+//
+// Extended bit shift for concatenated packed elements from 2 vectors.
+// Shift right returns low half, shift left return high half.
+
+#if defined(__AVX512VBMI2__) && defined(__AVX512VL__)
+
+#define mm256_shl2_64( v1, v2, c )    _mm256_shldi_epi64( v1, v2, c )
+#define mm256_shr2_64( v1, v2, c )    _mm256_shrdi_epi64( v1, v2, c )
+
+#define mm256_shl2_32( v1, v2, c )    _mm256_shldi_epi32( v1, v2, c )
+#define mm256_shr2_32( v1, v2, c )    _mm256_shrdi_epi32( v1, v2, c )
+
+#define mm256_shl2_16( v1, v2, c )    _mm256_shldi_epi16( v1, v2, c )
+#define mm256_shr2_16( v1, v2, c )    _mm256_shrdi_epi16( v1, v2, c )
+
+#else
+
+#define mm256_shl2i_64( v1, v2, c ) \
+                     _mm256_or_si256( _mm256_slli_epi64( v1, c ), \
+                                      _mm256_srli_epi64( v2, 64 - (c) ) )
+
+#define mm512_shr2_64( v1, v2, c ) \
+                    _mm256_or_si256( _mm256_srli_epi64( v2, c ), \
+                                     _mm256_slli_epi64( v1, 64 - (c) ) )
+
+#define mm256_shl2_32( v1, v2, c ) \
+                    _mm256_or_si256( _mm256_slli_epi32( v1, c ), \
+                                     _mm256_srli_epi32( v2, 32 - (c) ) )
+
+#define mm256_shr2_32( v1, v2, c ) \
+                    _mm256_or_si256( _mm256_srli_epi32( v2, c ), \
+                                     _mm256_slli_epi32( v1, 32 - (c) ) )
+
+#define mm256_shl2_16( v1, v2, c ) \
+                    _mm256_or_si256( _mm256_slli_epi16( v1, c ), \
+                                     _mm256_srli_epi16( v2, 16 - (c) ) )
+
+#define mm256_shr2_16( v1, v2, c ) \
+                    _mm256_or_si256( _mm256_srli_epi16( v2, c ), \
+                                     _mm256_slli_epi16( v1, 16 - (c) ) )
+
+#endif
+*/

 //
 //           Bit rotations.
@@ -414,13 +464,13 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 // Rotate elements within each 128 bit lane of 256 bit vector.

 // Limited 2 input shuffle
-#define mm256_shuffle2_64( a, b, c ) \
-   _mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( a ), \
-                                           _mm256_castsi256_pd( b ), c ) ); 
+#define mm256_shuffle2_64( v1, v2, c ) \
+   _mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( v1 ), \
+                                           _mm256_castsi256_pd( v2 ), c ) ); 

-#define mm256_shuffle2_32( a, b, c ) \
-   _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \
-                                           _mm256_castsi256_ps( b ), c ) ); 
+#define mm256_shuffle2_32( v1, v2, c ) \
+   _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \
+                                           _mm256_castsi256_ps( v2 ), c ) ); 

 #define mm256_swap128_64( v )  _mm256_shuffle_epi32( v, 0x4e )
 #define mm256_shuflr128_64 mm256_swap128_64