v3.19.9

2025-09-17 23:44:27 +00:00 · 2022-07-10 11:04:00 -04:00
parent 26b8927632
commit f552f2b1e8
27 changed files with 883 additions and 396 deletions
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -2654,6 +2654,10 @@ static inline void intrlv_2x128( void *dst, const void *src0,
   d[10] = s0[5];   d[11] = s1[5];
   d[12] = s0[6];   d[13] = s1[6];
   d[14] = s0[7];   d[15] = s1[7];
+   if ( bit_len <= 1024 ) return;
+   d[16] = s0[8];   d[17] = s1[8];
+   d[18] = s0[9];   d[19] = s1[9];
+   //   if ( bit_len <= 1280 ) return;
 }

 static inline void intrlv_2x128_512( void *dst, const void *src0,
@@ -2721,6 +2725,10 @@ static inline void intrlv_4x128( void *dst, const void *src0,
   d[20] = s0[5];    d[21] = s1[5];    d[22] = s2[5];    d[23] = s3[5];
   d[24] = s0[6];    d[25] = s1[6];    d[26] = s2[6];    d[27] = s3[6];
   d[28] = s0[7];    d[29] = s1[7];    d[30] = s2[7];    d[31] = s3[7];
+   if ( bit_len <= 1024 ) return;
+   d[32] = s0[8];    d[33] = s1[8];    d[34] = s2[8];    d[35] = s3[8];
+   d[36] = s0[9];    d[37] = s1[9];    d[38] = s2[9];    d[39] = s3[9];
+   // if ( bit_len <= 1280 ) return;
 }

 static inline void intrlv_4x128_512( void *dst, const void *src0,
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -411,7 +411,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_16( v, c ) \
   _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )

-// Limited 2 input shuffle
+// Limited 2 input shuffle, combines shuffle with blend. The destination low
+// half is always taken from src a, and the high half from src b.
 #define mm128_shuffle2_64( a, b, c ) \
   _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( a ), \
                                     _mm_castsi128_pd( b ), c ) ); 
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -442,8 +442,14 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 #define mm256_shuflr64_32 mm256_swap64_32
 #define mm256_shufll64_32 mm256_swap64_32

-//
-// Swap bytes in vector elements, endian bswap.
+// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
+// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
+// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
+// needed for a shuffle that crosses 128 bit lanes. BSWAP doesn't therefore the
+// AVX2 version will work here. The bswap control vector is coded to work
+// with both versions, bit 4 is ignored in AVX2. 
+
+// Reverse byte order in elements, endian bswap.
 #define mm256_bswap_64( v ) \
   _mm256_shuffle_epi8( v, \
         m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -318,6 +318,9 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
 // elements and can be called directly. But they only accept immediate 8
 // for control arg. 
+// The workaround is a fraud, just a fluke of the compiler's optimizer.
+// It fails without -O3. The compiler seems to unroll shift loops, eliminating
+// the variable control, better than rotate loops. 
 //
 // _mm512_rol_epi64,  _mm512_ror_epi64,  _mm512_rol_epi32,  _mm512_ror_epi32
 // _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
@@ -430,21 +433,9 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
  casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
 } while(0)

-//
-// Shift with zero fill & shuffle-rotate elements in 512 bit vector.
-//

-// rename plan change ror to vror for Vector ROtate Right,
-// and vrol for Vector ROtate Left, not to be confused with
-//variable rotate rorv, rolv,
-// Plan changed, use shuflr & shufll instead symbolizing a shuffle-rotate
-// operation. 1xNN notaion ia also removed and replaced with simpler NN.
-// Swap will still have its own mnemonic and will be aliased as both
-// left and right shuffles.
-
-// Shift elements right or left in 512 bit vector, filling with zeros.
-// Multiple element shifts can be combined into a single larger
-// element shift.
+// Cross-lane shuffles implementing rotate & shift of elements within a vector.
+//

 #define mm512_shiftr_256( v ) \
  _mm512_alignr_epi64( _mm512_setzero, v, 4 )
@@ -530,7 +521,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 // 128 bit lane shift is handled by bslli bsrli.

 // Swap hi & lo 128 bits in each 256 bit lane
-#define mm512_swap256_128( v )   _mm512_permutex_epi64( v, 0x4e )
+#define mm512_swap256_128( v )      _mm512_permutex_epi64( v, 0x4e )
 #define mm512_shuflr256_128 mm512_swap256_128
 #define mm512_shufll256_128 mm512_swap256_128

@@ -584,7 +575,9 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 //
 // Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
 
-// Limited 2 input, 1 output shuffle within 128 bit lanes.
+// Limited 2 input, 1 output shuffle, combines shuffle with blend.
+// Like most shuffles it's limited to 128 bit lanes and like some shuffles
+// destination elements must come from a specific source. 
 #define mm512_shuffle2_64( a, b, c ) \
   _mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( a ), \
                                           _mm512_castsi512_pd( b ), c ) ); 
@@ -621,11 +614,7 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 // Drop macros? They can easilly be rebuilt using shufl2 functions

 // 2 input, 1 output
-// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
-// rotated v1 
-// visually confusing for shif2r because of arg order. First arg is always
-// the target for modification, either update by reference or by function
-// return.
+// Rotate concatenated { v1, v2 ) right or left and return v1. 
 #define mm512_shufl2r_256( v1, v2 )    _mm512_alignr_epi64( v2, v1, 4 )
 #define mm512_shufl2l_256( v1, v2 )    _mm512_alignr_epi64( v1, v2, 4 )