v3.11.0

2025-09-17 23:44:27 +00:00 · 2020-01-02 23:54:08 -05:00
parent 241bc26767
commit 3572cb53c4
118 changed files with 7030 additions and 1575 deletions
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -2283,7 +2283,46 @@ static inline void rintrlv_8x32_8x64( void *dst,
   d[63] = _mm_unpackhi_epi32( s[61], s[63] );
 }

+// 8x32 -> 4x128

+// 16 bytes per lane
+#define RLEAVE_8X32_4X128( i ) \
+do { \
+    uint32_t *d0 = (uint32_t*)dst0 + (i); \
+    uint32_t *d1 = (uint32_t*)dst1 + (i); \
+    const uint32_t *s  = (const uint32_t*)src + ((i)<<1); \
+   d0[ 0] = s[ 0];      d1[ 0] = s[ 4]; \
+   d0[ 1] = s[ 8];      d1[ 1] = s[12]; \
+   d0[ 2] = s[16];      d1[ 2] = s[20]; \
+   d0[ 3] = s[24];      d1[ 3] = s[28]; \
+\
+   d0[ 4] = s[ 1];      d1[ 4] = s[ 5]; \
+   d0[ 5] = s[ 9];      d1[ 5] = s[13]; \
+   d0[ 6] = s[17];      d1[ 6] = s[21]; \
+   d0[ 7] = s[25];      d1[ 7] = s[29]; \
+\
+   d0[ 8] = s[ 2];      d1[ 8] = s[ 6]; \
+   d0[ 9] = s[10];      d1[ 9] = s[14]; \
+   d0[10] = s[18];      d1[10] = s[22]; \
+   d0[11] = s[26];      d1[11] = s[30]; \
+\
+   d0[12] = s[ 3];      d1[12] = s[ 7]; \
+   d0[13] = s[11];      d1[13] = s[15]; \
+   d0[14] = s[19];      d1[14] = s[23]; \
+   d0[15] = s[27];      d1[15] = s[31]; \
+} while(0)  
+
+static inline void rintrlv_8x32_4x128( void *dst0, void *dst1,
+                                    const void *src, const int bit_len )
+{
+   RLEAVE_8X32_4X128(   0 );    RLEAVE_8X32_4X128(  16 );
+   if ( bit_len <= 256 ) return;
+   RLEAVE_8X32_4X128(  32 );    RLEAVE_8X32_4X128(  48 );
+   if ( bit_len <= 512 ) return;
+   RLEAVE_8X32_4X128(  64 );    RLEAVE_8X32_4X128(  80 );
+   RLEAVE_8X32_4X128(  96 );    RLEAVE_8X32_4X128( 112 );
+}
+#undef RLEAVE_8X32_4X128

 /*
 #define RLEAVE_4x32_4x64(i) do \
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -42,17 +42,18 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
   return mm256_concat_128( hi, lo );
 }

-// Broadcast 128 bits in pairs of 64 bit integer constants {i1. i0} to all
-// 128 bit lanes.
-#define m256_const2_64( i1, i0 ) \
-    _mm256_permute4x64_epi64( _mm256_castsi128_si256( \
-                              m128_const_64( i1, i0 ) ), 0x44 )
-
 // Equivalent of set1, broadcast integer constant to all elements.
-#define m256_const1_64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
-#define m256_const1_32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
-#define m256_const1_16( i ) _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
-#define m256_const1_8 ( i ) _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
+#define m256_const1_128( v ) _mm256_broadcastsi128_si256( v )
+#define m256_const1_64( i )  _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
+#define m256_const1_32( i )  _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
+#define m256_const1_16( i )  _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
+#define m256_const1_8 ( i )  _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
+
+#define m256_const2_64( i1, i0 ) \
+  m256_const1_128( m128_const_64( i1, i0 ) )
+
+#define m126_const2_32( i1, i0 ) \
+   m256_const1_64( ( (uint64_t)(i1) << 32 ) | ( (uint64_t)(i0) & 0xffffffff ) ) 


 //
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -38,6 +38,36 @@
 //    shuffle_epi8 shuffles accross entire 512 bits. Shuffle usually
 //    doesn't cross 128 bit lane boundaries but is consistent with AVX2
 //    where shuffle_epi8 spans the entire vector.
+//
+//    There are 2 areas where overhead is aconcern: constants and
+//    permutations.
+//
+//    Constants need to be composed at run time by assembling individual
+//    elements, very expensive. The cost is proportional to the number of
+//    elements therefor use the largest element size possible, even by
+//    merging smaller values.
+//
+//    Constants with repeating patterns can be optimized with the smaller
+//    patterns repeated more frequently being more efficient.
+//
+//    Some specific constants can be very efficient. Zero is very efficient,
+//    1 and -1 slightly less so.
+//
+//    If an expensive constant is to be reused in the same function it should
+//    be declared as a local variable defined once and reused.
+//
+//    Permutations cab be very exppensive if they use a vector control index,
+//    even if the permutation itself is quite efficient.
+//    The index is essentially a constant with all the baggage that brings.
+//    The same rules apply, if an index is to be reused it should be defined
+//    as a local. This applies specifically to bswap operations.
+//
+//    Additionally, permutations using smaller vectors can be more efficient
+//    if the permutation doesn't cross lane boundaries ,typically 128 bits,
+//    ans the smnaller vector can use an imm comtrol.
+//
+//    If the permutation doesn't cross lane boundaries a shuffle instructions
+//    can be used with imm control instead of permute.

 //////////////////////////////////////////////////////////////
 //
@@ -106,12 +136,14 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
 #define m512_const1_16( i )    _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
 #define m512_const1_8( i )     _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )

+#define m512_const2_128( v1, v0 ) \
+   m512_const1_256( _mm512_inserti64x2( _mm512_castsi128_si512( lo ), hi, 1 ) )
+
 #define m512_const2_64( i1, i0 ) \
   m512_const1_128( m128_const_64( i1, i0 ) )

 #define m512_const2_32( i1, i0 ) \
-   m512_const1_64( ( ( ( (uint64_t)(i1) << 32 ) ) \
-                     | ( (uint64_t)(i0) & 0xffffffff ) ) )
+   m512_const1_64( ( (uint64_t)(i1) << 32 ) | ( (uint64_t)(i0) & 0xffffffff ) )

 // { m128_1, m128_1, m128_0, m128_0 }
 #define m512_const_2x128( v1, v0 ) \