v3.9.6

2025-09-17 23:44:27 +00:00 · 2019-07-17 17:54:38 -04:00
parent e2d5762ef2
commit 6f49ba09b7
34 changed files with 1930 additions and 382 deletions
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -477,42 +477,42 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, void *src )
   __m256i s0 = mm256_bswap_32( casti_m256i( src,0 ) );
   __m256i s1 = mm256_bswap_32( casti_m256i( src,1 ) );
   __m128i s2 = mm128_bswap_32( casti_m128i( src,4 ) );
-  const __m256i zero = m256_zero;
-  const __m256i one  = m256_one_32;
-  const __m256i two  = _mm256_add_epi32( one, one );
-  const __m256i tre  = _mm256_add_epi32( two, one );
-  const __m256i four = _mm256_add_epi32( two, two );
+  const __m256i zero  = m256_zero;
+  const __m256i one   = m256_one_32;
+  const __m256i two   = _mm256_add_epi32( one, one );
+  const __m256i three = _mm256_add_epi32( two, one );
+  const __m256i four  = _mm256_add_epi32( two, two );

-  casti_m256i( d, 0 ) = _mm256_permutevar8x32_epi32( s0, zero );
-  casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32( s0, one  );
-  casti_m256i( d, 2 ) = _mm256_permutevar8x32_epi32( s0, two  );
-  casti_m256i( d, 3 ) = _mm256_permutevar8x32_epi32( s0, tre  );
-  casti_m256i( d, 4 ) = _mm256_permutevar8x32_epi32( s0, four );
+  casti_m256i( d, 0 ) = _mm256_permutevar8x32_epi32( s0, zero  );
+  casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32( s0, one   );
+  casti_m256i( d, 2 ) = _mm256_permutevar8x32_epi32( s0, two   );
+  casti_m256i( d, 3 ) = _mm256_permutevar8x32_epi32( s0, three );
+  casti_m256i( d, 4 ) = _mm256_permutevar8x32_epi32( s0, four  );
  casti_m256i( d, 5 ) = _mm256_permutevar8x32_epi32( s0,
-                                       _mm256_add_epi32( four, one ) );
+                                       _mm256_add_epi32( four, one   ) );
  casti_m256i( d, 6 ) = _mm256_permutevar8x32_epi32( s0,
-                                       _mm256_add_epi32( four, two ) );
+                                       _mm256_add_epi32( four, two   ) );
  casti_m256i( d, 7 ) = _mm256_permutevar8x32_epi32( s0,
-                                       _mm256_add_epi32( four, tre ) );
-  casti_m256i( d, 8 ) = _mm256_permutevar8x32_epi32( s1, zero );
-  casti_m256i( d, 9 ) = _mm256_permutevar8x32_epi32( s1, one  );
-  casti_m256i( d,10 ) = _mm256_permutevar8x32_epi32( s1, two  );
-  casti_m256i( d,11 ) = _mm256_permutevar8x32_epi32( s1, tre  );
-  casti_m256i( d,12 ) = _mm256_permutevar8x32_epi32( s1, four );
+                                       _mm256_add_epi32( four, three ) );
+  casti_m256i( d, 8 ) = _mm256_permutevar8x32_epi32( s1, zero  );
+  casti_m256i( d, 9 ) = _mm256_permutevar8x32_epi32( s1, one   );
+  casti_m256i( d,10 ) = _mm256_permutevar8x32_epi32( s1, two   );
+  casti_m256i( d,11 ) = _mm256_permutevar8x32_epi32( s1, three );
+  casti_m256i( d,12 ) = _mm256_permutevar8x32_epi32( s1, four  );
  casti_m256i( d,13 ) = _mm256_permutevar8x32_epi32( s1,
-                                       _mm256_add_epi32( four, one ) );
+                                       _mm256_add_epi32( four, one   ) );
  casti_m256i( d,14 ) = _mm256_permutevar8x32_epi32( s1,
-                                       _mm256_add_epi32( four, two  ) );
+                                       _mm256_add_epi32( four, two   ) );
  casti_m256i( d,15 ) = _mm256_permutevar8x32_epi32( s1,
-                                       _mm256_add_epi32( four, tre ) );
+                                       _mm256_add_epi32( four, three ) );
  casti_m256i( d,16 ) = _mm256_permutevar8x32_epi32(
-                             _mm256_castsi128_si256( s2 ), zero );
+                             _mm256_castsi128_si256( s2 ), zero  );
  casti_m256i( d,17 ) = _mm256_permutevar8x32_epi32(
-                             _mm256_castsi128_si256( s2 ), one  );
+                             _mm256_castsi128_si256( s2 ), one   );
  casti_m256i( d,18 ) = _mm256_permutevar8x32_epi32(
-                             _mm256_castsi128_si256( s2 ), two  );
+                             _mm256_castsi128_si256( s2 ), two   );
  casti_m256i( d,19 ) = _mm256_permutevar8x32_epi32( 
-                             _mm256_castsi128_si256( s2 ), tre  );
+                             _mm256_castsi128_si256( s2 ), three );
 }

 #endif   // AVX2
@@ -677,39 +677,39 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, void *src )
 {
  __m512i s0 = mm512_bswap_32( casti_m512i( src, 0 ) );
  __m128i s1 = mm128_bswap_32( casti_m128i( src, 4 ) );
-  const __m512i zero   = m512_zero;
-  const __m512i one    = m512_one_32;
-  const __m512i two    = _mm512_add_epi32( one,   one  );
-  const __m512i tre    = _mm512_add_epi32( two,   one  );
-  const __m512i four   = _mm512_add_epi32( two,   two  );
-  const __m512i eight  = _mm512_add_epi32( four,  four );
-  const __m512i eleven = _mm512_add_epi32( eight, tre  );
+  const __m512i zero     = m512_zero;
+  const __m512i one      = m512_one_32;
+  const __m512i two      = _mm512_add_epi32( one,   one   );
+  const __m512i three    = _mm512_add_epi32( two,   one   );
+  const __m512i four     = _mm512_add_epi32( two,   two   );
+  const __m512i eight    = _mm512_add_epi32( four,  four  );
+  const __m512i eleven   = _mm512_add_epi32( eight, three );

-  casti_m512i( d, 0 ) = _mm512_permutexvar_epi32( s0, zero );
-  casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( s0, one  );
-  casti_m512i( d, 2 ) = _mm512_permutexvar_epi32( s0, two  );
-  casti_m512i( d, 3 ) = _mm512_permutexvar_epi32( s0, tre  );
-  casti_m512i( d, 4 ) = _mm512_permutexvar_epi32( s0, four );
+  casti_m512i( d, 0 ) = _mm512_permutexvar_epi32( s0, zero   );
+  casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( s0, one    );
+  casti_m512i( d, 2 ) = _mm512_permutexvar_epi32( s0, two    );
+  casti_m512i( d, 3 ) = _mm512_permutexvar_epi32( s0, three  );
+  casti_m512i( d, 4 ) = _mm512_permutexvar_epi32( s0, four   );
  casti_m512i( d, 5 ) = _mm512_permutexvar_epi32( s0,
-                                    _mm512_add_epi32( four, one ) );
+                                    _mm512_add_epi32( four,   one   ) );
  casti_m512i( d, 6 ) = _mm512_permutexvar_epi32( s0, 
-                                    _mm512_add_epi32( four, two ) );
+                                    _mm512_add_epi32( four,   two   ) );
  casti_m512i( d, 7 ) = _mm512_permutexvar_epi32( s0,
-                                    _mm512_add_epi32( four, tre ) );
+                                    _mm512_add_epi32( four,   three ) );
  casti_m512i( d, 8 ) = _mm512_permutexvar_epi32( s0, eight );
  casti_m512i( d, 9 ) = _mm512_permutexvar_epi32( s0, 
-                                    _mm512_add_epi32( eight, one ) );
+                                    _mm512_add_epi32( eight,  one   ) );
  casti_m512i( d,10 ) = _mm512_permutexvar_epi32( s0,
-                                    _mm512_add_epi32( eight, two ) );
+                                    _mm512_add_epi32( eight,  two   ) );
  casti_m512i( d,11 ) = _mm512_permutexvar_epi32( s0, eleven ); 
  casti_m512i( d,12 ) = _mm512_permutexvar_epi32( s0, 
-                                    _mm512_add_epi32( eleven, one ) );
+                                    _mm512_add_epi32( eleven, one   ) );
  casti_m512i( d,13 ) = _mm512_permutexvar_epi32( s0, 
-                                    _mm512_add_epi32( eleven, two ) );
+                                    _mm512_add_epi32( eleven, two   ) );
  casti_m512i( d,14 ) = _mm512_permutexvar_epi32( s0, 
-                                    _mm512_add_epi32( eleven, tre ) );
+                                    _mm512_add_epi32( eleven, three ) );
  casti_m512i( d,15 ) = _mm512_permutexvar_epi32( s0,
-                                    _mm512_add_epi32( eleven, four ) );
+                                    _mm512_add_epi32( eleven, four  ) );
  casti_m512i( d,16 ) = _mm512_permutexvar_epi32(
                          _mm512_castsi128_si512( s1 ), zero );
  casti_m512i( d,17 ) = _mm512_permutexvar_epi32(
@@ -717,7 +717,7 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, void *src )
  casti_m512i( d,18 ) = _mm512_permutexvar_epi32(
                          _mm512_castsi128_si512( s1 ), two  );
  casti_m512i( d,19 ) = _mm512_permutexvar_epi32(
-                          _mm512_castsi128_si512( s1 ), tre  );
+                          _mm512_castsi128_si512( s1 ), three  );
 }

 #endif    // AVX512
@@ -1006,20 +1006,20 @@ static inline void mm512_bswap32_intrlv80_8x64( void *dst, void *src )
   __m512i *d = (__m512i*)dst;
   __m512i s0 = mm512_bswap_32( casti_m512i(src, 0 ) );
   __m128i s1 = mm128_bswap_32( casti_m128i(src, 4 ) );
-  const __m512i zero = m512_zero;
-  const __m512i one  = m512_one_64;
-  const __m512i two  = _mm512_add_epi64( one, one );
-  const __m512i tre  = _mm512_add_epi64( two, one );
-  const __m512i four = _mm512_add_epi64( two, two );
+  const __m512i zero   = m512_zero;
+  const __m512i one    = m512_one_64;
+  const __m512i two    = _mm512_add_epi64( one, one );
+  const __m512i three  = _mm512_add_epi64( two, one );
+  const __m512i four   = _mm512_add_epi64( two, two );

  d[0] = _mm512_permutexvar_epi64( s0, zero );
  d[1] = _mm512_permutexvar_epi64( s0, one  );
  d[2] = _mm512_permutexvar_epi64( s0, two  );
-  d[3] = _mm512_permutexvar_epi64( s0, tre  );
+  d[3] = _mm512_permutexvar_epi64( s0, three  );
  d[4] = _mm512_permutexvar_epi64( s0, four );
-  d[5] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, one ) );
-  d[6] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, two ) );
-  d[7] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, tre ) );
+  d[5] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, one   ) );
+  d[6] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, two   ) );
+  d[7] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, three ) );
  d[8] = _mm512_permutexvar_epi64(
           _mm512_castsi128_si512( s1 ), zero );
  d[9] = _mm512_permutexvar_epi64(
@@ -1296,25 +1296,18 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
 #if defined(__SSE4_1__)
 // No SSE2 implementation.

-#define mm128_intrlv_blend_64( hi, lo ) \
-                _mm_blend_epi16( hi, lo, 0x0f )
-#define mm128_intrlv_blend_32( hi, lo ) \
-                _mm_blend_epi16( hi, lo, 0x33 )
+#define mm128_intrlv_blend_64( hi, lo )   _mm_blend_epi16( hi, lo, 0x0f )
+#define mm128_intrlv_blend_32( hi, lo )   _mm_blend_epi16( hi, lo, 0x33 )

 #endif   // SSE4_1

 #if defined(__AVX2__)

-#define mm256_intrlv_blend_128( hi, lo ) \
-                _mm256_blend_epi32( hi, lo, 0x0f )
+#define mm256_intrlv_blend_128( hi, lo )  _mm256_blend_epi32( hi, lo, 0x0f )
+#define mm256_intrlv_blend_64( hi, lo )   _mm256_blend_epi32( hi, lo, 0x33 )
+#define mm256_intrlv_blend_32( hi, lo )   _mm256_blend_epi32( hi, lo, 0x55 )

-#define mm256_intrlv_blend_64( hi, lo ) \
-                _mm256_blend_epi32( hi, lo, 0x33 )
-
-#define mm256_intrlv_blend_32( hi, lo ) \
-           _mm256_blend_epi32( hi, lo, 0x55 )
-
-// Blend 32 byte lanes of hash from 2 sources according to control mask.
+// Select lanes of 32 byte hash from 2 sources according to control mask.
 // macro due to 256 bit value arg.
 #define mm256_blend_hash_4x64( dst, a, b, mask ) \
 do { \
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -358,17 +358,17 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
 // no SSE2 implementation, no current users

 #define mm128_ror_1x16( v ) \
-   _mm_shuffle_epi8( v, _mm_set_epi8(  1, 0,15,14,13,12,11,10 \
-                                       9, 8, 7, 6, 5, 4, 3, 2 ) )
+   _mm_shuffle_epi8( v, m128_const_64( 0x01000f0e0d0c0b0a, \
+                                       0x0908070605040302 ) )
 #define mm128_rol_1x16( v ) \
-   _mm_shuffle_epi8( v, _mm_set_epi8( 13,12,11,10, 9, 8, 7, 6, \
-                                       5, 4, 3, 2, 1, 0,15,14 ) )
+   _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080706, \
+                                       0x0504030201000f0e ) )
 #define mm128_ror_1x8( v ) \
-   _mm_shuffle_epi8( v, _mm_set_epi8(  0,15,14,13,12,11,10, 9, \
-                                       8, 7, 6, 5, 4, 3, 2, 1 ) )
+   _mm_shuffle_epi8( v, m128_const_64( 0x000f0e0d0c0b0a09, \
+                                       0x0807060504030201 ) )
 #define mm128_rol_1x8( v ) \
-   _mm_shuffle_epi8( v, _mm_set_epi8( 14,13,12,11,10, 9, 8, 7, \
-                                       6, 5, 4, 3, 2, 1, 0,15 ) )
+   _mm_shuffle_epi8( v, m128_const_64( 0x0e0d0c0b0a090807, \
+                                       0x060504030201000f ) )
 #endif  // SSE3

 // Rotate 16 byte (128 bit) vector by c bytes.
@@ -386,12 +386,12 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
 #define mm128_swap32_64( v )  _mm_shuffle_epi32( v, 0xb1 )

 #define mm128_ror16_64( v )   _mm_shuffle_epi8( v, \
-         _mm_set_epi8(  9, 8,15,14,13,12,11,10,  1, 0, 7, 6, 5, 4, 3, 2 )
+                   m128_const_64( 0x09080f0e0d0c0b0a, 0x0100070605040302 )
 #define mm128_rol16_64( v )   _mm_shuffle_epi8( v, \
-              _mm_set_epi8( 13,12,11,10, 9, 8,15,14,  5, 4, 3, 2, 1, 0, 7, 6 )
+                   m128_const_64( 0x0dc0b0a09080f0e, 0x0504030201000706 )

 #define mm128_swap16_32( v )  _mm_shuffle_epi8( v, \
-                      _mm_set_epi8( 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2 )
+                   m128_const_64( 0x0d0c0f0e09080b0a, 0x0504070601000302 )

 //
 // Endian byte swap.
@@ -399,16 +399,15 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
 #if defined(__SSSE3__)

 #define mm128_bswap_64( v ) \
-   _mm_shuffle_epi8( v, m128_const64(  0x08090a0b0c0d0e0f, \
+   _mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
                                       0x0001020304050607 ) )

 #define mm128_bswap_32( v ) \
   _mm_shuffle_epi8( v, m128_const_64( 0x0c0d0e0f08090a0b, \
                                       0x0405060700010203 ) )

-#define mm128_bswap_16( v ) \
-   _mm_shuffle_epi8( v, _mm_set_epi8( 14,15,  12,13,  10,11,   8, 9, \
-                                       6, 7,   4, 5,   2, 3,   0, 1 ) )
+#define mm128_bswap_16( v ) _mm_shuffle_epi8( \
+                   m128_const_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 )

 // 8 byte qword * 8 qwords * 2 lanes = 128 bytes
 #define mm128_block_bswap_64( d, s ) do \
@@ -462,14 +461,14 @@ static inline __m128i mm128_bswap_16( __m128i v )

 static inline void mm128_block_bswap_64( __m128i *d, __m128i *s )
 {
-   d[0] = mm128_bswap_32( s[0] );
-   d[1] = mm128_bswap_32( s[1] );
-   d[2] = mm128_bswap_32( s[2] );
-   d[3] = mm128_bswap_32( s[3] );
-   d[4] = mm128_bswap_32( s[4] );
-   d[5] = mm128_bswap_32( s[5] );
-   d[6] = mm128_bswap_32( s[6] );
-   d[7] = mm128_bswap_32( s[7] );
+   d[0] = mm128_bswap_64( s[0] );
+   d[1] = mm128_bswap_64( s[1] );
+   d[2] = mm128_bswap_64( s[2] );
+   d[3] = mm128_bswap_64( s[3] );
+   d[4] = mm128_bswap_64( s[4] );
+   d[5] = mm128_bswap_64( s[5] );
+   d[6] = mm128_bswap_64( s[6] );
+   d[7] = mm128_bswap_64( s[7] );
 }

 static inline void mm128_block_bswap_32( __m128i *d, __m128i *s )
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -32,6 +32,7 @@

 // set instructions load memory resident constants, this avoids mem.
 // cost 4 pinsert + 1 vinsert, estimate 7 clocks.
+// Avoid using, mm128_const_64 twice is still faster.
 #define m256_const_64( i3, i2, i1, i0 ) \
   _mm256_insertf128_si256( _mm256_castsi128_si256( m128_const_64( i1, i0 ) ), \
                            m128_const_64( i3, i2 ), 1 )
@@ -50,7 +51,7 @@ static inline __m256i m256_one_64_fn()
  asm( "vpxor %0, %0, %0\n\t"
       "vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t"
       "vpsubq %%ymm1, %0, %0\n\t"
-       :"=x"(a)
+       : "=x"(a)
       :
       : "ymm1" );
  return a;
@@ -63,7 +64,7 @@ static inline __m256i m256_one_32_fn()
  asm( "vpxor %0, %0, %0\n\t"
       "vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t"
       "vpsubd %%ymm1, %0, %0\n\t"
-       :"=x"(a)
+       : "=x"(a)
       :
       : "ymm1" );
  return a;
@@ -76,7 +77,7 @@ static inline __m256i m256_one_16_fn()
  asm( "vpxor %0, %0, %0\n\t"
       "vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t"
       "vpsubw %%ymm1, %0, %0\n\t"
-       :"=x"(a)
+       : "=x"(a)
       :
       : "ymm1" );
  return a;
@@ -89,7 +90,7 @@ static inline __m256i m256_one_8_fn()
  asm( "vpxor %0, %0, %0\n\t"
       "vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t"
       "vpsubb %%ymm1, %0, %0\n\t"
-       :"=x"(a)
+       : "=x"(a)
       :
       : "ymm1" );
  return a;
@@ -100,7 +101,7 @@ static inline __m256i m256_neg1_fn()
 {
   __m256i a;
   asm( "vpcmpeqq %0, %0, %0\n\t"
-        :"=x"(a) );
+        : "=x"(a) );
   return a;
 }
 #define m256_neg1    m256_neg1_fn()
@@ -423,23 +424,23 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )

 // Rotate 256 bit vector by one 16 bit element.     
 #define mm256_ror_1x16( v ) \
-   _mm256_permutexvar_epi16( _mm256_set_epi16( \
-         0,15,14,13,12,11,10, 9,   8, 7, 6, 5, 4, 3, 2, 1 ), v )
+   _mm256_permutexvar_epi16( m256_const_64( \
+                                 0x0000000f000e000d, 0x000c000b000a0009, \
+                                 0x0008000700060005, 0x0004000300020001 ), v )

 #define mm256_rol_1x16( v ) \
-   _mm256_permutexvar_epi16( _mm256_set_epi16( \
-        14,13,12,11,10, 9, 8, 7,   6, 5, 4, 3, 2, 1, 0,15 ), v )
+   _mm256_permutexvar_epi16( m256_const_64( \
+                                 0x000e000d000c000b, 0x000a000900080007, \
+                                 0x0006000500040003, 0x000200010000000f ), v )

 // Rotate 256 bit vector by one byte.
-#define mm256_ror_1x8( v ) \
-   _mm256_permutexvar_epi8( _mm256_set_epi8( \
-         0,31,30,29,28,27,26,25,  24,23,22,21,20,19,18,17, \
-        16,15,14,13,12,11,10, 9,   8, 7, 6, 5, 4, 3, 2, 1 ), v )
+#define mm256_ror_1x8( v ) m256_const_64( \
+                                 0x001f1e1d1c1b1a19, 0x1817161514131211, \
+                                 0x100f0e0d0c0b0a09, 0x0807060504030201 )

-#define mm256_rol_1x8( v ) \
-   _mm256_permutexvar_epi8( _mm256_set_epi8( \
-        30,29,28,27,26,25,24,23,  22,21,20,19,18,17,16,15, \
-        14,13,12,11,10, 9, 8, 7,   6, 5, 4, 3, 2, 1, 0,31 ), v )
+#define mm256_rol_1x8( v ) m256_const_64( \
+                                 0x1e1d1c1b1a191817, 0x161514131211100f, \
+                                 0x0e0d0c0b0a090807, 0x060504030201001f )

 #endif  // AVX512

--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -503,7 +503,7 @@ static inline __m512i m512_neg1_fn()
                       0x08090A0B, 0x0C0D0E0F,   0x00010203, 0x04050607 ) )

 #define mm512_bswap_32( v ) \
-   _mm512_permutexvar_epi8( v, _mm512_set_epi832( \
+   _mm512_permutexvar_epi8( v, _mm512_set_epi32( \
                       0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233, \
                       0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233, \
                       0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233, \