v24.4

2025-09-17 23:44:27 +00:00 · 2024-07-01 00:33:19 -04:00
parent c47c4a8885
commit 47e24b50e8
23 changed files with 2097 additions and 2855 deletions
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -2436,7 +2436,7 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
 static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
 {
  const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
-                                             0x0405060700010203 );
+                                            0x0405060700010203 );
  const __m512i c1 = v512_64( 1 );
  v128_t s0 = casti_v128( src,0 );
  v128_t s1 = casti_v128( src,1 );
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -165,6 +165,11 @@ typedef union
 // compiler to exploit new features to produce optimum code.
 // Currently only used internally and by Luffa.

+
+#define v128_mov64       _mm_cvtsi64_si128
+#define v128_mov32       _mm_cvtsi32_si128
+
+/*
 static inline __m128i v128_mov64( const uint64_t n )
 {
  __m128i a;
@@ -186,11 +191,14 @@ static inline __m128i v128_mov32( const uint32_t n )
 #endif
  return a;
 }
+*/

 // broadcast lane 0 to all lanes
 #define v128_bcast64(v)                 _mm_shuffle_epi32( v, 0x44 )
 #define v128_bcast32(v)                 _mm_shuffle_epi32( v, 0x00 )

+// Not used, test first
+/*
 #if defined(__AVX2__)

 #define v128_bcast16(v)                 _mm_broadcastw_epi16(v)
@@ -198,9 +206,10 @@ static inline __m128i v128_mov32( const uint32_t n )
 #else

 #define v128_bcast16(v) \
-   v128_bcast32( v128_or( v128_sl32( v, 16 ), v ) )
+   _mm_shuffle_epi32( _mm_shufflelo_epi16( v, 0x00 ), 0x00 )

 #endif
+*/

 // Broadcast lane l to all lanes
 #define v128_duplane64( v, l ) \
@@ -216,28 +225,15 @@ static inline __m128i v128_mov32( const uint32_t n )
 // Pseudo constants
 #define v128_zero                       _mm_setzero_si128()

-#if defined(__SSE4_1__)
-
-// Bitwise AND, return 1 if result is all bits clear.
-#define v128_and_eq0(v1, v0)            _mm_testz_si128(v1, v0)
-
-// v128_is_zero?
-static inline int v128_cmpeq0( v128_t v )
-{  return v128_and_eq0( v, v ); }
-
-#endif
-
-// Bitwise compare return 1 if all bits set.
-#define v128_cmpeq1(v)                   _mm_test_all ones(v)
-
-#define v128_one                         v128_mov64(1)
+//#define v128_one                         v128_mov64(1)
+#define v128_one                        _mm_cvtsi64_si128( 1 )

 // ASM avoids the need to initialize return variable to avoid compiler warning.
 // Macro abstracts function parentheses to look like an identifier.
 static inline __m128i v128_neg1_fn()
 {
   __m128i a;
-#if defined(__AVX__) 
+#if defined(__AVX__)
   asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) );
 #else
   asm( "pcmpeqq %0, %0\n\t" : "=x"(a) );
@@ -268,7 +264,6 @@ static inline __m128i v128_neg1_fn()
 // p = any aligned pointer, i = scaled array index
 // returns value p[i]
 #define casti_v128(p,i)    (((__m128i*)(p))[(i)])
-#define casti_m128i        casti_v128     // deprecated
 #define casti_v128u64      casti_v128
 #define casti_v128u32      casti_v128
 #define casti_v128u16      casti_v128
@@ -279,13 +274,14 @@ static inline __m128i v128_neg1_fn()
 #define casto_v128(p,o) (((__m128i*)(p))+(o))

 #if defined(__SSE4_1__)
+
 #define v128_get64( v, l )         _mm_extract_epi64( v, l )
 #define v128_get32( v, l )         _mm_extract_epi32( v, l )
 #define v128_get16( v, l )         _mm_extract_epi16( v, l )
 #define v128_get8(  v, l )         _mm_extract_epi8(  v, l )

 #define v128_put64( v, u64, l )    _mm_insert_epi64( v, u64, l )
-#define v128_put32( v, u32, l )    _mm_insert_epi64( v, u32, l )
+#define v128_put32( v, u32, l )    _mm_insert_epi32( v, u32, l )
 #define v128_put16( v, u16, l )    _mm_insert_epi16( v, u16, l )
 #define v128_put8(  v, u8,  l )    _mm_insert_epi8(  v, u8,  l )

@@ -396,7 +392,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 {   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
 #define  memcpy_128           v128_memcpy  

+// Boolean operations
 #if defined(VL256)
+// Macros with duplicate references to the same argument are
+// not expression safe. Switch to inline function if required.

 // ~v1 | v0
 #define v128_ornot( v1, v0 )      _mm_ternarylogic_epi64( v1, v0, v0, 0xcf )
@@ -430,7 +429,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #else

-#define v128_ornot( v1, v0 )      _mm_or_si128( v1, v128_not( v0 ) )
+#define v128_ornot( v1, v0 )      _mm_or_si128( v128_not( v1 ), v0 )

 #define v128_xor3( a, b, c )      _mm_xor_si128( a, _mm_xor_si128( b, c ) )

@@ -464,9 +463,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_movmask32( v ) \
   _mm_movemask_ps( (__m128)(v) )

-//
-// Bit rotations
-
+// Shuffle 16 bit elements within 64 bit lanes.
 #define v128_shuffle16( v, c ) \
       _mm_shufflehi_epi16( _mm_shufflelo_epi16( v, c ), c )

@@ -476,6 +473,9 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_qrev16(v)      v128_shuffle16( v, 0x1b )
 #define v128_lrev16(v)      v128_shuffle16( v, 0xb1 )

+//
+// Bit rotations
+
 // Internal use only, should never be callled from application code.
 #define v128_ror64_sse2( v, c ) \
   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
@@ -601,7 +601,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #endif

-// ror( v1 ^ v0, n )
+// (v1 ^ v0) >>> n, ARM NEON has optimized version
 #define v128_ror64xor( v1, v0, n )  v128_ror64( v128_xor( v1, v0 ), n ) 

 /* not used
@@ -700,15 +700,11 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_swap64(v)      _mm_shuffle_epi32( v, 0x4e )  // grandfathered 
 #define v128_rev64(v)       _mm_shuffle_epi32( v, 0x4e )  // preferred
 #define v128_rev32(v)       _mm_shuffle_epi32( v, 0x1b )
-#define v128_rev16(v)       v128_shuffle16( v, 0x1b )

 // rotate vector elements
 #define v128_shuflr32(v)    _mm_shuffle_epi32( v, 0x39 )
 #define v128_shufll32(v)    _mm_shuffle_epi32( v, 0x93 )

-#define v128_shuflr16(v)    v128_shuffle16( v, 0x39 )
-#define v128_shufll16(v)    v128_shuffle16( v, 0x93 )
-
 // Endian byte swap.

 #if defined(__SSSE3__)
@@ -911,25 +907,27 @@ static inline void v128_block_bswap32_512( __m128i *d, const __m128i *s )
 #else

 #define v128_alignr8( hi, lo, c ) \
-   _mm_or_si128( _mm_slli_si128( hi, c ), _mm_srli_si128( lo, c ) )
+   _mm_or_si128( _mm_slli_si128( hi, 16-(c) ), _mm_srli_si128( lo, c ) )

+// c arg is trivial only valid value is 1
 #define v128_alignr64( hi, lo, c ) \
-   _mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
+   _mm_or_si128( _mm_slli_si128( hi, 16-((c)*8) ), _mm_srli_si128( lo, (c)*8 ) )

 #define v128_alignr32( hi, lo, c ) \
-   _mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
+   _mm_or_si128( _mm_slli_si128( hi, 16-((c)*4) ), _mm_srli_si128( lo, (c)*4 ) )

 #endif

 // blend using vector mask
 #if defined(__SSE4_1__)

-// Bytewise using sign bit of each byte element of mask
+// Bytewise using sign bit of each byte element of mask. Use full bitmask
+// for compatibility with SSE2 & NEON.
 #define v128_blendv                    _mm_blendv_epi8

 #else

-// Bitwise
+// Bitwise, use only byte wise for compatibility with SSE4_1.
 #define v128_blendv( v1, v0, mask ) \
   v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )

--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -172,7 +172,7 @@ static inline __m256i mm256_not( const __m256i v )
    
 #else

-#define mm256_ornot( v1, v0 )      _mm256_or_si256( v1, mm256_not( v0 ) )
+#define mm256_ornot( v1, v0 )      _mm256_or_si256( mm256_not( v1 ), v0 )

 #define mm256_xor3( a, b, c ) \
  _mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
@@ -217,12 +217,11 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_movmask_32( v ) \
   _mm256_movemask_ps( _mm256_castsi256_ps( v ) )

-//
-//           Bit rotations.
-
+// shuffle 16 bit elements within 64 bit lanes.
 #define mm256_shuffle16( v, c ) \
   _mm256_shufflehi_epi16( _mm256_shufflelo_epi16( v, c ), c )

+// reverse elements within lanes.
 #define mm256_qrev32(v)    _mm256_shuffle_epi32( v, 0xb1 )
 #define mm256_swap64_32    mm256_qrev32       // grandfathered

@@ -242,6 +241,9 @@ static inline __m256i mm256_not( const __m256i v )
   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
                         v128_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )

+//
+//           Bit rotations.
+
 // These should never be called directly by applications.
 #define mm256_ror_64_avx2( v, c ) \
   _mm256_or_si256( _mm256_srli_epi64( v, c ), \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -185,6 +185,8 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Ternary logic uses 8 bit truth table to define any 3 input logical
 // expression using any number or combinations of AND, OR, XOR, NOT.
+// Macros with duplicate references to the same argument are
+// not expression safe. Switch to inline function if required.

 // ~v1 | v0
 #define mm512_ornot( v1, v0 )      _mm512_ternarylogic_epi64( v1, v0, v0, 0xcf )
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -456,7 +456,6 @@ static inline uint64x2_t v128_rev64( uint64x2_t v )
 #define v128_swap64     v128_rev64   // grandfathered

 #define v128_rev32(v)        v128_rev64( v128_qrev32( v ) )
-#define v128_rev16(v)        v128_rev64( v128_qrev16( v ) )

 // shuffle-rotate vector elements
 static inline uint32x4_t v128_shuflr32( uint32x4_t v )
@@ -465,12 +464,6 @@ static inline uint32x4_t v128_shuflr32( uint32x4_t v )
 static inline uint32x4_t v128_shufll32( uint32x4_t v )
 {   return vextq_u32( v, v, 3 ); }

-static inline uint16x8_t v128_shuflr16( uint16x8_t v )
-{   return vextq_u16( v, v, 1 ); }
-
-static inline uint16x8_t v128_shufll16( uint16x8_t v )
-{   return vextq_u16( v, v, 7 ); }
-
 // reverse bits in bytes, nothing like it in x86_64
 #define v128_bitrev8           vrbitq_u8

@@ -547,7 +540,8 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
   casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
 }

-// Bitwise blend using vector mask
+// Bitwise blend using vector mask, use only bytewise for compatibility
+// with x86_64.
 #define v128_blendv( v1, v0, mask )    vbslq_u32( mask, v1, v0 )

 #endif   // __ARM_NEON