v23.15

2025-09-17 23:44:27 +00:00 · 2023-11-30 14:36:47 -05:00
parent 4e3f1b926f
commit 9d3a46c355
29 changed files with 3081 additions and 2234 deletions
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -207,7 +207,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n )

 #endif

-// broadcast lane l to all lanes
+// broadcast (replicate) lane l to all lanes
 #define v128_replane64( v, l ) \
   ( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x44 ) \
                : _mm_shuffle_epi32( v, 0xee )
@@ -319,29 +319,27 @@ static inline __m128i v128_neg1_fn()
 //    c[7:6] source element selector

 // Convert type and abbreviate name: eXtract Insert Mask = XIM
-#define mm128_xim_32( v1, v0, c ) \
+#define v128_xim32( v1, v0, c ) \
   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
                                    _mm_castsi128_ps( v0 ), c ) )
-#define v128_xim32 mm128_xim_32

 // Examples of simple operations using xim:
 /*
 // Copy i32 to element c of dest and copy remaining elemnts from v.
 #define v128_put32( v, i32, c ) \
-      mm128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
+      v128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
 */


-#define mm128_mask_32( v, m )    mm128_xim_32( v, v, m )
+#define v128_mask32( v, m )    v128_xim32( v, v, m & 0xf )

 // Zero 32 bit elements when corresponding bit in 4 bit mask is set.
-//static inline __m128i mm128_mask_32( const __m128i v, const int m ) 
-//{   return mm128_xim_32( v, v, m ); }
-#define v128_mask32    mm128_mask_32
+//static inline __m128i v128_mask32( const __m128i v, const int m ) 
+//{   return v128_xim32( v, v, m ); }

-// Copy element i2 of v2 to element i1 of dest and copy remaining elements from v1.
+// Copy element l0 of v0 to element l1 of dest and copy remaining elements from v1.
 #define v128_movlane32( v1, l1, v0, l0 ) \
-  mm128_xim_32( v1, v0, ( (l1)<<4 ) | ( (l0)<<6 ) )
+  v128_xim32( v1, v0, ( (l1)<<4 ) | ( (l0)<<6 ) )

 #endif  // SSE4_1

@@ -452,7 +450,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #define v128_orand( a, b, c )     _mm_or_si128( a, _mm_and_si128( b, c ) )

-#define v128_xnor( a, b )         mm128_not( _mm_xor_si128( a, b ) )
+#define v128_xnor( a, b )         v128_not( _mm_xor_si128( a, b ) )

 #endif

@@ -483,7 +481,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_qrev16(v)      v128_shuffle16( v, 0x1b )
 #define v128_lrev16(v)      v128_shuffle16( v, 0xb1 )

-// These should never be callled from application code, use rol/ror.
+// Internal use only, should never be callled from application code.
 #define v128_ror64_sse2( v, c ) \
   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )

@@ -498,14 +496,14 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #if defined(__AVX512VL__)

-// AVX512 fastest all rotations.
+// AVX512 fastest for all rotations.
 #define v128_ror64                _mm_ror_epi64
 #define v128_rol64                _mm_rol_epi64
 #define v128_ror32                _mm_ror_epi32
 #define v128_rol32                _mm_rol_epi32

 // ror/rol will always find the fastest but these names may fit better with
-// application code performing shuffles rather than bit rotations.
+// application code performing byte operations rather than bit rotations.
 #define v128_shuflr64_8( v)         _mm_ror_epi64( v,  8 )
 #define v128_shufll64_8( v)         _mm_rol_epi64( v,  8 )
 #define v128_shuflr64_16(v)         _mm_ror_epi64( v, 16 )
@@ -577,7 +575,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 : v128_rol32_sse2( v, c )

 #elif defined(__SSE2__)
-// SSE2: fastest 32 bit, very fast 16
+// SSE2: fastest 32 bit, very fast 16, all else slow

 #define v128_ror64( v, c ) \
   ( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
@@ -608,9 +606,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #endif

-//#define v128_ror64            mm128_ror_64
-//#define v128_rol64            mm128_rol_64
-//#define v128_ror32            mm128_ror_32
+// deprecated
 #define mm128_rol_32        v128_rol32

 /* not used
@@ -633,7 +629,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
   _mm_ror_epi32( v0, c ); \
   _mm_ror_epi32( v1, c )

-#define mm128_2rol32( v1, v0, c ) \
+#define v128_2rol32( v1, v0, c ) \
   _mm_rol_epi32( v0, c ); \
   _mm_rol_epi32( v1, c )

@@ -684,11 +680,13 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 // Cross lane shuffles

+// No NEON version
 #define v128_shuffle32     _mm_shuffle_epi32

-// shuffle using vector mask, for compatibility with NEON
+/* Not used, exists only for compatibility with NEON if ever needed.
 #define v128_shufflev32( v, vmask ) \
  v128_shuffle32( v, mm128_movmask_32( vmask ) )
+*/

 #define v128_shuffle8     _mm_shuffle_epi8

@@ -697,12 +695,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_shuffle2_64( v1, v2, c ) \
   _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \
                                     _mm_castsi128_pd( v2 ), c ) ); 
-#define mm128_shuffle2_64   v128_shuffle2_64

 #define v128_shuffle2_32( v1, v2, c ) \
   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
                                     _mm_castsi128_ps( v2 ), c ) ); 
-#define mm128_shuffle2_32   v128_shuffle2_32

 // Rotate vector elements accross all lanes

@@ -734,6 +730,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_bswap32( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
                                        0x0405060700010203 ) )
+// deprecated
 #define mm128_bswap_32      v128_bswap32

 #define v128_bswap16( v ) \
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -68,7 +68,7 @@
 #define v128_mul32                    vmulq_u32
 #define v128_mul16                    vmulq_u16

-// Widening, shuffle high element to align with Intel
+// Widening multiply, align source elements with Intel
 static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 {
   return vmull_u32( vget_low_u32( vcopyq_laneq_u32( v1, 1, v1, 2 ) ),
@@ -97,7 +97,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_cmplt16( v1, v0 )        vcltq_s16( (int16x8_t)v1, (int16x8_t)v0 )
 #define v128_cmplt8( v1, v0 )         vcltq_s8( (int8x16_t)v1, (int8x16_t)v0 )

-// bit shift
+// Logical bit shift
 #define v128_sl64                     vshlq_n_u64
 #define v128_sl32                     vshlq_n_u32
 #define v128_sl16                     vshlq_n_u16
@@ -108,7 +108,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_sr16                     vshrq_n_u16
 #define v128_sr8                      vshrq_n_u8

-// Unit tested, working.
+// Arithmetic shift.
 #define v128_sra64( v, c )            vshrq_n_s64( (int64x2_t)v, c )
 #define v128_sra32( v, c )            vshrq_n_s32( (int32x4_t)v, c )
 #define v128_sra16( v, c )            vshrq_n_s16( (int16x8_t)v, c )
@@ -255,24 +255,24 @@ typedef union
 #define v128_8                         vmovq_n_u8

 #define v64_set32( u32_1, u32_0 ) \
-   vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )
+  vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )

 #define v64_set16( u16_3, u16_2, u16_1, u16_0 ) \
-    vcreate_u16( ( (uint64_t)( ( (uint32_t)(u16_3) << 16 ) \
-                               | (uint32_t)(u16_2)       ) << 32 ) \
-               | ( (uint64_t)( ( (uint32_t)(u16_1) << 16 ) \
-                               | (uint32_t)(u16_0)       )       ) )
+  vcreate_u16( ( (uint64_t)( ( (uint32_t)(u16_3) << 16) \
+                             | (uint32_t)(u16_2)       ) << 32 ) \
+             | ( (uint64_t)( ( (uint32_t)(u16_1) << 16) \
+                             | (uint32_t)(u16_0)       )       ) )

 #define v64_set8( u8_7, u8_6, u8_5, u8_4, u8_3, u8_2, u8_1, u8_0 ) \
-    vcreate_u8( \
-     ( (uint64_t)( ( (uint32_t)(((uint16_t)(u8_7) << 8 ) \
-                               | (uint16_t)(u8_6)      ) << 16 ) \
-                 | ( (uint32_t)(((uint16_t)(u8_5) << 8 ) \
-                               | (uint16_t)(u8_4)      )       )) << 32 )  \
-   | ( (uint64_t)( ( (uint32_t)(((uint16_t)(u8_3) << 8 ) \
-                               | (uint16_t)(u8_2)      ) << 16 ) \
-                 | ( (uint32_t)(((uint16_t)(u8_1) << 8 ) \
-                               | (uint16_t)(u8_0)      )       ))       ))
+  vcreate_u8( \
+     ( (uint64_t)( ( (uint32_t)( ((uint16_t)(u8_7) << 8) \
+                                | (uint16_t)(u8_6)      ) << 16 ) \
+                 | ( (uint32_t)( ((uint16_t)(u8_5) << 8) \
+                                | (uint16_t)(u8_4)      )       ) ) << 32 )  \
+   | ( (uint64_t)( ( (uint32_t)( ((uint16_t)(u8_3) << 8) \
+                                | (uint16_t)(u8_2)      ) << 16 ) \
+                 | ( (uint32_t)( ((uint16_t)(u8_1) << 8) \
+                                | (uint16_t)(u8_0)      )       ) )       ) )

 #define v128_set64( u64_1, u64_0 ) \
   vcombine_u64( vcreate_u64( u64_0 ), vcreate_u64( u64_1 ) ) 
@@ -406,15 +406,17 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 v1 = vorrq_u32( v1, t1 ); \
 }

+/* not used anywhere and hopefully never will
 // vector mask, use as last resort. prefer tbl, rev, alignr, etc
 #define v128_shufflev32( v, vmask ) \
  v128_set32( ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[3] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[2] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[1] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[0] ] ) \
+*/

 #define v128_shuffle8( v, vmask ) \
-     vqtbl1q_u8( (uint8x16_t)v, (uint8x16_t)vmask );
+     vqtbl1q_u8( (uint8x16_t)v, (uint8x16_t)vmask )

 // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
 // Bit rotation already promotes faster widths. Usage is context sensitive.
@@ -532,20 +534,6 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
   casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
 }

-// Prograsmmable shuffles
-// no compatible shuffles with x86_64, will require targeted user code.
-              
-#define v128_extractmask8( df, de, dd, dc, db, da, d9, d8, \
-                           d7, d6, d5, d4, d3, d2, d1, d0, vmask )   \
-  d0 = ((uint8_t*)(&vmask))[0];   d1 = ((uint8_t*)(&vmask))[1]; \
-  d2 = ((uint8_t*)(&vmask))[2];   d3 = ((uint8_t*)(&vmask))[3]; \
-  d4 = ((uint8_t*)(&vmask))[0];   d5 = ((uint8_t*)(&vmask))[1]; \
-  d6 = ((uint8_t*)(&vmask))[2];   d7 = ((uint8_t*)(&vmask))[3]; \
-  d8 = ((uint8_t*)(&vmask))[0];   d9 = ((uint8_t*)(&vmask))[1]; \
-  da = ((uint8_t*)(&vmask))[2];   db = ((uint8_t*)(&vmask))[3]; \
-  dc = ((uint8_t*)(&vmask))[0];   dd = ((uint8_t*)(&vmask))[1]; \
-  de = ((uint8_t*)(&vmask))[2];   df = ((uint8_t*)(&vmask))[3]; 
-
 // Blendv
 #define v128_blendv( v1, v0, mask ) \
   v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )