v25.4

2026-02-22 16:33:08 +00:00 · 2025-06-20 20:31:41 -04:00
parent dd99580a4c
commit 66191db93c
86 changed files with 2701 additions and 4322 deletions
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -108,11 +108,13 @@ typedef union

 // A simple 128 bit permute, using function instead of macro avoids
 // problems if the v arg passed as an expression.
-static inline __m512i mm512_perm_128( const __m512i v, const int c )
+static inline __m512i mm512_perm128( const __m512i v, const int c )
 {  return _mm512_shuffle_i64x2( v, v, c ); }

 // Broadcast 128 bit vector to all lanes of 512 bit vector.
-#define mm512_bcast_m128( v )  mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
+#define mm512_bcast128( v )    mm512_perm128( _mm512_castsi128_si512( v ), 0 )
+// deprecated
+#define mm512_bcast_m128  mm512_bcast128 

 // Set either the low or high 64 bit elements in 128 bit lanes, other elements
 // are set to zero.
@@ -120,7 +122,7 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
 #define mm512_bcast128hi_64( i64 )     _mm512_maskz_set1_epi64( 0xaa, i64 )

 #define mm512_set2_64( i1, i0 ) \
-   mm512_bcast_m128( _mm_set_epi64x( i1, i0 ) )
+   mm512_bcast128( _mm_set_epi64x( i1, i0 ) )

 // Pseudo constants.
 #define m512_zero       _mm512_setzero_si512()
@@ -248,105 +250,57 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Reverse byte order of packed elements, vectorized endian conversion.

-#define mm512_bswap_64( v ) \
-   _mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \
-                              0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) )
+#define mm512_bswap_64( v )  _mm512_shuffle_epi8( v, V512_BSWAP64 )

-#define mm512_bswap_32( v ) \
-   _mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \
-                              0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) )
+#define mm512_bswap_32( v )  _mm512_shuffle_epi8( v, V512_BSWAP32 )
+
+/* not used
+#define mm512_bswap_16( v ) \
+   _mm512_shuffle_epi8( v, mm512_bcast128( _mm_set_epi64x( \
+                              0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
+*/

 #define mm512_bswap_16( v ) \
-   _mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \
-                              0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )

 // Source and destination are pointers, may point to same memory.
 // 8 lanes of 64 bytes each
 #define mm512_block_bswap_64( d, s ) \
 { \
-  const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
-                                0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
-  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
-  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
-  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
-  casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
-  casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
-  casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
-  casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
-  casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
-}
-#define mm512_block_bswap64_512    mm512_block_bswap_64
-
-#define mm512_block_bswap64_1024( d, s ) \
-{ \
-  const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
-                                0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
-  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
-  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
-  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
-  casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
-  casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
-  casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
-  casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
-  casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
-  casti_m512i( d, 8 ) = _mm512_shuffle_epi8( casti_m512i( s, 8 ), ctl ); \
-  casti_m512i( d, 9 ) = _mm512_shuffle_epi8( casti_m512i( s, 9 ), ctl ); \
-  casti_m512i( d,10 ) = _mm512_shuffle_epi8( casti_m512i( s,10 ), ctl ); \
-  casti_m512i( d,11 ) = _mm512_shuffle_epi8( casti_m512i( s,11 ), ctl ); \
-  casti_m512i( d,12 ) = _mm512_shuffle_epi8( casti_m512i( s,12 ), ctl ); \
-  casti_m512i( d,13 ) = _mm512_shuffle_epi8( casti_m512i( s,13 ), ctl ); \
-  casti_m512i( d,14 ) = _mm512_shuffle_epi8( casti_m512i( s,14 ), ctl ); \
-  casti_m512i( d,15 ) = _mm512_shuffle_epi8( casti_m512i( s,15 ), ctl ); \
+  casti_m512i( d, 0 ) = mm512_bswap_64( casti_m512i( s, 0 ) ); \
+  casti_m512i( d, 1 ) = mm512_bswap_64( casti_m512i( s, 1 ) ); \
+  casti_m512i( d, 2 ) = mm512_bswap_64( casti_m512i( s, 2 ) ); \
+  casti_m512i( d, 3 ) = mm512_bswap_64( casti_m512i( s, 3 ) ); \
+  casti_m512i( d, 4 ) = mm512_bswap_64( casti_m512i( s, 4 ) ); \
+  casti_m512i( d, 5 ) = mm512_bswap_64( casti_m512i( s, 5 ) ); \
+  casti_m512i( d, 6 ) = mm512_bswap_64( casti_m512i( s, 6 ) ); \
+  casti_m512i( d, 7 ) = mm512_bswap_64( casti_m512i( s, 7 ) ); \
 }

 // 16 lanes of 32 bytes each
 #define mm512_block_bswap_32( d, s ) \
 { \
-  const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
-  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
-  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
-  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
-  casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
-  casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
-  casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
-  casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
-  casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
+  casti_m512i( d, 0 ) = mm512_bswap_32( casti_m512i( s, 0 ) ); \
+  casti_m512i( d, 1 ) = mm512_bswap_32( casti_m512i( s, 1 ) ); \
+  casti_m512i( d, 2 ) = mm512_bswap_32( casti_m512i( s, 2 ) ); \
+  casti_m512i( d, 3 ) = mm512_bswap_32( casti_m512i( s, 3 ) ); \
+  casti_m512i( d, 4 ) = mm512_bswap_32( casti_m512i( s, 4 ) ); \
+  casti_m512i( d, 5 ) = mm512_bswap_32( casti_m512i( s, 5 ) ); \
+  casti_m512i( d, 6 ) = mm512_bswap_32( casti_m512i( s, 6 ) ); \
+  casti_m512i( d, 7 ) = mm512_bswap_32( casti_m512i( s, 7 ) ); \
 }
 #define mm512_block_bswap32_256   mm512_block_bswap_32

-#define mm512_block_bswap32_512( d, s ) \
-{ \
-  const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
-  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
-  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
-  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
-  casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
-  casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
-  casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
-  casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
-  casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
-  casti_m512i( d, 8 ) = _mm512_shuffle_epi8( casti_m512i( s, 8 ), ctl ); \
-  casti_m512i( d, 9 ) = _mm512_shuffle_epi8( casti_m512i( s, 9 ), ctl ); \
-  casti_m512i( d,10 ) = _mm512_shuffle_epi8( casti_m512i( s,10 ), ctl ); \
-  casti_m512i( d,11 ) = _mm512_shuffle_epi8( casti_m512i( s,11 ), ctl ); \
-  casti_m512i( d,12 ) = _mm512_shuffle_epi8( casti_m512i( s,12 ), ctl ); \
-  casti_m512i( d,13 ) = _mm512_shuffle_epi8( casti_m512i( s,13 ), ctl ); \
-  casti_m512i( d,14 ) = _mm512_shuffle_epi8( casti_m512i( s,14 ), ctl ); \
-  casti_m512i( d,15 ) = _mm512_shuffle_epi8( casti_m512i( s,15 ), ctl ); \
-}
-
-
-
 // Cross-lane shuffles implementing rotation of packed elements.
 // 

+// shuffle 16 bit elements within 64 bit lanes.
+#define mm512_shuffle16( v, c ) \
+   _mm512_shufflehi_epi16( _mm512_shufflelo_epi16( v, c ), c )
+
 // Rotate elements across entire vector.
-static inline __m512i mm512_swap_256( const __m512i v )
+static inline __m512i mm512_rev_256( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 4 ); }
-#define mm512_shuflr_256   mm512_swap_256
-#define mm512_shufll_256   mm512_swap_256
+#define mm512_swap_256      mm512_rev_256     // grandfathered

 static inline __m512i mm512_shuflr_128( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 2 ); }
@@ -394,9 +348,8 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 // Rotate elements within 256 bit lanes of 512 bit vector.

 // Swap hi & lo 128 bits in each 256 bit lane
-#define mm512_swap256_128( v )      _mm512_permutex_epi64( v, 0x4e )
-#define mm512_shuflr256_128 mm512_swap256_128
-#define mm512_shufll256_128 mm512_swap256_128
+#define mm512_rev256_128( v )       _mm512_permutex_epi64( v, 0x4e )
+#define mm512_swap256_128           mm512_rev256_128  // grandfathered

 // Rotate 256 bit lanes by one 64 bit element
 #define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
@@ -450,15 +403,23 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 //
 // Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
 
-#define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
-#define mm512_shuflr128_64      mm512_swap128_64
-#define mm512_shufll128_64      mm512_swap128_64
+#define mm512_rev128_64( v )      _mm512_shuffle_epi32( v, 0x4e )
+#define mm512_swap128_64          mm512_rev128_64   // grandfathered
+
+/*not used
+#define mm512_rev128_32(v)        _mm526_shuffle_epi32( v, 0x1b )
+#define mm512_rev128_16(v)         mm512_shuffle16( v, 0x1b )
+*/

 // Rotate 128 bit lanes by one 32 bit element
 #define mm512_shuflr128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
 #define mm512_shufll128_32( v )    _mm512_shuffle_epi32( v, 0x93 )

 /* Not used
+
+#define mm512_shuflr128_16(v)   mm512_shuffle16( v, 0x39 )
+#define mm512_shufll128_16(v)   mm512_shuffle16( v, 0x93 )
+   
 // Rotate 128 bit lanes right by c bytes, versatile and just as fast
 static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
 {  return _mm512_alignr_epi8( v, v, c ); }
@@ -476,11 +437,10 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
                                           _mm512_castsi512_ps( v2 ), c ) ); 

 // 64 bit lanes
-// Not really necessary with AVX512, included for consistency with AVX2/SSE.
+// ROL, ROR not necessary with AVX512, included for consistency with AVX2/SSE.

-#define mm512_swap64_32( v )    _mm512_shuffle_epi32( v, 0xb1 )
-#define mm512_shuflr64_32       mm512_swap64_32
-#define mm512_shufll64_32       mm512_swap64_32
+#define mm512_qrev32( v )       _mm512_shuffle_epi32( v, 0xb1 )
+#define mm512_swap64_32         mm512_qrev32        // grandfathered

 #define mm512_shuflr64_24( v )  _mm512_ror_epi64( v, 24 )
 #define mm512_shufll64_24( v )  _mm512_rol_epi64( v, 24 )
@@ -494,9 +454,7 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
 /* Not used
 // 32 bit lanes

-#define mm512_swap32_16( v )    _mm512_ror_epi32( v, 16 )
-#define mm512_shuflr32_16       mm512_swap32_16
-#define mm512_shufll32_16       mm512_swap32_16
+#define mm512_lrev16( v )       _mm512_ror_epi32( v, 16 )

 #define mm512_shuflr32_8( v )   _mm512_ror_epi32( v,  8 )
 #define mm512_shufll32_8( v )   _mm512_rol_epi32( v,  8 )