v3.21.3

2026-02-23 00:43:08 +00:00 · 2023-03-11 14:54:49 -05:00
parent fb93160641
commit b339450898
49 changed files with 1120 additions and 1119 deletions
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -409,19 +409,20 @@ static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
 static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 { return _mm512_alignr_epi32( v, v, n ); }

+/* Not used
 #define mm512_shuflr_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x0000001F001E001D, 0x001C001B001A0019, \
-                       0X0018001700160015, 0X0014001300120011, \
-                       0X0010000F000E000D, 0X000C000B000A0009, \
-                       0X0008000700060005, 0X0004000300020001 ), v )
+                       0x0018001700160015, 0x0014001300120011, \
+                       0x0010000F000E000D, 0x000C000B000A0009, \
+                       0x0008000700060005, 0x0004000300020001 ), v )

 #define mm512_shufll_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x001E001D001C001B, 0x001A001900180017, \
-                       0X0016001500140013, 0X001200110010000F, \
-                       0X000E000D000C000B, 0X000A000900080007, \
-                       0X0006000500040003, 0X000200010000001F ), v )
+                       0x0016001500140013, 0x001200110010000F, \
+                       0x000E000D000C000B, 0x000A000900080007, \
+                       0x0006000500040003, 0x000200010000001F ), v )

 #define mm512_shuflr_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
@@ -436,6 +437,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
                       0x2E2D2C2B2A292827, 0x262524232221201F, \
                       0x1E1D1C1B1A191817, 0x161514131211100F, \
                       0x0E0D0C0B0A090807, 0x060504030201003F ) )
+*/

 // 256 bit lanes used only by lyra2, move these there
 // Rotate elements within 256 bit lanes of 512 bit vector.
@@ -449,7 +451,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
 #define mm512_shufll256_64( v )     _mm512_permutex_epi64( v, 0x93 )

-/*
+/* Not used
 // Rotate 256 bit lanes by one 32 bit element
 #define mm512_shuflr256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
@@ -496,6 +498,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 //
 // Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
 
+/* Not used
 // Limited 2 input, 1 output shuffle, combines shuffle with blend.
 // Like most shuffles it's limited to 128 bit lanes and like some shuffles
 // destination elements must come from a specific source arg. 
@@ -506,7 +509,10 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuffle2_32( v1, v2, c ) \
   _mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( v1 ), \
                                           _mm512_castsi512_ps( v2 ), c ) ); 
+*/

+// These hard coded shuffles exist for consistency with AVX2 & SSE2 where
+// efficient generic versions don't exist.
 // Swap 64 bits in each 128 bit lane
 #define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
 #define mm512_shuflr128_64  mm512_swap128_64
@@ -516,9 +522,11 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuflr128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
 #define mm512_shufll128_32( v )    _mm512_shuffle_epi32( v, 0x93 )

-// Rotate right 128 bit lanes by c bytes, versatile and just as fast
+/* Not used
+// Rotate right 128 bit lanes by c bytes, efficient generic version of above.
 static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 {  return _mm512_alignr_epi8( v, v, c ); }
+*/

 // Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
 // can be done with ror & rol. Defined only for convenience and consistency