v23.5

2026-02-22 16:33:08 +00:00 · 2023-10-25 20:36:20 -04:00
parent 31c4dedf59
commit 160608cce5
180 changed files with 10318 additions and 13097 deletions
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -217,6 +217,69 @@ static inline __m256i mm256_not( const __m256i v )

 //
 //           Bit rotations.
+
+// Slow version, used as last resort
+#define mm256_ror_64_avx2( v, c ) \
+   _mm256_or_si256( _mm256_srli_epi64( v, c ), \
+                    _mm256_slli_epi64( v, 64-(c) ) )
+
+#define mm256_rol_64_avx2( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi64( v, c ), \
+                    _mm256_srli_epi64( v, 64-(c) ) )
+
+#define mm256_ror_32_avx2( v, c ) \
+   _mm256_or_si256( _mm256_srli_epi32( v, c ), \
+                    _mm256_slli_epi32( v, 32-(c) ) )
+
+#define mm256_rol_32_avx2( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
+                    _mm256_srli_epi32( v, 32-(c) ) )
+
+#if defined(__AVX512VL__)
+
+#define mm256_ror_64    _mm256_ror_epi64
+#define mm256_rol_64    _mm256_rol_epi64
+#define mm256_ror_32    _mm256_ror_epi32
+#define mm256_rol_32    _mm256_rol_epi32
+
+#else
+
+#define mm256_ror_64( v, c ) \
+   ( (c) == 32 ) ? _mm256_shuffle_epi32( v, 0xb1 ) \
+ : ( (c) == 24 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+              _mm_set_epi64x( 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) ) \
+ : ( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+              _mm_set_epi64x( 0x09080f0e0d0c0b0a, 0x0100070605040302 ) ) ) \
+ : ( (c) ==  8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+              _mm_set_epi64x( 0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) ) \
+ : mm256_ror_64_avx2( v, c )
+
+#define mm256_rol_64( v, c ) \
+   ( (c) == 32 ) ? _mm256_shuffle_epi32( v, 0xb1 ) \
+ : ( (c) == 24 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+             _mm_set_epi64x( 0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) ) \
+ : ( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+             _mm_set_epi64x( 0x0d0c0b0a09080f0e, 0x0504030201000706 ) ) ) \
+ : ( (c) ==  8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+             _mm_set_epi64x( 0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) ) \
+ : mm256_rol_64_avx2( v, c )
+
+#define mm256_ror_32( v, c ) \
+   ( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+             _mm_set_epi64x( 0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) )\
+ : ( (c) ==  8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+             _mm_set_epi64x( 0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) ) \
+ : mm256_ror_32_avx2( v, c )
+
+#define mm256_rol_32( v, c ) \
+   ( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+             _mm_set_epi64x( 0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) ) \
+ : ( (c) ==  8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+             _mm_set_epi64x( 0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) ) \
+ : mm256_rol_32_avx2( v, c )
+
+#endif
+
 //
 // x2 rotates elements in 2 individual vectors in a double buffered
 // optimization for AVX2, does nothing for AVX512 but is here for
@@ -224,12 +287,12 @@ static inline __m256i mm256_not( const __m256i v )

 #if defined(__AVX512VL__)
 //TODO Enable for AVX10_256
-
+/*
 #define mm256_ror_64    _mm256_ror_epi64
 #define mm256_rol_64    _mm256_rol_epi64
 #define mm256_ror_32    _mm256_ror_epi32
 #define mm256_rol_32    _mm256_rol_epi32
-
+*/
 #define mm256_rorx2_64( v1, v0, c ) \
   _mm256_ror_epi64( v0, c ); \
   _mm256_ror_epi64( v1, c )
@@ -247,7 +310,7 @@ static inline __m256i mm256_not( const __m256i v )
   _mm256_rol_epi32( v1, c )

 #else   // AVX2
-
+/*
 // use shuflr64 shuflr32 below for optimized bit rotations of multiples of 8.

 #define mm256_ror_64( v, c ) \
@@ -265,7 +328,7 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_rol_32( v, c ) \
   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
                    _mm256_srli_epi32( v, 32-(c) ) )
-
+*/
 #define mm256_rorx2_64( v1, v0, c ) \
 { \
 __m256i t0 = _mm256_srli_epi64( v0, c ); \
@@ -372,49 +435,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 { return _mm256_alignr_epi8( v, v, c ); }
 */

-// 64 bit lanes
-
-#define mm256_swap64_32( v )      _mm256_shuffle_epi32( v, 0xb1 )
-#define mm256_shuflr64_32         mm256_swap64_32
-#define mm256_shufll64_32         mm256_swap64_32
-
-//TODO Enable for AVX10_256
-#if defined(__AVX512VL__)
-  #define mm256_shuflr64_24( v )  _mm256_ror_epi64( v, 24 )
-#else
-  #define mm256_shuflr64_24( v ) \
-    _mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
-                                 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) )
-#endif
-
-#if defined(__AVX512VL__)
-  #define mm256_shuflr64_16( v )  _mm256_ror_epi64( v, 16 )
-#else
-  #define mm256_shuflr64_16( v ) \
-    _mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
-                                 0x09080f0e0d0c0b0a, 0x0100070605040302 ) ) )
-#endif
-
-// 32 bit lanes
-
-#if defined(__AVX512VL__)
-  #define mm256_swap32_16( v )  _mm256_ror_epi32( v, 16 )
-#else
-  #define mm256_swap32_16( v ) \
-    _mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
-                                 0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) )
-#endif
-#define mm256_shuflr32_16       mm256_swap32_16
-#define mm256_shufll32_16       mm256_swap32_16
-
-#if defined(__AVX512VL__)
-  #define mm256_shuflr32_8( v )  _mm256_ror_epi32( v, 8 )
-#else
-  #define mm256_shuflr32_8( v ) \
-    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
-                                    0x0c0f0e0d080b0a09, 0x0407060500030201, \
-                                    0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
-#endif
+// Same as bit rotation but logically used as byte/word rotation.
+#define mm256_swap64_32( v )     mm256_ror_64( v, 32 )

 // Reverse byte order in elements, endian bswap.
 #define mm256_bswap_64( v ) \
@@ -428,10 +450,11 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 #define mm256_bswap_16( v ) \
   _mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
                                0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
+//

 // Source and destination are pointers, may point to same memory.
 // 8 byte qword * 8 qwords * 4 lanes = 256 bytes
-#define mm256_block_bswap_64( d, s ) do \
+#define mm256_block_bswap_64( d, s ) \
 { \
  __m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
                                                  0x0001020304050607 ) ); \
@@ -443,10 +466,33 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
  casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
  casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
  casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
-} while(0)
+}
+#define mm256_block_bswap64_512   mm256_block_bswap_64
+
+#define mm256_block_bswap64_1024( d, s ) \
+{ \
+  __m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
+                                                  0x0001020304050607 ) ); \
+  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
+  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
+  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
+  casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
+  casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
+  casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
+  casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
+  casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
+  casti_m256i( d, 8 ) = _mm256_shuffle_epi8( casti_m256i( s, 8 ), ctl ); \
+  casti_m256i( d, 9 ) = _mm256_shuffle_epi8( casti_m256i( s, 9 ), ctl ); \
+  casti_m256i( d,10 ) = _mm256_shuffle_epi8( casti_m256i( s,10 ), ctl ); \
+  casti_m256i( d,11 ) = _mm256_shuffle_epi8( casti_m256i( s,11 ), ctl ); \
+  casti_m256i( d,12 ) = _mm256_shuffle_epi8( casti_m256i( s,12 ), ctl ); \
+  casti_m256i( d,13 ) = _mm256_shuffle_epi8( casti_m256i( s,13 ), ctl ); \
+  casti_m256i( d,14 ) = _mm256_shuffle_epi8( casti_m256i( s,14 ), ctl ); \
+  casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \
+}

 // 4 byte dword * 8 dwords * 8 lanes = 256 bytes
-#define mm256_block_bswap_32( d, s ) do \
+#define mm256_block_bswap_32( d, s ) \
 { \
  __m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
                                                  0x0405060700010203 ) ); \
@@ -458,7 +504,31 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
  casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
  casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
  casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
-} while(0)
+}
+#define mm256_block_bswap32_256      mm256_block_bswap_32
+
+#define mm256_block_bswap32_512( d, s ) \
+{ \
+  __m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
+                                                  0x0405060700010203 ) ); \
+  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
+  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
+  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
+  casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
+  casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
+  casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
+  casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
+  casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
+  casti_m256i( d, 8 ) = _mm256_shuffle_epi8( casti_m256i( s, 8 ), ctl ); \
+  casti_m256i( d, 9 ) = _mm256_shuffle_epi8( casti_m256i( s, 9 ), ctl ); \
+  casti_m256i( d,10 ) = _mm256_shuffle_epi8( casti_m256i( s,10 ), ctl ); \
+  casti_m256i( d,11 ) = _mm256_shuffle_epi8( casti_m256i( s,11 ), ctl ); \
+  casti_m256i( d,12 ) = _mm256_shuffle_epi8( casti_m256i( s,12 ), ctl ); \
+  casti_m256i( d,13 ) = _mm256_shuffle_epi8( casti_m256i( s,13 ), ctl ); \
+  casti_m256i( d,14 ) = _mm256_shuffle_epi8( casti_m256i( s,14 ), ctl ); \
+  casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \
+}
+

 #endif // __AVX2__
 #endif // SIMD_256_H__