v3.21.3 Unreleased

2026-02-22 16:33:08 +00:00 · 2023-03-13 03:20:13 -04:00
parent b339450898
commit c6bc9d67fb
49 changed files with 1126 additions and 1111 deletions
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -470,7 +470,7 @@ static inline void mm128_intrlv_4x32x( void *dst, void *src0, void  *src1,

 #if defined(__SSSE3__)

-static inline void mm128_bswap32_80( void *d, const void *s )
+static inline void mm128_bswap32_80( void *d, void *s )
 {
  __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
  casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), bswap_shuf );
@@ -482,7 +482,7 @@ static inline void mm128_bswap32_80( void *d, const void *s )

 #else

-static inline void mm128_bswap32_80( void *d, const void *s )
+static inline void mm128_bswap32_80( void *d, void *s )
 {
  ( (uint32_t*)d )[ 0] = bswap_32( ( (uint32_t*)s )[ 0] );
  ( (uint32_t*)d )[ 1] = bswap_32( ( (uint32_t*)s )[ 1] );
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -385,7 +385,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_var_32( v, c ) \
   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )

-/* Not used
+//
 // Limited 2 input shuffle, combines shuffle with blend. The destination low
 // half is always taken from v1, and the high half from v2.
 #define mm128_shuffle2_64( v1, v2, c ) \
@@ -395,7 +395,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_shuffle2_32( v1, v2, c ) \
   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
                                     _mm_castsi128_ps( v2 ), c ) ); 
-*/

 //
 // Rotate vector elements accross all lanes
@@ -407,7 +406,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_shuflr_32( v )   _mm_shuffle_epi32( v, 0x39 )
 #define mm128_shufll_32( v )   _mm_shuffle_epi32( v, 0x93 )

-/* Not used
 #if defined(__SSSE3__)

 // Rotate right by c bytes, no SSE2 equivalent.
@@ -415,7 +413,6 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 { return _mm_alignr_epi8( v, v, c ); }

 #endif
-*/

 // Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
 // for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
@@ -558,25 +555,68 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
   v2 = _mm_xor_si128( v1, v2 ); \
   v1 = _mm_xor_si128( v1, v2 );

-// Concatenate { hi, lo }, rotate right by c elements and return low 128 bits.
+
+// alignr for 32 & 64 bit elements is only available with AVX512 but
+// emulated here. Shift argument is not needed, it's always 1.
+// Behaviour is otherwise consistent with Intel alignr intrinsics.
+
 #if defined(__SSSE3__)

-// _mm_alignr_epi32 & _mm_alignr_epi64 are only available with AVX512VL but
-// are emulated here using _mm_alignr_epi8. There are no fast equivalents for
-// 256 bit vectors, though there is no for this functionality.
-
-#define mm128_alignr_64( hi, lo, c )    _mm_alignr_epi8( hi, lo, (c)*8 )
-#define mm128_alignr_32( hi, lo, c )    _mm_alignr_epi8( hi, lo, (c)*4 )
+#define mm128_alignr_64( v1, v2 )    _mm_alignr_epi8( v1, v2, 8 )
+#define mm128_alignr_32( v1, v2 )    _mm_alignr_epi8( v1, v2, 4 )

 #else

-#define mm128_alignr_64( hi, lo, c ) \
-   _mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
+#define mm128_alignr_64( v1, v2 )    _mm_or_si128( _mm_slli_si128( v1, 8 ), \
+                                                   _mm_srli_si128( v2, 8 ) )

-#define mm128_alignr_32( hi, lo, c ) \
-   _mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
+#define mm128_alignr_32( v1, v2 )    _mm_or_si128( _mm_slli_si128( v1, 4 ), \
+                                                   _mm_srli_si128( v2, 4 ) )

 #endif

+// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
+// vrol & vror are deprecated and do not exist for larger vectors.
+// Their only use is by lyra2 blake2b when AVX2 is not available and is
+// grandfathered.
+
+#if defined(__SSSE3__)
+
+#define mm128_vror256_64( v1, v2 ) \
+do { \
+   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
+           v1 = _mm_alignr_epi8( v2, v1, 8 ); \
+           v2 = t; \
+} while(0)
+
+#define mm128_vrol256_64( v1, v2 ) \
+do { \
+   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
+           v2 = _mm_alignr_epi8( v2, v1, 8 ); \
+           v1 = t; \
+} while(0)
+
+#else  // SSE2
+
+#define mm128_vror256_64( v1, v2 ) \
+do { \
+   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
+                              _mm_slli_si128( v2, 8 ) ); \
+           v2 = _mm_or_si128( _mm_srli_si128( v2, 8 ), \
+                              _mm_slli_si128( v1, 8 ) ); \
+           v1 = t; \
+} while(0)
+
+#define mm128_vrol256_64( v1, v2 ) \
+do { \
+   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
+                              _mm_srli_si128( v2, 8 ) ); \
+           v2 = _mm_or_si128( _mm_slli_si128( v2, 8 ), \
+                              _mm_srli_si128( v1, 8 ) ); \
+           v1 = t; \
+} while(0)
+
+#endif  // SSE4.1 else SSE2
+
 #endif // __SSE2__
 #endif // SIMD_128_H__
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -239,8 +239,8 @@ static inline __m256i mm256_not( const __m256i v )

 // Mask making
 // Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
-// Create a 64 or 32 bit integer mask from MSB of 64 or 32 bit elements.
-// Effectively a sign test: if (mask[n]) then -1 else  0.
+// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
+// Effectively a sign test.

 #define mm256_movmask_64( v ) \
   _mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
@@ -348,7 +348,7 @@ static inline __m256i mm256_not( const __m256i v )
   _mm256_or_si256( _mm256_slli_epi16( v, c ), \
                    _mm256_srli_epi16( v, 16-(c) ) )

-// Deprecated. Obsolete sm3, the only user, is grandfathered.
+// Deprecated.
 #define mm256_rol_var_32( v, c ) \
   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
                    _mm256_srli_epi32( v, 32-(c) ) )
@@ -391,7 +391,6 @@ static inline __m256i mm256_shufll_32( const __m256i v )
 //
 // Rotate elements within each 128 bit lane of 256 bit vector.

-/* Not used
 // Limited 2 input shuffle
 #define mm256_shuffle2_64( v1, v2, c ) \
   _mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( v1 ), \
@@ -400,7 +399,6 @@ static inline __m256i mm256_shufll_32( const __m256i v )
 #define mm256_shuffle2_32( v1, v2, c ) \
   _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \
                                           _mm256_castsi256_ps( v2 ), c ) ); 
-*/

 #define mm256_swap128_64( v )  _mm256_shuffle_epi32( v, 0x4e )
 #define mm256_shuflr128_64 mm256_swap128_64
@@ -513,8 +511,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 } while(0)

 // swap 256 bit vectors in place.
-// Deprecated, Shabal is the only user and it should be modified to reorder
-// instructions.
+// This should be avoided, it's more efficient to switch references.
 #define mm256_swap512_256( v1, v2 ) \
   v1 = _mm256_xor_si256( v1, v2 ); \
   v2 = _mm256_xor_si256( v1, v2 ); \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -409,20 +409,19 @@ static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
 static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 { return _mm512_alignr_epi32( v, v, n ); }

-/* Not used
 #define mm512_shuflr_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x0000001F001E001D, 0x001C001B001A0019, \
-                       0x0018001700160015, 0x0014001300120011, \
-                       0x0010000F000E000D, 0x000C000B000A0009, \
-                       0x0008000700060005, 0x0004000300020001 ), v )
+                       0X0018001700160015, 0X0014001300120011, \
+                       0X0010000F000E000D, 0X000C000B000A0009, \
+                       0X0008000700060005, 0X0004000300020001 ), v )

 #define mm512_shufll_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x001E001D001C001B, 0x001A001900180017, \
-                       0x0016001500140013, 0x001200110010000F, \
-                       0x000E000D000C000B, 0x000A000900080007, \
-                       0x0006000500040003, 0x000200010000001F ), v )
+                       0X0016001500140013, 0X001200110010000F, \
+                       0X000E000D000C000B, 0X000A000900080007, \
+                       0X0006000500040003, 0X000200010000001F ), v )

 #define mm512_shuflr_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
@@ -437,7 +436,6 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
                       0x2E2D2C2B2A292827, 0x262524232221201F, \
                       0x1E1D1C1B1A191817, 0x161514131211100F, \
                       0x0E0D0C0B0A090807, 0x060504030201003F ) )
-*/

 // 256 bit lanes used only by lyra2, move these there
 // Rotate elements within 256 bit lanes of 512 bit vector.
@@ -451,7 +449,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
 #define mm512_shufll256_64( v )     _mm512_permutex_epi64( v, 0x93 )

-/* Not used
+/*
 // Rotate 256 bit lanes by one 32 bit element
 #define mm512_shuflr256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
@@ -498,7 +496,6 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 //
 // Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
 
-/* Not used
 // Limited 2 input, 1 output shuffle, combines shuffle with blend.
 // Like most shuffles it's limited to 128 bit lanes and like some shuffles
 // destination elements must come from a specific source arg. 
@@ -509,10 +506,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuffle2_32( v1, v2, c ) \
   _mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( v1 ), \
                                           _mm512_castsi512_ps( v2 ), c ) ); 
-*/

-// These hard coded shuffles exist for consistency with AVX2 & SSE2 where
-// efficient generic versions don't exist.
 // Swap 64 bits in each 128 bit lane
 #define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
 #define mm512_shuflr128_64  mm512_swap128_64
@@ -522,11 +516,9 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuflr128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
 #define mm512_shufll128_32( v )    _mm512_shuffle_epi32( v, 0x93 )

-/* Not used
-// Rotate right 128 bit lanes by c bytes, efficient generic version of above.
+// Rotate right 128 bit lanes by c bytes, versatile and just as fast
 static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 {  return _mm512_alignr_epi8( v, v, c ); }
-*/

 // Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
 // can be done with ror & rol. Defined only for convenience and consistency