v3.15.1

2025-09-17 23:44:27 +00:00 · 2020-11-09 13:19:05 -05:00
parent c85fb3842b
commit 4fa8fcea8b
18 changed files with 100 additions and 115 deletions
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -135,11 +135,17 @@ static inline __m128i mm128_neg1_fn()
 // Bitwise not (~v)  
 #define mm128_not( v )          _mm_xor_si128( (v), m128_neg1 ) 

-// Unary negation of elements
+// Unary negation of elements (-v)
 #define mm128_negate_64( v )    _mm_sub_epi64( m128_zero, v )
 #define mm128_negate_32( v )    _mm_sub_epi32( m128_zero, v )  
 #define mm128_negate_16( v )    _mm_sub_epi16( m128_zero, v )  

+// Clear (zero) 32 bit elements based on bits set in 4 bit mask.
+// Fast, avoids using vector mask, but only available for 128 bit vectors.
+#define mm128_mask_32( a, mask ) \
+   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( a ), \
+                                    _mm_castsi128_ps( a ), mask ) )
+
 // Add 4 values, fewer dependencies than sequential addition.
 #define mm128_add4_64( a, b, c, d ) \
   _mm_add_epi64( _mm_add_epi64( a, b ), _mm_add_epi64( c, d ) )
@@ -269,11 +275,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 // Rotate vector elements accross all lanes

 #define mm128_swap_64( v )    _mm_shuffle_epi32( v, 0x4e )
-
 #define mm128_ror_1x32( v )   _mm_shuffle_epi32( v, 0x39 )
 #define mm128_rol_1x32( v )   _mm_shuffle_epi32( v, 0x93 )
-
-
 //#define mm128_swap_64( v )    _mm_alignr_epi8( v, v,  8 )
 //#define mm128_ror_1x32( v )   _mm_alignr_epi8( v, v,  4 )
 //#define mm128_rol_1x32( v )   _mm_alignr_epi8( v, v, 12 )
@@ -282,53 +285,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_ror_1x8( v )    _mm_alignr_epi8( v, v,  1 )
 #define mm128_rol_1x8( v )    _mm_alignr_epi8( v, v, 15 )

+// Rotate by c bytes
 #define mm128_ror_x8( v, c )  _mm_alignr_epi8( v, c )
 #define mm128_rol_x8( v, c )  _mm_alignr_epi8( v, 16-(c) )


-/*
-// Rotate 16 byte (128 bit) vector by c bytes.
-// Less efficient using shift but more versatile. Use only for odd number
-// byte rotations. Use shuffle above whenever possible.
-#define mm128_ror_x8( v, c ) \
-   _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
-
-#define mm128_rol_x8( v, c ) \
-   _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
-
-#if defined (__SSE3__)
-// no SSE2 implementation, no current users
-
-#define mm128_ror_1x16( v ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x01000f0e0d0c0b0a, \
-                                       0x0908070605040302 ) )
-#define mm128_rol_1x16( v ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080706, \
-                                       0x0504030201000f0e ) )
-#define mm128_ror_1x8( v ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x000f0e0d0c0b0a09, \
-                                       0x0807060504030201 ) )
-#define mm128_rol_1x8( v ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x0e0d0c0b0a090807, \
-                                       0x060504030201000f ) )
-#else  // SSE2
-
-#define mm128_ror_1x16( v ) \
-   _mm_or_si128( _mm_srli_si128( v, 2 ), _mm_slli_si128( v, 14 ) )
-
-#define mm128_rol_1x16( v ) \
-   _mm_or_si128( _mm_slli_si128( v, 2 ), _mm_srli_si128( v, 14 ) )
-
-#define mm128_ror_1x8( v ) \
-   _mm_or_si128( _mm_srli_si128( v, 1 ), _mm_slli_si128( v, 15 ) )
-
-#define mm128_rol_1x8( v ) \
-   _mm_or_si128( _mm_slli_si128( v, 1 ), _mm_srli_si128( v, 15 ) )
-
-#endif   // SSE3 else SSE2
-*/
-
-
 // Invert vector: {3,2,1,0} -> {0,1,2,3}
 #define mm128_invert_32( v ) _mm_shuffle_epi32( v, 0x1b )

--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -26,8 +26,6 @@
 #define mm256_concat_128( hi, lo ) \
   _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )

-#define m256_const1_128( v ) \
-         _mm256_broadcastsi128_si256( v )

 // Equavalent of set, move 64 bit integer constants to respective 64 bit
 // elements.
@@ -144,10 +142,11 @@ do { \

 // Parallel AES, for when x is expected to be in a 256 bit register.
 // Use same 128 bit key.
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+//#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if 0
 #define mm256_aesenc_2x128( x, k ) \
-   _mm256_aesenc_epi128( x, m256_const1_128(k ) )
+   _mm256_aesenc_epi128( x, k )

 #else

--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -56,15 +56,15 @@
 //    If an expensive constant is to be reused in the same function it should
 //    be declared as a local variable defined once and reused.
 //
-//    Permutations cab be very exppensive if they use a vector control index,
+//    Permutations can be very expensive if they use a vector control index,
 //    even if the permutation itself is quite efficient.
 //    The index is essentially a constant with all the baggage that brings.
 //    The same rules apply, if an index is to be reused it should be defined
 //    as a local. This applies specifically to bswap operations.
 //
 //    Additionally, permutations using smaller vectors can be more efficient
-//    if the permutation doesn't cross lane boundaries ,typically 128 bits,
-//    ans the smnaller vector can use an imm comtrol.
+//    if the permutation doesn't cross lane boundaries, typically 128 bits,
+//    and the smnaller vector can use an imm comtrol.
 //
 //    If the permutation doesn't cross lane boundaries a shuffle instructions
 //    can be used with imm control instead of permute.
@@ -182,7 +182,10 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
 //
 // Basic operations without SIMD equivalent

+// ~x
 #define mm512_not( x )       _mm512_xor_si512( x, m512_neg1 )
+
+// -x
 #define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
 #define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x )  
 #define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x )  
@@ -443,20 +446,13 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Rotate elements within 256 bit lanes of 512 bit vector.

-// Rename these for consistency. Element size is always last.
-// mm<vectorsize>_<op><lanesize>_<elementsize>
-
-
 // Swap hi & lo 128 bits in each 256 bit lane
-
 #define mm512_swap256_128( v )   _mm512_permutex_epi64( v, 0x4e )

 // Rotate 256 bit lanes by one 64 bit element
-
 #define mm512_ror256_64( v )   _mm512_permutex_epi64( v, 0x39 )
 #define mm512_rol256_64( v )   _mm512_permutex_epi64( v, 0x93 )

-
 // Rotate 256 bit lanes by one 32 bit element

 #define mm512_ror256_32( v ) \