mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.15.1
This commit is contained in:
@@ -135,11 +135,17 @@ static inline __m128i mm128_neg1_fn()
|
||||
// Bitwise not (~v)
|
||||
#define mm128_not( v ) _mm_xor_si128( (v), m128_neg1 )
|
||||
|
||||
// Unary negation of elements
|
||||
// Unary negation of elements (-v)
|
||||
#define mm128_negate_64( v ) _mm_sub_epi64( m128_zero, v )
|
||||
#define mm128_negate_32( v ) _mm_sub_epi32( m128_zero, v )
|
||||
#define mm128_negate_16( v ) _mm_sub_epi16( m128_zero, v )
|
||||
|
||||
// Clear (zero) 32 bit elements based on bits set in 4 bit mask.
|
||||
// Fast, avoids using vector mask, but only available for 128 bit vectors.
|
||||
#define mm128_mask_32( a, mask ) \
|
||||
_mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( a ), \
|
||||
_mm_castsi128_ps( a ), mask ) )
|
||||
|
||||
// Add 4 values, fewer dependencies than sequential addition.
|
||||
#define mm128_add4_64( a, b, c, d ) \
|
||||
_mm_add_epi64( _mm_add_epi64( a, b ), _mm_add_epi64( c, d ) )
|
||||
@@ -269,11 +275,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
// Rotate vector elements accross all lanes
|
||||
|
||||
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
|
||||
|
||||
#define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
|
||||
#define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
|
||||
|
||||
|
||||
//#define mm128_swap_64( v ) _mm_alignr_epi8( v, v, 8 )
|
||||
//#define mm128_ror_1x32( v ) _mm_alignr_epi8( v, v, 4 )
|
||||
//#define mm128_rol_1x32( v ) _mm_alignr_epi8( v, v, 12 )
|
||||
@@ -282,53 +285,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm128_ror_1x8( v ) _mm_alignr_epi8( v, v, 1 )
|
||||
#define mm128_rol_1x8( v ) _mm_alignr_epi8( v, v, 15 )
|
||||
|
||||
// Rotate by c bytes
|
||||
#define mm128_ror_x8( v, c ) _mm_alignr_epi8( v, c )
|
||||
#define mm128_rol_x8( v, c ) _mm_alignr_epi8( v, 16-(c) )
|
||||
|
||||
|
||||
/*
|
||||
// Rotate 16 byte (128 bit) vector by c bytes.
|
||||
// Less efficient using shift but more versatile. Use only for odd number
|
||||
// byte rotations. Use shuffle above whenever possible.
|
||||
#define mm128_ror_x8( v, c ) \
|
||||
_mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
|
||||
|
||||
#define mm128_rol_x8( v, c ) \
|
||||
_mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
|
||||
|
||||
#if defined (__SSE3__)
|
||||
// no SSE2 implementation, no current users
|
||||
|
||||
#define mm128_ror_1x16( v ) \
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x01000f0e0d0c0b0a, \
|
||||
0x0908070605040302 ) )
|
||||
#define mm128_rol_1x16( v ) \
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080706, \
|
||||
0x0504030201000f0e ) )
|
||||
#define mm128_ror_1x8( v ) \
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x000f0e0d0c0b0a09, \
|
||||
0x0807060504030201 ) )
|
||||
#define mm128_rol_1x8( v ) \
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x0e0d0c0b0a090807, \
|
||||
0x060504030201000f ) )
|
||||
#else // SSE2
|
||||
|
||||
#define mm128_ror_1x16( v ) \
|
||||
_mm_or_si128( _mm_srli_si128( v, 2 ), _mm_slli_si128( v, 14 ) )
|
||||
|
||||
#define mm128_rol_1x16( v ) \
|
||||
_mm_or_si128( _mm_slli_si128( v, 2 ), _mm_srli_si128( v, 14 ) )
|
||||
|
||||
#define mm128_ror_1x8( v ) \
|
||||
_mm_or_si128( _mm_srli_si128( v, 1 ), _mm_slli_si128( v, 15 ) )
|
||||
|
||||
#define mm128_rol_1x8( v ) \
|
||||
_mm_or_si128( _mm_slli_si128( v, 1 ), _mm_srli_si128( v, 15 ) )
|
||||
|
||||
#endif // SSE3 else SSE2
|
||||
*/
|
||||
|
||||
|
||||
// Invert vector: {3,2,1,0} -> {0,1,2,3}
|
||||
#define mm128_invert_32( v ) _mm_shuffle_epi32( v, 0x1b )
|
||||
|
||||
|
@@ -26,8 +26,6 @@
|
||||
#define mm256_concat_128( hi, lo ) \
|
||||
_mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )
|
||||
|
||||
#define m256_const1_128( v ) \
|
||||
_mm256_broadcastsi128_si256( v )
|
||||
|
||||
// Equavalent of set, move 64 bit integer constants to respective 64 bit
|
||||
// elements.
|
||||
@@ -144,10 +142,11 @@ do { \
|
||||
|
||||
// Parallel AES, for when x is expected to be in a 256 bit register.
|
||||
// Use same 128 bit key.
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
//#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if 0
|
||||
#define mm256_aesenc_2x128( x, k ) \
|
||||
_mm256_aesenc_epi128( x, m256_const1_128(k ) )
|
||||
_mm256_aesenc_epi128( x, k )
|
||||
|
||||
#else
|
||||
|
||||
|
@@ -56,15 +56,15 @@
|
||||
// If an expensive constant is to be reused in the same function it should
|
||||
// be declared as a local variable defined once and reused.
|
||||
//
|
||||
// Permutations cab be very exppensive if they use a vector control index,
|
||||
// Permutations can be very expensive if they use a vector control index,
|
||||
// even if the permutation itself is quite efficient.
|
||||
// The index is essentially a constant with all the baggage that brings.
|
||||
// The same rules apply, if an index is to be reused it should be defined
|
||||
// as a local. This applies specifically to bswap operations.
|
||||
//
|
||||
// Additionally, permutations using smaller vectors can be more efficient
|
||||
// if the permutation doesn't cross lane boundaries ,typically 128 bits,
|
||||
// ans the smnaller vector can use an imm comtrol.
|
||||
// if the permutation doesn't cross lane boundaries, typically 128 bits,
|
||||
// and the smnaller vector can use an imm comtrol.
|
||||
//
|
||||
// If the permutation doesn't cross lane boundaries a shuffle instructions
|
||||
// can be used with imm control instead of permute.
|
||||
@@ -182,7 +182,10 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
||||
//
|
||||
// Basic operations without SIMD equivalent
|
||||
|
||||
// ~x
|
||||
#define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 )
|
||||
|
||||
// -x
|
||||
#define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
|
||||
#define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x )
|
||||
#define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x )
|
||||
@@ -443,20 +446,13 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
//
|
||||
// Rotate elements within 256 bit lanes of 512 bit vector.
|
||||
|
||||
// Rename these for consistency. Element size is always last.
|
||||
// mm<vectorsize>_<op><lanesize>_<elementsize>
|
||||
|
||||
|
||||
// Swap hi & lo 128 bits in each 256 bit lane
|
||||
|
||||
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
|
||||
|
||||
// Rotate 256 bit lanes by one 64 bit element
|
||||
|
||||
#define mm512_ror256_64( v ) _mm512_permutex_epi64( v, 0x39 )
|
||||
#define mm512_rol256_64( v ) _mm512_permutex_epi64( v, 0x93 )
|
||||
|
||||
|
||||
// Rotate 256 bit lanes by one 32 bit element
|
||||
|
||||
#define mm512_ror256_32( v ) \
|
||||
|
Reference in New Issue
Block a user