mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.21.0
This commit is contained in:
@@ -193,8 +193,17 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
|
||||
// Basic operations without equivalent SIMD intrinsic
|
||||
|
||||
// Bitwise not (~v)
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
static inline __m128i mm128_not( const __m128i v )
|
||||
{ return _mm_ternarylogic_epi64( v, v, v, 1 ); }
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_not( v ) _mm_xor_si128( v, m128_neg1 )
|
||||
|
||||
#endif
|
||||
|
||||
// Unary negation of elements (-v)
|
||||
#define mm128_negate_64( v ) _mm_sub_epi64( m128_zero, v )
|
||||
#define mm128_negate_32( v ) _mm_sub_epi32( m128_zero, v )
|
||||
@@ -439,7 +448,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
|
||||
//
|
||||
// Limited 2 input shuffle, combines shuffle with blend. The destination low
|
||||
// half is always taken from src a, and the high half from src b.
|
||||
// half is always taken from v1, and the high half from v2.
|
||||
#define mm128_shuffle2_64( v1, v2, c ) \
|
||||
_mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \
|
||||
_mm_castsi128_pd( v2 ), c ) );
|
||||
@@ -600,9 +609,6 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
|
||||
#endif // SSSE3 else SSE2
|
||||
|
||||
//
|
||||
// Rotate in place concatenated 128 bit vectors as one 256 bit vector.
|
||||
|
||||
// Swap 128 bit vectors.
|
||||
// This should be avoided, it's more efficient to switch references.
|
||||
#define mm128_swap256_128( v1, v2 ) \
|
||||
@@ -611,61 +617,23 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
v1 = _mm_xor_si128( v1, v2 );
|
||||
|
||||
|
||||
// Two input shuffle-rotate.
|
||||
// Concatenate v1 & v2 and byte rotate as a 256 bit vector.
|
||||
// Function macros with two inputs and one output, inputs are preserved.
|
||||
// Returns the high 128 bits, ie updated v1.
|
||||
// alignr for 32 & 64 bit elements is only available with AVX512 but
|
||||
// emulated here. Shift argument is not needed, it's always 1.
|
||||
// Behaviour is otherwise consistent with Intel alignr intrinsics.
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
|
||||
#define mm128_shufl2l_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
|
||||
|
||||
/*
|
||||
#define mm128_shufl2r_32( v1, v2 ) _mm_alignr_epi8( v2, v1, 4 )
|
||||
#define mm128_shufl2l_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 )
|
||||
|
||||
#define mm128_shufl2r_16( v1, v2 ) _mm_alignr_epi8( v2, v1, 2 )
|
||||
#define mm128_shufl2l_16( v1, v2 ) _mm_alignr_epi8( v1, v2, 2 )
|
||||
|
||||
#define mm128_shufl2r_8( v1, v2 ) _mm_alignr_epi8( v2, v1, 1 )
|
||||
#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 1 )
|
||||
*/
|
||||
#define mm128_alignr_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
|
||||
#define mm128_alignr_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 )
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_shufl2r_64( v1, v2 ) \
|
||||
_mm_or_si128( _mm_srli_si128( v1, 8 ), \
|
||||
_mm_slli_si128( v2, 8 ) )
|
||||
#define mm128_alignr_64( v1, v2 ) _mm_or_si128( _mm_slli_si128( v1, 8 ), \
|
||||
_mm_srli_si128( v2, 8 ) )
|
||||
|
||||
#define mm128_shufl2l_64( v1, v2 ) \
|
||||
_mm_or_si128( _mm_slli_si128( v1, 8 ), \
|
||||
_mm_srli_si128( v2, 8 ) )
|
||||
/*
|
||||
#define mm128_shufl2r_32( v1, v2 ) \
|
||||
_mm_or_si128( _mm_srli_si128( v1, 4 ), \
|
||||
_mm_slli_si128( v2, 12 ) )
|
||||
#define mm128_alignr_32( v1, v2 ) _mm_or_si128( _mm_slli_si128( v1, 4 ), \
|
||||
_mm_srli_si128( v2, 4 ) )
|
||||
|
||||
#define mm128_shufl2l_32( v1, v2 ) \
|
||||
_mm_or_si128( _mm_slli_si128( v1, 4 ), \
|
||||
_mm_srli_si128( v2, 12 ) )
|
||||
|
||||
#define mm128_shufl2r_16( v1, v2 ) \
|
||||
_mm_or_si128( _mm_srli_si128( v1, 2 ), \
|
||||
_mm_slli_si128( v2, 14 ) )
|
||||
|
||||
#define mm128_shufl2l_16( v1, v2 ) \
|
||||
_mm_or_si128( _mm_slli_si128( v1, 2 ), \
|
||||
_mm_srli_si128( v2, 14 ) )
|
||||
|
||||
#define mm128_shufl2r_8( v1, v2 ) \
|
||||
_mm_or_si128( _mm_srli_si128( v1, 1 ), \
|
||||
_mm_slli_si128( v2, 15 ) )
|
||||
|
||||
#define mm128_shufl2l_8( v1, v2 ) \
|
||||
_mm_or_si128( _mm_slli_si128( v1, 1 ), \
|
||||
_mm_srli_si128( v2, 15 ) )
|
||||
*/
|
||||
#endif
|
||||
|
||||
// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
|
||||
@@ -689,50 +657,6 @@ do { \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
#define mm128_vror256_32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 4 ); \
|
||||
v1 = _mm_alignr_epi8( v2, v1, 4 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vrol256_32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 12 ); \
|
||||
v2 = _mm_alignr_epi8( v2, v1, 12 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vror256_16( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 2 ); \
|
||||
v1 = _mm_alignr_epi8( v2, v1, 2 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vrol256_16( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 14 ); \
|
||||
v2 = _mm_alignr_epi8( v2, v1, 14 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vror256_8( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 1 ); \
|
||||
v1 = _mm_alignr_epi8( v2, v1, 1 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vrol256_8( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 15 ); \
|
||||
v2 = _mm_alignr_epi8( v2, v1, 15 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define mm128_vror256_64( v1, v2 ) \
|
||||
@@ -752,61 +676,7 @@ do { \
|
||||
_mm_srli_si128( v1, 8 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
/*
|
||||
#define mm128_vror256_32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
|
||||
_mm_slli_si128( v2, 12 ) ); \
|
||||
v2 = _mm_or_si128( _mm_srli_si128( v2, 4 ), \
|
||||
_mm_slli_si128( v1, 12 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vrol256_32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
|
||||
_mm_srli_si128( v2, 12 ) ); \
|
||||
v2 = _mm_or_si128( _mm_slli_si128( v2, 4 ), \
|
||||
_mm_srli_si128( v1, 12 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vror256_16( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
|
||||
_mm_slli_si128( v2, 14 ) ); \
|
||||
v2 = _mm_or_si128( _mm_srli_si128( v2, 2 ), \
|
||||
_mm_slli_si128( v1, 14 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vrol256_16( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
|
||||
_mm_srli_si128( v2, 14 ) ); \
|
||||
v2 = _mm_or_si128( _mm_slli_si128( v2, 2 ), \
|
||||
_mm_srli_si128( v1, 14 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vror256_8( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
|
||||
_mm_slli_si128( v2, 15 ) ); \
|
||||
v2 = _mm_or_si128( _mm_srli_si128( v2, 1 ), \
|
||||
_mm_slli_si128( v1, 15 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vrol256_8( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
|
||||
_mm_srli_si128( v2, 15 ) ); \
|
||||
v2 = _mm_or_si128( _mm_slli_si128( v2, 1 ), \
|
||||
_mm_srli_si128( v1, 15 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
*/
|
||||
#endif // SSE4.1 else SSE2
|
||||
|
||||
#endif // __SSE2__
|
||||
|
@@ -15,14 +15,13 @@
|
||||
//
|
||||
// "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
|
||||
// lanes and data can't cross the 128 bit lane boundary.
|
||||
// Some usage may have the index vector encoded as if full vector
|
||||
// shuffles are supported. This has no side effects and would have the same
|
||||
// results using either version.
|
||||
// If the need arises and AVX512VL is available, 256 bit full vector shuffles
|
||||
// can be implemented using the AVX512 zero-mask feature with a NULL mask.
|
||||
// Using intrinsics it's simple: _mm256_maskz_shuffle_epi8( 0, v, c )
|
||||
// With asm it's a bit more complicated with the addition of the mask register
|
||||
// and zero tag: vpshufb ymm0{k0}{z}, ymm1, ymm2
|
||||
// Instructions that can move data across 128 bit lane boundary incur a
|
||||
// performance penalty over those that can't.
|
||||
// Some usage of index vectors may be encoded as if full vector shuffles are
|
||||
// supported. This has no side effects and would have the same results using
|
||||
// either version.
|
||||
// If the need arises and AVX512VL is available, 256 bit full vector byte
|
||||
// shuffles can be implemented using the AVX512 mask feature with a NULL mask.
|
||||
|
||||
#if defined(__AVX__)
|
||||
|
||||
@@ -141,7 +140,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
//
|
||||
// Basic operations without SIMD equivalent
|
||||
|
||||
// Bitwise not ( ~v )
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
static inline __m256i mm256_not( const __m256i v )
|
||||
|
@@ -37,13 +37,21 @@
|
||||
// version of this specific instruction does not.
|
||||
//
|
||||
// New alignr instructions for epi64 and epi32 operate across the entire
|
||||
// vector. "_mm512_alignr_epi8" continues to be restricted to 128 bit lanes.
|
||||
// vector but slower than epi8 which continues to be restricted to 128 bit
|
||||
// lanes.
|
||||
//
|
||||
// "_mm512_permutexvar_epi8" and "_mm512_permutex2var_epi8" require
|
||||
// AVX512-VBMI. The same instructions with larger elements don't have this
|
||||
// requirement. "_mm512_permutexvar_epi8" also performs the same operation
|
||||
// as "_mm512_shuffle_epi8" which only requires AVX512-BW.
|
||||
//
|
||||
// Two coding conventions are used to prevent macro argument side effects:
|
||||
// - if a macro arg is used in an expression it must be protected by
|
||||
// parentheses to ensure an expression argument is evaluated first.
|
||||
// - if an argument is to referenced multiple times a C inline function
|
||||
// should be used instead of a macro to prevent an expression argument
|
||||
// from being evaluated multiple times.
|
||||
//
|
||||
// There are 2 areas where overhead is a major concern: constants and
|
||||
// permutations.
|
||||
//
|
||||
@@ -184,7 +192,6 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
||||
// Basic operations without SIMD equivalent
|
||||
|
||||
// Bitwise NOT: ~x
|
||||
// #define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 )
|
||||
static inline __m512i mm512_not( const __m512i x )
|
||||
{ return _mm512_ternarylogic_epi64( x, x, x, 1 ); }
|
||||
|
||||
@@ -295,7 +302,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
#define mm512_nand( a, b ) \
|
||||
_mm512_ternarylogic_epi64( a, b, b, 0xef )
|
||||
|
||||
|
||||
/*
|
||||
// Diagonal blending
|
||||
// Blend 8 64 bit elements from 8 vectors
|
||||
#define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \
|
||||
@@ -313,6 +320,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
_mm512_mask_blend_epi32( 0x3333, \
|
||||
_mm512_mask_blend_epi32( 0x4444, v3, v2 ), \
|
||||
_mm512_mask_blend_epi32( 0x1111, v1, v0 ) )
|
||||
*/
|
||||
|
||||
/*
|
||||
//
|
||||
@@ -374,6 +382,19 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
#define mm512_ror_32 _mm512_ror_epi32
|
||||
#define mm512_rol_32 _mm512_rol_epi32
|
||||
|
||||
/*
|
||||
#if defined(__AVX512VBMI2__)
|
||||
|
||||
// Use C inline function in case arg is coded as an expression.
|
||||
static inline __m512i mm512_ror_16( __m512i v, int c )
|
||||
{ return _mm512_shrdi_epi16( v, v, c ); }
|
||||
|
||||
static inline __m512i mm512_rol_16( __m512i v, int c )
|
||||
{ return _mm512_shldi_epi16( v, v, c ); }
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
//
|
||||
// Reverse byte order of packed elements, vectorized endian conversion.
|
||||
|
||||
@@ -518,7 +539,6 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
|
||||
//
|
||||
// Rotate elements within 256 bit lanes of 512 bit vector.
|
||||
// 128 bit lane shift is handled by bslli bsrli.
|
||||
|
||||
// Swap hi & lo 128 bits in each 256 bit lane
|
||||
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
|
||||
@@ -623,22 +643,5 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 )
|
||||
#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 )
|
||||
|
||||
/*
|
||||
// 2 input, 1 output
|
||||
// Concatenate { v1, v2 } then rotate right or left and return the high
|
||||
// 512 bits, ie rotated v1.
|
||||
#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 )
|
||||
#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 )
|
||||
|
||||
#define mm512_shufl2r_128( v1, v2 ) _mm512_alignr_epi64( v2, v1, 2 )
|
||||
#define mm512_shufl2l_128( v1, v2 ) _mm512_alignr_epi64( v1, v2, 2 )
|
||||
|
||||
#define mm512_shufl2r_64( v1, v2 ) _mm512_alignr_epi64( v2, v1, 1 )
|
||||
#define mm512_shufl2l_64( v1, v2 ) _mm512_alignr_epi64( v1, v2, 1 )
|
||||
|
||||
#define mm512_shufl2r_32( v1, v2 ) _mm512_alignr_epi32( v2, v1, 1 )
|
||||
#define mm512_shufl2l_32( v1, v2 ) _mm512_alignr_epi32( v1, v2, 1 )
|
||||
*/
|
||||
|
||||
#endif // AVX512
|
||||
#endif // SIMD_512_H__
|
||||
|
Reference in New Issue
Block a user