mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.22.3
This commit is contained in:
@@ -42,10 +42,10 @@ typedef union
|
||||
uint32_t u32[4];
|
||||
} __attribute__ ((aligned (16))) m128_ovly;
|
||||
|
||||
// Efficient and convenient moving between GP & low bits of XMM.
|
||||
// Use VEX when available to give access to xmm8-15 and zero extend for
|
||||
// larger vectors.
|
||||
|
||||
// Deprecated. EVEX adds support for integer argument in broadcast instruction
|
||||
// eliminating the need for an explicit move in most cases. Use the set1
|
||||
// intrinsic with integers and let the compiler figure it out.
|
||||
static inline __m128i mm128_mov64_128( const uint64_t n )
|
||||
{
|
||||
__m128i a;
|
||||
@@ -68,65 +68,27 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
|
||||
return a;
|
||||
}
|
||||
|
||||
// Inconstant naming, prefix should reflect return value:
|
||||
// u64_mov128_64
|
||||
|
||||
static inline uint64_t u64_mov128_64( const __m128i a )
|
||||
{
|
||||
uint64_t n;
|
||||
#if defined(__AVX__)
|
||||
asm( "vmovq %1, %0\n\t" : "=r"(n) : "x"(a) );
|
||||
#else
|
||||
asm( "movq %1, %0\n\t" : "=r"(n) : "x"(a) );
|
||||
#endif
|
||||
return n;
|
||||
}
|
||||
|
||||
static inline uint32_t u32_mov128_32( const __m128i a )
|
||||
{
|
||||
uint32_t n;
|
||||
#if defined(__AVX__)
|
||||
asm( "vmovd %1, %0\n\t" : "=r"(n) : "x"(a) );
|
||||
#else
|
||||
asm( "movd %1, %0\n\t" : "=r"(n) : "x"(a) );
|
||||
#endif
|
||||
return n;
|
||||
}
|
||||
|
||||
// Emulate broadcast & insert instructions not available in SSE2
|
||||
#define mm128_bcast_i64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
|
||||
#define mm128_bcast_i32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
|
||||
// FYI only, not used anywhere
|
||||
//#define mm128_bcast_m64( v ) _mm_shuffle_epi32( v, 0x44 )
|
||||
//#define mm128_bcast_m32( v ) _mm_shuffle_epi32( v, 0x00 )
|
||||
|
||||
#define m128_const_i128( i ) mm128_mov64_128( i )
|
||||
|
||||
// deprecated
|
||||
#define m128_const1_64 mm128_bcast_i64
|
||||
#define m128_const1_32 mm128_bcast_i32
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
// Assign 64 bit integers to respective elements: {hi, lo}
|
||||
#define m128_const_64( hi, lo ) \
|
||||
_mm_insert_epi64( mm128_mov64_128( lo ), hi, 1 )
|
||||
|
||||
#else
|
||||
// Deprecated, use set1 directly
|
||||
#define m128_const1_64 _mm_set1_epi64x
|
||||
#define m128_const1_32 _mm_set1_epi32
|
||||
|
||||
// Deprecated, use set directly
|
||||
#define m128_const_64 _mm_set_epi64x
|
||||
|
||||
#endif
|
||||
|
||||
// Pseudo constants
|
||||
|
||||
#define m128_zero _mm_setzero_si128()
|
||||
#define m128_one_128 mm128_mov64_128( 1 )
|
||||
#define m128_one_64 mm128_bcast_i64( 1 )
|
||||
#define m128_one_32 mm128_bcast_i32( 1 )
|
||||
#define m128_one_16 mm128_bcast_i32( 0x00010001 )
|
||||
#define m128_one_8 mm128_bcast_i32( 0x01010101 )
|
||||
//#define m128_one_64 _mm_set1_epi64x( 1 )
|
||||
#define m128_one_32 _mm_set1_epi32( 1 )
|
||||
|
||||
// ASM avoids the need to initialize return variable to avoid compiler warning.
|
||||
// Macro abstracts function parentheses to look like an identifier.
|
||||
|
||||
static inline __m128i mm128_neg1_fn()
|
||||
{
|
||||
__m128i a;
|
||||
@@ -184,15 +146,11 @@ static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
|
||||
const int c )
|
||||
{ return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); }
|
||||
|
||||
// Extract 32 bit element c from v and return as integer.
|
||||
static inline uint32_t mm128_extract_32( const __m128i v, const int c )
|
||||
{ return u32_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }
|
||||
|
||||
// Zero 32 bit elements when bit in mask is set.
|
||||
// Zero 32 bit elements when corresponding bit in 4 bit mask is set.
|
||||
static inline __m128i mm128_mask_32( const __m128i v, const int m )
|
||||
{ return mm128_xim_32( v, v, m ); }
|
||||
|
||||
// Move element i2 of v2 to element i1 of v1 and return updated v1.
|
||||
// Copy element i2 of v2 to element i1 of dest and copy remaining elements from v1.
|
||||
#define mm128_mov32_32( v1, i1, v2, i2 ) \
|
||||
mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )
|
||||
|
||||
@@ -213,13 +171,6 @@ static inline __m128i mm128_not( const __m128i v )
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
// Unary negation of elements (-v)
|
||||
#define mm128_negate_64( v ) _mm_sub_epi64( m128_zero, v )
|
||||
#define mm128_negate_32( v ) _mm_sub_epi32( m128_zero, v )
|
||||
#define mm128_negate_16( v ) _mm_sub_epi16( m128_zero, v )
|
||||
*/
|
||||
|
||||
// Add 4 values, fewer dependencies than sequential addition.
|
||||
#define mm128_add4_64( a, b, c, d ) \
|
||||
_mm_add_epi64( _mm_add_epi64( a, b ), _mm_add_epi64( c, d ) )
|
||||
@@ -384,16 +335,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
|
||||
#endif // AVX512 else SSE2
|
||||
|
||||
#define mm128_ror_16( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) )
|
||||
|
||||
#define mm128_rol_16( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
|
||||
|
||||
// Deprecated.
|
||||
#define mm128_rol_var_32( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
// Cross lane shuffles
|
||||
//
|
||||
// Limited 2 input shuffle, combines shuffle with blend. The destination low
|
||||
@@ -415,6 +356,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
|
||||
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
|
||||
|
||||
/* Not used
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
// Rotate right by c bytes, no SSE2 equivalent.
|
||||
@@ -422,6 +364,7 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
{ return _mm_alignr_epi8( v, v, c ); }
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
// Rotate 64 bit lanes
|
||||
|
||||
@@ -471,25 +414,25 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define mm128_bswap_128( v ) \
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x0001020304050607, \
|
||||
0x08090a0b0c0d0e0f ) )
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0001020304050607, \
|
||||
0x08090a0b0c0d0e0f ) )
|
||||
|
||||
#define mm128_bswap_64( v ) \
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
|
||||
0x0001020304050607 ) )
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
|
||||
0x0001020304050607 ) )
|
||||
|
||||
#define mm128_bswap_32( v ) \
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ) )
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ) )
|
||||
|
||||
#define mm128_bswap_16( v ) \
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x0e0f0c0d0a0b0809, \
|
||||
0x0607040502030001 )
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \
|
||||
0x0607040502030001 )
|
||||
|
||||
// 8 byte qword * 8 qwords * 2 lanes = 128 bytes
|
||||
#define mm128_block_bswap_64( d, s ) do \
|
||||
{ \
|
||||
__m128i ctl = m128_const_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
__m128i ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \
|
||||
casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), ctl ); \
|
||||
casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), ctl ); \
|
||||
@@ -503,7 +446,7 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
// 4 byte dword * 8 dwords * 4 lanes = 128 bytes
|
||||
#define mm128_block_bswap_32( d, s ) do \
|
||||
{ \
|
||||
__m128i ctl = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
__m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \
|
||||
casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), ctl ); \
|
||||
casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), ctl ); \
|
||||
@@ -564,14 +507,6 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
|
||||
#endif // SSSE3 else SSE2
|
||||
|
||||
// Swap 128 bit vectors.
|
||||
// This should be avoided, it's more efficient to switch references.
|
||||
#define mm128_swap256_128( v1, v2 ) \
|
||||
v1 = _mm_xor_si128( v1, v2 ); \
|
||||
v2 = _mm_xor_si128( v1, v2 ); \
|
||||
v1 = _mm_xor_si128( v1, v2 );
|
||||
|
||||
|
||||
// alignr instruction for 32 & 64 bit elements is only available with AVX512
|
||||
// but emulated here. Behaviour is consistent with Intel alignr intrinsics.
|
||||
|
||||
|
Reference in New Issue
Block a user