mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v24.4
This commit is contained in:
@@ -2436,7 +2436,7 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
||||
static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
||||
{
|
||||
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
0x0405060700010203 );
|
||||
const __m512i c1 = v512_64( 1 );
|
||||
v128_t s0 = casti_v128( src,0 );
|
||||
v128_t s1 = casti_v128( src,1 );
|
||||
|
@@ -165,6 +165,11 @@ typedef union
|
||||
// compiler to exploit new features to produce optimum code.
|
||||
// Currently only used internally and by Luffa.
|
||||
|
||||
|
||||
#define v128_mov64 _mm_cvtsi64_si128
|
||||
#define v128_mov32 _mm_cvtsi32_si128
|
||||
|
||||
/*
|
||||
static inline __m128i v128_mov64( const uint64_t n )
|
||||
{
|
||||
__m128i a;
|
||||
@@ -186,11 +191,14 @@ static inline __m128i v128_mov32( const uint32_t n )
|
||||
#endif
|
||||
return a;
|
||||
}
|
||||
*/
|
||||
|
||||
// broadcast lane 0 to all lanes
|
||||
#define v128_bcast64(v) _mm_shuffle_epi32( v, 0x44 )
|
||||
#define v128_bcast32(v) _mm_shuffle_epi32( v, 0x00 )
|
||||
|
||||
// Not used, test first
|
||||
/*
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define v128_bcast16(v) _mm_broadcastw_epi16(v)
|
||||
@@ -198,9 +206,10 @@ static inline __m128i v128_mov32( const uint32_t n )
|
||||
#else
|
||||
|
||||
#define v128_bcast16(v) \
|
||||
v128_bcast32( v128_or( v128_sl32( v, 16 ), v ) )
|
||||
_mm_shuffle_epi32( _mm_shufflelo_epi16( v, 0x00 ), 0x00 )
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
// Broadcast lane l to all lanes
|
||||
#define v128_duplane64( v, l ) \
|
||||
@@ -216,28 +225,15 @@ static inline __m128i v128_mov32( const uint32_t n )
|
||||
// Pseudo constants
|
||||
#define v128_zero _mm_setzero_si128()
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
// Bitwise AND, return 1 if result is all bits clear.
|
||||
#define v128_and_eq0(v1, v0) _mm_testz_si128(v1, v0)
|
||||
|
||||
// v128_is_zero?
|
||||
static inline int v128_cmpeq0( v128_t v )
|
||||
{ return v128_and_eq0( v, v ); }
|
||||
|
||||
#endif
|
||||
|
||||
// Bitwise compare return 1 if all bits set.
|
||||
#define v128_cmpeq1(v) _mm_test_all ones(v)
|
||||
|
||||
#define v128_one v128_mov64(1)
|
||||
//#define v128_one v128_mov64(1)
|
||||
#define v128_one _mm_cvtsi64_si128( 1 )
|
||||
|
||||
// ASM avoids the need to initialize return variable to avoid compiler warning.
|
||||
// Macro abstracts function parentheses to look like an identifier.
|
||||
static inline __m128i v128_neg1_fn()
|
||||
{
|
||||
__m128i a;
|
||||
#if defined(__AVX__)
|
||||
#if defined(__AVX__)
|
||||
asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) );
|
||||
#else
|
||||
asm( "pcmpeqq %0, %0\n\t" : "=x"(a) );
|
||||
@@ -268,7 +264,6 @@ static inline __m128i v128_neg1_fn()
|
||||
// p = any aligned pointer, i = scaled array index
|
||||
// returns value p[i]
|
||||
#define casti_v128(p,i) (((__m128i*)(p))[(i)])
|
||||
#define casti_m128i casti_v128 // deprecated
|
||||
#define casti_v128u64 casti_v128
|
||||
#define casti_v128u32 casti_v128
|
||||
#define casti_v128u16 casti_v128
|
||||
@@ -279,13 +274,14 @@ static inline __m128i v128_neg1_fn()
|
||||
#define casto_v128(p,o) (((__m128i*)(p))+(o))
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
#define v128_get64( v, l ) _mm_extract_epi64( v, l )
|
||||
#define v128_get32( v, l ) _mm_extract_epi32( v, l )
|
||||
#define v128_get16( v, l ) _mm_extract_epi16( v, l )
|
||||
#define v128_get8( v, l ) _mm_extract_epi8( v, l )
|
||||
|
||||
#define v128_put64( v, u64, l ) _mm_insert_epi64( v, u64, l )
|
||||
#define v128_put32( v, u32, l ) _mm_insert_epi64( v, u32, l )
|
||||
#define v128_put32( v, u32, l ) _mm_insert_epi32( v, u32, l )
|
||||
#define v128_put16( v, u16, l ) _mm_insert_epi16( v, u16, l )
|
||||
#define v128_put8( v, u8, l ) _mm_insert_epi8( v, u8, l )
|
||||
|
||||
@@ -396,7 +392,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
||||
#define memcpy_128 v128_memcpy
|
||||
|
||||
// Boolean operations
|
||||
#if defined(VL256)
|
||||
// Macros with duplicate references to the same argument are
|
||||
// not expression safe. Switch to inline function if required.
|
||||
|
||||
// ~v1 | v0
|
||||
#define v128_ornot( v1, v0 ) _mm_ternarylogic_epi64( v1, v0, v0, 0xcf )
|
||||
@@ -430,7 +429,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
|
||||
#else
|
||||
|
||||
#define v128_ornot( v1, v0 ) _mm_or_si128( v1, v128_not( v0 ) )
|
||||
#define v128_ornot( v1, v0 ) _mm_or_si128( v128_not( v1 ), v0 )
|
||||
|
||||
#define v128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
|
||||
|
||||
@@ -464,9 +463,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
#define v128_movmask32( v ) \
|
||||
_mm_movemask_ps( (__m128)(v) )
|
||||
|
||||
//
|
||||
// Bit rotations
|
||||
|
||||
// Shuffle 16 bit elements within 64 bit lanes.
|
||||
#define v128_shuffle16( v, c ) \
|
||||
_mm_shufflehi_epi16( _mm_shufflelo_epi16( v, c ), c )
|
||||
|
||||
@@ -476,6 +473,9 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
#define v128_qrev16(v) v128_shuffle16( v, 0x1b )
|
||||
#define v128_lrev16(v) v128_shuffle16( v, 0xb1 )
|
||||
|
||||
//
|
||||
// Bit rotations
|
||||
|
||||
// Internal use only, should never be callled from application code.
|
||||
#define v128_ror64_sse2( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
|
||||
@@ -601,7 +601,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
|
||||
#endif
|
||||
|
||||
// ror( v1 ^ v0, n )
|
||||
// (v1 ^ v0) >>> n, ARM NEON has optimized version
|
||||
#define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n )
|
||||
|
||||
/* not used
|
||||
@@ -700,15 +700,11 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
#define v128_swap64(v) _mm_shuffle_epi32( v, 0x4e ) // grandfathered
|
||||
#define v128_rev64(v) _mm_shuffle_epi32( v, 0x4e ) // preferred
|
||||
#define v128_rev32(v) _mm_shuffle_epi32( v, 0x1b )
|
||||
#define v128_rev16(v) v128_shuffle16( v, 0x1b )
|
||||
|
||||
// rotate vector elements
|
||||
#define v128_shuflr32(v) _mm_shuffle_epi32( v, 0x39 )
|
||||
#define v128_shufll32(v) _mm_shuffle_epi32( v, 0x93 )
|
||||
|
||||
#define v128_shuflr16(v) v128_shuffle16( v, 0x39 )
|
||||
#define v128_shufll16(v) v128_shuffle16( v, 0x93 )
|
||||
|
||||
// Endian byte swap.
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
@@ -911,25 +907,27 @@ static inline void v128_block_bswap32_512( __m128i *d, const __m128i *s )
|
||||
#else
|
||||
|
||||
#define v128_alignr8( hi, lo, c ) \
|
||||
_mm_or_si128( _mm_slli_si128( hi, c ), _mm_srli_si128( lo, c ) )
|
||||
_mm_or_si128( _mm_slli_si128( hi, 16-(c) ), _mm_srli_si128( lo, c ) )
|
||||
|
||||
// c arg is trivial only valid value is 1
|
||||
#define v128_alignr64( hi, lo, c ) \
|
||||
_mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
|
||||
_mm_or_si128( _mm_slli_si128( hi, 16-((c)*8) ), _mm_srli_si128( lo, (c)*8 ) )
|
||||
|
||||
#define v128_alignr32( hi, lo, c ) \
|
||||
_mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
|
||||
_mm_or_si128( _mm_slli_si128( hi, 16-((c)*4) ), _mm_srli_si128( lo, (c)*4 ) )
|
||||
|
||||
#endif
|
||||
|
||||
// blend using vector mask
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
// Bytewise using sign bit of each byte element of mask
|
||||
// Bytewise using sign bit of each byte element of mask. Use full bitmask
|
||||
// for compatibility with SSE2 & NEON.
|
||||
#define v128_blendv _mm_blendv_epi8
|
||||
|
||||
#else
|
||||
|
||||
// Bitwise
|
||||
// Bitwise, use only byte wise for compatibility with SSE4_1.
|
||||
#define v128_blendv( v1, v0, mask ) \
|
||||
v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )
|
||||
|
||||
|
@@ -172,7 +172,7 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
|
||||
#else
|
||||
|
||||
#define mm256_ornot( v1, v0 ) _mm256_or_si256( v1, mm256_not( v0 ) )
|
||||
#define mm256_ornot( v1, v0 ) _mm256_or_si256( mm256_not( v1 ), v0 )
|
||||
|
||||
#define mm256_xor3( a, b, c ) \
|
||||
_mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
|
||||
@@ -217,12 +217,11 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
#define mm256_movmask_32( v ) \
|
||||
_mm256_movemask_ps( _mm256_castsi256_ps( v ) )
|
||||
|
||||
//
|
||||
// Bit rotations.
|
||||
|
||||
// shuffle 16 bit elements within 64 bit lanes.
|
||||
#define mm256_shuffle16( v, c ) \
|
||||
_mm256_shufflehi_epi16( _mm256_shufflelo_epi16( v, c ), c )
|
||||
|
||||
// reverse elements within lanes.
|
||||
#define mm256_qrev32(v) _mm256_shuffle_epi32( v, 0xb1 )
|
||||
#define mm256_swap64_32 mm256_qrev32 // grandfathered
|
||||
|
||||
@@ -242,6 +241,9 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
v128_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
|
||||
|
||||
//
|
||||
// Bit rotations.
|
||||
|
||||
// These should never be called directly by applications.
|
||||
#define mm256_ror_64_avx2( v, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
|
||||
|
@@ -185,6 +185,8 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
//
|
||||
// Ternary logic uses 8 bit truth table to define any 3 input logical
|
||||
// expression using any number or combinations of AND, OR, XOR, NOT.
|
||||
// Macros with duplicate references to the same argument are
|
||||
// not expression safe. Switch to inline function if required.
|
||||
|
||||
// ~v1 | v0
|
||||
#define mm512_ornot( v1, v0 ) _mm512_ternarylogic_epi64( v1, v0, v0, 0xcf )
|
||||
|
@@ -456,7 +456,6 @@ static inline uint64x2_t v128_rev64( uint64x2_t v )
|
||||
#define v128_swap64 v128_rev64 // grandfathered
|
||||
|
||||
#define v128_rev32(v) v128_rev64( v128_qrev32( v ) )
|
||||
#define v128_rev16(v) v128_rev64( v128_qrev16( v ) )
|
||||
|
||||
// shuffle-rotate vector elements
|
||||
static inline uint32x4_t v128_shuflr32( uint32x4_t v )
|
||||
@@ -465,12 +464,6 @@ static inline uint32x4_t v128_shuflr32( uint32x4_t v )
|
||||
static inline uint32x4_t v128_shufll32( uint32x4_t v )
|
||||
{ return vextq_u32( v, v, 3 ); }
|
||||
|
||||
static inline uint16x8_t v128_shuflr16( uint16x8_t v )
|
||||
{ return vextq_u16( v, v, 1 ); }
|
||||
|
||||
static inline uint16x8_t v128_shufll16( uint16x8_t v )
|
||||
{ return vextq_u16( v, v, 7 ); }
|
||||
|
||||
// reverse bits in bytes, nothing like it in x86_64
|
||||
#define v128_bitrev8 vrbitq_u8
|
||||
|
||||
@@ -547,7 +540,8 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
|
||||
casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
|
||||
}
|
||||
|
||||
// Bitwise blend using vector mask
|
||||
// Bitwise blend using vector mask, use only bytewise for compatibility
|
||||
// with x86_64.
|
||||
#define v128_blendv( v1, v0, mask ) vbslq_u32( mask, v1, v0 )
|
||||
|
||||
#endif // __ARM_NEON
|
||||
|
Reference in New Issue
Block a user