mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v23.9
This commit is contained in:
@@ -65,8 +65,6 @@
|
||||
#define v128_add32 _mm_add_epi32
|
||||
#define v128_add16 _mm_add_epi16
|
||||
#define v128_add8 _mm_add_epi8
|
||||
#define v128_add4_64 mm128_add4_64
|
||||
#define v128_add4_32 mm128_add4_32
|
||||
|
||||
#define v128_sub64 _mm_sub_epi64
|
||||
#define v128_sub32 _mm_sub_epi32
|
||||
@@ -120,8 +118,8 @@
|
||||
#define v128_xor _mm_xor_si128
|
||||
#define v128_xorq _mm_xor_si128
|
||||
#define v128_andnot _mm_andnot_si128
|
||||
#define v128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
|
||||
#define v128_ornot( a, b ) mm128_or( a, mm128_not( b ) )
|
||||
#define v128_xnor( a, b ) v128_not( _mm_xor_si128( a, b ) )
|
||||
#define v128_ornot( a, b ) _mm_or_si128( a, v128_not( b ) )
|
||||
|
||||
// ternary
|
||||
#define v128_xorandnot( v2, v1, v0 ) \
|
||||
@@ -135,13 +133,6 @@
|
||||
#define v128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
|
||||
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
// shift 2 concatenated vectors right
|
||||
#define v128_alignr64 mm128_alignr_64
|
||||
#define v128_alignr32 mm128_alignr_32
|
||||
#if defined(__SSSE3__)
|
||||
#define v128_alignr8 _mm_alignr_epi8
|
||||
#endif
|
||||
|
||||
// unpack
|
||||
#define v128_unpacklo64 _mm_unpacklo_epi64
|
||||
#define v128_unpackhi64 _mm_unpackhi_epi64
|
||||
@@ -404,21 +395,24 @@ static inline __m128i mm128_negate_16( __m128i v )
|
||||
|
||||
|
||||
// Add 4 values, fewer dependencies than sequential addition.
|
||||
#define mm128_add4_64( a, b, c, d ) \
|
||||
#define v128_add4_64( a, b, c, d ) \
|
||||
_mm_add_epi64( _mm_add_epi64( a, b ), _mm_add_epi64( c, d ) )
|
||||
#define mm128_add4_64 v128_add4_64
|
||||
|
||||
#define mm128_add4_32( a, b, c, d ) \
|
||||
#define v128_add4_32( a, b, c, d ) \
|
||||
_mm_add_epi32( _mm_add_epi32( a, b ), _mm_add_epi32( c, d ) )
|
||||
#define v128_add4_32 mm128_add4_32
|
||||
#define mm128_add4_32 v128_add4_32
|
||||
|
||||
#define mm128_add4_16( a, b, c, d ) \
|
||||
#define v128_add4_16( a, b, c, d ) \
|
||||
_mm_add_epi16( _mm_add_epi16( a, b ), _mm_add_epi16( c, d ) )
|
||||
|
||||
#define mm128_add4_8( a, b, c, d ) \
|
||||
#define v128_add4_8( a, b, c, d ) \
|
||||
_mm_add_epi8( _mm_add_epi8( a, b ), _mm_add_epi8( c, d ) )
|
||||
|
||||
#define mm128_xor4( a, b, c, d ) \
|
||||
#define v128_xor4( a, b, c, d ) \
|
||||
_mm_xor_si128( _mm_xor_si128( a, b ), _mm_xor_si128( c, d ) )
|
||||
#define mm128_xor4 v128_xor4
|
||||
|
||||
|
||||
// Memory functions
|
||||
// Mostly for convenience, avoids calculating bytes.
|
||||
@@ -984,18 +978,23 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define mm128_alignr_64( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*8 )
|
||||
#define mm128_alignr_32( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*4 )
|
||||
#define v128_alignr8 _mm_alignr_epi8
|
||||
#define v128_alignr64( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*8 )
|
||||
#define v128_alignr32( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*4 )
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_alignr_64( hi, lo, c ) \
|
||||
#define v128_alignr64( hi, lo, c ) \
|
||||
_mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
|
||||
|
||||
#define mm128_alignr_32( hi, lo, c ) \
|
||||
#define v128_alignr32( hi, lo, c ) \
|
||||
_mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
|
||||
|
||||
#endif
|
||||
#define mm128_alignr_64 v128_alignr64
|
||||
#define mm128_alignr_32 v128_alignr32
|
||||
#define mm128_alignr_8 v128_alignr32
|
||||
|
||||
|
||||
// NEON only uses vector mask. x86 blend selects second arg when control bit
|
||||
// is set. Blendv selects second arg when sign bit is set. And masking is the
|
||||
|
||||
@@ -336,27 +336,27 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
|
||||
// Bit rotation
|
||||
#define v128_ror64( v, c ) \
|
||||
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint64x2_t)v) ) \
|
||||
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \
|
||||
: vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
|
||||
|
||||
#define v128_rol64( v, c ) \
|
||||
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint64x2_t)v) ) \
|
||||
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \
|
||||
: vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
|
||||
|
||||
#define v128_ror32( v, c ) \
|
||||
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint32x4_t)v) ) \
|
||||
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \
|
||||
: vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
|
||||
|
||||
#define v128_rol32( v, c ) \
|
||||
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint32x4_t)v) ) \
|
||||
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \
|
||||
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
|
||||
|
||||
#define v128_ror16( v, c ) \
|
||||
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint16x8_t)v) ) \
|
||||
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
|
||||
: vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
|
||||
|
||||
#define v128_rol16( v, c ) \
|
||||
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint16x8_t)v) ) \
|
||||
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
|
||||
: vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
|
||||
|
||||
#define v128_ror8( v, c ) \
|
||||
|
||||
Reference in New Issue
Block a user