mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v24.2
This commit is contained in:
@@ -469,7 +469,7 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
0x0405060700010203 );
|
||||
|
||||
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
|
||||
@@ -913,9 +913,7 @@ static inline void extr_lane_8x32( void *d, const void *s,
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
|
||||
|
||||
//TODO Enable for AVX10_256 AVX10_512
|
||||
#if defined(VL256) && defined(VBMI)
|
||||
|
||||
// Combine byte swap & broadcast in one permute
|
||||
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
@@ -977,7 +975,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
{
|
||||
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
0x0405060700010203 );
|
||||
const __m256i c1 = v256_32( 1 );
|
||||
const __m256i c2 = _mm256_add_epi32( c1, c1 );
|
||||
const __m256i c3 = _mm256_add_epi32( c2, c1 );
|
||||
@@ -1035,7 +1033,8 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
_mm256_castsi128_si256( s4 ), c3 );
|
||||
}
|
||||
|
||||
#endif // AVX512VBMI else
|
||||
#endif
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
// 16x32
|
||||
@@ -1417,11 +1416,9 @@ static inline void extr_lane_16x32( void *d, const void *s,
|
||||
((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+240 ];
|
||||
}
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
#if defined(__AVX512VBMI__)
|
||||
|
||||
// TODO Enable for AVX10_512
|
||||
#if defined(VBMI)
|
||||
|
||||
// Combine byte swap & broadcast in one permute
|
||||
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
@@ -1540,7 +1537,7 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
_mm512_castsi128_si512( s4 ) );
|
||||
}
|
||||
|
||||
#endif // VBMI else
|
||||
#endif
|
||||
#endif // AVX512
|
||||
|
||||
///////////////////////////
|
||||
@@ -1983,9 +1980,9 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
|
||||
#if defined(__AVX2__)
|
||||
|
||||
//TODO Enable for AVX10_256 AVX10_512
|
||||
#if defined(VL256) && defined(VBMI)
|
||||
|
||||
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
{
|
||||
@@ -2019,7 +2016,7 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
_mm256_castsi128_si256( s4 ) );
|
||||
}
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
#else
|
||||
|
||||
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
{
|
||||
@@ -2049,6 +2046,8 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
_mm256_castsi128_si256( s4 ), 0x55 );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#endif // SSE2
|
||||
@@ -2375,9 +2374,7 @@ static inline void extr_lane_8x64( void *dst, const void *src, const int lane,
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__)
|
||||
|
||||
//TODO Enable for AVX10_512
|
||||
#if defined(SIMD512)
|
||||
|
||||
// broadcast to all lanes
|
||||
static inline void mm512_intrlv80_8x64( void *dst, const void *src )
|
||||
@@ -2399,7 +2396,7 @@ static inline void mm512_intrlv80_8x64( void *dst, const void *src )
|
||||
|
||||
// byte swap and broadcast to all lanes
|
||||
|
||||
#if defined(__AVX512VBMI__)
|
||||
#if defined(VBMI)
|
||||
|
||||
// Combine byte swap & broadcast in one permute
|
||||
static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
||||
@@ -2626,10 +2623,9 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
#if defined(__AVX512VBMI__)
|
||||
//TODO Enable for AVX10_512
|
||||
#if defined(VBMI)
|
||||
|
||||
static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
|
||||
{
|
||||
@@ -3532,9 +3528,7 @@ do { \
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
//TODO Enable for AVX10_512
|
||||
#if defined(SIMD512)
|
||||
|
||||
/*
|
||||
#define mm512_intrlv_blend_128( hi, lo ) \
|
||||
@@ -3559,7 +3553,7 @@ do { \
|
||||
dst[7] = _mm512_mask_blend_epi64( mask, a[7], b[7] ); \
|
||||
} while(0)
|
||||
|
||||
#endif // AVX512
|
||||
#endif // SIMD512
|
||||
|
||||
#undef ILEAVE_4x32
|
||||
#undef LOAD_SRCE_4x32
|
||||
|
@@ -207,12 +207,12 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
|
||||
|
||||
#endif
|
||||
|
||||
// broadcast (replicate) lane l to all lanes
|
||||
#define v128_replane64( v, l ) \
|
||||
// Broadcast lane l to all lanes
|
||||
#define v128_duplane64( v, l ) \
|
||||
( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x44 ) \
|
||||
: _mm_shuffle_epi32( v, 0xee )
|
||||
|
||||
#define v128_replane32( v, l ) \
|
||||
#define v128_duplane32( v, l ) \
|
||||
( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x00 ) \
|
||||
: ( (l) == 1 ) ? _mm_shuffle_epi32( v, 0x55 ) \
|
||||
: ( (l) == 2 ) ? _mm_shuffle_epi32( v, 0xaa ) \
|
||||
@@ -347,8 +347,7 @@ static inline __m128i v128_neg1_fn()
|
||||
// Basic operations without equivalent SIMD intrinsic
|
||||
|
||||
// Bitwise not (~v)
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
#if defined(VL256)
|
||||
|
||||
static inline __m128i v128_not( const __m128i v )
|
||||
{ return _mm_ternarylogic_epi64( v, v, v, 1 ); }
|
||||
@@ -402,8 +401,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
||||
#define memcpy_128 v128_memcpy
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
#if defined(VL256)
|
||||
|
||||
// ~v1 | v0
|
||||
#define v128_ornot( v1, v0 ) _mm_ternarylogic_epi64( v1, v0, v0, 0xcf )
|
||||
|
||||
// a ^ b ^ c
|
||||
#define v128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
@@ -434,6 +435,8 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
|
||||
#else
|
||||
|
||||
#define v128_ornot( v1, v0 ) _mm_or_si128( v1, v128_not( v0 ) )
|
||||
|
||||
#define v128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
|
||||
|
||||
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
|
||||
@@ -454,7 +457,6 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
|
||||
#endif
|
||||
|
||||
#define v128_ornot( a, b ) _mm_or_si128( a, v128_not( b ) )
|
||||
|
||||
// Mask making
|
||||
// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
|
||||
@@ -494,7 +496,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
#define v128_rol32_sse2( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
|
||||
// AVX512 fastest for all rotations.
|
||||
#define v128_ror64 _mm_ror_epi64
|
||||
@@ -609,13 +611,15 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
// deprecated
|
||||
#define mm128_rol_32 v128_rol32
|
||||
|
||||
// ror( v1 ^ v0, n )
|
||||
#define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n )
|
||||
|
||||
/* not used
|
||||
// x2 rotates elements in 2 individual vectors in a double buffered
|
||||
// optimization for SSE2, does nothing for AVX512 but is there for
|
||||
// transparency.
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
#if defined(VL256)
|
||||
|
||||
#define v128_2ror64( v1, v0, c ) \
|
||||
_mm_ror_epi64( v0, c ); \
|
||||
@@ -917,10 +921,8 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
|
||||
#define v128_block_bswap32 mm128_block_bswap_32
|
||||
#define v128_block_bswap64 mm128_block_bswap_64
|
||||
|
||||
|
||||
// alignr instruction for 32 & 64 bit elements is only available with AVX512
|
||||
// but emulated here. Behaviour is consistent with Intel alignr intrinsics.
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define v128_alignr8 _mm_alignr_epi8
|
||||
@@ -929,6 +931,9 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
|
||||
|
||||
#else
|
||||
|
||||
#define v128_alignr8( hi, lo, c ) \
|
||||
_mm_or_si128( _mm_slli_si128( hi, c ), _mm_srli_si128( lo, c ) )
|
||||
|
||||
#define v128_alignr64( hi, lo, c ) \
|
||||
_mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
|
||||
|
||||
@@ -937,12 +942,15 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
|
||||
|
||||
#endif
|
||||
|
||||
// blend using vector mask
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
// Bytewise using sign bit of each byte element of mask
|
||||
#define v128_blendv _mm_blendv_epi8
|
||||
|
||||
#else
|
||||
|
||||
// Bitwise
|
||||
#define v128_blendv( v1, v0, mask ) \
|
||||
v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )
|
||||
|
||||
|
@@ -66,8 +66,7 @@ typedef union
|
||||
|
||||
// Set either the low or high 64 bit elements in 128 bit lanes, other elements
|
||||
// are set to zero.
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
#if defined(VL256)
|
||||
|
||||
#define mm256_bcast128lo_64( i64 ) _mm256_maskz_set1_epi64( 0x55, i64 )
|
||||
#define mm256_bcast128hi_64( i64 ) _mm256_maskz_set1_epi64( 0xaa, i64 )
|
||||
@@ -117,8 +116,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
//
|
||||
// Basic operations without SIMD equivalent
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
#if defined(VL256)
|
||||
|
||||
static inline __m256i mm256_not( const __m256i v )
|
||||
{ return _mm256_ternarylogic_epi64( v, v, v, 1 ); }
|
||||
@@ -137,8 +135,10 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
#define mm256_add4_32( a, b, c, d ) \
|
||||
_mm256_add_epi32( _mm256_add_epi32( a, b ), _mm256_add_epi32( c, d ) )
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
#if defined(VL256)
|
||||
|
||||
// ~v1 | v0
|
||||
#define mm256_ornot( v1, v0 ) _mm256_ternarylogic_epi64( v1, v0, v0, 0xcf )
|
||||
|
||||
// a ^ b ^ c
|
||||
#define mm256_xor3( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
@@ -172,6 +172,8 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
|
||||
#else
|
||||
|
||||
#define mm256_ornot( v1, v0 ) _mm256_or_si256( v1, mm256_not( v0 ) )
|
||||
|
||||
#define mm256_xor3( a, b, c ) \
|
||||
_mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
|
||||
|
||||
@@ -257,7 +259,7 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
|
||||
_mm256_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
|
||||
#define mm256_ror_64 _mm256_ror_epi64
|
||||
#define mm256_rol_64 _mm256_rol_epi64
|
||||
@@ -343,8 +345,7 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
// optimization for AVX2, does nothing for AVX512 but is here for
|
||||
// transparency.
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
#if defined(VL256)
|
||||
/*
|
||||
#define mm256_ror_64 _mm256_ror_epi64
|
||||
#define mm256_rol_64 _mm256_rol_epi64
|
||||
@@ -470,7 +471,7 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
|
||||
/* Not used
|
||||
// Rotate 256 bit vector by one 32 bit element.
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
static inline __m256i mm256_shuflr_32( const __m256i v )
|
||||
{ return _mm256_alignr_epi32( v, v, 1 ); }
|
||||
static inline __m256i mm256_shufll_32( const __m256i v )
|
||||
@@ -507,8 +508,8 @@ static inline __m256i mm256_shufll_32( const __m256i v )
|
||||
#define mm256_shuflr128_32(v) _mm256_shuffle_epi32( v, 0x39 )
|
||||
#define mm256_shufll128_32(v) _mm256_shuffle_epi32( v, 0x93 )
|
||||
|
||||
#define mm256_shuflr128_16(v) _mm256_shuffle_epi16( v, 0x39 )
|
||||
#define mm256_shufll128_16(v) _mm256_shuffle_epi16( v, 0x93 )
|
||||
#define mm256_shuflr128_16(v) mm256_shuffle_16( v, 0x39 )
|
||||
#define mm256_shufll128_16(v) mm256_shuffle_16( v, 0x93 )
|
||||
|
||||
/* Not used
|
||||
static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
@@ -606,6 +607,22 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \
|
||||
}
|
||||
|
||||
#if defined(VL256)
|
||||
|
||||
#define mm256_alignr64 _mm256_alignr_epi64
|
||||
|
||||
#else
|
||||
|
||||
#define mm256_alignr64( v1, v0, c ) \
|
||||
( ( (c) & 3 ) == 1 ) ? _mm256_blend_epi32( mm256_shuflr_64( v1 ), \
|
||||
mm256_shuflr_64( v0 ), 0x3f ) \
|
||||
: ( ( (c) & 3 ) == 2 ) ? _mm256_blend_epi32( mm256_rev_128( v1 ), \
|
||||
mm256_rev_128( v0 ), 0x0f ) \
|
||||
: ( ( (c) & 3 ) == 3 ) ? _mm256_blend_epi32( mm256_shufll_64( v1 ), \
|
||||
mm256_shufll_64( v0 ), 0x03 ) \
|
||||
: v0
|
||||
|
||||
#endif
|
||||
|
||||
#endif // __AVX2__
|
||||
#endif // SIMD_256_H__
|
||||
|
@@ -14,7 +14,13 @@
|
||||
// vectors. It is therefore not technically required for any 512 bit vector
|
||||
// utilities defined below.
|
||||
|
||||
#if defined(__x86_64__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
// if avx10 // avx512 is always set
|
||||
// if evex512: yes
|
||||
// else if avx512 : yes // avx512 is set but not avx10
|
||||
// else : no // avx512 not set or avx10.1 is set without evex512
|
||||
|
||||
|
||||
#if defined(SIMD512)
|
||||
|
||||
// AVX512 intrinsics have a few changes from previous conventions.
|
||||
//
|
||||
@@ -180,6 +186,9 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
// Ternary logic uses 8 bit truth table to define any 3 input logical
|
||||
// expression using any number or combinations of AND, OR, XOR, NOT.
|
||||
|
||||
// ~v1 | v0
|
||||
#define mm512_ornot( v1, v0 ) _mm512_ternarylogic_epi64( v1, v0, v0, 0xcf )
|
||||
|
||||
// a ^ b ^ c
|
||||
#define mm512_xor3( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
|
||||
|
@@ -4,22 +4,20 @@
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON)
|
||||
|
||||
// Targeted functions supporting NEON SIMD 128 & 64 bit vectors.
|
||||
// Size matters!
|
||||
// Element size matters!
|
||||
//
|
||||
// Intel naming is generally used.
|
||||
//
|
||||
// documented instructions that aren't defined on RPi 4.
|
||||
// They seem to be all 3 op instructionsi.
|
||||
// Some advanced logical operations that require SHA3. Prior to GCC-13
|
||||
// they also require armv8.2
|
||||
//
|
||||
// veor3q ie xor3
|
||||
// vxarq_u64( v1, v0, n ) ror( xor( v1, v0 ), n )
|
||||
// vraxlq_u64( v1, v0 ) xor( rol( v1, 1 ), rol( v0, 1 ) )
|
||||
// vbcaxq( v2, v1, v0 ) xor( v2, and( v1, not(v0) ) )
|
||||
// vsraq_n( v1, v0, n ) add( v1, sr( v0, n ) )
|
||||
// veor3q( v2, v1, v0 ) xor3 v2 ^ v1 ^ v0
|
||||
// vxarq_u64( v1, v0, n ) ror64xor ( v1 ^ v0 ) >>> n )
|
||||
// vbcaxq_u{64,32,16,8}( v2, v1, v0 ) xorandnot v2 ^ ( v1 & ~v0 )
|
||||
//
|
||||
// Doesn't work on RPi but works on OPi:
|
||||
//
|
||||
// vornq( v1, v0 ) or( v1, not( v0 ) )
|
||||
// not used anywhere yet
|
||||
// vrax1q_u64( v1, v0 ) v1 ^ ( v0 <<< 1 )
|
||||
// vsraq_n_u{64,32,16,8}( v1, v0, n ) v1 + ( v0 >> n )
|
||||
|
||||
#define v128_t uint32x4_t // default,
|
||||
#define v128u64_t uint64x2_t
|
||||
@@ -87,15 +85,15 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
|
||||
// Not yet needed
|
||||
//#define v128_cmpeq1
|
||||
// Signed
|
||||
#define v128_cmpgt64( v1, v0 ) vcgtq_s64( (int64x2_t)v1, (int64x2_t)v0 )
|
||||
#define v128_cmpgt32( v1, v0 ) vcgtq_s32( (int32x4_t)v1, (int32x4_t)v0 )
|
||||
#define v128_cmpgt16( v1, v0 ) vcgtq_s16( (int16x8_t)v1, (int16x8_t)v0 )
|
||||
#define v128_cmpgt8( v1, v0 ) vcgtq_s8( (int8x16_t)v1, (int8x16_t)v0 )
|
||||
#define v128_cmpgt64( v1, v0 ) vcgtq_s64( (int64x2_t)v1, (int64x2_t)(v0) )
|
||||
#define v128_cmpgt32( v1, v0 ) vcgtq_s32( (int32x4_t)v1, (int32x4_t)(v0) )
|
||||
#define v128_cmpgt16( v1, v0 ) vcgtq_s16( (int16x8_t)v1, (int16x8_t)(v0) )
|
||||
#define v128_cmpgt8( v1, v0 ) vcgtq_s8( (int8x16_t)v1, (int8x16_t)(v0) )
|
||||
|
||||
#define v128_cmplt64( v1, v0 ) vcltq_s64( (int64x2_t)v1, (int64x2_t)v0 )
|
||||
#define v128_cmplt32( v1, v0 ) vcltq_s32( (int32x4_t)v1, (int32x4_t)v0 )
|
||||
#define v128_cmplt16( v1, v0 ) vcltq_s16( (int16x8_t)v1, (int16x8_t)v0 )
|
||||
#define v128_cmplt8( v1, v0 ) vcltq_s8( (int8x16_t)v1, (int8x16_t)v0 )
|
||||
#define v128_cmplt64( v1, v0 ) vcltq_s64( (int64x2_t)v1, (int64x2_t)(v0) )
|
||||
#define v128_cmplt32( v1, v0 ) vcltq_s32( (int32x4_t)v1, (int32x4_t)(v0) )
|
||||
#define v128_cmplt16( v1, v0 ) vcltq_s16( (int16x8_t)v1, (int16x8_t)(v0) )
|
||||
#define v128_cmplt8( v1, v0 ) vcltq_s8( (int8x16_t)v1, (int8x16_t)(v0) )
|
||||
|
||||
// Logical bit shift
|
||||
#define v128_sl64 vshlq_n_u64
|
||||
@@ -109,33 +107,38 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
|
||||
#define v128_sr8 vshrq_n_u8
|
||||
|
||||
// Arithmetic shift.
|
||||
#define v128_sra64( v, c ) vshrq_n_s64( (int64x2_t)v, c )
|
||||
#define v128_sra32( v, c ) vshrq_n_s32( (int32x4_t)v, c )
|
||||
#define v128_sra16( v, c ) vshrq_n_s16( (int16x8_t)v, c )
|
||||
#define v128_sra64( v, c ) vshrq_n_s64( (int64x2_t)(v), c )
|
||||
#define v128_sra32( v, c ) vshrq_n_s32( (int32x4_t)(v), c )
|
||||
#define v128_sra16( v, c ) vshrq_n_s16( (int16x8_t)(v), c )
|
||||
|
||||
// unary logic
|
||||
|
||||
#define v128_not vmvnq_u32
|
||||
|
||||
// binary logic
|
||||
|
||||
#define v128_or vorrq_u32
|
||||
#define v128_and vandq_u32
|
||||
#define v128_xor veorq_u32
|
||||
|
||||
// ~v1 & v0
|
||||
#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32( v1 ), v0 )
|
||||
#define v128_andnot( v1, v0 ) vbicq_u32( v0, v1 )
|
||||
|
||||
// ~( a ^ b ), same as (~a) ^ b
|
||||
#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
|
||||
|
||||
// ~v1 | v0, x86_64 convention, first arg is not'ed
|
||||
#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 )
|
||||
// ~v1 | v0, args reversed for consistency with x86_64
|
||||
#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 )
|
||||
|
||||
// ternary logic
|
||||
|
||||
// v2 ^ v1 ^ v0
|
||||
// veorq_u32 not defined
|
||||
//#define v128_xor3 veor3q_u32
|
||||
#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) )
|
||||
// This will compile with GCC-11 on armv8.2 and above. At this time there is no
|
||||
// known way to test arm minor version.
|
||||
#if defined(__ARM_FEATURE_SHA3)
|
||||
#define v128_xor3 veor3q_u32
|
||||
#else
|
||||
#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) )
|
||||
#endif
|
||||
|
||||
// v2 & v1 & v0
|
||||
#define v128_and3( v2, v1, v0 ) v128_and( v2, v128_and( v1, v0 ) )
|
||||
@@ -143,8 +146,12 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
|
||||
// v2 | v1 | v0
|
||||
#define v128_or3( v2, v1, v0 ) v128_or( v2, v128_or( v1, v0 ) )
|
||||
|
||||
// a ^ ( ~b & c )
|
||||
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
|
||||
// v2 ^ ( ~v1 & v0 )
|
||||
#if defined(__ARM_FEATURE_SHA3)
|
||||
#define v128_xorandnot( v2, v1, v0 ) vbcaxq_u32( v2, v0, v1 )
|
||||
#else
|
||||
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
|
||||
#endif
|
||||
|
||||
// a ^ ( b & c )
|
||||
#define v128_xorand( v2, v1, v0 ) v128_xor( v2, v128_and( v1, v0 ) )
|
||||
@@ -158,12 +165,12 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
|
||||
// v2 | ( v1 & v0 )
|
||||
#define v128_orand( v2, v1, v0 ) v128_or( v2, v128_and( v1, v0 ) )
|
||||
|
||||
// shift 2 concatenated vectors right.
|
||||
// shift 2 concatenated vectors right, args reversed for consistency with x86_64
|
||||
#define v128_alignr64( v1, v0, c ) vextq_u64( v0, v1, c )
|
||||
#define v128_alignr32( v1, v0, c ) vextq_u32( v0, v1, c )
|
||||
#define v128_alignr8( v1, v0, c ) vextq_u8( v0, v1, c )
|
||||
|
||||
// Intetleave high or low half of 2 vectors.
|
||||
// Interleave high or low half of 2 vectors.
|
||||
#define v128_unpacklo64( v1, v0 ) vzip1q_u64( v1, v0 )
|
||||
#define v128_unpackhi64( v1, v0 ) vzip2q_u64( v1, v0 )
|
||||
#define v128_unpacklo32( v1, v0 ) vzip1q_u32( v1, v0 )
|
||||
@@ -214,10 +221,10 @@ typedef union
|
||||
#define v128_bcast32(v) vdupq_laneq_u32( v, 0 )
|
||||
#define v128_bcast16(v) vdupq_laneq_u16( v, 0 )
|
||||
|
||||
// Replicate (broadcast) lane l to all lanes
|
||||
#define v128_replane64( v, l ) vdupq_laneq_u64( v, l )
|
||||
#define v128_replane32( v, l ) vdupq_laneq_u32( v, l )
|
||||
#define v128_replane16( v, l ) vdupq_laneq_u16( v, l )
|
||||
// Broadcast lane l to all lanes
|
||||
#define v128_duplane64( v, l ) vdupq_laneq_u64( v, l )
|
||||
#define v128_duplane32( v, l ) vdupq_laneq_u32( v, l )
|
||||
#define v128_duplane16( v, l ) vdupq_laneq_u16( v, l )
|
||||
|
||||
// pointer indexing
|
||||
#define casti_v128( p, i ) (((uint32x4_t*)(p))[i])
|
||||
@@ -232,16 +239,6 @@ typedef union
|
||||
#define cast_v128u32( p ) (*((uint32x4_t*)(p)))
|
||||
#define castp_v128u32( p ) ((uint32x4_t*)(p))
|
||||
|
||||
// use C cast, flexible source type
|
||||
#define u32_to_u64 vreinterpretq_u64_u32
|
||||
#define u64_to_u32 vreinterpretq_u32_u64
|
||||
|
||||
#define u64_to_u8 vreinterpretq_u8_u64
|
||||
#define u8_to_u64 vreinterpretq_u64_u8
|
||||
|
||||
#define u32_to_u8 vreinterpretq_u8_u32
|
||||
#define u8_to_u32 vreinterpretq_u32_u8
|
||||
|
||||
#define v128_zero v128_64( 0ull )
|
||||
|
||||
#define v128_cmpeq_zero vceqzq_u64
|
||||
@@ -336,35 +333,56 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
#define v128_movmask64
|
||||
|
||||
// Bit rotation
|
||||
|
||||
#define v128_ror64( v, c ) \
|
||||
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \
|
||||
: vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
|
||||
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
|
||||
: vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
|
||||
((uint64x2_t)(v)), c )
|
||||
|
||||
#define v128_rol64( v, c ) \
|
||||
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \
|
||||
: vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
|
||||
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
|
||||
: vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
|
||||
((uint64x2_t)(v)), c )
|
||||
|
||||
#define v128_ror32( v, c ) \
|
||||
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \
|
||||
: vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
|
||||
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
|
||||
: vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
|
||||
((uint32x4_t)(v)), c )
|
||||
|
||||
#define v128_rol32( v, c ) \
|
||||
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \
|
||||
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
|
||||
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
|
||||
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
|
||||
((uint32x4_t)(v)), c )
|
||||
|
||||
#define v128_ror16( v, c ) \
|
||||
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
|
||||
: vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
|
||||
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \
|
||||
: vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
|
||||
((uint16x8_t)(v)), c )
|
||||
|
||||
#define v128_rol16( v, c ) \
|
||||
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
|
||||
: vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
|
||||
: vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
|
||||
((uint16x8_t)(v)), c )
|
||||
|
||||
#define v128_ror8( v, c ) \
|
||||
vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)v), 8-c ), ((uint8x16_t)v), c )
|
||||
vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
|
||||
((uint8x16_t)(v)), c )
|
||||
|
||||
#define v128_rol8( v, c ) \
|
||||
vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)v), 8-c ), ((uint8x16_t)v), c )
|
||||
vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
|
||||
((uint8x16_t)(v)), c )
|
||||
|
||||
|
||||
// ror( v1 ^ v0, n )
|
||||
#if defined(__ARM_FEATURE_SHA3)
|
||||
|
||||
#define v128_ror64xor( v1, v0, n ) vxarq_u64( v1, v0, n )
|
||||
|
||||
#else
|
||||
|
||||
#define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n )
|
||||
|
||||
#endif
|
||||
|
||||
#define v128_2ror64( v1, v0, c ) \
|
||||
{ \
|
||||
@@ -416,7 +434,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
*/
|
||||
|
||||
#define v128_shuffle8( v, vmask ) \
|
||||
vqtbl1q_u8( (uint8x16_t)v, (uint8x16_t)vmask )
|
||||
vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )
|
||||
|
||||
// sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
|
||||
// Bit rotation already promotes faster widths. Usage is context sensitive.
|
||||
@@ -465,7 +483,6 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
|
||||
#define v128_bswap32(v) (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) )
|
||||
#define v128_bswap64(v) (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
|
||||
#define v128_bswap128(v) (uint32x4_t)v128_swap64( v128_bswap64(v) )
|
||||
#define v128_bswap256(p) v128_bswap128( (p)[0], (p)[1] )
|
||||
|
||||
// Usefull for x86_64 but does nothing for ARM
|
||||
#define v128_block_bswap32( dst, src ) \
|
||||
@@ -534,16 +551,8 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
|
||||
casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
|
||||
}
|
||||
|
||||
// Blendv
|
||||
#define v128_blendv( v1, v0, mask ) \
|
||||
v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )
|
||||
|
||||
/*
|
||||
// vbcaxq not defined
|
||||
#define v128_blendv( v1, v0, mask ) \
|
||||
vbcaxq_u32( v128_and( mask, v1 ), v0, mask )
|
||||
*/
|
||||
// Bitwise blend using vector mask
|
||||
#define v128_blendv( v1, v0, mask ) vbslq_u32( mask, v1, v0 )
|
||||
|
||||
#endif // __ARM_NEON
|
||||
|
||||
#endif // SIMD_NEON_H__
|
||||
|
Reference in New Issue
Block a user