mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
243 lines
9.5 KiB
C
243 lines
9.5 KiB
C
#if defined(__aarch64__) && defined(__ARM_NEON)
|
|
|
|
// targeted functions using generic names makes portable obsolete
|
|
|
|
#define v128_t uint32x4_t
|
|
|
|
// load & store
|
|
#define v128_load( p ) vld1q_u32( (uint32_t*)(p) )
|
|
#define v128_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
|
|
|
|
// arithmetic
|
|
#define v128_add64 vaddq_u64
|
|
#define v128_add32 vaddq_u32
|
|
#define v128_add16 vaddq_u16
|
|
#define v128_add8 vaddq_u8
|
|
|
|
#define v128_sub64 vsubq_u64
|
|
#define v128_sub32 vsubq_u32
|
|
#define v128_sub16 vsubq_u16
|
|
#define v128_sub8 vsubq_u8
|
|
|
|
// return low half
|
|
#define v128_mullo64 vmulq_u64
|
|
#define v128_mullo32 vmulq_u32
|
|
#define v128_mullo16 vmulq_u16
|
|
|
|
// widen not working, use placeholders
|
|
//#define v128_mul32 vmull_u32
|
|
//#define v128_mul16 vmull_u16
|
|
#define v128_mul64 vmulq_u64
|
|
#define v128_mul32 vmulq_u32
|
|
#define v128_mul16 vmulq_u16
|
|
|
|
// compare
|
|
#define v128_cmpeq64 vceqq_u64
|
|
#define v128_cmpeq32 vceqq_u32
|
|
#define v128_cmpeq16 vceqq_u16
|
|
|
|
#define v128_cmpgt64 vcgtq_u64
|
|
#define v128_cmpgt32 vcgtq_u32
|
|
#define v128_cmpgt16 vcgtq_u16
|
|
|
|
#define v128_cmplt64 vcltq_u64
|
|
#define v128_cmplt32 vcltq_u32
|
|
#define v128_cmplt16 vcltq_u16
|
|
|
|
// bit shift & rotate
|
|
#define v128_sl64 vshlq_n_u64
|
|
#define v128_sl32 vshlq_n_u32
|
|
#define v128_sl16 vshlq_n_u16
|
|
|
|
#define v128_sr64 vshrq_n_u64
|
|
#define v128_sr32 vshrq_n_u32
|
|
#define v128_sr16 vshrq_n_u16
|
|
|
|
#define v128_sra64 vshrq_n_s64
|
|
#define v128_sra32 vshrq_n_s32
|
|
#define v128_sra16 vshrq_n_s16
|
|
|
|
// logical ops
|
|
#define v128_or vorrq_u32
|
|
#define v128_and vandq_u32
|
|
#define v128_not vmvnq_u32
|
|
#define v128_xor veorq_u32
|
|
|
|
#define v128_xor3( v2, v1, v0 ) v128_xor( v2, v128_xor( v1, v0 ) )
|
|
//#define v128_xor3 veor3q_u32
|
|
#define v128_nor vornq_u32
|
|
#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32(v1), v0 )
|
|
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
|
|
#define v128_and3( a, b, c ) v128_and( a, v128_and( b, c ) )
|
|
#define v128_or3( a, b, c ) v128_or( a, v128_or( b, c ) )
|
|
#define v128_xorand( a, b, c ) v128_xor( a, v128_and( b, c ) )
|
|
#define v128_andxor( a, b, c ) v128_and( a, v128_xor( b, c ))
|
|
#define v128_xoror( a, b, c ) v128_xor( a, v128_or( b, c ) )
|
|
#define v128_orand( a, b, c ) v128_or( a, v128_and( b, c ) )
|
|
#define v128_xnor( a, b ) v128_not( v128_xor( a, b ) )
|
|
|
|
#define v128_alignr64 vextq_u64
|
|
#define v128_alignr32 vextq_u32
|
|
#define v128_alignr8 vextq_u8
|
|
|
|
#define v128_unpacklo64 vtrn1q_u64
|
|
#define v128_unpackhi64 vtrn2q_u64
|
|
|
|
#define v128_unpacklo32 vtrn1q_u32
|
|
#define v128_unpackhi32 vtrn2q_u32
|
|
|
|
#define v128_unpacklo16 vtrn1q_u16
|
|
#define v128_unpackhi16 vtrn2q_u16
|
|
|
|
#define v128_unpacklo8 vtrn1q_u8
|
|
#define v128_unpackhi8 vtrn2q_u8
|
|
|
|
// AES
|
|
// consistent with Intel AES, break up for optimizing
|
|
#define v128_aesenc( v, k ) vaesmcq_u8( vaeseq_u8( v, k ) )
|
|
#define v128_aesenclast( v, k ) vaeseq_u8( v, k )
|
|
|
|
#define v128_aesdec( v, k ) vaesimcq_u8( vaesdq_u8( v, k ) )
|
|
#define v128_aesdeclast( v, k ) vaesdq_u8( v, k )
|
|
|
|
// pointer indexing
|
|
#define casti_v128( p, i ) (((uint32x4_t*)(p))[i])
|
|
|
|
#define cast_v128( p ) (*((uint32x4_t*)(p)))
|
|
|
|
|
|
// Many NEON instructions are sized when they don't need to be, for example
|
|
// zero, which may cause the compiler to complain when the sizes don't match.
|
|
// use "-flax_vector_conversions".
|
|
|
|
#define u32_to_u64 vreinterpretq_u64_u32
|
|
#define u64_to_u32 vreinterpretq_u32_u64
|
|
|
|
#define u64_to_u8 vreinterpretq_u8_u64
|
|
#define u8_to_u64 vreinterpretq_u64_u8
|
|
|
|
#define u32_to_u8 vreinterpretq_u8_u32
|
|
#define u8_to_u32 vreinterpretq_u32_u8
|
|
|
|
#define v128_zero v128_64( 0ull )
|
|
//#define v128_zero_fn() v128_64( 0ull )
|
|
//#define v128_zero v128_zero_fn
|
|
|
|
// set1
|
|
#define v128_32 vmovq_n_u32
|
|
#define v128_64 vmovq_n_u64
|
|
|
|
#define v128_set64( u64_1, u64_0 ) \
|
|
( (uint64x2_t)( ( (uint128_t)(u64_1) << 64 ) | (uint128_t)(u64_0) ) )
|
|
#define v128_set_64 v128_set64 // deprecated
|
|
|
|
#define v128_set32( u32_3, u32_2, u32_1, u32_0 ) \
|
|
(uint32x4_t)( ( (uint128_t)(u32_3) << 96 ) | ( (uint128_t)(u32_2) << 64 ) \
|
|
| ( (uint128_t)(u32_1) << 64 ) | ( (uint128_t)(u32_0) ) )
|
|
#define v128_set_32 v128_set32 // deprecated
|
|
|
|
|
|
static inline void v128_memset_zero( uint32x4_t *dst, const int n )
|
|
{ for( int i = 0; i < n; i++ ) dst[n] = (uint32x4_t)(uint128_t)0; }
|
|
|
|
static inline void v128_memset( uint32x4_t *dst, const uint32x4_t *src,
|
|
const int n )
|
|
{ for( int i = 0; i < n; i++ ) dst[n] = src[n]; }
|
|
|
|
static inline void v128_memcpy( uint32x4_t *dst, const uint32x4_t *src, const int n )
|
|
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
|
|
|
// select src & dst lanes
|
|
#define v128_mov32( dst, ld, src, ls ) vcopyq_laneq_u32( dst, ld, src, ls )
|
|
|
|
// move src u64 to lane 0, neon needs a source vector to write into
|
|
#define v128_mov64( u64 ) (uint64x2_t)(uint128_t)(u64)
|
|
|
|
static inline uint64x2_t v128_negate64( uint64x2_t v )
|
|
{ return v128_sub64( v128_xor( v, v ), v ); }
|
|
|
|
static inline uint32x4_t v128_negate32( uint32x4_t v )
|
|
{ return v128_sub32( v128_xor( v, v ), v ); }
|
|
|
|
static inline uint16x8_t v128_negate16( uint16x8_t v )
|
|
{ return v128_sub64( v128_xor( v, v ), v ); }
|
|
|
|
#define v128_add4_32( v3, v2, v1, v0 ) \
|
|
vaddq_u32( vaddq_u32( v3, v2 ), vaddq_u32( v1, v0 ) )
|
|
|
|
// how to build a bitmask from vector elements?
|
|
#define v128_movmask32 _Static_assert (0, "No ARM target: v128_movmask32")
|
|
#define v128_movmask64 _Static_assert (0, "No ARM target: v128_movmask64")
|
|
|
|
|
|
static inline uint64x2_t v128_ror64( uint64x2_t v, const int c )
|
|
{ return vsriq_n_u64( vsliq_n_u64( v, v, 64-(c) ), v, c ); }
|
|
|
|
static inline uint64x2_t v128_rol64( uint64x2_t v, const int c )
|
|
{ return vsriq_n_u64( vsliq_n_u64( v, v, c ), v, 64-(c) ); }
|
|
|
|
static inline uint32x4_t v128_ror32( uint32x4_t v, const int c )
|
|
{ return vsriq_n_u32( vsliq_n_u32( v, v, 32-(c) ), v, c ); }
|
|
|
|
static inline uint32x4_t v128_rol32( uint32x4_t v, const int c )
|
|
{ return vsriq_n_u32( vsliq_n_u32( v, v, c ), v, 32-(c) ); }
|
|
|
|
static inline uint16x8_t v128_ror16( uint16x8_t v, const int c )
|
|
{ return vsriq_n_u16( vsliq_n_u16( v, v, 16-(c) ), v, c ); }
|
|
|
|
static inline uint16x8_t v128_rol16( uint16x8_t v, const int c )
|
|
{ return vsriq_n_u16( vsliq_n_u16( v, v, c ), v, 16-(c) ); }
|
|
|
|
// reverse endian byte order
|
|
#define v128_bswap16(v) u8_to_u16( vrev16q_u8( u16_to_u8(v) ))
|
|
#define v128_bswap32(v) u8_to_u32( vrev32q_u8( u32_to_u8(v) ))
|
|
#define v128_bswap64(v) u8_to_u64( vrev64q_u8( u64_to_u8(v) ))
|
|
#define v128_bswap128(v) v128_swap64( v128_bswap64(v) )
|
|
|
|
#define v128_block_bswap32( dst, src ) \
|
|
casti_v128( dst, 0 ) = v128_bswap32( casti_v128( src, 0 ) ); \
|
|
casti_v128( dst, 1 ) = v128_bswap32( casti_v128( src, 1 ) ); \
|
|
casti_v128( dst, 2 ) = v128_bswap32( casti_v128( src, 2 ) ); \
|
|
casti_v128( dst, 3 ) = v128_bswap32( casti_v128( src, 3 ) ); \
|
|
casti_v128( dst, 4 ) = v128_bswap32( casti_v128( src, 4 ) ); \
|
|
casti_v128( dst, 5 ) = v128_bswap32( casti_v128( src, 5 ) ); \
|
|
casti_v128( dst, 6 ) = v128_bswap32( casti_v128( src, 6 ) ); \
|
|
casti_v128( dst, 7 ) = v128_bswap32( casti_v128( src, 7 ) );
|
|
|
|
#define v128_block_bswap64( dst, src ) \
|
|
dst[0] = v128_bswap64( src[0] ); \
|
|
dst[1] = v128_bswap64( src[1] ); \
|
|
dst[2] = v128_bswap64( src[2] ); \
|
|
dst[3] = v128_bswap64( src[3] ); \
|
|
dst[4] = v128_bswap64( src[4] ); \
|
|
dst[5] = v128_bswap64( src[5] ); \
|
|
dst[6] = v128_bswap64( src[6] ); \
|
|
dst[7] = v128_bswap64( src[7] );
|
|
|
|
#define v128_rev32( v ) vrev64q_u32( v )
|
|
|
|
static inline uint32x4_t v128_swap64( uint32x4_t v )
|
|
{ return vextq_u64( v, v, 1 ); }
|
|
|
|
static inline uint32x4_t v128_swap32( uint32x4_t v )
|
|
{ return vextq_u32( v, v, 2 ); }
|
|
|
|
static inline uint32x4_t v128_shuflr32( uint32x4_t v )
|
|
{ return vextq_u32( v, v, 1 ); }
|
|
|
|
static inline uint32x4_t v128_shufll32( uint32x4_t v )
|
|
{ return vextq_u32( v, v, 3 ); }
|
|
|
|
#define v128_swap64_32(v) v128_ror64( v, 32 )
|
|
#define v128_shuflr64_24(v) v128_ror64( v, 24 )
|
|
#define v128_shuflr64_16(v) v128_ror64( v, 16 )
|
|
|
|
#define v128_swap32_16(v) v128_ror32( v, 16 )
|
|
#define v128_shuflr32_8(v) v128_ror32( v, 8 )
|
|
|
|
// Not the same as SSE2, this uses vector mask, SSE2 uses imm8 mask.
|
|
#define v128_blend16( v1, v0, mask ) \
|
|
v128_or( v128_and( mask, v1 ), v128_andnot( mask, v0 ) )
|
|
|
|
#endif
|