Files
cpuminer-opt-gpu/simd-utils/simd-neon.h
Jay D Dee 31c4dedf59 v3.23.4
2023-10-06 22:18:09 -04:00

243 lines
9.5 KiB
C

#if defined(__aarch64__) && defined(__ARM_NEON)
// targeted functions using generic names makes portable obsolete
#define v128_t uint32x4_t
// load & store
#define v128_load( p ) vld1q_u32( (uint32_t*)(p) )
#define v128_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
// arithmetic
#define v128_add64 vaddq_u64
#define v128_add32 vaddq_u32
#define v128_add16 vaddq_u16
#define v128_add8 vaddq_u8
#define v128_sub64 vsubq_u64
#define v128_sub32 vsubq_u32
#define v128_sub16 vsubq_u16
#define v128_sub8 vsubq_u8
// return low half
#define v128_mullo64 vmulq_u64
#define v128_mullo32 vmulq_u32
#define v128_mullo16 vmulq_u16
// widen not working, use placeholders
//#define v128_mul32 vmull_u32
//#define v128_mul16 vmull_u16
#define v128_mul64 vmulq_u64
#define v128_mul32 vmulq_u32
#define v128_mul16 vmulq_u16
// compare
#define v128_cmpeq64 vceqq_u64
#define v128_cmpeq32 vceqq_u32
#define v128_cmpeq16 vceqq_u16
#define v128_cmpgt64 vcgtq_u64
#define v128_cmpgt32 vcgtq_u32
#define v128_cmpgt16 vcgtq_u16
#define v128_cmplt64 vcltq_u64
#define v128_cmplt32 vcltq_u32
#define v128_cmplt16 vcltq_u16
// bit shift & rotate
#define v128_sl64 vshlq_n_u64
#define v128_sl32 vshlq_n_u32
#define v128_sl16 vshlq_n_u16
#define v128_sr64 vshrq_n_u64
#define v128_sr32 vshrq_n_u32
#define v128_sr16 vshrq_n_u16
#define v128_sra64 vshrq_n_s64
#define v128_sra32 vshrq_n_s32
#define v128_sra16 vshrq_n_s16
// logical ops
#define v128_or vorrq_u32
#define v128_and vandq_u32
#define v128_not vmvnq_u32
#define v128_xor veorq_u32
#define v128_xor3( v2, v1, v0 ) v128_xor( v2, v128_xor( v1, v0 ) )
//#define v128_xor3 veor3q_u32
#define v128_nor vornq_u32
#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32(v1), v0 )
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
#define v128_and3( a, b, c ) v128_and( a, v128_and( b, c ) )
#define v128_or3( a, b, c ) v128_or( a, v128_or( b, c ) )
#define v128_xorand( a, b, c ) v128_xor( a, v128_and( b, c ) )
#define v128_andxor( a, b, c ) v128_and( a, v128_xor( b, c ))
#define v128_xoror( a, b, c ) v128_xor( a, v128_or( b, c ) )
#define v128_orand( a, b, c ) v128_or( a, v128_and( b, c ) )
#define v128_xnor( a, b ) v128_not( v128_xor( a, b ) )
#define v128_alignr64 vextq_u64
#define v128_alignr32 vextq_u32
#define v128_alignr8 vextq_u8
#define v128_unpacklo64 vtrn1q_u64
#define v128_unpackhi64 vtrn2q_u64
#define v128_unpacklo32 vtrn1q_u32
#define v128_unpackhi32 vtrn2q_u32
#define v128_unpacklo16 vtrn1q_u16
#define v128_unpackhi16 vtrn2q_u16
#define v128_unpacklo8 vtrn1q_u8
#define v128_unpackhi8 vtrn2q_u8
// AES
// consistent with Intel AES, break up for optimizing
#define v128_aesenc( v, k ) vaesmcq_u8( vaeseq_u8( v, k ) )
#define v128_aesenclast( v, k ) vaeseq_u8( v, k )
#define v128_aesdec( v, k ) vaesimcq_u8( vaesdq_u8( v, k ) )
#define v128_aesdeclast( v, k ) vaesdq_u8( v, k )
// pointer indexing
#define casti_v128( p, i ) (((uint32x4_t*)(p))[i])
#define cast_v128( p ) (*((uint32x4_t*)(p)))
// Many NEON instructions are sized when they don't need to be, for example
// zero, which may cause the compiler to complain when the sizes don't match.
// use "-flax_vector_conversions".
#define u32_to_u64 vreinterpretq_u64_u32
#define u64_to_u32 vreinterpretq_u32_u64
#define u64_to_u8 vreinterpretq_u8_u64
#define u8_to_u64 vreinterpretq_u64_u8
#define u32_to_u8 vreinterpretq_u8_u32
#define u8_to_u32 vreinterpretq_u32_u8
#define v128_zero v128_64( 0ull )
//#define v128_zero_fn() v128_64( 0ull )
//#define v128_zero v128_zero_fn
// set1
#define v128_32 vmovq_n_u32
#define v128_64 vmovq_n_u64
#define v128_set64( u64_1, u64_0 ) \
( (uint64x2_t)( ( (uint128_t)(u64_1) << 64 ) | (uint128_t)(u64_0) ) )
#define v128_set_64 v128_set64 // deprecated
#define v128_set32( u32_3, u32_2, u32_1, u32_0 ) \
(uint32x4_t)( ( (uint128_t)(u32_3) << 96 ) | ( (uint128_t)(u32_2) << 64 ) \
| ( (uint128_t)(u32_1) << 64 ) | ( (uint128_t)(u32_0) ) )
#define v128_set_32 v128_set32 // deprecated
static inline void v128_memset_zero( uint32x4_t *dst, const int n )
{ for( int i = 0; i < n; i++ ) dst[n] = (uint32x4_t)(uint128_t)0; }
static inline void v128_memset( uint32x4_t *dst, const uint32x4_t *src,
const int n )
{ for( int i = 0; i < n; i++ ) dst[n] = src[n]; }
static inline void v128_memcpy( uint32x4_t *dst, const uint32x4_t *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
// select src & dst lanes
#define v128_mov32( dst, ld, src, ls ) vcopyq_laneq_u32( dst, ld, src, ls )
// move src u64 to lane 0, neon needs a source vector to write into
#define v128_mov64( u64 ) (uint64x2_t)(uint128_t)(u64)
static inline uint64x2_t v128_negate64( uint64x2_t v )
{ return v128_sub64( v128_xor( v, v ), v ); }
static inline uint32x4_t v128_negate32( uint32x4_t v )
{ return v128_sub32( v128_xor( v, v ), v ); }
static inline uint16x8_t v128_negate16( uint16x8_t v )
{ return v128_sub64( v128_xor( v, v ), v ); }
#define v128_add4_32( v3, v2, v1, v0 ) \
vaddq_u32( vaddq_u32( v3, v2 ), vaddq_u32( v1, v0 ) )
// how to build a bitmask from vector elements?
#define v128_movmask32 _Static_assert (0, "No ARM target: v128_movmask32")
#define v128_movmask64 _Static_assert (0, "No ARM target: v128_movmask64")
static inline uint64x2_t v128_ror64( uint64x2_t v, const int c )
{ return vsriq_n_u64( vsliq_n_u64( v, v, 64-(c) ), v, c ); }
static inline uint64x2_t v128_rol64( uint64x2_t v, const int c )
{ return vsriq_n_u64( vsliq_n_u64( v, v, c ), v, 64-(c) ); }
static inline uint32x4_t v128_ror32( uint32x4_t v, const int c )
{ return vsriq_n_u32( vsliq_n_u32( v, v, 32-(c) ), v, c ); }
static inline uint32x4_t v128_rol32( uint32x4_t v, const int c )
{ return vsriq_n_u32( vsliq_n_u32( v, v, c ), v, 32-(c) ); }
static inline uint16x8_t v128_ror16( uint16x8_t v, const int c )
{ return vsriq_n_u16( vsliq_n_u16( v, v, 16-(c) ), v, c ); }
static inline uint16x8_t v128_rol16( uint16x8_t v, const int c )
{ return vsriq_n_u16( vsliq_n_u16( v, v, c ), v, 16-(c) ); }
// reverse endian byte order
#define v128_bswap16(v) u8_to_u16( vrev16q_u8( u16_to_u8(v) ))
#define v128_bswap32(v) u8_to_u32( vrev32q_u8( u32_to_u8(v) ))
#define v128_bswap64(v) u8_to_u64( vrev64q_u8( u64_to_u8(v) ))
#define v128_bswap128(v) v128_swap64( v128_bswap64(v) )
#define v128_block_bswap32( dst, src ) \
casti_v128( dst, 0 ) = v128_bswap32( casti_v128( src, 0 ) ); \
casti_v128( dst, 1 ) = v128_bswap32( casti_v128( src, 1 ) ); \
casti_v128( dst, 2 ) = v128_bswap32( casti_v128( src, 2 ) ); \
casti_v128( dst, 3 ) = v128_bswap32( casti_v128( src, 3 ) ); \
casti_v128( dst, 4 ) = v128_bswap32( casti_v128( src, 4 ) ); \
casti_v128( dst, 5 ) = v128_bswap32( casti_v128( src, 5 ) ); \
casti_v128( dst, 6 ) = v128_bswap32( casti_v128( src, 6 ) ); \
casti_v128( dst, 7 ) = v128_bswap32( casti_v128( src, 7 ) );
#define v128_block_bswap64( dst, src ) \
dst[0] = v128_bswap64( src[0] ); \
dst[1] = v128_bswap64( src[1] ); \
dst[2] = v128_bswap64( src[2] ); \
dst[3] = v128_bswap64( src[3] ); \
dst[4] = v128_bswap64( src[4] ); \
dst[5] = v128_bswap64( src[5] ); \
dst[6] = v128_bswap64( src[6] ); \
dst[7] = v128_bswap64( src[7] );
#define v128_rev32( v ) vrev64q_u32( v )
static inline uint32x4_t v128_swap64( uint32x4_t v )
{ return vextq_u64( v, v, 1 ); }
static inline uint32x4_t v128_swap32( uint32x4_t v )
{ return vextq_u32( v, v, 2 ); }
static inline uint32x4_t v128_shuflr32( uint32x4_t v )
{ return vextq_u32( v, v, 1 ); }
static inline uint32x4_t v128_shufll32( uint32x4_t v )
{ return vextq_u32( v, v, 3 ); }
#define v128_swap64_32(v) v128_ror64( v, 32 )
#define v128_shuflr64_24(v) v128_ror64( v, 24 )
#define v128_shuflr64_16(v) v128_ror64( v, 16 )
#define v128_swap32_16(v) v128_ror32( v, 16 )
#define v128_shuflr32_8(v) v128_ror32( v, 8 )
// Not the same as SSE2, this uses vector mask, SSE2 uses imm8 mask.
#define v128_blend16( v1, v0, mask ) \
v128_or( v128_and( mask, v1 ), v128_andnot( mask, v0 ) )
#endif