mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v23.5
This commit is contained in:
1910
simd-utils/intrlv.h
1910
simd-utils/intrlv.h
File diff suppressed because it is too large
Load Diff
@@ -34,58 +34,85 @@
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// New architecturally agnostic syntax:
|
||||
// All users of 128 bit SIMD should use new syntax or protect SSE2 only
|
||||
// code segments.
|
||||
// Other vector sizes continue with old syntax for now.
|
||||
// Definitionns here will gradually be converted to new synytax.
|
||||
// For consistency the larger vector utilities should do the same.
|
||||
|
||||
|
||||
// direct translation of native intrinsics
|
||||
|
||||
#define v128_t __m128i
|
||||
// Needed for ARM
|
||||
#define v128u64_t v128_t
|
||||
#define v128u32_t v128_t
|
||||
#define v128u16_t v128_t
|
||||
#define v128u8_t v128_t
|
||||
|
||||
#define v128_load _mm_load_si128
|
||||
#define v128_store _mm_store_si128
|
||||
|
||||
// Needed for ARM, Doesn't do anything special on x86_64
|
||||
#define v128_load1_64(p) _mm_set1_epi64x(*(uint64_t*)(p) )
|
||||
#define v128_load1_32(p) _mm_set_epi32( *(uint32_t*)(p) )
|
||||
#define v128_load1_16(p) _mm_set_epi16( *(uint16_t*)(p) )
|
||||
#define v128_load1_8( p) _mm_set_epi8( *(uint8_t*) (p) )
|
||||
|
||||
// arithmetic
|
||||
#define v128_add64 _mm_add_epi64
|
||||
#define v128_add32 _mm_add_epi32
|
||||
#define v128_add16 _mm_add_epi16
|
||||
#define v128_add8 _mm_add_epi8
|
||||
#define v128_add4_64 mm128_add4_64
|
||||
#define v128_add4_32 mm128_add4_32
|
||||
|
||||
#define v128_sub64 _mm_sub_epi64
|
||||
#define v128_sub32 _mm_sub_epi32
|
||||
#define v128_sub16 _mm_sub_epi16
|
||||
#define v128_sub8 _mm_sub_epi8
|
||||
|
||||
// widen
|
||||
#define v128_mul64 _mm_mul_epu64
|
||||
#define v128_mul32 _mm_mul_epu32
|
||||
#define v128_mul16 _mm_mul_epu16
|
||||
|
||||
// save low half
|
||||
#define v128_mullo32 _mm_mullo_epi32
|
||||
#define v128_mullo16 _mm_mullo_epi16
|
||||
#define v128_mul64 _mm_mullo_epi64
|
||||
#define v128_mul32 _mm_mullo_epi32
|
||||
#define v128_mul16 _mm_mullo_epi16
|
||||
|
||||
// widen
|
||||
#define v128_mulw32 _mm_mul_epu32
|
||||
#define v128_mulw16 _mm_mul_epu16
|
||||
|
||||
// compare
|
||||
#define v128_cmpeq64 _mm_cmpeq_epi64
|
||||
#define v128_cmpeq32 _mm_cmpeq_epi32
|
||||
#define v128_cmpeq16 _mm_cmpeq_epi16
|
||||
#define v128_cmpeq8 _mm_cmpeq_epi8
|
||||
|
||||
#define v128_cmpgt64 _mm_cmpgt_epi64
|
||||
#define v128_cmpgt32 _mm_cmpgt_epi32
|
||||
#define v128_cmpgt16 _mm_cmpgt_epi16
|
||||
#define v128_cmpgt8 _mm_cmpgt_epi8
|
||||
|
||||
#define v128_cmplt64 _mm_cmplt_epi64
|
||||
#define v128_cmplt32 _mm_cmplt_epi32
|
||||
#define v128_cmplt16 _mm_cmplt_epi16
|
||||
#define v128_cmplt8 _mm_cmplt_epi8
|
||||
|
||||
// bit shift
|
||||
#define v128_sl64 _mm_slli_epi64
|
||||
#define v128_sl32 _mm_slli_epi32
|
||||
#define v128_sl16 _mm_slli_epi16
|
||||
#define v128_sl8 _mm_slli_epi8
|
||||
|
||||
#define v128_sr64 _mm_srli_epi64
|
||||
#define v128_sr32 _mm_srli_epi32
|
||||
#define v128_sr16 _mm_srli_epi16
|
||||
#define v128_sr8 _mm_srli_epi8
|
||||
|
||||
#define v128_sra64 _mm_srai_epi64
|
||||
#define v128_sra32 _mm_srai_epi32
|
||||
#define v128_sra16 _mm_srai_epi16
|
||||
#define v128_sra8 _mm_srai_epi8
|
||||
|
||||
// logic
|
||||
#define v128_or _mm_or_si128
|
||||
@@ -93,45 +120,48 @@
|
||||
#define v128_xor _mm_xor_si128
|
||||
#define v128_xorq _mm_xor_si128
|
||||
#define v128_andnot _mm_andnot_si128
|
||||
#define v128_xorandnot( v2, v1, v0 ) _mm_xor_si128( v2, _mm_andnot_si128( v1, v0 ) )
|
||||
#define v128_xor3( v2, v1, v0 ) _mm_xor_si128( v2, _mm_xor_si128( v1, v0 ) )
|
||||
#define v128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
|
||||
#define v128_ornot( a, b ) mm128_or( a, mm128_not( b ) )
|
||||
|
||||
// ternary
|
||||
#define v128_xorandnot( v2, v1, v0 ) \
|
||||
_mm_xor_si128( v2, _mm_andnot_si128( v1, v0 ) )
|
||||
#define v128_xor3( v2, v1, v0 ) \
|
||||
_mm_xor_si128( v2, _mm_xor_si128( v1, v0 ) )
|
||||
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
|
||||
#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
|
||||
#define v128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
|
||||
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
|
||||
#define v128_nor mm128_nor
|
||||
|
||||
// shift 2 concatenated vectors right
|
||||
#define v128_alignr64 mm128_alignr_64
|
||||
#define v128_alignr32 mm128_alignr_32
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define v128_alignr8 _mm_alignr_epi8
|
||||
|
||||
#endif
|
||||
|
||||
// NEON version uses vector mask
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
#define v128_blend16 _mm_blend_epi16
|
||||
|
||||
#define v128_alignr8 _mm_alignr_epi8
|
||||
#endif
|
||||
|
||||
// unpack
|
||||
#define v128_unpacklo64 _mm_unpacklo_epi64
|
||||
#define v128_unpackhi64 _mm_unpackhi_epi64
|
||||
|
||||
#define v128_unpacklo32 _mm_unpacklo_epi32
|
||||
#define v128_unpackhi32 _mm_unpackhi_epi32
|
||||
|
||||
#define v128_unpacklo16 _mm_unpacklo_epi16
|
||||
#define v128_unpackhi16 _mm_unpackhi_epi16
|
||||
|
||||
#define v128_unpacklo8 _mm_unpacklo_epi8
|
||||
#define v128_unpackhi8 _mm_unpackhi_epi8
|
||||
|
||||
// New shorter agnostic name
|
||||
#define v128_ziplo64 _mm_unpacklo_epi64
|
||||
#define v128_ziphi64 _mm_unpackhi_epi64
|
||||
#define v128_ziplo32 _mm_unpacklo_epi32
|
||||
#define v128_ziphi32 _mm_unpackhi_epi32
|
||||
#define v128_ziplo16 _mm_unpacklo_epi16
|
||||
#define v128_ziphi16 _mm_unpackhi_epi16
|
||||
#define v128_ziplo8 _mm_unpacklo_epi8
|
||||
#define v128_ziphi8 _mm_unpackhi_epi8
|
||||
|
||||
// AES
|
||||
#define v128_aesenc _mm_aesenc_si128
|
||||
#define v128_aesenclast _mm_aesenclast_si128
|
||||
@@ -144,24 +174,26 @@ typedef union
|
||||
__m128i m128;
|
||||
uint32_t u32[4];
|
||||
} __attribute__ ((aligned (16))) m128_ovly;
|
||||
#define v128_ovly m128_ovly
|
||||
|
||||
|
||||
#define mm128_64(i64) _mm_set1_epi64x(i64)
|
||||
#define mm128_32(i32) _mm_set1_epi32(i32)
|
||||
#define v128_32 mm128_32
|
||||
#define v128_64 mm128_64
|
||||
// use for immediate constants, use load1 for mem.
|
||||
#define v128_64 _mm_set1_epi64x
|
||||
#define v128_32 _mm_set1_epi32
|
||||
#define v128_16 _mm_set1_epi16
|
||||
#define v128_8 _mm_set1_epi8
|
||||
|
||||
#define v128_set64 _mm_set_epi64x
|
||||
#define v128_set_64 v128_set64 // deprecated
|
||||
#define v128_set32 _mm_set_epi32
|
||||
#define v128_set_32 v128_set32 // deprecated
|
||||
|
||||
#define v128_set16 _mm_set_epi16
|
||||
#define v128_set8 _mm_set_epi8
|
||||
|
||||
// Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements
|
||||
// that make these functions either unnecessary or inefficient.
|
||||
// In cases where an explicit move betweeen GP & SIMD registers is still
|
||||
// necessary the cvt, set, or set1 intrinsics can be used allowing the
|
||||
// compiler to exploit new features to produce optimum code.
|
||||
// Currently only used internally and by Luffa.
|
||||
|
||||
static inline __m128i mm128_mov64_128( const uint64_t n )
|
||||
{
|
||||
__m128i a;
|
||||
@@ -172,7 +204,7 @@ static inline __m128i mm128_mov64_128( const uint64_t n )
|
||||
#endif
|
||||
return a;
|
||||
}
|
||||
#define v128_mov64( u64 ) mm128_mov64_128( u64 )
|
||||
//#define v128_mov64( u64 ) mm128_mov64_128( u64 )
|
||||
|
||||
|
||||
static inline __m128i mm128_mov32_128( const uint32_t n )
|
||||
@@ -192,14 +224,28 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
|
||||
//#define mm128_bcast_m32( v ) _mm_shuffle_epi32( v, 0x00 )
|
||||
|
||||
// Pseudo constants
|
||||
#define v128_zero _mm_setzero_si128()
|
||||
#define m128_zero v128_zero
|
||||
#define v128_zero _mm_setzero_si128()
|
||||
#define m128_zero v128_zero
|
||||
|
||||
#define m128_one_128 mm128_mov64_128( 1 )
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
// Bitwise AND, return 1 if result is all bits clear.
|
||||
#define v128_and_eq0 _mm_testz_si128
|
||||
|
||||
static inline int v128_cmpeq0( v128_t v )
|
||||
{ return v128_and_eq0( v, v ); }
|
||||
|
||||
#endif
|
||||
|
||||
// Bitwise compare return 1 if all bits set.
|
||||
#define v128_cmpeq1 _mm_test_all ones
|
||||
|
||||
#define v128_one mm128_mov64_128( 1 )
|
||||
#define m128_one_128 v128_one
|
||||
|
||||
// ASM avoids the need to initialize return variable to avoid compiler warning.
|
||||
// Macro abstracts function parentheses to look like an identifier.
|
||||
static inline __m128i mm128_neg1_fn()
|
||||
static inline __m128i v128_neg1_fn()
|
||||
{
|
||||
__m128i a;
|
||||
#if defined(__AVX__)
|
||||
@@ -209,9 +255,54 @@ static inline __m128i mm128_neg1_fn()
|
||||
#endif
|
||||
return a;
|
||||
}
|
||||
#define m128_neg1 mm128_neg1_fn()
|
||||
#define m128_neg1_fn v128_neg1_fn
|
||||
#define v128_neg1 v128_neg1_fn()
|
||||
#define m128_neg1 v128_neg1
|
||||
|
||||
//
|
||||
// Vector pointer cast
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns p as pointer to vector type
|
||||
#define castp_m128i(p) ((__m128i*)(p))
|
||||
#define castp_v128 castp_m128i
|
||||
#define castp_v128u64 castp_v128
|
||||
#define castp_v128u32 castp_v128
|
||||
#define castp_v128u16 castp_v128
|
||||
#define castp_v128u8 castp_v128
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns *p, watch your pointer arithmetic
|
||||
#define cast_m128i(p) (*((__m128i*)(p)))
|
||||
#define cast_v128 cast_m128i
|
||||
#define cast_v128u64 cast_v128
|
||||
#define cast_v128u32 cast_v128
|
||||
#define cast_v128u16 cast_v128
|
||||
#define cast_v128u8 cast_v128
|
||||
|
||||
// p = any aligned pointer, i = scaled array index
|
||||
// returns value p[i]
|
||||
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
|
||||
#define casti_v128 casti_m128i
|
||||
#define casti_v128u64 casti_v128
|
||||
#define casti_v128u32 casti_v128
|
||||
#define casti_v128u16 casti_v128
|
||||
#define casti_v128u8 casti_v128
|
||||
|
||||
// p = any aligned pointer, o = scaled offset
|
||||
// returns pointer p+o
|
||||
#define casto_m128i(p,o) (((__m128i*)(p))+(o))
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#define v128_get64( v, l ) _mm_extract_epi64( v, l )
|
||||
#define v128_get32( v, l ) _mm_extract_epi32( v, l )
|
||||
#define v128_get16( v, l ) _mm_extract_epi16( v, l )
|
||||
#define v128_get8( v, l ) _mm_extract_epi8( v, l )
|
||||
|
||||
#define v128_put64( v, u64, l ) _mm_insert_epi64( v, u64, l )
|
||||
#define v128_put32( v, u32, l ) _mm_insert_epi64( v, u32, l )
|
||||
#define v128_put16( v, u16, l ) _mm_insert_epi16( v, u16, l )
|
||||
#define v128_put8( v, u8, l ) _mm_insert_epi8( v, u8, l )
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
//
|
||||
@@ -238,32 +329,25 @@ static inline __m128i mm128_neg1_fn()
|
||||
// c[7:6] source element selector
|
||||
|
||||
// Convert type and abbreviate name: eXtract Insert Mask = XIM
|
||||
#define mm128_xim_32( v1, v2, c ) \
|
||||
#define mm128_xim_32( v1, v0, c ) \
|
||||
_mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
|
||||
_mm_castsi128_ps( v2 ), c ) )
|
||||
|
||||
/* Another way to do it with individual arguments.
|
||||
#define mm128_xim_32( v1, i1, v2, i2, mask ) \
|
||||
_mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
|
||||
_mm_castsi128_ps( v2 ), \
|
||||
(mask) | ((i1)<<4) | ((i2)<<6) ) )
|
||||
*/
|
||||
_mm_castsi128_ps( v0 ), c ) )
|
||||
|
||||
// Examples of simple operations using xim:
|
||||
/*
|
||||
// Copy i32 to element c of dest and copy remaining elemnts from v.
|
||||
#define v128_put32( v, i32, c ) \
|
||||
mm128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
|
||||
*/
|
||||
|
||||
// Copy i to element c of dest and copy remaining elemnts from v.
|
||||
static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
|
||||
const int c )
|
||||
{ return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); }
|
||||
|
||||
// Zero 32 bit elements when corresponding bit in 4 bit mask is set.
|
||||
static inline __m128i mm128_mask_32( const __m128i v, const int m )
|
||||
{ return mm128_xim_32( v, v, m ); }
|
||||
|
||||
// Copy element i2 of v2 to element i1 of dest and copy remaining elements from v1.
|
||||
#define mm128_mov32_32( v1, i1, v2, i2 ) \
|
||||
mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )
|
||||
#define v128_mov32( dst, ld, src, ls ) mm128_mov32_32( dst, ld, src, ls )
|
||||
#define v128_movlane32( v1, l1, v0, l0 ) \
|
||||
mm128_xim_32( v1, v0, ( (l1)<<4 ) | ( (l0)<<6 ) )
|
||||
|
||||
#endif // SSE4_1
|
||||
|
||||
@@ -282,8 +366,7 @@ static inline __m128i mm128_not( const __m128i v )
|
||||
#define mm128_not( v ) _mm_xor_si128( v, m128_neg1 )
|
||||
|
||||
#endif
|
||||
#define v128_not mm128_not
|
||||
|
||||
#define v128_not mm128_not
|
||||
|
||||
static inline __m128i mm128_negate_64( __m128i v )
|
||||
{ return _mm_sub_epi64( _mm_xor_si128( v, v ), v ); }
|
||||
@@ -315,30 +398,6 @@ static inline __m128i mm128_negate_16( __m128i v )
|
||||
#define mm128_xor4( a, b, c, d ) \
|
||||
_mm_xor_si128( _mm_xor_si128( a, b ), _mm_xor_si128( c, d ) )
|
||||
|
||||
|
||||
//
|
||||
// Vector pointer cast
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns p as pointer to vector type
|
||||
#define castp_m128i(p) ((__m128i*)(p))
|
||||
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns *p, watch your pointer arithmetic
|
||||
#define cast_m128i(p) (*((__m128i*)(p)))
|
||||
#define cast_v128 cast_m128i
|
||||
|
||||
// p = any aligned pointer, i = scaled array index
|
||||
// returns value p[i]
|
||||
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
|
||||
#define casti_v128 casti_m128i
|
||||
|
||||
// p = any aligned pointer, o = scaled offset
|
||||
// returns pointer p+o
|
||||
#define casto_m128i(p,o) (((__m128i*)(p))+(o))
|
||||
|
||||
|
||||
// Memory functions
|
||||
// Mostly for convenience, avoids calculating bytes.
|
||||
// Assumes data is alinged and integral.
|
||||
@@ -424,6 +483,83 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
//
|
||||
// Bit rotations
|
||||
|
||||
// Neon has fast xor-ror, useful for big blake, if it actually works.
|
||||
#define v128_xror64( v1, v0, c ) v128_ror64( v128_xor( v1, v0 ) c )
|
||||
|
||||
|
||||
// Slow bit rotation, used as last resort
|
||||
#define mm128_ror_64_sse2( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm128_rol_64_sse2( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm128_ror_32_sse2( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
|
||||
|
||||
#define mm128_rol_32_sse2( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define mm128_ror_64 _mm_ror_epi64
|
||||
#define mm128_rol_64 _mm_rol_epi64
|
||||
#define mm128_ror_32 _mm_ror_epi32
|
||||
#define mm128_rol_32 _mm_rol_epi32
|
||||
|
||||
// optimized byte wise rotation
|
||||
#elif defined(__SSSE3__)
|
||||
|
||||
#define mm128_ror_64( v, c ) \
|
||||
( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 24 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) \
|
||||
: ( (c) == 16 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x09080f0e0d0c0b0a, 0x0100070605040302 ) ) \
|
||||
: ( (c) == 8 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) \
|
||||
: mm128_ror_64_sse2( v, c )
|
||||
|
||||
#define mm128_rol_64( v, c ) \
|
||||
( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 24 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) \
|
||||
: ( (c) == 16 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0d0c0b0a09080f0e, 0x0504030201000706 ) ) \
|
||||
: ( (c) == 8 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) \
|
||||
: mm128_rol_64_sse2( v, c )
|
||||
|
||||
#define mm128_ror_32( v, c ) \
|
||||
( (c) == 16 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) \
|
||||
: ( (c) == 8 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) \
|
||||
: mm128_ror_32_sse2( v, c )
|
||||
|
||||
#define mm128_rol_32( v, c ) \
|
||||
( (c) == 16 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) \
|
||||
: ( (c) == 8 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) \
|
||||
: mm128_rol_32_sse2( v, c )
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_ror_64 mm128_ror_64_sse2
|
||||
#define mm128_rol_64 mm128_rol_64_sse2
|
||||
#define mm128_ror_32 mm128_ror_32_sse2
|
||||
#define mm128_rol_32 mm128_rol_32_sse2
|
||||
|
||||
#endif
|
||||
|
||||
// Architecturally agnostic naming
|
||||
#define v128_ror64 mm128_ror_64
|
||||
#define v128_rol64 mm128_rol_64
|
||||
#define v128_ror32 mm128_ror_32
|
||||
#define v128_rol32 mm128_rol_32
|
||||
|
||||
|
||||
// x2 rotates elements in 2 individual vectors in a double buffered
|
||||
// optimization for SSE2, does nothing for AVX512 but is there for
|
||||
// transparency.
|
||||
@@ -431,13 +567,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
|
||||
#define mm128_ror_64 _mm_ror_epi64
|
||||
#define mm128_rol_64 _mm_rol_epi64
|
||||
#define mm128_ror_32 _mm_ror_epi32
|
||||
#define mm128_rol_32 _mm_rol_epi32
|
||||
#define mm128_ror_16 _mm_ror_epi16
|
||||
#define mm128_rol_16 _mm_rol_epi16
|
||||
|
||||
#define mm128_rorx2_64( v1, v0, c ) \
|
||||
_mm_ror_epi64( v0, c ); \
|
||||
_mm_ror_epi64( v1, c )
|
||||
@@ -456,24 +585,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define mm128_ror_64( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm128_rol_64( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm128_ror_32( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
|
||||
|
||||
#define mm128_rol_32( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
#define mm128_ror_16( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) )
|
||||
|
||||
#define mm128_rol_16( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
|
||||
|
||||
#define mm128_rorx2_64( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_srli_epi64( v0, c ); \
|
||||
@@ -516,17 +627,22 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
|
||||
#endif // AVX512 else SSE2
|
||||
|
||||
#define v128_ror64 mm128_ror_64
|
||||
#define v128 rol64 mm128_rol_64
|
||||
#define v128_2ror64 mm128_rorx2_64
|
||||
#define v128_2rol64 mm128_rolx2_64
|
||||
#define v128_2ror32 mm128_rorx2_32
|
||||
#define v128_2rol32 mm128_rolx2_32
|
||||
|
||||
#define v128_ror32 mm128_ror_32
|
||||
#define v128_rol32 mm128_rol_32
|
||||
|
||||
#define v128_ror16 mm128_ror_16
|
||||
#define v128_rol16 mm128_rol_16
|
||||
|
||||
// Cross lane shuffles
|
||||
//
|
||||
|
||||
#define v128_shuffle32 _mm_shuffle_epi32
|
||||
|
||||
// shuffle using vector mask, for compatibility with NEON
|
||||
#define v128_shufflev32( v, vmask ) \
|
||||
v128_shuffle32( v, mm128_movmask_32( vmask ) )
|
||||
|
||||
#define v128_shuffle8 _mm_shuffle_epi8
|
||||
|
||||
// Limited 2 input shuffle, combines shuffle with blend. The destination low
|
||||
// half is always taken from v1, and the high half from v2.
|
||||
#define mm128_shuffle2_64( v1, v2, c ) \
|
||||
@@ -540,19 +656,21 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
// Rotate vector elements accross all lanes
|
||||
|
||||
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
|
||||
#define v128_swap64 mm128_swap_64
|
||||
|
||||
#define v128_swap64 mm128_swap_64
|
||||
#define mm128_shuflr_64 mm128_swap_64
|
||||
#define mm128_shufll_64 mm128_swap_64
|
||||
|
||||
// Don't use as an alias for byte sized bit rotation
|
||||
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
|
||||
#define v128_shuflr32 mm128_shuflr_32
|
||||
#define v128_shuflr32 mm128_shuflr_32
|
||||
|
||||
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
|
||||
#define v128_shufll32 mm128_shufll_32
|
||||
#define v128_shufll32 mm128_shufll_32
|
||||
|
||||
#define v128_swap64_32( v ) v128_ror64( v, 32 )
|
||||
|
||||
#define mm128_rev_32( v ) _mm_shuffle_epi32( v, 0x1b )
|
||||
#define v128_rev32( v ) mm128_rev_32( v )
|
||||
#define v128_rev32 mm128_rev_32
|
||||
|
||||
/* Not used
|
||||
#if defined(__SSSE3__)
|
||||
@@ -564,65 +682,6 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
#endif
|
||||
*/
|
||||
|
||||
// Rotate 64 bit lanes
|
||||
|
||||
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
|
||||
#define v128_swap64_32 mm128_swap64_32
|
||||
|
||||
#define mm128_shuflr64_32 mm128_swap64_32
|
||||
#define mm128_shufll64_32 mm128_swap64_32
|
||||
|
||||
//TODO Enable for AVX10_256
|
||||
#if defined(__AVX512VL__)
|
||||
#define m128_shuflr64_24( v ) _mm_ror_epi64( v, 24 )
|
||||
#elif defined(__SSSE3__)
|
||||
#define mm128_shuflr64_24( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
|
||||
#else
|
||||
#define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
|
||||
#endif
|
||||
#define v128_shuflr64_24 mm128_shuflr64_24
|
||||
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm128_shuflr64_16( v ) _mm_ror_epi64( v, 16 )
|
||||
#elif defined(__SSSE3__)
|
||||
#define mm128_shuflr64_16( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
|
||||
#else
|
||||
#define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
|
||||
#endif
|
||||
#define v128_shuflr64_16 mm128_shuflr64_16
|
||||
|
||||
// Rotate 32 bit lanes
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm128_swap32_16( v ) _mm_ror_epi32( v, 16 )
|
||||
#elif defined(__SSSE3__)
|
||||
#define mm128_swap32_16( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
|
||||
#else
|
||||
#define mm128_swap32_16( v ) mm128_ror_32( v, 16 )
|
||||
#endif
|
||||
#define mm128_shuflr32_16 mm128_swap32_16
|
||||
#define mm128_shufll32_16 mm128_swap32_16
|
||||
#define v128_swap32_16 mm128_swap32_16
|
||||
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm128_shuflr32_8( v ) _mm_ror_epi32( v, 8 )
|
||||
#elif defined(__SSSE3__)
|
||||
#define mm128_shuflr32_8( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
|
||||
#else
|
||||
#define mm128_shuflr32_8( v ) mm128_ror_32( v, 8 )
|
||||
#endif
|
||||
#define v128_shuflr32_8 mm128_shuflr32_8
|
||||
|
||||
//
|
||||
// Endian byte swap.
|
||||
|
||||
@@ -645,7 +704,22 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
0x0607040502030001 )
|
||||
|
||||
// 8 byte qword * 8 qwords * 2 lanes = 128 bytes
|
||||
#define mm128_block_bswap_64( d, s ) do \
|
||||
#define mm128_block_bswap_64( d, s ) \
|
||||
{ \
|
||||
__m128i ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
|
||||
casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
|
||||
casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
|
||||
casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
|
||||
casti_m128i( d,4 ) = _mm_shuffle_epi8( casti_m128i( s,4 ), ctl ); \
|
||||
casti_m128i( d,5 ) = _mm_shuffle_epi8( casti_m128i( s,5 ), ctl ); \
|
||||
casti_m128i( d,6 ) = _mm_shuffle_epi8( casti_m128i( s,6 ), ctl ); \
|
||||
casti_m128i( d,7 ) = _mm_shuffle_epi8( casti_m128i( s,7 ), ctl ); \
|
||||
}
|
||||
#define mm128_block_bswap64_512 mm128_block_bswap_64
|
||||
#define v128_block_bswap64_512 mm128_block_bswap_64
|
||||
|
||||
#define v128_block_bswap64_1024( d, s ) \
|
||||
{ \
|
||||
__m128i ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \
|
||||
@@ -656,10 +730,33 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
casti_m128i( d, 5 ) = _mm_shuffle_epi8( casti_m128i( s, 5 ), ctl ); \
|
||||
casti_m128i( d, 6 ) = _mm_shuffle_epi8( casti_m128i( s, 6 ), ctl ); \
|
||||
casti_m128i( d, 7 ) = _mm_shuffle_epi8( casti_m128i( s, 7 ), ctl ); \
|
||||
} while(0)
|
||||
casti_m128i( d, 8 ) = _mm_shuffle_epi8( casti_m128i( s, 8 ), ctl ); \
|
||||
casti_m128i( d, 9 ) = _mm_shuffle_epi8( casti_m128i( s, 9 ), ctl ); \
|
||||
casti_m128i( d,10 ) = _mm_shuffle_epi8( casti_m128i( s,10 ), ctl ); \
|
||||
casti_m128i( d,11 ) = _mm_shuffle_epi8( casti_m128i( s,11 ), ctl ); \
|
||||
casti_m128i( d,12 ) = _mm_shuffle_epi8( casti_m128i( s,12 ), ctl ); \
|
||||
casti_m128i( d,13 ) = _mm_shuffle_epi8( casti_m128i( s,13 ), ctl ); \
|
||||
casti_m128i( d,14 ) = _mm_shuffle_epi8( casti_m128i( s,14 ), ctl ); \
|
||||
casti_m128i( d,15 ) = _mm_shuffle_epi8( casti_m128i( s,15 ), ctl ); \
|
||||
}
|
||||
|
||||
// 4 byte dword * 8 dwords * 4 lanes = 128 bytes
|
||||
#define mm128_block_bswap_32( d, s ) do \
|
||||
#define mm128_block_bswap_32( d, s ) \
|
||||
{ \
|
||||
__m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
|
||||
casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
|
||||
casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
|
||||
casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
|
||||
casti_m128i( d,4 ) = _mm_shuffle_epi8( casti_m128i( s,4 ), ctl ); \
|
||||
casti_m128i( d,5 ) = _mm_shuffle_epi8( casti_m128i( s,5 ), ctl ); \
|
||||
casti_m128i( d,6 ) = _mm_shuffle_epi8( casti_m128i( s,6 ), ctl ); \
|
||||
casti_m128i( d,7 ) = _mm_shuffle_epi8( casti_m128i( s,7 ), ctl ); \
|
||||
}
|
||||
#define mm128_block_bswap32_256 mm128_block_bswap_32
|
||||
#define v128_block_bswap32_256 mm128_block_bswap_32
|
||||
|
||||
#define v128_block_bswap32_512( d, s ) \
|
||||
{ \
|
||||
__m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \
|
||||
@@ -670,7 +767,15 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
casti_m128i( d, 5 ) = _mm_shuffle_epi8( casti_m128i( s, 5 ), ctl ); \
|
||||
casti_m128i( d, 6 ) = _mm_shuffle_epi8( casti_m128i( s, 6 ), ctl ); \
|
||||
casti_m128i( d, 7 ) = _mm_shuffle_epi8( casti_m128i( s, 7 ), ctl ); \
|
||||
} while(0)
|
||||
casti_m128i( d, 8 ) = _mm_shuffle_epi8( casti_m128i( s, 8 ), ctl ); \
|
||||
casti_m128i( d, 9 ) = _mm_shuffle_epi8( casti_m128i( s, 9 ), ctl ); \
|
||||
casti_m128i( d,10 ) = _mm_shuffle_epi8( casti_m128i( s,10 ), ctl ); \
|
||||
casti_m128i( d,11 ) = _mm_shuffle_epi8( casti_m128i( s,11 ), ctl ); \
|
||||
casti_m128i( d,12 ) = _mm_shuffle_epi8( casti_m128i( s,12 ), ctl ); \
|
||||
casti_m128i( d,13 ) = _mm_shuffle_epi8( casti_m128i( s,13 ), ctl ); \
|
||||
casti_m128i( d,14 ) = _mm_shuffle_epi8( casti_m128i( s,14 ), ctl ); \
|
||||
casti_m128i( d,15 ) = _mm_shuffle_epi8( casti_m128i( s,15 ), ctl ); \
|
||||
}
|
||||
|
||||
#else // SSE2
|
||||
|
||||
@@ -707,6 +812,27 @@ static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
|
||||
d[6] = mm128_bswap_64( s[6] );
|
||||
d[7] = mm128_bswap_64( s[7] );
|
||||
}
|
||||
#define v128_block_bswap64_512 mm128_block_bswap_64
|
||||
|
||||
static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[ 0] = mm128_bswap_64( s[ 0] );
|
||||
d[ 1] = mm128_bswap_64( s[ 1] );
|
||||
d[ 2] = mm128_bswap_64( s[ 2] );
|
||||
d[ 3] = mm128_bswap_64( s[ 3] );
|
||||
d[ 4] = mm128_bswap_64( s[ 4] );
|
||||
d[ 5] = mm128_bswap_64( s[ 5] );
|
||||
d[ 6] = mm128_bswap_64( s[ 6] );
|
||||
d[ 7] = mm128_bswap_64( s[ 7] );
|
||||
d[ 8] = mm128_bswap_64( s[ 8] );
|
||||
d[ 9] = mm128_bswap_64( s[ 9] );
|
||||
d[10] = mm128_bswap_64( s[10] );
|
||||
d[11] = mm128_bswap_64( s[11] );
|
||||
d[14] = mm128_bswap_64( s[12] );
|
||||
d[13] = mm128_bswap_64( s[13] );
|
||||
d[14] = mm128_bswap_64( s[14] );
|
||||
d[15] = mm128_bswap_64( s[15] );
|
||||
}
|
||||
|
||||
static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
{
|
||||
@@ -719,6 +845,28 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
d[6] = mm128_bswap_32( s[6] );
|
||||
d[7] = mm128_bswap_32( s[7] );
|
||||
}
|
||||
#define mm128_block_bswap32_256 mm128_block_bswap_32
|
||||
#define v128_block_bswap32_256 mm128_block_bswap_32
|
||||
|
||||
static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[ 0] = mm128_bswap_32( s[ 0] );
|
||||
d[ 1] = mm128_bswap_32( s[ 1] );
|
||||
d[ 2] = mm128_bswap_32( s[ 2] );
|
||||
d[ 3] = mm128_bswap_32( s[ 3] );
|
||||
d[ 4] = mm128_bswap_32( s[ 4] );
|
||||
d[ 5] = mm128_bswap_32( s[ 5] );
|
||||
d[ 6] = mm128_bswap_32( s[ 6] );
|
||||
d[ 7] = mm128_bswap_32( s[ 7] );
|
||||
d[ 8] = mm128_bswap_32( s[ 8] );
|
||||
d[ 9] = mm128_bswap_32( s[ 9] );
|
||||
d[10] = mm128_bswap_32( s[10] );
|
||||
d[11] = mm128_bswap_32( s[11] );
|
||||
d[12] = mm128_bswap_32( s[12] );
|
||||
d[13] = mm128_bswap_32( s[13] );
|
||||
d[14] = mm128_bswap_32( s[14] );
|
||||
d[15] = mm128_bswap_32( s[15] );
|
||||
}
|
||||
|
||||
#endif // SSSE3 else SSE2
|
||||
|
||||
@@ -747,5 +895,21 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
|
||||
#endif
|
||||
|
||||
// NEON only uses vector mask. x86 blend selects second arg when control bit
|
||||
// is set. Blendv selects second arg when sign bit is set. And masking is the
|
||||
// opposite, elements are selected from the first arg if the mask bits are set.
|
||||
// Arm blend is a bit by bit blend while x76 is an elenet blend.
|
||||
// Reverse the logic so the use mask is consistent with both formats.
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
#define v128_blendv _mm_blendv_epi8
|
||||
|
||||
#else
|
||||
|
||||
#define v128_blendv( v1, v0, mask ) \
|
||||
v128_or( v128_andnot( mask, v0 ), v128_and( mask, v1 ) )
|
||||
|
||||
#endif
|
||||
|
||||
#endif // __SSE2__
|
||||
#endif // SIMD_128_H__
|
||||
|
@@ -217,6 +217,69 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
|
||||
//
|
||||
// Bit rotations.
|
||||
|
||||
// Slow version, used as last resort
|
||||
#define mm256_ror_64_avx2( v, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
|
||||
_mm256_slli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm256_rol_64_avx2( v, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi64( v, c ), \
|
||||
_mm256_srli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm256_ror_32_avx2( v, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi32( v, c ), \
|
||||
_mm256_slli_epi32( v, 32-(c) ) )
|
||||
|
||||
#define mm256_rol_32_avx2( v, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
|
||||
_mm256_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define mm256_ror_64 _mm256_ror_epi64
|
||||
#define mm256_rol_64 _mm256_rol_epi64
|
||||
#define mm256_ror_32 _mm256_ror_epi32
|
||||
#define mm256_rol_32 _mm256_rol_epi32
|
||||
|
||||
#else
|
||||
|
||||
#define mm256_ror_64( v, c ) \
|
||||
( (c) == 32 ) ? _mm256_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 24 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) ) \
|
||||
: ( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x09080f0e0d0c0b0a, 0x0100070605040302 ) ) ) \
|
||||
: ( (c) == 8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) ) \
|
||||
: mm256_ror_64_avx2( v, c )
|
||||
|
||||
#define mm256_rol_64( v, c ) \
|
||||
( (c) == 32 ) ? _mm256_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 24 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) ) \
|
||||
: ( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0d0c0b0a09080f0e, 0x0504030201000706 ) ) ) \
|
||||
: ( (c) == 8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) ) \
|
||||
: mm256_rol_64_avx2( v, c )
|
||||
|
||||
#define mm256_ror_32( v, c ) \
|
||||
( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) )\
|
||||
: ( (c) == 8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) ) \
|
||||
: mm256_ror_32_avx2( v, c )
|
||||
|
||||
#define mm256_rol_32( v, c ) \
|
||||
( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) ) \
|
||||
: ( (c) == 8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) ) \
|
||||
: mm256_rol_32_avx2( v, c )
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
// x2 rotates elements in 2 individual vectors in a double buffered
|
||||
// optimization for AVX2, does nothing for AVX512 but is here for
|
||||
@@ -224,12 +287,12 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
|
||||
/*
|
||||
#define mm256_ror_64 _mm256_ror_epi64
|
||||
#define mm256_rol_64 _mm256_rol_epi64
|
||||
#define mm256_ror_32 _mm256_ror_epi32
|
||||
#define mm256_rol_32 _mm256_rol_epi32
|
||||
|
||||
*/
|
||||
#define mm256_rorx2_64( v1, v0, c ) \
|
||||
_mm256_ror_epi64( v0, c ); \
|
||||
_mm256_ror_epi64( v1, c )
|
||||
@@ -247,7 +310,7 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
_mm256_rol_epi32( v1, c )
|
||||
|
||||
#else // AVX2
|
||||
|
||||
/*
|
||||
// use shuflr64 shuflr32 below for optimized bit rotations of multiples of 8.
|
||||
|
||||
#define mm256_ror_64( v, c ) \
|
||||
@@ -265,7 +328,7 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
#define mm256_rol_32( v, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
|
||||
_mm256_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
*/
|
||||
#define mm256_rorx2_64( v1, v0, c ) \
|
||||
{ \
|
||||
__m256i t0 = _mm256_srli_epi64( v0, c ); \
|
||||
@@ -372,49 +435,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
{ return _mm256_alignr_epi8( v, v, c ); }
|
||||
*/
|
||||
|
||||
// 64 bit lanes
|
||||
|
||||
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
|
||||
#define mm256_shuflr64_32 mm256_swap64_32
|
||||
#define mm256_shufll64_32 mm256_swap64_32
|
||||
|
||||
//TODO Enable for AVX10_256
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm256_shuflr64_24( v ) _mm256_ror_epi64( v, 24 )
|
||||
#else
|
||||
#define mm256_shuflr64_24( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
|
||||
0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) )
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm256_shuflr64_16( v ) _mm256_ror_epi64( v, 16 )
|
||||
#else
|
||||
#define mm256_shuflr64_16( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
|
||||
0x09080f0e0d0c0b0a, 0x0100070605040302 ) ) )
|
||||
#endif
|
||||
|
||||
// 32 bit lanes
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm256_swap32_16( v ) _mm256_ror_epi32( v, 16 )
|
||||
#else
|
||||
#define mm256_swap32_16( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
|
||||
0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) )
|
||||
#endif
|
||||
#define mm256_shuflr32_16 mm256_swap32_16
|
||||
#define mm256_shufll32_16 mm256_swap32_16
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm256_shuflr32_8( v ) _mm256_ror_epi32( v, 8 )
|
||||
#else
|
||||
#define mm256_shuflr32_8( v ) \
|
||||
_mm256_shuffle_epi8( v, _mm256_set_epi64x( \
|
||||
0x0c0f0e0d080b0a09, 0x0407060500030201, \
|
||||
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
|
||||
#endif
|
||||
// Same as bit rotation but logically used as byte/word rotation.
|
||||
#define mm256_swap64_32( v ) mm256_ror_64( v, 32 )
|
||||
|
||||
// Reverse byte order in elements, endian bswap.
|
||||
#define mm256_bswap_64( v ) \
|
||||
@@ -428,10 +450,11 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
#define mm256_bswap_16( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
|
||||
0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
|
||||
//
|
||||
|
||||
// Source and destination are pointers, may point to same memory.
|
||||
// 8 byte qword * 8 qwords * 4 lanes = 256 bytes
|
||||
#define mm256_block_bswap_64( d, s ) do \
|
||||
#define mm256_block_bswap_64( d, s ) \
|
||||
{ \
|
||||
__m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
|
||||
0x0001020304050607 ) ); \
|
||||
@@ -443,10 +466,33 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
|
||||
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
|
||||
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
|
||||
} while(0)
|
||||
}
|
||||
#define mm256_block_bswap64_512 mm256_block_bswap_64
|
||||
|
||||
#define mm256_block_bswap64_1024( d, s ) \
|
||||
{ \
|
||||
__m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
|
||||
0x0001020304050607 ) ); \
|
||||
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
|
||||
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
|
||||
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
|
||||
casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
|
||||
casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
|
||||
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
|
||||
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
|
||||
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
|
||||
casti_m256i( d, 8 ) = _mm256_shuffle_epi8( casti_m256i( s, 8 ), ctl ); \
|
||||
casti_m256i( d, 9 ) = _mm256_shuffle_epi8( casti_m256i( s, 9 ), ctl ); \
|
||||
casti_m256i( d,10 ) = _mm256_shuffle_epi8( casti_m256i( s,10 ), ctl ); \
|
||||
casti_m256i( d,11 ) = _mm256_shuffle_epi8( casti_m256i( s,11 ), ctl ); \
|
||||
casti_m256i( d,12 ) = _mm256_shuffle_epi8( casti_m256i( s,12 ), ctl ); \
|
||||
casti_m256i( d,13 ) = _mm256_shuffle_epi8( casti_m256i( s,13 ), ctl ); \
|
||||
casti_m256i( d,14 ) = _mm256_shuffle_epi8( casti_m256i( s,14 ), ctl ); \
|
||||
casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \
|
||||
}
|
||||
|
||||
// 4 byte dword * 8 dwords * 8 lanes = 256 bytes
|
||||
#define mm256_block_bswap_32( d, s ) do \
|
||||
#define mm256_block_bswap_32( d, s ) \
|
||||
{ \
|
||||
__m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ) ); \
|
||||
@@ -458,7 +504,31 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
|
||||
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
|
||||
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
|
||||
} while(0)
|
||||
}
|
||||
#define mm256_block_bswap32_256 mm256_block_bswap_32
|
||||
|
||||
#define mm256_block_bswap32_512( d, s ) \
|
||||
{ \
|
||||
__m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ) ); \
|
||||
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
|
||||
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
|
||||
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
|
||||
casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
|
||||
casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
|
||||
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
|
||||
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
|
||||
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
|
||||
casti_m256i( d, 8 ) = _mm256_shuffle_epi8( casti_m256i( s, 8 ), ctl ); \
|
||||
casti_m256i( d, 9 ) = _mm256_shuffle_epi8( casti_m256i( s, 9 ), ctl ); \
|
||||
casti_m256i( d,10 ) = _mm256_shuffle_epi8( casti_m256i( s,10 ), ctl ); \
|
||||
casti_m256i( d,11 ) = _mm256_shuffle_epi8( casti_m256i( s,11 ), ctl ); \
|
||||
casti_m256i( d,12 ) = _mm256_shuffle_epi8( casti_m256i( s,12 ), ctl ); \
|
||||
casti_m256i( d,13 ) = _mm256_shuffle_epi8( casti_m256i( s,13 ), ctl ); \
|
||||
casti_m256i( d,14 ) = _mm256_shuffle_epi8( casti_m256i( s,14 ), ctl ); \
|
||||
casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \
|
||||
}
|
||||
|
||||
|
||||
#endif // __AVX2__
|
||||
#endif // SIMD_256_H__
|
||||
|
@@ -251,7 +251,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
|
||||
// Source and destination are pointers, may point to same memory.
|
||||
// 8 lanes of 64 bytes each
|
||||
#define mm512_block_bswap_64( d, s ) do \
|
||||
#define mm512_block_bswap_64( d, s ) \
|
||||
{ \
|
||||
const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
|
||||
@@ -263,10 +263,33 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
|
||||
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
|
||||
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
|
||||
} while(0)
|
||||
}
|
||||
#define mm512_block_bswap64_512 mm512_block_bswap_64
|
||||
|
||||
#define mm512_block_bswap64_1024( d, s ) \
|
||||
{ \
|
||||
const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
|
||||
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
|
||||
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
|
||||
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
|
||||
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
|
||||
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
|
||||
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
|
||||
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
|
||||
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
|
||||
casti_m512i( d, 8 ) = _mm512_shuffle_epi8( casti_m512i( s, 8 ), ctl ); \
|
||||
casti_m512i( d, 9 ) = _mm512_shuffle_epi8( casti_m512i( s, 9 ), ctl ); \
|
||||
casti_m512i( d,10 ) = _mm512_shuffle_epi8( casti_m512i( s,10 ), ctl ); \
|
||||
casti_m512i( d,11 ) = _mm512_shuffle_epi8( casti_m512i( s,11 ), ctl ); \
|
||||
casti_m512i( d,12 ) = _mm512_shuffle_epi8( casti_m512i( s,12 ), ctl ); \
|
||||
casti_m512i( d,13 ) = _mm512_shuffle_epi8( casti_m512i( s,13 ), ctl ); \
|
||||
casti_m512i( d,14 ) = _mm512_shuffle_epi8( casti_m512i( s,14 ), ctl ); \
|
||||
casti_m512i( d,15 ) = _mm512_shuffle_epi8( casti_m512i( s,15 ), ctl ); \
|
||||
}
|
||||
|
||||
// 16 lanes of 32 bytes each
|
||||
#define mm512_block_bswap_32( d, s ) do \
|
||||
#define mm512_block_bswap_32( d, s ) \
|
||||
{ \
|
||||
const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
|
||||
@@ -278,7 +301,31 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
|
||||
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
|
||||
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
|
||||
} while(0)
|
||||
}
|
||||
#define mm512_block_bswap32_256 mm512_block_bswap_32
|
||||
|
||||
#define mm512_block_bswap32_512( d, s ) \
|
||||
{ \
|
||||
const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
|
||||
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
|
||||
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
|
||||
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
|
||||
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
|
||||
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
|
||||
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
|
||||
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
|
||||
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
|
||||
casti_m512i( d, 8 ) = _mm512_shuffle_epi8( casti_m512i( s, 8 ), ctl ); \
|
||||
casti_m512i( d, 9 ) = _mm512_shuffle_epi8( casti_m512i( s, 9 ), ctl ); \
|
||||
casti_m512i( d,10 ) = _mm512_shuffle_epi8( casti_m512i( s,10 ), ctl ); \
|
||||
casti_m512i( d,11 ) = _mm512_shuffle_epi8( casti_m512i( s,11 ), ctl ); \
|
||||
casti_m512i( d,12 ) = _mm512_shuffle_epi8( casti_m512i( s,12 ), ctl ); \
|
||||
casti_m512i( d,13 ) = _mm512_shuffle_epi8( casti_m512i( s,13 ), ctl ); \
|
||||
casti_m512i( d,14 ) = _mm512_shuffle_epi8( casti_m512i( s,14 ), ctl ); \
|
||||
casti_m512i( d,15 ) = _mm512_shuffle_epi8( casti_m512i( s,15 ), ctl ); \
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Cross-lane shuffles implementing rotation of packed elements.
|
||||
|
@@ -1,38 +1,83 @@
|
||||
#if !defined(SIMD_64_H__)
|
||||
#define SIMD_64_H__ 1
|
||||
|
||||
#if defined(__x86_64__) && defined(__MMX__) && defined(__SSE__)
|
||||
#if defined(__x86_64__) && defined(__MMX__)
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// 64 bit MMX vectors.
|
||||
//
|
||||
// This code is not used anywhere annd likely never will. It's intent was
|
||||
// to support 2 way parallel hashing using SSE2 for 64 bit, and MMX for 32
|
||||
// bit hash functions, but was never implemented.
|
||||
// to support 2 way parallel hashing using MMX, or NEON for 32 bit hash
|
||||
// functions, but hasn't been implementedwas never implemented.
|
||||
//
|
||||
|
||||
#define v64_t __m64
|
||||
#define v64u32_t v64_t
|
||||
|
||||
#define v64_load _mm_load_si64
|
||||
#define v64_store _mm_store_si64
|
||||
|
||||
#define v64_64(i64) ((__m64)(i64))
|
||||
#define v64_32 _mm_set1_pi32
|
||||
#define v64_16 _mm_set1_pi16
|
||||
#define v64_8 _mm_set1_pi8
|
||||
|
||||
#define v64_add32 _mm_add_pi32
|
||||
#define v64_add16 _mm_add_pi16
|
||||
#define v64_add8 _mm_add_pi8
|
||||
|
||||
#define v64_mul32 _mm_mullo_pi32
|
||||
#define v64_mul16 _mm_mullo_pi16
|
||||
|
||||
// compare
|
||||
#define v64_cmpeq32 _mm_cmpeq_epi32
|
||||
#define v64_cmpeq16 _mm_cmpeq_epi16
|
||||
#define v64_cmpeq8 _mm_cmpeq_epi8
|
||||
|
||||
#define v64_cmpgt32 _mm_cmpgt_epi32
|
||||
#define v64_cmpgt16 _mm_cmpgt_epi16
|
||||
#define v64_cmpgt8 _mm_cmpgt_epi8
|
||||
|
||||
#define v64_cmplt32 _mm_cmplt_epi32
|
||||
#define v64_cmplt16 _mm_cmplt_epi16
|
||||
#define v64_cmplt8 _mm_cmplt_epi8
|
||||
|
||||
// bit shift
|
||||
#define v64_sl32 _mm_slli_epi32
|
||||
#define v64_sl16 _mm_slli_epi16
|
||||
#define v64_sl8 _mm_slli_epi8
|
||||
|
||||
#define v64_sr32 _mm_srli_epi32
|
||||
#define v64_sr16 _mm_srli_epi16
|
||||
#define v64_sr8 _mm_srli_epi8
|
||||
|
||||
#define v64_sra32 _mm_srai_epi32
|
||||
#define v64_sra16 _mm_srai_epi16
|
||||
#define v64_sra8 _mm_srai_epi8
|
||||
|
||||
#define v64_alignr8 _mm_alignr_pi8
|
||||
#define v64_unpacklo32 _mm_unpacklo_pi32
|
||||
#define v64_unpackhi32 _mm_unpackhi_pi32
|
||||
#define v64_unpacklo16 _mm_unpacklo_pi16
|
||||
#define v64_unpackhi16 _mm_unpacklhi_pi16
|
||||
#define v64_unpacklo8 _mm_unpacklo_pi8
|
||||
#define v64_unpackhi8 _mm_unpackhi_pi16
|
||||
|
||||
// Pseudo constants
|
||||
|
||||
/*
|
||||
#define m64_zero _mm_setzero_si64()
|
||||
#define m64_one_64 _mm_set_pi32( 0UL, 1UL )
|
||||
#define m64_one_32 _mm_set1_pi32( 1UL )
|
||||
#define m64_one_16 _mm_set1_pi16( 1U )
|
||||
#define m64_one_8 _mm_set1_pi8( 1U );
|
||||
#define m64_neg1 _mm_set1_pi32( 0xFFFFFFFFUL )
|
||||
*/
|
||||
#define m64_zero ( (__m64)0ULL )
|
||||
#define m64_one_64 ( (__m64)1ULL )
|
||||
#define m64_one_32 ( (__m64)0x0000000100000001ULL )
|
||||
#define m64_one_16 ( (__m64)0x0001000100010001ULL )
|
||||
#define m64_one_8 ( (__m64)0x0101010101010101ULL )
|
||||
#define m64_neg1 ( (__m64)0xFFFFFFFFFFFFFFFFULL )
|
||||
#define v64_zero _mm_setzero_si64()
|
||||
#define v64_one_64 _mm_set_pi32( 0UL, 1UL )
|
||||
#define v64_one_32 v64_32( 1UL )
|
||||
#define v64_one_16 v64_16( 1U )
|
||||
#define v64_one_8 v64_8( 1U );
|
||||
#define v64_neg1 v64_32( 0xFFFFFFFFUL )
|
||||
|
||||
#define casti_m64(p,i) (((__m64*)(p))[(i)])
|
||||
#define casti_v64(p,i) (((v64_t*)(p))[(i)])
|
||||
|
||||
// Bitwise not: ~(a)
|
||||
//#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
|
||||
#define mm64_not( a ) ( (__m64)( ~( (uint64_t)(a) ) )
|
||||
#define v64_not( a ) ( (v64_t)( ~( (uint64_t)(a) ) )
|
||||
|
||||
/*
|
||||
// Unary negate elements
|
||||
@@ -41,69 +86,95 @@
|
||||
#define mm64_negate_8( v ) _mm_sub_pi8( m64_zero, v )
|
||||
*/
|
||||
|
||||
static inline void v64_memset_zero( __m64 *dst, const int n )
|
||||
{ for ( int i = 0; i < n; i++ ) dst[i] = v64_zero; }
|
||||
|
||||
static inline void v64_memset( __m64 *dst, const __m64 a, const int n )
|
||||
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
|
||||
|
||||
static inline void v64_memcpy( __m64 *dst, const __m64 *src, const int n )
|
||||
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
||||
|
||||
#define v64_or _mm_or_si64
|
||||
#define v64_and _mm_and_si64
|
||||
#define v64_xor _mm_xor_si64
|
||||
#define v64_andnot _mm_andnot_si64
|
||||
#define v64_xor3( v2, v1, v0 ) v64_xor( v2, v64_andnot( v1, v0 ) )
|
||||
#define v64_xorandnot( v2, v1, v0 ) v64_xor( v2, v64_andnot( v1, v0 ) )
|
||||
|
||||
|
||||
// Rotate bits in packed elements of 64 bit vector
|
||||
#define mm64_rol_64( a, n ) \
|
||||
#define v64_rol64( a, n ) \
|
||||
_mm_or_si64( _mm_slli_si64( a, n ), \
|
||||
_mm_srli_si64( a, 64-(n) ) )
|
||||
|
||||
#define mm64_ror_64( a, n ) \
|
||||
#define v64_ror64( a, n ) \
|
||||
_mm_or_si64( _mm_srli_si64( a, n ), \
|
||||
_mm_slli_si64( a, 64-(n) ) )
|
||||
|
||||
#define mm64_rol_32( a, n ) \
|
||||
#define v64_rol32( a, n ) \
|
||||
_mm_or_si64( _mm_slli_pi32( a, n ), \
|
||||
_mm_srli_pi32( a, 32-(n) ) )
|
||||
|
||||
#define mm64_ror_32( a, n ) \
|
||||
#define v64_ror32( a, n ) \
|
||||
_mm_or_si64( _mm_srli_pi32( a, n ), \
|
||||
_mm_slli_pi32( a, 32-(n) ) )
|
||||
|
||||
#define mm64_rol_16( a, n ) \
|
||||
#define v64_rol16( a, n ) \
|
||||
_mm_or_si64( _mm_slli_pi16( a, n ), \
|
||||
_mm_srli_pi16( a, 16-(n) ) )
|
||||
|
||||
#define mm64_ror_16( a, n ) \
|
||||
#define v64_ror16( a, n ) \
|
||||
_mm_or_si64( _mm_srli_pi16( a, n ), \
|
||||
_mm_slli_pi16( a, 16-(n) ) )
|
||||
|
||||
// Rotate packed elements accross lanes. Useful for byte swap and byte
|
||||
// rotation.
|
||||
|
||||
// Swap hi & lo 32 bits.
|
||||
#define mm64_swap_32( a ) _mm_shuffle_pi16( a, 0x4e )
|
||||
#if defined(__SSE__)
|
||||
|
||||
#define mm64_shulfr_16( a ) _mm_shuffle_pi16( a, 0x39 )
|
||||
#define mm64_shufll_16( a ) _mm_shuffle_pi16( a, 0x93 )
|
||||
// Swap hi & lo 32 bits.
|
||||
#define v64_swap32( a ) _mm_shuffle_pi16( a, 0x4e )
|
||||
|
||||
#define v64_shulfr16( a ) _mm_shuffle_pi16( a, 0x39 )
|
||||
#define v64_shufll16( a ) _mm_shuffle_pi16( a, 0x93 )
|
||||
|
||||
// Swap hi & lo 16 bits of each 32 bit element
|
||||
#define mm64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 )
|
||||
#define v64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 )
|
||||
|
||||
#endif // SSE
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
// Endian byte swap packed elements
|
||||
#define mm64_bswap_32( v ) \
|
||||
|
||||
#define v64_bswap32( v ) \
|
||||
_mm_shuffle_pi8( v, (__m64)0x0405060700010203 )
|
||||
|
||||
#define mm64_bswap_16( v ) \
|
||||
#define v64_bswap16( v ) \
|
||||
_mm_shuffle_pi8( v, (__m64)0x0607040502030001 );
|
||||
|
||||
// Rotate right by c bytes
|
||||
static inline __m64 mm64_vror_x8( __m64 v, const int c )
|
||||
static inline v64_t v64_shuflr_x8( __m64 v, const int c )
|
||||
{ return _mm_alignr_pi8( v, v, c ); }
|
||||
|
||||
#else
|
||||
|
||||
#define mm64_bswap_32( v ) \
|
||||
#define v64_bswap32( v ) \
|
||||
_mm_set_pi32( __builtin_bswap32( ((uint32_t*)&v)[1] ), \
|
||||
__builtin_bswap32( ((uint32_t*)&v)[0] ) )
|
||||
|
||||
#define mm64_bswap_16( v ) \
|
||||
#define v64_bswap16( v ) \
|
||||
_mm_set_pi16( __builtin_bswap16( ((uint16_t*)&v)[3] ), \
|
||||
__builtin_bswap16( ((uint16_t*)&v)[2] ), \
|
||||
__builtin_bswap16( ((uint16_t*)&v)[1] ), \
|
||||
__builtin_bswap16( ((uint16_t*)&v)[0] ) )
|
||||
|
||||
#endif
|
||||
#endif // SSSE3
|
||||
|
||||
#define v64_blendv( v1, v0, mask ) \
|
||||
v64_or( v64_and( mask, v1 ), v64_andnot( mask, v0 ) )
|
||||
|
||||
|
||||
#endif // MMX
|
||||
|
||||
|
@@ -39,8 +39,66 @@ static inline uint32_t bswap_32( uint32_t a )
|
||||
( ( ( (x) << 24 ) & 0xff000000 ) | ( ((x) << 8 ) & 0x00ff0000 ) \
|
||||
| ( ( (x) >> 8 ) & 0x0000ff00 ) | ( ((x) >> 24 ) & 0x000000ff ) )
|
||||
|
||||
// Poorman's 2 way parallel SIMD uses u64 to bswap 2 u32
|
||||
#define bswap_32x2( u64 ) \
|
||||
( ( (u64) & 0xff000000ff000000 ) >> 24 ) \
|
||||
| ( ( (u64) & 0x00ff000000ff0000 ) >> 8 ) \
|
||||
| ( ( (u64) & 0x0000ff000000ff00 ) << 8 ) \
|
||||
| ( ( (u64) & 0x000000ff000000ff ) << 24 )
|
||||
|
||||
#endif
|
||||
|
||||
// 128 bit rotation
|
||||
#define bswap_128( x ) \
|
||||
( (uint128_t)(bswap_64( (uint64_t)(x & 0xffffffffffffffff) ) ) << 64 ); \
|
||||
|| ( (uint128_t)(bswap_64( (uint64_t)(x >> 64) ) ) ); \
|
||||
|
||||
|
||||
// Set byte order regardless of host order.
|
||||
static inline uint64_t be64( const uint64_t u64 )
|
||||
{
|
||||
const uint8_t *p = (uint8_t const *)&u64;
|
||||
return ( ( ( (uint64_t)(p[7]) + ( (uint64_t)(p[6]) << 8 ) ) +
|
||||
( ( (uint64_t)(p[5]) << 16 ) + ( (uint64_t)(p[4]) << 24 ) ) ) +
|
||||
( ( (uint64_t)(p[3]) << 32 ) + ( (uint64_t)(p[2]) << 40 ) ) +
|
||||
( ( (uint64_t)(p[1]) << 48 ) + ( (uint64_t)(p[0]) << 56 ) ) );
|
||||
}
|
||||
|
||||
static inline uint64_t le64( const uint64_t u64 )
|
||||
{
|
||||
const uint8_t *p = (uint8_t const *)&u64;
|
||||
return ( ( ( (uint64_t)(p[0]) + ( (uint64_t)(p[1]) << 8 ) ) +
|
||||
( ( (uint64_t)(p[2]) << 16 ) + ( (uint64_t)(p[3]) << 24 ) ) ) +
|
||||
( ( (uint64_t)(p[3]) << 32 ) + ( (uint64_t)(p[1]) << 40 ) ) +
|
||||
( ( (uint64_t)(p[2]) << 48 ) + ( (uint64_t)(p[3]) << 56 ) ) );
|
||||
}
|
||||
|
||||
static inline uint32_t be32( const uint32_t u32 )
|
||||
{
|
||||
const uint8_t *p = (uint8_t const *)&u32;
|
||||
return ( ( (uint32_t)(p[3]) + ( (uint32_t)(p[2]) << 8 ) ) +
|
||||
( ( (uint32_t)(p[1]) << 16 ) + ( (uint32_t)(p[0]) << 24 ) ) );
|
||||
}
|
||||
|
||||
static inline uint32_t le32( const uint32_t u32 )
|
||||
{
|
||||
const uint8_t *p = (uint8_t const *)&u32;
|
||||
return ( ( (uint32_t)(p[0]) + ( (uint32_t)(p[1]) << 8 ) ) +
|
||||
( ( (uint32_t)(p[2]) << 16) + ( (uint32_t)(p[3]) << 24 ) ) );
|
||||
}
|
||||
|
||||
static inline uint16_t be16( const uint16_t u16 )
|
||||
{
|
||||
const uint8_t *p = (uint8_t const *)&u16;
|
||||
return ( (uint16_t)(p[3]) ) + ( (uint16_t)(p[2]) << 8 );
|
||||
}
|
||||
|
||||
static inline uint32_t le162( const uint16_t u16 )
|
||||
{
|
||||
const uint8_t *p = (uint8_t const *)&u16;
|
||||
return ( (uint16_t)(p[0]) ) + ( (uint16_t)(p[1]) << 8 );
|
||||
}
|
||||
|
||||
// Bit rotation
|
||||
#if defined(__x86_64__)
|
||||
|
||||
@@ -51,10 +109,6 @@ static inline uint32_t bswap_32( uint32_t a )
|
||||
|
||||
#elif defined(__aarch64__)
|
||||
|
||||
//#pragma message "aarch64 fast bit rotation"
|
||||
|
||||
// "ror" instruction (intrinsic?) for 32 & 64 bits, args must determine size.
|
||||
|
||||
static inline uint64_t ror64( uint64_t a, const int c )
|
||||
{
|
||||
uint64_t b;
|
||||
|
@@ -1,72 +1,118 @@
|
||||
#if !defined(SIMD_NEON_H__)
|
||||
#define SIMD_NEON_H__ 1
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON)
|
||||
|
||||
// targeted functions using generic names makes portable obsolete
|
||||
// Targeted functions supporting NEON SIMD 128 & 64 bit vectors.
|
||||
// Size matters!
|
||||
//
|
||||
// Intel naming is generally used.
|
||||
//
|
||||
// documented instructions that aren't defined on RPi 4.
|
||||
// They seem to be all 3 op instructionsi.
|
||||
//
|
||||
// veor3q ie xor3
|
||||
// vxarq_u64( v1, v0, n ) ror( xor( v1, v0 ), n )
|
||||
// vraxlq_u64( v1, v0 ) xor( rol( v1, 1 ), rol( v0, 1 ) )
|
||||
// vbcaxq( v2, v1, v0 ) xor( v2, and( v1, not(v0) ) )
|
||||
//
|
||||
// might not work, not tried yet:
|
||||
//
|
||||
// vornq( v1, v0 ) or( v1, not( v0 ) )
|
||||
// vsraq_n( v1, v0, n ) add( v1, sr( v0, n ) )
|
||||
|
||||
#define v128_t uint32x4_t
|
||||
#define v128_t uint32x4_t // default,
|
||||
#define v128u64_t uint64x2_t
|
||||
#define v128u32_t uint32x4_t
|
||||
#define v128u16_t uint16x8_t
|
||||
#define v128u8_t uint8x16_t
|
||||
|
||||
// load & store
|
||||
#define v128_load( p ) vld1q_u32( (uint32_t*)(p) )
|
||||
#define v128_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
|
||||
|
||||
// load & set1 combined
|
||||
#define v128_load1_64(p) vld1q_dup_u64( (uint64_t*)(p) )
|
||||
#define v128_load1_32(p) vld1q_dup_u32( (uint32_t*)(p) )
|
||||
#define v128_load1_16(p) vld1q_dup_u16( (uint16_t*)(p) )
|
||||
#define v128_load1_8( p) vld1q_dup_u8( (uint8_t*) (p) )
|
||||
|
||||
// arithmetic
|
||||
#define v128_add64 vaddq_u64
|
||||
#define v128_add32 vaddq_u32
|
||||
#define v128_add16 vaddq_u16
|
||||
#define v128_add8 vaddq_u8
|
||||
|
||||
#define v128_add4_64( v3, v2, v1, v0 ) \
|
||||
vaddq_u64( vaddq_u64( v3, v2 ), vaddq_u64( v1, v0 ) )
|
||||
|
||||
#define v128_add4_32( v3, v2, v1, v0 ) \
|
||||
vaddq_u32( vaddq_u32( v3, v2 ), vaddq_u32( v1, v0 ) )
|
||||
|
||||
#define v128_sub64 vsubq_u64
|
||||
#define v128_sub32 vsubq_u32
|
||||
#define v128_sub16 vsubq_u16
|
||||
#define v128_sub8 vsubq_u8
|
||||
|
||||
// return low half
|
||||
#define v128_mullo64 vmulq_u64
|
||||
#define v128_mullo32 vmulq_u32
|
||||
#define v128_mullo16 vmulq_u16
|
||||
// returns low half, u64 undocumented, may not exist.
|
||||
#define v128_mul64 vmulq_u64
|
||||
#define v128_mul32 vmulq_u32
|
||||
#define v128_mul16 vmulq_u16
|
||||
|
||||
// widen not working, use placeholders
|
||||
//#define v128_mul32 vmull_u32
|
||||
//#define v128_mul16 vmull_u16
|
||||
#define v128_mul64 vmulq_u64
|
||||
#define v128_mul32 vmulq_u32
|
||||
#define v128_mul16 vmulq_u16
|
||||
// slow, tested with argon2d
|
||||
static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
|
||||
{
|
||||
return vmull_u32( vget_low_u32( vcopyq_laneq_u32( v1, 1, v1, 2 ) ),
|
||||
vget_low_u32( vcopyq_laneq_u32( v0, 1, v0, 2 ) ) );
|
||||
}
|
||||
|
||||
// compare
|
||||
#define v128_cmpeq64 vceqq_u64
|
||||
#define v128_cmpeq32 vceqq_u32
|
||||
#define v128_cmpeq16 vceqq_u16
|
||||
#define v128_cmpeq8 vceqq_u8
|
||||
|
||||
#define v128_cmpeq0 vceqzq_u64
|
||||
|
||||
#define v128_cmpgt64 vcgtq_u64
|
||||
#define v128_cmpgt32 vcgtq_u32
|
||||
#define v128_cmpgt16 vcgtq_u16
|
||||
#define v128_cmpgt8 vcgtq_u8
|
||||
|
||||
#define v128_cmplt64 vcltq_u64
|
||||
#define v128_cmplt32 vcltq_u32
|
||||
#define v128_cmplt16 vcltq_u16
|
||||
#define v128_cmplt8 vcltq_u8
|
||||
|
||||
// bit shift & rotate
|
||||
// bit shift
|
||||
#define v128_sl64 vshlq_n_u64
|
||||
#define v128_sl32 vshlq_n_u32
|
||||
#define v128_sl16 vshlq_n_u16
|
||||
#define v128_sl8 vshlq_n_u8
|
||||
|
||||
#define v128_sr64 vshrq_n_u64
|
||||
#define v128_sr32 vshrq_n_u32
|
||||
#define v128_sr16 vshrq_n_u16
|
||||
#define v128_sr8 vshrq_n_u8
|
||||
|
||||
// Maybe signed shift will work.
|
||||
#define v128_sra64 vshrq_n_s64
|
||||
#define v128_sra32 vshrq_n_s32
|
||||
#define v128_sra16 vshrq_n_s16
|
||||
|
||||
// logical ops
|
||||
// logic
|
||||
#define v128_or vorrq_u32
|
||||
#define v128_and vandq_u32
|
||||
#define v128_not vmvnq_u32
|
||||
#define v128_xor veorq_u32
|
||||
|
||||
#define v128_xor3( v2, v1, v0 ) v128_xor( v2, v128_xor( v1, v0 ) )
|
||||
//#define v128_xor3 veor3q_u32
|
||||
#define v128_nor vornq_u32
|
||||
#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32(v1), v0 )
|
||||
#define v128_xnor( a, b ) v128_not( v128_xor( a, b ) )
|
||||
#define v128_ornot vornq_u32
|
||||
|
||||
// ternary logic, veorq_u32 not defined
|
||||
//#define v128_xor3 veor3q_u32
|
||||
#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) )
|
||||
#define v128_nor vornq_u32
|
||||
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
|
||||
#define v128_and3( a, b, c ) v128_and( a, v128_and( b, c ) )
|
||||
#define v128_or3( a, b, c ) v128_or( a, v128_or( b, c ) )
|
||||
@@ -74,23 +120,31 @@
|
||||
#define v128_andxor( a, b, c ) v128_and( a, v128_xor( b, c ))
|
||||
#define v128_xoror( a, b, c ) v128_xor( a, v128_or( b, c ) )
|
||||
#define v128_orand( a, b, c ) v128_or( a, v128_and( b, c ) )
|
||||
#define v128_xnor( a, b ) v128_not( v128_xor( a, b ) )
|
||||
|
||||
#define v128_alignr64 vextq_u64
|
||||
#define v128_alignr32 vextq_u32
|
||||
#define v128_alignr8 vextq_u8
|
||||
// shift 2 concatenated vectors right.
|
||||
#define v128_alignr64( v1, v0, c ) vextq_u64( v0, v1, c )
|
||||
#define v128_alignr32( v1, v0, c ) vextq_u32( v0, v1, c )
|
||||
#define v128_alignr8( v1, v0, c ) vextq_u8( v0, v1, c )
|
||||
|
||||
#define v128_unpacklo64 vtrn1q_u64
|
||||
#define v128_unpackhi64 vtrn2q_u64
|
||||
// Intetleave high or low half of 2 vectors.
|
||||
#define v128_unpacklo64( v1, v0 ) vzip1q_u64( v0, v1 )
|
||||
#define v128_unpackhi64( v1, v0 ) vzip2q_u64( v0, v1 )
|
||||
#define v128_unpacklo32( v1, v0 ) vzip1q_u32( v0, v1 )
|
||||
#define v128_unpackhi32( v1, v0 ) vzip2q_u32( v0, v1 )
|
||||
#define v128_unpacklo16( v1, v0 ) vzip1q_u16( v0, v1 )
|
||||
#define v128_unpackhi16( v1, v0 ) vzip2q_u16( v0, v1 )
|
||||
#define v128_unpacklo8( v1, v0 ) vzip1q_u8( v0, v1 )
|
||||
#define v128_unpackhi8( v1, v0 ) vzip2q_u8( v0, v1 )
|
||||
|
||||
#define v128_unpacklo32 vtrn1q_u32
|
||||
#define v128_unpackhi32 vtrn2q_u32
|
||||
|
||||
#define v128_unpacklo16 vtrn1q_u16
|
||||
#define v128_unpackhi16 vtrn2q_u16
|
||||
|
||||
#define v128_unpacklo8 vtrn1q_u8
|
||||
#define v128_unpackhi8 vtrn2q_u8
|
||||
// Shorter agnostic names for unpack using NEON-like syntax
|
||||
#define v128_ziplo64 vzip1q_u64
|
||||
#define v128_ziphi64 vzip2q_u64
|
||||
#define v128_ziplo32 vzip1q_u32
|
||||
#define v128_ziphi32 vzip2q_u32
|
||||
#define v128_ziplo16 vzip1q_u16
|
||||
#define v128_ziphi16 vzip2q_u16
|
||||
#define v128_ziplo8 vzip1q_u8
|
||||
#define v128_ziphi8 vzip2q_u8
|
||||
|
||||
// AES
|
||||
// consistent with Intel AES, break up for optimizing
|
||||
@@ -100,16 +154,26 @@
|
||||
#define v128_aesdec( v, k ) vaesimcq_u8( vaesdq_u8( v, k ) )
|
||||
#define v128_aesdeclast( v, k ) vaesdq_u8( v, k )
|
||||
|
||||
typedef union
|
||||
{
|
||||
uint32x4_t m128;
|
||||
uint32_t u32[4];
|
||||
} __attribute__ ((aligned (16))) v128_ovly;
|
||||
|
||||
// pointer indexing
|
||||
#define casti_v128( p, i ) (((uint32x4_t*)(p))[i])
|
||||
|
||||
#define cast_v128( p ) (*((uint32x4_t*)(p)))
|
||||
#define castp_v128( p ) ((uint32x4_t*)(p))
|
||||
|
||||
#define casti_v128u64( p, i ) (((uint64x2_t*)(p))[i])
|
||||
#define cast_v128u64( p ) (*((uin64x24_t*)(p)))
|
||||
#define castp_v128u64( p ) ((uint64x2_t*)(p))
|
||||
|
||||
// Many NEON instructions are sized when they don't need to be, for example
|
||||
// zero, which may cause the compiler to complain when the sizes don't match.
|
||||
// use "-flax_vector_conversions".
|
||||
#define casti_v128u32( p, i ) (((uint32x4_t*)(p))[i])
|
||||
#define cast_v128u32( p ) (*((uint32x4_t*)(p)))
|
||||
#define castp_v128u32( p ) ((uint32x4_t*)(p))
|
||||
|
||||
// use C cast, flexible source type
|
||||
#define u32_to_u64 vreinterpretq_u64_u32
|
||||
#define u64_to_u32 vreinterpretq_u32_u64
|
||||
|
||||
@@ -120,123 +184,332 @@
|
||||
#define u8_to_u32 vreinterpretq_u32_u8
|
||||
|
||||
#define v128_zero v128_64( 0ull )
|
||||
//#define v128_zero_fn() v128_64( 0ull )
|
||||
//#define v128_zero v128_zero_fn
|
||||
|
||||
#define v128_cmpeq_zero vceqzq_u64
|
||||
|
||||
#define v128_neg1 v128_64( 0xffffffffffffffffull )
|
||||
|
||||
// set1
|
||||
#define v128_32 vmovq_n_u32
|
||||
#define v128_64 vmovq_n_u64
|
||||
#define v128_32 vmovq_n_u32
|
||||
#define v128_16 vmovq_n_u16
|
||||
#define v128_8 vmovq_n_u8
|
||||
|
||||
#define v64_set32( u32_1, u32_0 ) \
|
||||
vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )
|
||||
|
||||
#define v64_set16( u16_3, u16_2, u16_1, u16_0 ) \
|
||||
vcreate_u16( ( (uint64_t)( ( (uint32_t)(u16_3) << 16 ) \
|
||||
| (uint32_t)(u16_2) ) << 32 ) \
|
||||
| ( (uint64_t)( ( (uint32_t)(u16_1) << 16 ) \
|
||||
| (uint32_t)(u16_0) ) ) )
|
||||
|
||||
#define v64_set8( u8_7, u8_6, u8_5, u8_4, u8_3, u8_2, u8_1, u8_0 ) \
|
||||
vcreate_u8( \
|
||||
( (uint64_t)( ( (uint32_t)(((uint16_t)(u8_7) << 8 ) \
|
||||
| (uint16_t)(u8_6) ) << 16 ) \
|
||||
| ( (uint32_t)(((uint16_t)(u8_5) << 8 ) \
|
||||
| (uint16_t)(u8_4) ) )) << 32 ) \
|
||||
| ( (uint64_t)( ( (uint32_t)(((uint16_t)(u8_3) << 8 ) \
|
||||
| (uint16_t)(u8_2) ) << 16 ) \
|
||||
| ( (uint32_t)(((uint16_t)(u8_1) << 8 ) \
|
||||
| (uint16_t)(u8_0) ) )) ))
|
||||
|
||||
#define v128_set64( u64_1, u64_0 ) \
|
||||
( (uint64x2_t)( ( (uint128_t)(u64_1) << 64 ) | (uint128_t)(u64_0) ) )
|
||||
#define v128_set_64 v128_set64 // deprecated
|
||||
vcombine_u64( vcreate_u64( u64_0 ), vcreate_u64( u64_1 ) )
|
||||
|
||||
#define v128_set32( u32_3, u32_2, u32_1, u32_0 ) \
|
||||
(uint32x4_t)( ( (uint128_t)(u32_3) << 96 ) | ( (uint128_t)(u32_2) << 64 ) \
|
||||
| ( (uint128_t)(u32_1) << 64 ) | ( (uint128_t)(u32_0) ) )
|
||||
#define v128_set_32 v128_set32 // deprecated
|
||||
vcombine_u32( v64_set32( u32_1, u32_0 ), v64_set32( u32_3, u32_2 ) )
|
||||
|
||||
#define v128_set16( u16_7, u16_6, u16_5, u16_4, u16_3, u16_2, u16_1, u16_0 ) \
|
||||
vcombine_u16( v64_set16( u16_3, u16_2, u16_1, u16_0 ), \
|
||||
v64_set16( u16_7, u16_6, u16_5, u16_4 ) )
|
||||
|
||||
#define v128_set8( u8_f, u8_e, u8_d, u8_c, u8_b, u8_a, u8_9, u8_8, \
|
||||
u8_7, u8_6, u8_5, u8_4, u8_3, u8_2, u8_1, u8_0 ) \
|
||||
vcombine_u8( v64_set8( u8_7, u8_6, u8_5, u8_4, u8_3, u8_2, u8_1, u8_0 ), \
|
||||
v64_set8( u8_f, u8_e, u8_d, u8_c, u8_b, u8_a, u8_9, u8_8 ) )
|
||||
|
||||
|
||||
static inline void v128_memset_zero( uint32x4_t *dst, const int n )
|
||||
{ for( int i = 0; i < n; i++ ) dst[n] = (uint32x4_t)(uint128_t)0; }
|
||||
// move single element from source to dest,lanes must be immediate constant
|
||||
// same as xim?
|
||||
#define v128_movlane64( v1, l1, v0, l0 ) vcopyq_laneq_u64( v1, l1, v0, l0 )
|
||||
#define v128_movlane32( v1, l1, v0, l0 ) vcopyq_laneq_u32( v1, l1, v0, l0 )
|
||||
#define v128_movlane16( v1, l1, v0, l0 ) vcopyq_laneq_u16( v1, l1, v0, l0 )
|
||||
#define v128_movlane8( v1, l1, v0, l0 ) vcopyq_laneq_u8( v1, l1, v0, l0 )
|
||||
|
||||
static inline void v128_memset( uint32x4_t *dst, const uint32x4_t *src,
|
||||
const int n )
|
||||
{ for( int i = 0; i < n; i++ ) dst[n] = src[n]; }
|
||||
#define v128_get64( v, l ) vgetq_lane_u64( v, l )
|
||||
#define v128_get32( v, l ) vgetq_lane_u32( v, l )
|
||||
#define v128_get16( v, l ) vgetq_lane_u16( v, l )
|
||||
#define v128_get8( v, l ) vgetq_lane_u8( v, l )
|
||||
|
||||
#define v128_put64( v, i64, l ) vsetq_lane_u64( i64, v, l )
|
||||
#define v128_put32( v, i32, l ) vsetq_lane_u32( i32, v, l )
|
||||
#define v128_put16( v, i16, l ) vsetq_lane_u16( i16, v, l )
|
||||
#define v128_put8( v, i8, l ) vsetq_lane_u8( i8, v, l )
|
||||
|
||||
#define v128_negate64 vnegq_s64
|
||||
#define v128_negate32 vnegq_s32
|
||||
#define v128_negate16 vnegq_s16
|
||||
#define v128_negate8 vnegq_s8
|
||||
|
||||
static inline void v128_memset_zero( void *dst, const int n )
|
||||
{
|
||||
for( int i = 0; i < n; i++ )
|
||||
((uint32x4_t*)dst)[n] = (uint32x4_t)(uint128_t)0;
|
||||
}
|
||||
|
||||
static inline void v128_memset( void *dst, const void *src, const int n )
|
||||
{
|
||||
for( int i = 0; i < n; i++ )
|
||||
((uint32x4_t*)dst)[n] = ((const uint32x4_t*)src)[n];
|
||||
}
|
||||
|
||||
static inline void v128_memcpy( uint32x4_t *dst, const uint32x4_t *src, const int n )
|
||||
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
||||
|
||||
// select src & dst lanes
|
||||
#define v128_mov32( dst, ld, src, ls ) vcopyq_laneq_u32( dst, ld, src, ls )
|
||||
|
||||
// move src u64 to lane 0, neon needs a source vector to write into
|
||||
#define v128_mov64( u64 ) (uint64x2_t)(uint128_t)(u64)
|
||||
|
||||
static inline uint64x2_t v128_negate64( uint64x2_t v )
|
||||
{ return v128_sub64( v128_xor( v, v ), v ); }
|
||||
|
||||
static inline uint32x4_t v128_negate32( uint32x4_t v )
|
||||
{ return v128_sub32( v128_xor( v, v ), v ); }
|
||||
|
||||
static inline uint16x8_t v128_negate16( uint16x8_t v )
|
||||
{ return v128_sub64( v128_xor( v, v ), v ); }
|
||||
|
||||
#define v128_add4_32( v3, v2, v1, v0 ) \
|
||||
vaddq_u32( vaddq_u32( v3, v2 ), vaddq_u32( v1, v0 ) )
|
||||
static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i ++ )
|
||||
((uint32x4_t*)dst)[i] = ((const uint32x4_t*)src)[i];
|
||||
}
|
||||
|
||||
// how to build a bitmask from vector elements?
|
||||
#define v128_movmask32 _Static_assert (0, "No ARM target: v128_movmask32")
|
||||
#define v128_movmask64 _Static_assert (0, "No ARM target: v128_movmask64")
|
||||
#define v128_movmask32
|
||||
#define v128_movmask64
|
||||
|
||||
// Bit rotation
|
||||
//TODO, maybe, Optimize 64 bit rotations
|
||||
// Fall back for odd bit rotations
|
||||
static inline uint64x2_t v128_ror64( uint64x2_t v, int c )
|
||||
{ return vsriq_n_u64( vshlq_n_u64( v, 64-c ), v, c ); }
|
||||
|
||||
static inline uint64x2_t v128_ror64( uint64x2_t v, const int c )
|
||||
{ return vsriq_n_u64( vsliq_n_u64( v, v, 64-(c) ), v, c ); }
|
||||
static inline uint64x2_t v128_rol64( uint64x2_t v, int c )
|
||||
{ return vsriq_n_u64( vshlq_n_u64( v, c ), v, 64-c ); }
|
||||
|
||||
static inline uint64x2_t v128_rol64( uint64x2_t v, const int c )
|
||||
{ return vsriq_n_u64( vsliq_n_u64( v, v, c ), v, 64-(c) ); }
|
||||
static inline uint32x4_t v128_ror32( uint32x4_t v, int c )
|
||||
{ return vsriq_n_u32( vshlq_n_u32( v, 32-c ), v, c ); }
|
||||
|
||||
static inline uint32x4_t v128_ror32( uint32x4_t v, const int c )
|
||||
{ return vsriq_n_u32( vsliq_n_u32( v, v, 32-(c) ), v, c ); }
|
||||
static inline uint32x4_t v128_rol32( uint32x4_t v, int c )
|
||||
{ return vsriq_n_u32( vshlq_n_u32( v, c ), v, 32-c ); }
|
||||
|
||||
static inline uint32x4_t v128_rol32( uint32x4_t v, const int c )
|
||||
{ return vsriq_n_u32( vsliq_n_u32( v, v, c ), v, 32-(c) ); }
|
||||
static inline uint16x8_t v128_ror16( uint16x8_t v, int c )
|
||||
{ return vsriq_n_u16( vshlq_n_u16( v, 16-c ), v, c ); }
|
||||
|
||||
static inline uint16x8_t v128_ror16( uint16x8_t v, const int c )
|
||||
{ return vsriq_n_u16( vsliq_n_u16( v, v, 16-(c) ), v, c ); }
|
||||
static inline uint16x8_t v128_rol16( uint16x8_t v, int c )
|
||||
{ return vsriq_n_u16( vshlq_n_u16( v, c ), v, 16-c ); }
|
||||
|
||||
static inline uint16x8_t v128_rol16( uint16x8_t v, const int c )
|
||||
{ return vsriq_n_u16( vsliq_n_u16( v, v, c ), v, 16-(c) ); }
|
||||
static inline uint8x16_t v128_ror8( uint8x16_t v, int c )
|
||||
{ return vsriq_n_u8( vshlq_n_u8( v, 8-c ), v, c ); }
|
||||
|
||||
// reverse endian byte order
|
||||
#define v128_bswap16(v) u8_to_u16( vrev16q_u8( u16_to_u8(v) ))
|
||||
#define v128_bswap32(v) u8_to_u32( vrev32q_u8( u32_to_u8(v) ))
|
||||
#define v128_bswap64(v) u8_to_u64( vrev64q_u8( u64_to_u8(v) ))
|
||||
#define v128_bswap128(v) v128_swap64( v128_bswap64(v) )
|
||||
static inline uint8x16_t v128_rol8( uint16x8_t v, int c )
|
||||
{ return vsriq_n_u8( vshlq_n_u8( v, c ), v, 8-c ); }
|
||||
|
||||
#define v128_block_bswap32( dst, src ) \
|
||||
casti_v128( dst, 0 ) = v128_bswap32( casti_v128( src, 0 ) ); \
|
||||
casti_v128( dst, 1 ) = v128_bswap32( casti_v128( src, 1 ) ); \
|
||||
casti_v128( dst, 2 ) = v128_bswap32( casti_v128( src, 2 ) ); \
|
||||
casti_v128( dst, 3 ) = v128_bswap32( casti_v128( src, 3 ) ); \
|
||||
casti_v128( dst, 4 ) = v128_bswap32( casti_v128( src, 4 ) ); \
|
||||
casti_v128( dst, 5 ) = v128_bswap32( casti_v128( src, 5 ) ); \
|
||||
casti_v128( dst, 6 ) = v128_bswap32( casti_v128( src, 6 ) ); \
|
||||
casti_v128( dst, 7 ) = v128_bswap32( casti_v128( src, 7 ) );
|
||||
/*
|
||||
// Optimzed for half element rotations (swap)
|
||||
#define v128_ror64( v, c ) \
|
||||
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( v ) : v128_ror64_neon( v, c )
|
||||
|
||||
#define v128_block_bswap64( dst, src ) \
|
||||
dst[0] = v128_bswap64( src[0] ); \
|
||||
dst[1] = v128_bswap64( src[1] ); \
|
||||
dst[2] = v128_bswap64( src[2] ); \
|
||||
dst[3] = v128_bswap64( src[3] ); \
|
||||
dst[4] = v128_bswap64( src[4] ); \
|
||||
dst[5] = v128_bswap64( src[5] ); \
|
||||
dst[6] = v128_bswap64( src[6] ); \
|
||||
dst[7] = v128_bswap64( src[7] );
|
||||
#define v128_rol64( v, c ) \
|
||||
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( v ) : v128_rol64_neon( v, c )
|
||||
|
||||
#define v128_ror32( v, c ) \
|
||||
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( v ) : v128_ror32_neon( v, c )
|
||||
|
||||
#define v128_rev32( v ) vrev64q_u32( v )
|
||||
#define v128_rol32( v, c ) \
|
||||
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( v ) : v128_rol32_neon( v, c )
|
||||
*/
|
||||
|
||||
#define v128_2ror64( v1, v0, c ) \
|
||||
{ \
|
||||
uint64x2_t t0 = vshrq_n_u64( v0, c ); \
|
||||
uint64x2_t t1 = vshrq_n_u64( v1, c ); \
|
||||
v0 = vsliq_n_u64( v0, 64-(c) ); \
|
||||
v1 = vsliq_n_u64( v1, 64-(c) ); \
|
||||
v0 = vorrq_u64( v0, t0 ); \
|
||||
v1 = vorrq_u64( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define v128_2rol64_( v1, v0, c ) \
|
||||
{ \
|
||||
uint64x2_t t0 = vshlq_n_u64( v0, c ); \
|
||||
uint64x2_t t1 = vshlq_n_u64( v1, c ); \
|
||||
v0 = vsriq_n_u64( v0, 64-(c) ); \
|
||||
v1 = vsriq_n_u64( v1, 64-(c) ); \
|
||||
v0 = vorrq_u64( v0, t0 ); \
|
||||
v1 = vorrq_u64( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define v128_2rorl32( v1, v0, c ) \
|
||||
{ \
|
||||
uint32x4_t t0 = vshrq_n_u32( v0, c ); \
|
||||
uint32x4_t t1 = vshrq_n_u32( v1, c ); \
|
||||
v0 = vsliq_n_u32( v0, 32-(c) ); \
|
||||
v1 = vsliq_n_u32( v1, 32-(c) ); \
|
||||
v0 = vorrq_u32( v0, t0 ); \
|
||||
v1 = vorrq_u32( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define v128_2rorx32( v1, v0, c ) \
|
||||
{ \
|
||||
uint32x4_t t0 = vshlq_n_u32( v0, c ); \
|
||||
uint32x4_t t1 = vshlq_n_u32( v1, c ); \
|
||||
v0 = vsriq_n_u32( v0, 32-(c) ); \
|
||||
v1 = vsriq_n_u32( v1, 32-(c) ); \
|
||||
v0 = vorrq_u32( v0, t0 ); \
|
||||
v1 = vorrq_u32( v1, t1 ); \
|
||||
}
|
||||
|
||||
// vector rotation , size?
|
||||
static inline uint32x4_t v128_swap64( uint32x4_t v )
|
||||
{ return vextq_u64( v, v, 1 ); }
|
||||
|
||||
static inline uint32x4_t v128_swap32( uint32x4_t v )
|
||||
{ return vextq_u32( v, v, 2 ); }
|
||||
|
||||
static inline uint32x4_t v128_shuflr32( uint32x4_t v )
|
||||
{ return vextq_u32( v, v, 1 ); }
|
||||
|
||||
static inline uint32x4_t v128_shufll32( uint32x4_t v )
|
||||
{ return vextq_u32( v, v, 3 ); }
|
||||
|
||||
#define v128_swap64_32(v) v128_ror64( v, 32 )
|
||||
#define v128_shuflr64_24(v) v128_ror64( v, 24 )
|
||||
#define v128_shuflr64_16(v) v128_ror64( v, 16 )
|
||||
// Cross lane shuffles, no programmable shuffle in NEON
|
||||
|
||||
#define v128_swap32_16(v) v128_ror32( v, 16 )
|
||||
#define v128_shuflr32_8(v) v128_ror32( v, 8 )
|
||||
// vector mask, use as last resort. prefer rev, alignr, etc
|
||||
#define v128_shufflev32( v, vmask ) \
|
||||
v128_set32( ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[3] ], \
|
||||
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[2] ], \
|
||||
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[1] ], \
|
||||
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[0] ] ) \
|
||||
|
||||
// Not the same as SSE2, this uses vector mask, SSE2 uses imm8 mask.
|
||||
#define v128_blend16( v1, v0, mask ) \
|
||||
v128_or( v128_and( mask, v1 ), v128_andnot( mask, v0 ) )
|
||||
// compatible with x86_64, but very slow, avoid
|
||||
#define v128_shuffle8( v, vmask ) \
|
||||
v128_set8( ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[15] ], \
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[14] ], \
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[13] ], \
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[12] ], \
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[11] ], \
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[10] ], \
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 9] ], \
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 8] ], \
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 7] ], \
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 6] ], \
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 5] ], \
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 4] ], \
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 3] ], \
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 2] ], \
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 1] ], \
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 0] ] )
|
||||
|
||||
#endif
|
||||
#define v128_swap64_32( v ) vrev64q_u32( v )
|
||||
#define v128_v128_shuflr64_16( v ) v128_ror_64( v, 16 )
|
||||
#define v128_v128_shufll64_16( v ) v128_rol_64( v, 16 )
|
||||
|
||||
// Don't use as an alias for byte sized bit rotation
|
||||
#define v128_swap32_16( v ) vrev64q_u16( v )
|
||||
#define v128_v128_shuflr32_8( v ) v128_ror_32( v, 8 )
|
||||
#define v128_v128_shufll32_8( v ) v128_rol_32( v, 8 )
|
||||
|
||||
// reverse elements
|
||||
#define v128_rev32( v ) vrev64q_u32( v )
|
||||
#define v128_rev16( v ) vrev64q_u16( v )
|
||||
#define v128_rev8( v ) vrev64q_u8( v )
|
||||
|
||||
// reverse bits, nothing like it in x86_64
|
||||
#define v128_bitrev8( v ) vrbitq_u8
|
||||
|
||||
// reverse byte order
|
||||
#define v128_bswap16 vrev16q_u8
|
||||
#define v128_bswap32 vrev32q_u8
|
||||
#define v128_bswap64 vrev64q_u8
|
||||
#define v128_bswap128(v) v128_swap64( v128_bswap64(v) )
|
||||
#define v128_bswap256(p) v128_bswap128( (p)[0], (p)[1] )
|
||||
|
||||
// Usefull for x86_64 but does nothing for ARM
|
||||
#define v128_block_bswap32( dst, src ) \
|
||||
{ \
|
||||
casti_v128u32( dst,0 ) = v128_bswap32( casti_v128u32( src,0 ) ); \
|
||||
casti_v128u32( dst,1 ) = v128_bswap32( casti_v128u32( src,1 ) ); \
|
||||
casti_v128u32( dst,2 ) = v128_bswap32( casti_v128u32( src,2 ) ); \
|
||||
casti_v128u32( dst,3 ) = v128_bswap32( casti_v128u32( src,3 ) ); \
|
||||
casti_v128u32( dst,4 ) = v128_bswap32( casti_v128u32( src,4 ) ); \
|
||||
casti_v128u32( dst,5 ) = v128_bswap32( casti_v128u32( src,5 ) ); \
|
||||
casti_v128u32( dst,6 ) = v128_bswap32( casti_v128u32( src,6 ) ); \
|
||||
casti_v128u32( dst,7 ) = v128_bswap32( casti_v128u32( src,7 ) ); \
|
||||
}
|
||||
#define v128_block_bswap32_256( dst, src ) \
|
||||
|
||||
#define v128_block_bswap32_512( dst, src ) \
|
||||
{ \
|
||||
casti_v128u32( dst, 0 ) = v128_bswap32( casti_v128u32( src, 0 ) ); \
|
||||
casti_v128u32( dst, 1 ) = v128_bswap32( casti_v128u32( src, 1 ) ); \
|
||||
casti_v128u32( dst, 2 ) = v128_bswap32( casti_v128u32( src, 2 ) ); \
|
||||
casti_v128u32( dst, 3 ) = v128_bswap32( casti_v128u32( src, 3 ) ); \
|
||||
casti_v128u32( dst, 4 ) = v128_bswap32( casti_v128u32( src, 4 ) ); \
|
||||
casti_v128u32( dst, 5 ) = v128_bswap32( casti_v128u32( src, 5 ) ); \
|
||||
casti_v128u32( dst, 6 ) = v128_bswap32( casti_v128u32( src, 6 ) ); \
|
||||
casti_v128u32( dst, 7 ) = v128_bswap32( casti_v128u32( src, 7 ) ); \
|
||||
casti_v128u32( dst, 8 ) = v128_bswap32( casti_v128u32( src, 8 ) ); \
|
||||
casti_v128u32( dst, 9 ) = v128_bswap32( casti_v128u32( src, 9 ) ); \
|
||||
casti_v128u32( dst,10 ) = v128_bswap32( casti_v128u32( src,10 ) ); \
|
||||
casti_v128u32( dst,11 ) = v128_bswap32( casti_v128u32( src,11 ) ); \
|
||||
casti_v128u32( dst,12 ) = v128_bswap32( casti_v128u32( src,12 ) ); \
|
||||
casti_v128u32( dst,13 ) = v128_bswap32( casti_v128u32( src,13 ) ); \
|
||||
casti_v128u32( dst,14 ) = v128_bswap32( casti_v128u32( src,14 ) ); \
|
||||
casti_v128u32( dst,15 ) = v128_bswap32( casti_v128u32( src,15 ) ); \
|
||||
}
|
||||
|
||||
#define v128_block_bswap64( dst, src ) \
|
||||
{ \
|
||||
casti_v128u64( dst,0 ) = v128_bswap64( casti_v128u64( src,0 ) ); \
|
||||
casti_v128u64( dst,1 ) = v128_bswap64( casti_v128u64( src,1 ) ); \
|
||||
casti_v128u64( dst,2 ) = v128_bswap64( casti_v128u64( src,2 ) ); \
|
||||
casti_v128u64( dst,3 ) = v128_bswap64( casti_v128u64( src,3 ) ); \
|
||||
casti_v128u64( dst,4 ) = v128_bswap64( casti_v128u64( src,4 ) ); \
|
||||
casti_v128u64( dst,5 ) = v128_bswap64( casti_v128u64( src,5 ) ); \
|
||||
casti_v128u64( dst,6 ) = v128_bswap64( casti_v128u64( src,6 ) ); \
|
||||
casti_v128u64( dst,7 ) = v128_bswap64( casti_v128u64( src,7 ) ); \
|
||||
}
|
||||
#define v128_block_bswap64_512 v128_block_bswap64 \
|
||||
|
||||
#define v128_block_bswap64_1024( dst, src ) \
|
||||
{ \
|
||||
casti_v128u64( dst, 0 ) = v128_bswap64( casti_v128u64( src, 0 ) ); \
|
||||
casti_v128u64( dst, 1 ) = v128_bswap64( casti_v128u64( src, 1 ) ); \
|
||||
casti_v128u64( dst, 2 ) = v128_bswap64( casti_v128u64( src, 2 ) ); \
|
||||
casti_v128u64( dst, 3 ) = v128_bswap64( casti_v128u64( src, 3 ) ); \
|
||||
casti_v128u64( dst, 4 ) = v128_bswap64( casti_v128u64( src, 4 ) ); \
|
||||
casti_v128u64( dst, 5 ) = v128_bswap64( casti_v128u64( src, 5 ) ); \
|
||||
casti_v128u64( dst, 6 ) = v128_bswap64( casti_v128u64( src, 6 ) ); \
|
||||
casti_v128u64( dst, 7 ) = v128_bswap64( casti_v128u64( src, 7 ) ); \
|
||||
casti_v128u64( dst, 8 ) = v128_bswap64( casti_v128u64( src, 8 ) ); \
|
||||
casti_v128u64( dst, 9 ) = v128_bswap64( casti_v128u64( src, 9 ) ); \
|
||||
casti_v128u64( dst,10 ) = v128_bswap64( casti_v128u64( src,10 ) ); \
|
||||
casti_v128u64( dst,11 ) = v128_bswap64( casti_v128u64( src,11 ) ); \
|
||||
casti_v128u64( dst,12 ) = v128_bswap64( casti_v128u64( src,12 ) ); \
|
||||
casti_v128u64( dst,13 ) = v128_bswap64( casti_v128u64( src,13 ) ); \
|
||||
casti_v128u64( dst,14 ) = v128_bswap64( casti_v128u64( src,14 ) ); \
|
||||
casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
|
||||
}
|
||||
|
||||
// Prograsmmable shuffles
|
||||
// no compatible shuffles with x86_64, will require targeted user code.
|
||||
|
||||
#define v128_extractmask8( df, de, dd, dc, db, da, d9, d8, \
|
||||
d7, d6, d5, d4, d3, d2, d1, d0, vmask ) \
|
||||
d0 = ((uint8_t*)(&vmask))[0]; d1 = ((uint8_t*)(&vmask))[1]; \
|
||||
d2 = ((uint8_t*)(&vmask))[2]; d3 = ((uint8_t*)(&vmask))[3]; \
|
||||
d4 = ((uint8_t*)(&vmask))[0]; d5 = ((uint8_t*)(&vmask))[1]; \
|
||||
d6 = ((uint8_t*)(&vmask))[2]; d7 = ((uint8_t*)(&vmask))[3]; \
|
||||
d8 = ((uint8_t*)(&vmask))[0]; d9 = ((uint8_t*)(&vmask))[1]; \
|
||||
da = ((uint8_t*)(&vmask))[2]; db = ((uint8_t*)(&vmask))[3]; \
|
||||
dc = ((uint8_t*)(&vmask))[0]; dd = ((uint8_t*)(&vmask))[1]; \
|
||||
de = ((uint8_t*)(&vmask))[2]; df = ((uint8_t*)(&vmask))[3];
|
||||
|
||||
// Blendv
|
||||
#define v128_blendv( v1, v0, mask ) \
|
||||
v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )
|
||||
|
||||
/*
|
||||
// vbcaxq not defined
|
||||
#define v128_blendv( v1, v0, mask ) \
|
||||
vbcaxq_u32( v128_and( mask, v1 ), v0, mask )
|
||||
*/
|
||||
|
||||
#endif // __ARM_NEON
|
||||
|
||||
#endif // SIMD_NEON_H__
|
||||
|
Reference in New Issue
Block a user