mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v23.10
This commit is contained in:
@@ -35,17 +35,17 @@
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// New architecturally agnostic syntax:
|
||||
// All users of 128 bit SIMD should use new syntax or protect SSE2 only
|
||||
// code segments.
|
||||
// Other vector sizes continue with old syntax for now.
|
||||
// Definitionns here will gradually be converted to new synytax.
|
||||
// For consistency the larger vector utilities should do the same.
|
||||
|
||||
//
|
||||
// __m128i -> v128_t
|
||||
// _mm_ -> v128_
|
||||
// mm128_ -> v128_
|
||||
//
|
||||
// There is also new syntax to accomodate ARM's stricter type checking of
|
||||
// vector element size. They have no effect on x86_64.
|
||||
|
||||
// direct translation of native intrinsics
|
||||
|
||||
#define v128_t __m128i
|
||||
// Needed for ARM
|
||||
#define v128u64_t v128_t
|
||||
#define v128u32_t v128_t
|
||||
#define v128u16_t v128_t
|
||||
@@ -56,9 +56,9 @@
|
||||
|
||||
// Needed for ARM, Doesn't do anything special on x86_64
|
||||
#define v128_load1_64(p) _mm_set1_epi64x(*(uint64_t*)(p) )
|
||||
#define v128_load1_32(p) _mm_set_epi32( *(uint32_t*)(p) )
|
||||
#define v128_load1_16(p) _mm_set_epi16( *(uint16_t*)(p) )
|
||||
#define v128_load1_8( p) _mm_set_epi8( *(uint8_t*) (p) )
|
||||
#define v128_load1_32(p) _mm_set1_epi32( *(uint32_t*)(p) )
|
||||
#define v128_load1_16(p) _mm_set1_epi16( *(uint16_t*)(p) )
|
||||
#define v128_load1_8( p) _mm_set1_epi8( *(uint8_t*) (p) )
|
||||
|
||||
// arithmetic
|
||||
#define v128_add64 _mm_add_epi64
|
||||
@@ -80,7 +80,7 @@
|
||||
#define v128_mulw32 _mm_mul_epu32
|
||||
#define v128_mulw16 _mm_mul_epu16
|
||||
|
||||
// compare
|
||||
// signed compare
|
||||
#define v128_cmpeq64 _mm_cmpeq_epi64
|
||||
#define v128_cmpeq32 _mm_cmpeq_epi32
|
||||
#define v128_cmpeq16 _mm_cmpeq_epi16
|
||||
@@ -118,20 +118,6 @@
|
||||
#define v128_xor _mm_xor_si128
|
||||
#define v128_xorq _mm_xor_si128
|
||||
#define v128_andnot _mm_andnot_si128
|
||||
#define v128_xnor( a, b ) v128_not( _mm_xor_si128( a, b ) )
|
||||
#define v128_ornot( a, b ) _mm_or_si128( a, v128_not( b ) )
|
||||
|
||||
// ternary
|
||||
#define v128_xorandnot( v2, v1, v0 ) \
|
||||
_mm_xor_si128( v2, _mm_andnot_si128( v1, v0 ) )
|
||||
#define v128_xor3( v2, v1, v0 ) \
|
||||
_mm_xor_si128( v2, _mm_xor_si128( v1, v0 ) )
|
||||
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
|
||||
#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
|
||||
#define v128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
|
||||
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
// unpack
|
||||
#define v128_unpacklo64 _mm_unpacklo_epi64
|
||||
@@ -234,24 +220,22 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
|
||||
|
||||
// Pseudo constants
|
||||
#define v128_zero _mm_setzero_si128()
|
||||
#define m128_zero _mm_setzero_si128()
|
||||
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
// Bitwise AND, return 1 if result is all bits clear.
|
||||
#define v128_and_eq0 _mm_testz_si128
|
||||
#define v128_and_eq0(v1, v0) _mm_testz_si128(v1, v0)
|
||||
|
||||
// v128_is_zero?
|
||||
static inline int v128_cmpeq0( v128_t v )
|
||||
{ return v128_and_eq0( v, v ); }
|
||||
|
||||
#endif
|
||||
|
||||
// Bitwise compare return 1 if all bits set.
|
||||
#define v128_cmpeq1 _mm_test_all ones
|
||||
#define v128_cmpeq1(v) _mm_test_all ones(v)
|
||||
|
||||
#define v128_one mm128_mov64_128( 1 )
|
||||
#define m128_one_128 v128_one
|
||||
#define v128_one mm128_mov64_128(1)
|
||||
|
||||
// ASM avoids the need to initialize return variable to avoid compiler warning.
|
||||
// Macro abstracts function parentheses to look like an identifier.
|
||||
@@ -265,17 +249,14 @@ static inline __m128i v128_neg1_fn()
|
||||
#endif
|
||||
return a;
|
||||
}
|
||||
#define m128_neg1_fn v128_neg1_fn
|
||||
#define v128_neg1 v128_neg1_fn()
|
||||
#define m128_neg1 v128_neg1
|
||||
|
||||
//
|
||||
// Vector pointer cast
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns p as pointer to vector type
|
||||
#define castp_m128i(p) ((__m128i*)(p))
|
||||
#define castp_v128 castp_m128i
|
||||
#define castp_v128(p) ((__m128i*)(p))
|
||||
#define castp_v128u64 castp_v128
|
||||
#define castp_v128u32 castp_v128
|
||||
#define castp_v128u16 castp_v128
|
||||
@@ -283,8 +264,7 @@ static inline __m128i v128_neg1_fn()
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns *p, watch your pointer arithmetic
|
||||
#define cast_m128i(p) (*((__m128i*)(p)))
|
||||
#define cast_v128 cast_m128i
|
||||
#define cast_v128(p) (*((__m128i*)(p)))
|
||||
#define cast_v128u64 cast_v128
|
||||
#define cast_v128u32 cast_v128
|
||||
#define cast_v128u16 cast_v128
|
||||
@@ -292,8 +272,8 @@ static inline __m128i v128_neg1_fn()
|
||||
|
||||
// p = any aligned pointer, i = scaled array index
|
||||
// returns value p[i]
|
||||
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
|
||||
#define casti_v128 casti_m128i
|
||||
#define casti_v128(p,i) (((__m128i*)(p))[(i)])
|
||||
#define casti_m128i casti_v128 // deprecated
|
||||
#define casti_v128u64 casti_v128
|
||||
#define casti_v128u32 casti_v128
|
||||
#define casti_v128u16 casti_v128
|
||||
@@ -301,7 +281,7 @@ static inline __m128i v128_neg1_fn()
|
||||
|
||||
// p = any aligned pointer, o = scaled offset
|
||||
// returns pointer p+o
|
||||
#define casto_m128i(p,o) (((__m128i*)(p))+(o))
|
||||
#define casto_v128(p,o) (((__m128i*)(p))+(o))
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#define v128_get64( v, l ) _mm_extract_epi64( v, l )
|
||||
@@ -316,7 +296,7 @@ static inline __m128i v128_neg1_fn()
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
//
|
||||
// _mm_insert_ps( _mm128i v1, __m128i v2, imm8 c )
|
||||
// _mm_insert_ps( __m128i v1, __m128i v2, imm8 c )
|
||||
//
|
||||
// Fast and powerful but very limited in its application.
|
||||
// It requires SSE4.1 but only works with 128 bit vectors with 32 bit
|
||||
@@ -371,37 +351,31 @@ static inline __m128i v128_neg1_fn()
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
|
||||
static inline __m128i mm128_not( const __m128i v )
|
||||
static inline __m128i v128_not( const __m128i v )
|
||||
{ return _mm_ternarylogic_epi64( v, v, v, 1 ); }
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_not( v ) _mm_xor_si128( v, m128_neg1 )
|
||||
#define v128_not( v ) _mm_xor_si128( v, v128_neg1 )
|
||||
|
||||
#endif
|
||||
#define v128_not mm128_not
|
||||
|
||||
static inline __m128i mm128_negate_64( __m128i v )
|
||||
static inline v128u64_t v128_negate_64( v128u64_t v )
|
||||
{ return _mm_sub_epi64( _mm_xor_si128( v, v ), v ); }
|
||||
#define v128_negate64 mm128_negate_64
|
||||
|
||||
static inline __m128i mm128_negate_32( __m128i v )
|
||||
static inline v128u32_t v128_negate_32( v128u32_t v )
|
||||
{ return _mm_sub_epi32( _mm_xor_si128( v, v ), v ); }
|
||||
#define v128_negate32 mm128_negate_32
|
||||
|
||||
static inline __m128i mm128_negate_16( __m128i v )
|
||||
static inline v128u16_t v128_negate_16( v128u16_t v )
|
||||
{ return _mm_sub_epi16( _mm_xor_si128( v, v ), v ); }
|
||||
#define v128_negate16 mm128_negate_16
|
||||
|
||||
|
||||
// Add 4 values, fewer dependencies than sequential addition.
|
||||
#define v128_add4_64( a, b, c, d ) \
|
||||
_mm_add_epi64( _mm_add_epi64( a, b ), _mm_add_epi64( c, d ) )
|
||||
#define mm128_add4_64 v128_add4_64
|
||||
|
||||
#define v128_add4_32( a, b, c, d ) \
|
||||
_mm_add_epi32( _mm_add_epi32( a, b ), _mm_add_epi32( c, d ) )
|
||||
#define mm128_add4_32 v128_add4_32
|
||||
|
||||
#define v128_add4_16( a, b, c, d ) \
|
||||
_mm_add_epi16( _mm_add_epi16( a, b ), _mm_add_epi16( c, d ) )
|
||||
@@ -411,7 +385,6 @@ static inline __m128i mm128_negate_16( __m128i v )
|
||||
|
||||
#define v128_xor4( a, b, c, d ) \
|
||||
_mm_xor_si128( _mm_xor_si128( a, b ), _mm_xor_si128( c, d ) )
|
||||
#define mm128_xor4 v128_xor4
|
||||
|
||||
|
||||
// Memory functions
|
||||
@@ -419,70 +392,71 @@ static inline __m128i mm128_negate_16( __m128i v )
|
||||
// Assumes data is alinged and integral.
|
||||
// n = number of __m128i, bytes/16
|
||||
|
||||
static inline void memset_zero_128( __m128i *dst, const int n )
|
||||
{ for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; }
|
||||
#define v128_memset_zero memset_zero_128
|
||||
static inline void v128_memset_zero( v128_t *dst, const int n )
|
||||
{ for ( int i = 0; i < n; i++ ) dst[i] = v128_zero; }
|
||||
#define memset_zero_128 v128_memset_zero
|
||||
|
||||
static inline void memset_128( __m128i *dst, const __m128i a, const int n )
|
||||
static inline void v128_memset( v128_t *dst, const v128_t a, const int n )
|
||||
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
|
||||
#define v128_memset memset_128
|
||||
|
||||
static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
||||
#define v128_memcpy memcpy_128
|
||||
#define memcpy_128 v128_memcpy
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
|
||||
// a ^ b ^ c
|
||||
#define mm128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
#define v128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
|
||||
// a & b & c
|
||||
#define mm128_and3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x80 )
|
||||
#define v128_and3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x80 )
|
||||
|
||||
// a | b | c
|
||||
#define mm128_or3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xfe )
|
||||
#define v128_or3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xfe )
|
||||
|
||||
// a ^ ( b & c )
|
||||
#define mm128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 )
|
||||
#define v128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 )
|
||||
|
||||
// a & ( b ^ c )
|
||||
#define mm128_andxor( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x60 )
|
||||
#define v128_andxor( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x60 )
|
||||
|
||||
// a ^ ( b | c )
|
||||
#define mm128_xoror( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x1e )
|
||||
#define v128_xoror( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x1e )
|
||||
|
||||
// a ^ ( ~b & c )
|
||||
#define mm128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 )
|
||||
#define v128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 )
|
||||
|
||||
// a | ( b & c )
|
||||
#define mm128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 )
|
||||
#define v128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 )
|
||||
|
||||
// ~( a ^ b ), same as (~a) ^ b
|
||||
#define mm128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
|
||||
#define v128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
|
||||
#define v128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
|
||||
|
||||
#define mm128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
#define mm128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
|
||||
#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
|
||||
|
||||
#define mm128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
#define mm128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
|
||||
#define v128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
|
||||
|
||||
#define mm128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
|
||||
#define v128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
|
||||
|
||||
#define mm128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) )
|
||||
#define v128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) )
|
||||
|
||||
#define mm128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
#define mm128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
|
||||
#define v128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
|
||||
|
||||
#endif
|
||||
|
||||
#define v128_ornot( a, b ) _mm_or_si128( a, v128_not( b ) )
|
||||
|
||||
// Mask making
|
||||
// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
|
||||
// Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
|
||||
@@ -508,7 +482,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define v128_qrev16(v) v128_shuffle16( v, 0x1b )
|
||||
#define v128_lrev16(v) v128_shuffle16( v, 0xb1 )
|
||||
|
||||
// These sgould never be callled from application code, use rol/ror.
|
||||
// These should never be callled from application code, use rol/ror.
|
||||
#define v128_ror64_sse2( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
|
||||
|
||||
@@ -524,12 +498,12 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
// AVX512 fastest all rotations.
|
||||
#define mm128_ror_64 _mm_ror_epi64
|
||||
#define mm128_rol_64 _mm_rol_epi64
|
||||
#define mm128_ror_32 _mm_ror_epi32
|
||||
#define mm128_rol_32 _mm_rol_epi32
|
||||
#define v128_ror64 _mm_ror_epi64
|
||||
#define v128_rol64 _mm_rol_epi64
|
||||
#define v128_ror32 _mm_ror_epi32
|
||||
#define v128_rol32 _mm_rol_epi32
|
||||
|
||||
// ror/rol will alway find the fastest but these names may fit better with
|
||||
// ror/rol will always find the fastest but these names may fit better with
|
||||
// application code performing shuffles rather than bit rotations.
|
||||
#define v128_shuflr64_8( v) _mm_ror_epi64( v, 8 )
|
||||
#define v128_shufll64_8( v) _mm_rol_epi64( v, 8 )
|
||||
@@ -543,7 +517,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define v128_shufll32_16(v) _mm_rol_epi32( v, 16 )
|
||||
|
||||
#elif defined(__SSSE3__)
|
||||
// SSE2: fastest 32 bit, very fast 16, fast 8
|
||||
// SSSE3: fastest 32 bit, very fast 16, fast 8
|
||||
|
||||
#define v128_shuflr64_8( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
@@ -569,7 +543,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0e0d0c0f0a09080b, 0x0605040702010003 ) )
|
||||
|
||||
#define mm128_ror_64( v, c ) \
|
||||
#define v128_ror64( v, c ) \
|
||||
( (c) == 8 ) ? v128_shuflr64_8( v ) \
|
||||
: ( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
|
||||
: ( (c) == 24 ) ? v128_shuflr64_24( v ) \
|
||||
@@ -579,7 +553,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
: ( (c) == 56 ) ? v128_shufll64_8( v ) \
|
||||
: v128_ror64_sse2( v, c )
|
||||
|
||||
#define mm128_rol_64( v, c ) \
|
||||
#define v128_rol64( v, c ) \
|
||||
( (c) == 8 ) ? v128_shufll64_8( v ) \
|
||||
: ( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \
|
||||
: ( (c) == 24 ) ? v128_shufll64_24( v ) \
|
||||
@@ -589,13 +563,13 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
: ( (c) == 56 ) ? v128_shuflr64_8( v ) \
|
||||
: v128_rol64_sse2( v, c )
|
||||
|
||||
#define mm128_ror_32( v, c ) \
|
||||
#define v128_ror32( v, c ) \
|
||||
( (c) == 8 ) ? v128_shuflr32_8( v ) \
|
||||
: ( (c) == 16 ) ? v128_lrev16( v ) \
|
||||
: ( (c) == 24 ) ? v128_shufll32_8( v ) \
|
||||
: v128_ror32_sse2( v, c )
|
||||
|
||||
#define mm128_rol_32( v, c ) \
|
||||
#define v128_rol32( v, c ) \
|
||||
( (c) == 8 ) ? v128_shufll32_8( v ) \
|
||||
: ( (c) == 16 ) ? v128_lrev16( v ) \
|
||||
: ( (c) == 24 ) ? v128_shuflr32_8( v ) \
|
||||
@@ -604,42 +578,41 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#elif defined(__SSE2__)
|
||||
// SSE2: fastest 32 bit, very fast 16
|
||||
|
||||
#define mm128_ror_64( v, c ) \
|
||||
#define v128_ror64( v, c ) \
|
||||
( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
|
||||
: ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 48 ) ? v128_shuffle16( v, 0x93 ) \
|
||||
: v128_ror64_sse2( v, c )
|
||||
|
||||
#define mm128_rol_64( v, c ) \
|
||||
#define v128_rol64( v, c ) \
|
||||
( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \
|
||||
: ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 48 ) ? v128_shuffle16( v, 0x39 ) \
|
||||
: v128_rol64_sse2( v, c )
|
||||
|
||||
#define mm128_ror_32( v, c ) \
|
||||
#define v128_ror32( v, c ) \
|
||||
( (c) == 16 ) ? v128_lrev16( v ) \
|
||||
: v128_ror32_sse2( v, c )
|
||||
|
||||
#define mm128_rol_32( v, c ) \
|
||||
#define v128_rol32( v, c ) \
|
||||
( (c) == 16 ) ? v128_lrev16( v ) \
|
||||
: v128_rol32_sse2( v, c )
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_ror_64 v128_ror64_sse2
|
||||
#define mm128_rol_64 v128_rol64_sse2
|
||||
#define mm128_ror_32 v128_ror32_sse2
|
||||
#define mm128_rol_32 v128_rol32_sse2
|
||||
#define v128_ror64 v128_ror64_sse2
|
||||
#define v128_rol64 v128_rol64_sse2
|
||||
#define v128_ror32 v128_ror32_sse2
|
||||
#define v128_rol32 v128_rol32_sse2
|
||||
|
||||
#endif
|
||||
|
||||
// Generic names for portable code
|
||||
#define v128_ror64 mm128_ror_64
|
||||
#define v128_rol64 mm128_rol_64
|
||||
#define v128_ror32 mm128_ror_32
|
||||
#define v128_rol32 mm128_rol_32
|
||||
|
||||
//#define v128_ror64 mm128_ror_64
|
||||
//#define v128_rol64 mm128_rol_64
|
||||
//#define v128_ror32 mm128_ror_32
|
||||
#define mm128_rol_32 v128_rol32
|
||||
|
||||
/* not used
|
||||
// x2 rotates elements in 2 individual vectors in a double buffered
|
||||
// optimization for SSE2, does nothing for AVX512 but is there for
|
||||
// transparency.
|
||||
@@ -647,25 +620,25 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
|
||||
#define mm128_rorx2_64( v1, v0, c ) \
|
||||
#define v128_2ror64( v1, v0, c ) \
|
||||
_mm_ror_epi64( v0, c ); \
|
||||
_mm_ror_epi64( v1, c )
|
||||
|
||||
#define mm128_rolx2_64( v1, v0, c ) \
|
||||
#define v128_2rol64( v1, v0, c ) \
|
||||
_mm_rol_epi64( v0, c ); \
|
||||
_mm_rol_epi64( v1, c )
|
||||
|
||||
#define mm128_rorx2_32( v1, v0, c ) \
|
||||
#define v128_2ror32( v1, v0, c ) \
|
||||
_mm_ror_epi32( v0, c ); \
|
||||
_mm_ror_epi32( v1, c )
|
||||
|
||||
#define mm128_rolx2_32( v1, v0, c ) \
|
||||
#define mm128_2rol32( v1, v0, c ) \
|
||||
_mm_rol_epi32( v0, c ); \
|
||||
_mm_rol_epi32( v1, c )
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define mm128_rorx2_64( v1, v0, c ) \
|
||||
#define v128_2ror64( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_srli_epi64( v0, c ); \
|
||||
__m128i t1 = _mm_srli_epi64( v1, c ); \
|
||||
@@ -675,7 +648,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
v1 = _mm_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define mm128_rolx2_64( v1, v0, c ) \
|
||||
#define v128_2rol64( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_slli_epi64( v0, c ); \
|
||||
__m128i t1 = _mm_slli_epi64( v1, c ); \
|
||||
@@ -685,7 +658,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
v1 = _mm_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define mm128_rorx2_32( v1, v0, c ) \
|
||||
#define v128_2ror32( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_srli_epi32( v0, c ); \
|
||||
__m128i t1 = _mm_srli_epi32( v1, c ); \
|
||||
@@ -695,7 +668,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
v1 = _mm_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define mm128_rolx2_32( v1, v0, c ) \
|
||||
#define v128_2rol32( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_slli_epi32( v0, c ); \
|
||||
__m128i t1 = _mm_slli_epi32( v1, c ); \
|
||||
@@ -706,12 +679,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
}
|
||||
|
||||
#endif // AVX512 else SSE2
|
||||
|
||||
#define v128_2ror64 mm128_rorx2_64
|
||||
#define v128_2rol64 mm128_rolx2_64
|
||||
#define v128_2ror32 mm128_rorx2_32
|
||||
#define v128_2rol32 mm128_rolx2_32
|
||||
|
||||
*/
|
||||
|
||||
// Cross lane shuffles
|
||||
|
||||
@@ -750,95 +718,76 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define v128_shuflr16(v) v128_shuffle16( v, 0x39 )
|
||||
#define v128_shufll16(v) v128_shuffle16( v, 0x93 )
|
||||
|
||||
|
||||
//TODO fix this
|
||||
// alias bswap
|
||||
//#define v128_qrev8(v) _mm_shuffle_epi8( v, v128_8( 0,1,2,3,4,5,6,7 ) )
|
||||
//#define v128_lrev8(v) _mm_shuffle_epi8( v, v128_8( 4,5,6,7, 0,1,2,3 ) )
|
||||
//#define v128_wrev8(v) _mm_shuffle_epi8( v, v128_8( 6,7, 4,5, 2,3, 1,0 ) )
|
||||
|
||||
// reverse bits, can it be done?
|
||||
//#define v128_bitrev8( v ) vrbitq_u8
|
||||
|
||||
/* Not used
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
// Rotate right by c bytes, no SSE2 equivalent.
|
||||
static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
{ return _mm_alignr_epi8( v, v, c ); }
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
// Endian byte swap.
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define mm128_bswap_128( v ) \
|
||||
#define v128_bswap128( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0001020304050607, \
|
||||
0x08090a0b0c0d0e0f ) )
|
||||
|
||||
#define mm128_bswap_64( v ) \
|
||||
#define v128_bswap64( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
|
||||
0x0001020304050607 ) )
|
||||
|
||||
#define mm128_bswap_32( v ) \
|
||||
#define v128_bswap32( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ) )
|
||||
#define mm128_bswap_32 v128_bswap32
|
||||
|
||||
#define mm128_bswap_16( v ) \
|
||||
#define v128_bswap16( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \
|
||||
0x0607040502030001 )
|
||||
|
||||
// 8 byte qword * 8 qwords * 2 lanes = 128 bytes
|
||||
#define mm128_block_bswap_64( d, s ) \
|
||||
{ \
|
||||
__m128i ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
|
||||
casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
|
||||
casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
|
||||
casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
|
||||
casti_m128i( d,4 ) = _mm_shuffle_epi8( casti_m128i( s,4 ), ctl ); \
|
||||
casti_m128i( d,5 ) = _mm_shuffle_epi8( casti_m128i( s,5 ), ctl ); \
|
||||
casti_m128i( d,6 ) = _mm_shuffle_epi8( casti_m128i( s,6 ), ctl ); \
|
||||
casti_m128i( d,7 ) = _mm_shuffle_epi8( casti_m128i( s,7 ), ctl ); \
|
||||
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
|
||||
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
|
||||
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
|
||||
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
|
||||
casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
|
||||
casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
|
||||
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
|
||||
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
|
||||
}
|
||||
#define mm128_block_bswap64_512 mm128_block_bswap_64
|
||||
#define v128_block_bswap64_512 mm128_block_bswap_64
|
||||
|
||||
#define v128_block_bswap64_1024( d, s ) \
|
||||
{ \
|
||||
__m128i ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \
|
||||
casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), ctl ); \
|
||||
casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), ctl ); \
|
||||
casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), ctl ); \
|
||||
casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), ctl ); \
|
||||
casti_m128i( d, 5 ) = _mm_shuffle_epi8( casti_m128i( s, 5 ), ctl ); \
|
||||
casti_m128i( d, 6 ) = _mm_shuffle_epi8( casti_m128i( s, 6 ), ctl ); \
|
||||
casti_m128i( d, 7 ) = _mm_shuffle_epi8( casti_m128i( s, 7 ), ctl ); \
|
||||
casti_m128i( d, 8 ) = _mm_shuffle_epi8( casti_m128i( s, 8 ), ctl ); \
|
||||
casti_m128i( d, 9 ) = _mm_shuffle_epi8( casti_m128i( s, 9 ), ctl ); \
|
||||
casti_m128i( d,10 ) = _mm_shuffle_epi8( casti_m128i( s,10 ), ctl ); \
|
||||
casti_m128i( d,11 ) = _mm_shuffle_epi8( casti_m128i( s,11 ), ctl ); \
|
||||
casti_m128i( d,12 ) = _mm_shuffle_epi8( casti_m128i( s,12 ), ctl ); \
|
||||
casti_m128i( d,13 ) = _mm_shuffle_epi8( casti_m128i( s,13 ), ctl ); \
|
||||
casti_m128i( d,14 ) = _mm_shuffle_epi8( casti_m128i( s,14 ), ctl ); \
|
||||
casti_m128i( d,15 ) = _mm_shuffle_epi8( casti_m128i( s,15 ), ctl ); \
|
||||
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
|
||||
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
|
||||
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
|
||||
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
|
||||
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
|
||||
casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
|
||||
casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
|
||||
casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
|
||||
casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
|
||||
casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
|
||||
casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
|
||||
casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
|
||||
casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
|
||||
casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
|
||||
casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
|
||||
casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
|
||||
}
|
||||
|
||||
// 4 byte dword * 8 dwords * 4 lanes = 128 bytes
|
||||
#define mm128_block_bswap_32( d, s ) \
|
||||
{ \
|
||||
__m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
|
||||
casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
|
||||
casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
|
||||
casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
|
||||
casti_m128i( d,4 ) = _mm_shuffle_epi8( casti_m128i( s,4 ), ctl ); \
|
||||
casti_m128i( d,5 ) = _mm_shuffle_epi8( casti_m128i( s,5 ), ctl ); \
|
||||
casti_m128i( d,6 ) = _mm_shuffle_epi8( casti_m128i( s,6 ), ctl ); \
|
||||
casti_m128i( d,7 ) = _mm_shuffle_epi8( casti_m128i( s,7 ), ctl ); \
|
||||
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
|
||||
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
|
||||
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
|
||||
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
|
||||
casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
|
||||
casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
|
||||
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
|
||||
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
|
||||
}
|
||||
#define mm128_block_bswap32_256 mm128_block_bswap_32
|
||||
#define v128_block_bswap32_256 mm128_block_bswap_32
|
||||
@@ -846,129 +795,127 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
|
||||
#define mm128_block_bswap32_128( d, s ) \
|
||||
{ \
|
||||
__m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
|
||||
casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
|
||||
casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
|
||||
casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
|
||||
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
|
||||
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
|
||||
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
|
||||
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
|
||||
}
|
||||
|
||||
#define v128_block_bswap32_512( d, s ) \
|
||||
{ \
|
||||
__m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \
|
||||
casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), ctl ); \
|
||||
casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), ctl ); \
|
||||
casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), ctl ); \
|
||||
casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), ctl ); \
|
||||
casti_m128i( d, 5 ) = _mm_shuffle_epi8( casti_m128i( s, 5 ), ctl ); \
|
||||
casti_m128i( d, 6 ) = _mm_shuffle_epi8( casti_m128i( s, 6 ), ctl ); \
|
||||
casti_m128i( d, 7 ) = _mm_shuffle_epi8( casti_m128i( s, 7 ), ctl ); \
|
||||
casti_m128i( d, 8 ) = _mm_shuffle_epi8( casti_m128i( s, 8 ), ctl ); \
|
||||
casti_m128i( d, 9 ) = _mm_shuffle_epi8( casti_m128i( s, 9 ), ctl ); \
|
||||
casti_m128i( d,10 ) = _mm_shuffle_epi8( casti_m128i( s,10 ), ctl ); \
|
||||
casti_m128i( d,11 ) = _mm_shuffle_epi8( casti_m128i( s,11 ), ctl ); \
|
||||
casti_m128i( d,12 ) = _mm_shuffle_epi8( casti_m128i( s,12 ), ctl ); \
|
||||
casti_m128i( d,13 ) = _mm_shuffle_epi8( casti_m128i( s,13 ), ctl ); \
|
||||
casti_m128i( d,14 ) = _mm_shuffle_epi8( casti_m128i( s,14 ), ctl ); \
|
||||
casti_m128i( d,15 ) = _mm_shuffle_epi8( casti_m128i( s,15 ), ctl ); \
|
||||
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
|
||||
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
|
||||
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
|
||||
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
|
||||
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
|
||||
casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
|
||||
casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
|
||||
casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
|
||||
casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
|
||||
casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
|
||||
casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
|
||||
casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
|
||||
casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
|
||||
casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
|
||||
casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
|
||||
casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
|
||||
}
|
||||
|
||||
#else // SSE2
|
||||
|
||||
static inline __m128i mm128_bswap_64( __m128i v )
|
||||
static inline v128_t v128_bswap64( __m128i v )
|
||||
{
|
||||
v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
|
||||
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
|
||||
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
|
||||
}
|
||||
|
||||
static inline __m128i mm128_bswap_32( __m128i v )
|
||||
static inline v128_t v128_bswap32( __m128i v )
|
||||
{
|
||||
v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
|
||||
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
|
||||
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
|
||||
}
|
||||
#define mm128_bswap_32 v128_bswap32
|
||||
|
||||
static inline __m128i mm128_bswap_16( __m128i v )
|
||||
static inline v128_t v128_bswap16( __m128i v )
|
||||
{
|
||||
return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
|
||||
}
|
||||
|
||||
#define mm128_bswap_128( v ) v128_qrev32( v128_bswap64( v ) )
|
||||
#define v128_bswap128( v ) v128_qrev32( v128_bswap64( v ) )
|
||||
|
||||
static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[0] = mm128_bswap_64( s[0] );
|
||||
d[1] = mm128_bswap_64( s[1] );
|
||||
d[2] = mm128_bswap_64( s[2] );
|
||||
d[3] = mm128_bswap_64( s[3] );
|
||||
d[4] = mm128_bswap_64( s[4] );
|
||||
d[5] = mm128_bswap_64( s[5] );
|
||||
d[6] = mm128_bswap_64( s[6] );
|
||||
d[7] = mm128_bswap_64( s[7] );
|
||||
d[0] = v128_bswap64( s[0] );
|
||||
d[1] = v128_bswap64( s[1] );
|
||||
d[2] = v128_bswap64( s[2] );
|
||||
d[3] = v128_bswap64( s[3] );
|
||||
d[4] = v128_bswap64( s[4] );
|
||||
d[5] = v128_bswap64( s[5] );
|
||||
d[6] = v128_bswap64( s[6] );
|
||||
d[7] = v128_bswap64( s[7] );
|
||||
}
|
||||
#define v128_block_bswap64_512 mm128_block_bswap_64
|
||||
|
||||
static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[ 0] = mm128_bswap_64( s[ 0] );
|
||||
d[ 1] = mm128_bswap_64( s[ 1] );
|
||||
d[ 2] = mm128_bswap_64( s[ 2] );
|
||||
d[ 3] = mm128_bswap_64( s[ 3] );
|
||||
d[ 4] = mm128_bswap_64( s[ 4] );
|
||||
d[ 5] = mm128_bswap_64( s[ 5] );
|
||||
d[ 6] = mm128_bswap_64( s[ 6] );
|
||||
d[ 7] = mm128_bswap_64( s[ 7] );
|
||||
d[ 8] = mm128_bswap_64( s[ 8] );
|
||||
d[ 9] = mm128_bswap_64( s[ 9] );
|
||||
d[10] = mm128_bswap_64( s[10] );
|
||||
d[11] = mm128_bswap_64( s[11] );
|
||||
d[14] = mm128_bswap_64( s[12] );
|
||||
d[13] = mm128_bswap_64( s[13] );
|
||||
d[14] = mm128_bswap_64( s[14] );
|
||||
d[15] = mm128_bswap_64( s[15] );
|
||||
d[ 0] = v128_bswap64( s[ 0] );
|
||||
d[ 1] = v128_bswap64( s[ 1] );
|
||||
d[ 2] = v128_bswap64( s[ 2] );
|
||||
d[ 3] = v128_bswap64( s[ 3] );
|
||||
d[ 4] = v128_bswap64( s[ 4] );
|
||||
d[ 5] = v128_bswap64( s[ 5] );
|
||||
d[ 6] = v128_bswap64( s[ 6] );
|
||||
d[ 7] = v128_bswap64( s[ 7] );
|
||||
d[ 8] = v128_bswap64( s[ 8] );
|
||||
d[ 9] = v128_bswap64( s[ 9] );
|
||||
d[10] = v128_bswap64( s[10] );
|
||||
d[11] = v128_bswap64( s[11] );
|
||||
d[14] = v128_bswap64( s[12] );
|
||||
d[13] = v128_bswap64( s[13] );
|
||||
d[14] = v128_bswap64( s[14] );
|
||||
d[15] = v128_bswap64( s[15] );
|
||||
}
|
||||
|
||||
static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[0] = mm128_bswap_32( s[0] );
|
||||
d[1] = mm128_bswap_32( s[1] );
|
||||
d[2] = mm128_bswap_32( s[2] );
|
||||
d[3] = mm128_bswap_32( s[3] );
|
||||
d[4] = mm128_bswap_32( s[4] );
|
||||
d[5] = mm128_bswap_32( s[5] );
|
||||
d[6] = mm128_bswap_32( s[6] );
|
||||
d[7] = mm128_bswap_32( s[7] );
|
||||
d[0] = v128_bswap32( s[0] );
|
||||
d[1] = v128_bswap32( s[1] );
|
||||
d[2] = v128_bswap32( s[2] );
|
||||
d[3] = v128_bswap32( s[3] );
|
||||
d[4] = v128_bswap32( s[4] );
|
||||
d[5] = v128_bswap32( s[5] );
|
||||
d[6] = v128_bswap32( s[6] );
|
||||
d[7] = v128_bswap32( s[7] );
|
||||
}
|
||||
#define mm128_block_bswap32_256 mm128_block_bswap_32
|
||||
#define v128_block_bswap32_256 mm128_block_bswap_32
|
||||
|
||||
static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[ 0] = mm128_bswap_32( s[ 0] );
|
||||
d[ 1] = mm128_bswap_32( s[ 1] );
|
||||
d[ 2] = mm128_bswap_32( s[ 2] );
|
||||
d[ 3] = mm128_bswap_32( s[ 3] );
|
||||
d[ 4] = mm128_bswap_32( s[ 4] );
|
||||
d[ 5] = mm128_bswap_32( s[ 5] );
|
||||
d[ 6] = mm128_bswap_32( s[ 6] );
|
||||
d[ 7] = mm128_bswap_32( s[ 7] );
|
||||
d[ 8] = mm128_bswap_32( s[ 8] );
|
||||
d[ 9] = mm128_bswap_32( s[ 9] );
|
||||
d[10] = mm128_bswap_32( s[10] );
|
||||
d[11] = mm128_bswap_32( s[11] );
|
||||
d[12] = mm128_bswap_32( s[12] );
|
||||
d[13] = mm128_bswap_32( s[13] );
|
||||
d[14] = mm128_bswap_32( s[14] );
|
||||
d[15] = mm128_bswap_32( s[15] );
|
||||
d[ 0] = v128_bswap32( s[ 0] );
|
||||
d[ 1] = v128_bswap32( s[ 1] );
|
||||
d[ 2] = v128_bswap32( s[ 2] );
|
||||
d[ 3] = v128_bswap32( s[ 3] );
|
||||
d[ 4] = v128_bswap32( s[ 4] );
|
||||
d[ 5] = v128_bswap32( s[ 5] );
|
||||
d[ 6] = v128_bswap32( s[ 6] );
|
||||
d[ 7] = v128_bswap32( s[ 7] );
|
||||
d[ 8] = v128_bswap32( s[ 8] );
|
||||
d[ 9] = v128_bswap32( s[ 9] );
|
||||
d[10] = v128_bswap32( s[10] );
|
||||
d[11] = v128_bswap32( s[11] );
|
||||
d[12] = v128_bswap32( s[12] );
|
||||
d[13] = v128_bswap32( s[13] );
|
||||
d[14] = v128_bswap32( s[14] );
|
||||
d[15] = v128_bswap32( s[15] );
|
||||
}
|
||||
|
||||
#endif // SSSE3 else SSE2
|
||||
|
||||
#define v128_bswap32 mm128_bswap_32
|
||||
#define v128_bswap64 mm128_bswap_64
|
||||
#define v128_bswap128 mm128_bswap_128
|
||||
#define v128_block_bswap32 mm128_block_bswap_32
|
||||
#define v128_block_bswap64 mm128_block_bswap_64
|
||||
|
||||
@@ -991,16 +938,7 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
|
||||
_mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
|
||||
|
||||
#endif
|
||||
#define mm128_alignr_64 v128_alignr64
|
||||
#define mm128_alignr_32 v128_alignr32
|
||||
#define mm128_alignr_8 v128_alignr32
|
||||
|
||||
|
||||
// NEON only uses vector mask. x86 blend selects second arg when control bit
|
||||
// is set. Blendv selects second arg when sign bit is set. And masking is the
|
||||
// opposite, elements are selected from the first arg if the mask bits are set.
|
||||
// Arm blend is a bit by bit blend while x76 is an elenet blend.
|
||||
// Reverse the logic so the use mask is consistent with both formats.
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
#define v128_blendv _mm_blendv_epi8
|
||||
@@ -1008,7 +946,7 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
|
||||
#else
|
||||
|
||||
#define v128_blendv( v1, v0, mask ) \
|
||||
v128_or( v128_andnot( mask, v0 ), v128_and( mask, v1 ) )
|
||||
v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )
|
||||
|
||||
#endif
|
||||
|
||||
|
Reference in New Issue
Block a user