mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v23.8
This commit is contained in:
@@ -153,10 +153,16 @@
|
||||
#define v128_unpackhi8 _mm_unpackhi_epi8
|
||||
|
||||
// AES
|
||||
// Nokey means nothing on x86_64 but it saves an instruction and a register
|
||||
// on ARM.
|
||||
#define v128_aesenc _mm_aesenc_si128
|
||||
#define v128_aesenc_nokey(v) _mm_aesenc_si128( v, v128_zero )
|
||||
#define v128_aesenclast _mm_aesenclast_si128
|
||||
#define v128_aesenclast_nokey(v) _mm_aesenclast_si128( v, v128_zero )
|
||||
#define v128_aesdec _mm_aesdec_si128
|
||||
#define v128_aesdec_nokey(v) _mm_aesdec_si128( v, v128_zero )
|
||||
#define v128_aesdeclast _mm_aesdeclast_si128
|
||||
#define v128_aesdeclast_nokey(v) _mm_aesdeclast_si128( v, v128_zero )
|
||||
|
||||
// Used instead if casting.
|
||||
typedef union
|
||||
@@ -499,73 +505,141 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
//
|
||||
// Bit rotations
|
||||
|
||||
// Slow bit rotation, used as last resort
|
||||
#define mm128_ror_64_sse2( v, c ) \
|
||||
#define v128_shuffle16( v, c ) \
|
||||
_mm_shufflehi_epi16( _mm_shufflelo_epi16( v, c ), c )
|
||||
|
||||
#define v128_qrev32(v) _mm_shuffle_epi32( v, 0xb1 )
|
||||
#define v128_swap64_32(v) _mm_shuffle_epi32( v, 0xb1 ) // grandfathered
|
||||
|
||||
#define v128_qrev16(v) v128_shuffle16( v, 0x1b )
|
||||
#define v128_lrev16(v) v128_shuffle16( v, 0xb1 )
|
||||
|
||||
// These sgould never be callled from application code, use rol/ror.
|
||||
#define v128_ror64_sse2( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm128_rol_64_sse2( v, c ) \
|
||||
#define v128_rol64_sse2( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm128_ror_32_sse2( v, c ) \
|
||||
#define v128_ror32_sse2( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
|
||||
|
||||
#define mm128_rol_32_sse2( v, c ) \
|
||||
#define v128_rol32_sse2( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define mm128_ror_64 _mm_ror_epi64
|
||||
#define mm128_rol_64 _mm_rol_epi64
|
||||
#define mm128_ror_32 _mm_ror_epi32
|
||||
#define mm128_rol_32 _mm_rol_epi32
|
||||
// AVX512 fastest all rotations.
|
||||
#define mm128_ror_64 _mm_ror_epi64
|
||||
#define mm128_rol_64 _mm_rol_epi64
|
||||
#define mm128_ror_32 _mm_ror_epi32
|
||||
#define mm128_rol_32 _mm_rol_epi32
|
||||
|
||||
// ror/rol will alway find the fastest but these names may fit better with
|
||||
// application code performing shuffles rather than bit rotations.
|
||||
#define v128_shuflr64_8( v) _mm_ror_epi64( v, 8 )
|
||||
#define v128_shufll64_8( v) _mm_rol_epi64( v, 8 )
|
||||
#define v128_shuflr64_16(v) _mm_ror_epi64( v, 16 )
|
||||
#define v128_shufll64_16(v) _mm_rol_epi64( v, 16 )
|
||||
#define v128_shuflr64_24(v) _mm_ror_epi64( v, 24 )
|
||||
#define v128_shufll64_24(v) _mm_rol_epi64( v, 24 )
|
||||
#define v128_shuflr32_8( v) _mm_ror_epi32( v, 8 )
|
||||
#define v128_shufll32_8( v) _mm_rol_epi32( v, 8 )
|
||||
#define v128_shuflr32_16(v) _mm_ror_epi32( v, 16 )
|
||||
#define v128_shufll32_16(v) _mm_rol_epi32( v, 16 )
|
||||
|
||||
// optimized byte wise rotation
|
||||
#elif defined(__SSSE3__)
|
||||
// SSE2: fastest 32 bit, very fast 16, fast 8
|
||||
|
||||
#define v128_shuflr64_8( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x080f0e0d0c0b0a09, 0x0007060504030201 ) )
|
||||
|
||||
#define v128_shufll64_8( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0e0d0c0b0a09080f, 0x0605040302010007 ) )
|
||||
|
||||
#define v128_shuflr64_24( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
|
||||
|
||||
#define v128_shufll64_24( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0c0b0a09080f0e0d, 0x0403020100070605 ) )
|
||||
|
||||
#define v128_shuflr32_8( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
|
||||
|
||||
#define v128_shufll32_8( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0e0d0c0f0a09080b, 0x0605040702010003 ) )
|
||||
|
||||
#define mm128_ror_64( v, c ) \
|
||||
( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 24 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) \
|
||||
: ( (c) == 16 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x09080f0e0d0c0b0a, 0x0100070605040302 ) ) \
|
||||
: ( (c) == 8 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) \
|
||||
: mm128_ror_64_sse2( v, c )
|
||||
( (c) == 8 ) ? v128_shuflr64_8( v ) \
|
||||
: ( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
|
||||
: ( (c) == 24 ) ? v128_shuflr64_24( v ) \
|
||||
: ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 40 ) ? v128_shufll64_24( v ) \
|
||||
: ( (c) == 48 ) ? v128_shuffle16( v, 0x93 ) \
|
||||
: ( (c) == 56 ) ? v128_shufll64_8( v ) \
|
||||
: v128_ror64_sse2( v, c )
|
||||
|
||||
#define mm128_rol_64( v, c ) \
|
||||
( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 24 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) \
|
||||
: ( (c) == 16 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0d0c0b0a09080f0e, 0x0504030201000706 ) ) \
|
||||
: ( (c) == 8 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) \
|
||||
: mm128_rol_64_sse2( v, c )
|
||||
( (c) == 8 ) ? v128_shufll64_8( v ) \
|
||||
: ( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \
|
||||
: ( (c) == 24 ) ? v128_shufll64_24( v ) \
|
||||
: ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 40 ) ? v128_shuflr64_24( v ) \
|
||||
: ( (c) == 48 ) ? v128_shuffle16( v, 0x39 ) \
|
||||
: ( (c) == 56 ) ? v128_shuflr64_8( v ) \
|
||||
: v128_rol64_sse2( v, c )
|
||||
|
||||
#define mm128_ror_32( v, c ) \
|
||||
( (c) == 16 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) \
|
||||
: ( (c) == 8 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) \
|
||||
: mm128_ror_32_sse2( v, c )
|
||||
( (c) == 8 ) ? v128_shuflr32_8( v ) \
|
||||
: ( (c) == 16 ) ? v128_lrev16( v ) \
|
||||
: ( (c) == 24 ) ? v128_shufll32_8( v ) \
|
||||
: v128_ror32_sse2( v, c )
|
||||
|
||||
#define mm128_rol_32( v, c ) \
|
||||
( (c) == 16 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) \
|
||||
: ( (c) == 8 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) \
|
||||
: mm128_rol_32_sse2( v, c )
|
||||
( (c) == 8 ) ? v128_shufll32_8( v ) \
|
||||
: ( (c) == 16 ) ? v128_lrev16( v ) \
|
||||
: ( (c) == 24 ) ? v128_shuflr32_8( v ) \
|
||||
: v128_rol32_sse2( v, c )
|
||||
|
||||
#elif defined(__SSE2__)
|
||||
// SSE2: fastest 32 bit, very fast 16
|
||||
|
||||
#define mm128_ror_64( v, c ) \
|
||||
( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
|
||||
: ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 48 ) ? v128_shuffle16( v, 0x93 ) \
|
||||
: v128_ror64_sse2( v, c )
|
||||
|
||||
#define mm128_rol_64( v, c ) \
|
||||
( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \
|
||||
: ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 48 ) ? v128_shuffle16( v, 0x39 ) \
|
||||
: v128_rol64_sse2( v, c )
|
||||
|
||||
#define mm128_ror_32( v, c ) \
|
||||
( (c) == 16 ) ? v128_lrev16( v ) \
|
||||
: v128_ror32_sse2( v, c )
|
||||
|
||||
#define mm128_rol_32( v, c ) \
|
||||
( (c) == 16 ) ? v128_lrev16( v ) \
|
||||
: v128_rol32_sse2( v, c )
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_ror_64 mm128_ror_64_sse2
|
||||
#define mm128_rol_64 mm128_rol_64_sse2
|
||||
#define mm128_ror_32 mm128_ror_32_sse2
|
||||
#define mm128_rol_32 mm128_rol_32_sse2
|
||||
#define mm128_ror_64 v128_ror64_sse2
|
||||
#define mm128_rol_64 v128_rol64_sse2
|
||||
#define mm128_ror_32 v128_ror32_sse2
|
||||
#define mm128_rol_32 v128_rol32_sse2
|
||||
|
||||
#endif
|
||||
|
||||
// Architecturally agnostic naming
|
||||
// Generic names for portable code
|
||||
#define v128_ror64 mm128_ror_64
|
||||
#define v128_rol64 mm128_rol_64
|
||||
#define v128_ror32 mm128_ror_32
|
||||
@@ -669,9 +743,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
|
||||
// Rotate vector elements accross all lanes
|
||||
|
||||
#define v128_shuffle16( v, c ) \
|
||||
_mm_or_si128( _mm_shufflehi_epi16( v, c ), _mm_shufflelo_epi16( v, c ) )
|
||||
|
||||
// reverse elements in vector
|
||||
#define v128_swap64(v) _mm_shuffle_epi32( v, 0x4e ) // grandfathered
|
||||
#define v128_rev64(v) _mm_shuffle_epi32( v, 0x4e ) // preferred
|
||||
@@ -685,24 +756,12 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define v128_shuflr16(v) v128_shuffle16( v, 0x39 )
|
||||
#define v128_shufll16(v) v128_shuffle16( v, 0x93 )
|
||||
|
||||
// Some sub-vector shuffles are identical to bit rotation. Shuffle is faster.
|
||||
// Bit rotation already promotes faster widths. Usage of these versions
|
||||
// are context sensitive.
|
||||
|
||||
// reverse elements in vector lanes
|
||||
#define v128_qrev32(v) v128_ror64( v, 32 )
|
||||
#define v128_swap64_32(v) v128_ror64( v, 32 ) // grandfathered
|
||||
|
||||
#define v128_qrev16(v) \
|
||||
_mm_or_si128( _mm_shufflehi_epi16( v, v128u16( 0x1b ) ) \
|
||||
_mm_shufflelo_epi16( v, v128u16( 0x1b ) ) )
|
||||
|
||||
#define v128_lrev16(v) v128_ror32( v, 16 )
|
||||
|
||||
//TODO fix this
|
||||
// alias bswap
|
||||
#define v128_qrev8(v) _mm_shuffle_epi8( v, v128_8( 0,1,2,3,4,5,6,7 ) )
|
||||
#define v128_lrev8(v) _mm_shuffle_epi8( v, v128_8( 4,5,6,7, 0,1,2,3 ) )
|
||||
#define v128_wrev8(v) _mm_shuffle_epi8( v, v128_8( 6,7, 4,5, 2,3, 1,0 ) )
|
||||
//#define v128_qrev8(v) _mm_shuffle_epi8( v, v128_8( 0,1,2,3,4,5,6,7 ) )
|
||||
//#define v128_lrev8(v) _mm_shuffle_epi8( v, v128_8( 4,5,6,7, 0,1,2,3 ) )
|
||||
//#define v128_wrev8(v) _mm_shuffle_epi8( v, v128_8( 6,7, 4,5, 2,3, 1,0 ) )
|
||||
|
||||
// reverse bits, can it be done?
|
||||
//#define v128_bitrev8( v ) vrbitq_u8
|
||||
@@ -790,6 +849,16 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
#define mm128_block_bswap32_256 mm128_block_bswap_32
|
||||
#define v128_block_bswap32_256 mm128_block_bswap_32
|
||||
|
||||
|
||||
#define mm128_block_bswap32_128( d, s ) \
|
||||
{ \
|
||||
__m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
|
||||
casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
|
||||
casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
|
||||
casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
|
||||
}
|
||||
|
||||
#define v128_block_bswap32_512( d, s ) \
|
||||
{ \
|
||||
__m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
|
@@ -218,7 +218,29 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
//
|
||||
// Bit rotations.
|
||||
|
||||
// Slow version, used as last resort
|
||||
#define mm256_shuffle16( v, c ) \
|
||||
_mm256_shufflehi_epi16( _mm256_shufflelo_epi16( v, c ), c )
|
||||
|
||||
#define mm256_qrev32(v) _mm256_shuffle_epi32( v, 0xb1 )
|
||||
#define mm256_swap64_32 mm256_qrev32 // grandfathered
|
||||
|
||||
#define mm256_qrev16(v) mm256_shuffle16( v, 0x1b )
|
||||
|
||||
#define mm256_qrev8(v) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
v128_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) )
|
||||
|
||||
#define mm256_lrev16(v) mm256_shuffle16( v, 0xb1 )
|
||||
|
||||
#define mm256_lrev8(v) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
v128_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) )
|
||||
|
||||
#define mm256_wrev8(v) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
v128_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
|
||||
|
||||
// These should never be called directly by applications.
|
||||
#define mm256_ror_64_avx2( v, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
|
||||
_mm256_slli_epi64( v, 64-(c) ) )
|
||||
@@ -242,40 +264,76 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
#define mm256_ror_32 _mm256_ror_epi32
|
||||
#define mm256_rol_32 _mm256_rol_epi32
|
||||
|
||||
// Redundant but naming may be a better fit in some applications.
|
||||
#define mm126_shuflr64_8( v) _mm256_ror_epi64( v, 8 )
|
||||
#define mm156_shufll64_8( v) _mm256_rol_epi64( v, 8 )
|
||||
#define mm256_shuflr64_16(v) _mm256_ror_epi64( v, 16 )
|
||||
#define mm256_shufll64_16(v) _mm256_rol_epi64( v, 16 )
|
||||
#define mm256_shuflr64_24(v) _mm256_ror_epi64( v, 24 )
|
||||
#define mm256_shufll64_24(v) _mm256_rol_epi64( v, 24 )
|
||||
#define mm256_shuflr32_8( v) _mm256_ror_epi32( v, 8 )
|
||||
#define mm256_shufll32_8( v) _mm256_rol_epi32( v, 8 )
|
||||
#define mm256_shuflr32_16(v) _mm256_ror_epi32( v, 16 )
|
||||
#define mm256_shufll32_16(v) _mm256_rol_epi32( v, 16 )
|
||||
|
||||
#else
|
||||
|
||||
// ROR & ROL will always find the fastest but these names may be a better fit
|
||||
// in some applications.
|
||||
#define mm256_shuflr64_8( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) )
|
||||
|
||||
#define mm256_shufll64_8( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) )
|
||||
|
||||
#define mm256_shuflr64_24( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) )
|
||||
|
||||
#define mm256_shufll64_24( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) )
|
||||
|
||||
#define mm256_shuflr32_8( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) )
|
||||
|
||||
#define mm256_shufll32_8( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) )
|
||||
|
||||
#define mm256_ror_64( v, c ) \
|
||||
( (c) == 32 ) ? _mm256_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 24 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) ) \
|
||||
: ( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x09080f0e0d0c0b0a, 0x0100070605040302 ) ) ) \
|
||||
: ( (c) == 8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) ) \
|
||||
( (c) == 8 ) ? mm256_shuflr64_8( v ) \
|
||||
: ( (c) == 16 ) ? mm256_shuffle16( v, 0x39 ) \
|
||||
: ( (c) == 24 ) ? mm256_shuflr64_24( v ) \
|
||||
: ( (c) == 32 ) ? _mm256_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 40 ) ? mm256_shufll64_24( v ) \
|
||||
: ( (c) == 48 ) ? mm256_shuffle16( v, 0x93 ) \
|
||||
: ( (c) == 56 ) ? mm256_shufll64_8( v ) \
|
||||
: mm256_ror_64_avx2( v, c )
|
||||
|
||||
#define mm256_rol_64( v, c ) \
|
||||
( (c) == 32 ) ? _mm256_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 24 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) ) \
|
||||
: ( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0d0c0b0a09080f0e, 0x0504030201000706 ) ) ) \
|
||||
: ( (c) == 8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) ) \
|
||||
( (c) == 8 ) ? mm256_shufll64_8( v ) \
|
||||
: ( (c) == 16 ) ? mm256_shuffle16( v, 0x93 ) \
|
||||
: ( (c) == 24 ) ? mm256_shufll64_24( v ) \
|
||||
: ( (c) == 32 ) ? _mm256_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 40 ) ? mm256_shuflr64_24( v ) \
|
||||
: ( (c) == 48 ) ? mm256_shuffle16( v, 0x39 ) \
|
||||
: ( (c) == 56 ) ? mm256_shuflr64_8( v ) \
|
||||
: mm256_rol_64_avx2( v, c )
|
||||
|
||||
#define mm256_ror_32( v, c ) \
|
||||
( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) )\
|
||||
: ( (c) == 8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) ) \
|
||||
( (c) == 8 ) ? mm256_shuflr32_8( v ) \
|
||||
: ( (c) == 16 ) ? mm256_lrev16( v ) \
|
||||
: ( (c) == 24 ) ? mm256_shufll32_8( v ) \
|
||||
: mm256_ror_32_avx2( v, c )
|
||||
|
||||
#define mm256_rol_32( v, c ) \
|
||||
( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) ) \
|
||||
: ( (c) == 8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) ) \
|
||||
( (c) == 8 ) ? mm256_shufll32_8( v ) \
|
||||
: ( (c) == 16 ) ? mm256_lrev16( v ) \
|
||||
: ( (c) == 24 ) ? mm256_shuflr32_8( v ) \
|
||||
: mm256_rol_32_avx2( v, c )
|
||||
|
||||
#endif
|
||||
@@ -400,25 +458,19 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
/* Not used
|
||||
// Rotate 256 bit vector by one 32 bit element.
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
static inline __m256i mm256_shuflr_32( const __m256i v )
|
||||
{ return _mm256_alignr_epi32( v, v, 1 ); }
|
||||
|
||||
static inline __m256i mm256_shufll_32( const __m256i v )
|
||||
{ return _mm256_alignr_epi32( v, v, 15 ); }
|
||||
|
||||
#else
|
||||
|
||||
#define mm256_shuflr_32( v ) \
|
||||
_mm256_permutevar8x32_epi32( v, \
|
||||
_mm256_set_spi64x( 0x0000000000000007, 0x0000000600000005, \
|
||||
0x0000000400000003, 0x0000000200000001 ) )
|
||||
|
||||
#define mm256_shufll_32( v ) \
|
||||
_mm256_permutevar8x32_epi32( v, \
|
||||
_mm256_set_epi64x( 0x0000000600000005, 0x0000000400000003, \
|
||||
0x0000000200000001, 0x0000000000000007 ) )
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
@@ -450,21 +502,6 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
{ return _mm256_alignr_epi8( v, v, c ); }
|
||||
*/
|
||||
|
||||
// Same as bit rotation but logically used as byte/word rotation.
|
||||
#define mm256_swap64_32( v ) mm256_ror_64( v, 32 ) // grandfathered
|
||||
#define mm256_rev64_32( v ) mm256_ror_64( v, 32 )
|
||||
|
||||
#define mm256_shuflr64_16(v) _mm256_ror_epi64( v, 16 )
|
||||
#define mm256_shufll64_16(v) _mm256_rol_epi64( v, 16 )
|
||||
|
||||
#define mm256_shuflr64_8(v) _mm256_ror_epi64( v, 8 )
|
||||
#define mm256_shufll64_8(v) _mm256_rol_epi64( v, 8 )
|
||||
|
||||
#define mm256_rev32_16( v ) mm256_ror_32( v, 16 )
|
||||
|
||||
#define mm256_shuflr32_8(v) _mm256_ror_epi32( v, 8 )
|
||||
#define mm256_shufll32_8(v) _mm256_rol_epi32( v, 8 )
|
||||
|
||||
// Reverse byte order in elements, endian bswap.
|
||||
#define mm256_bswap_64( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
|
||||
|
@@ -1,6 +1,9 @@
|
||||
#if !defined(SIMD_INT_H__)
|
||||
#define SIMD_INT_H__ 1
|
||||
|
||||
//TODO compile time test for byte order
|
||||
// be64 etc using HW bowap.
|
||||
//
|
||||
// Endian byte swap
|
||||
#if defined(__x86_64__)
|
||||
|
||||
@@ -9,8 +12,6 @@
|
||||
|
||||
#elif defined(__aarch64__)
|
||||
|
||||
//#pragma message "aarch64 fast bswap"
|
||||
|
||||
static inline uint64_t bswap_64( uint64_t a )
|
||||
{
|
||||
uint64_t b;
|
||||
|
@@ -81,7 +81,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
|
||||
#define v128_cmpeq16 vceqq_u16
|
||||
#define v128_cmpeq8 vceqq_u8
|
||||
|
||||
#define v128_cmpeq0 vceqzq_u64
|
||||
#define v128_iszero vceqzq_u64
|
||||
|
||||
// Not yet needed
|
||||
//#define v128_cmpeq1
|
||||
@@ -174,12 +174,31 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
|
||||
|
||||
|
||||
// AES
|
||||
// consistent with Intel AES, break up for optimizing
|
||||
#define v128_aesenc( v, k ) vaesmcq_u8( vaeseq_u8( v, k ) )
|
||||
#define v128_aesenclast( v, k ) vaeseq_u8( v, k )
|
||||
// consistent with Intel AES intrinsics, break up for optimizing
|
||||
#define v128_aesenc( v, k ) \
|
||||
v128_xor( k, vaesmcq_u8( vaeseq_u8( v, v128_zero ) ) )
|
||||
|
||||
#define v128_aesenc_nokey( v ) \
|
||||
vaesmcq_u8( vaeseq_u8( v, v128_zero ) )
|
||||
|
||||
#define v128_aesenclast( v, k ) \
|
||||
v128_xor( k, vaeseq_u8( v, v128_zero ) )
|
||||
|
||||
#define v128_aesenclast_nokey( v, k ) \
|
||||
vaeseq_u8( v, v128_zero )
|
||||
|
||||
#define v128_aesdec( v, k ) \
|
||||
v128_xor( k, vaesimcq_u8( vaesdq_u8( v, v128_zero ) ) )
|
||||
|
||||
#define v128_aesdec_nokey( v, k ) \
|
||||
vaesimcq_u8( vaesdq_u8( v, v128_zero ) )
|
||||
|
||||
#define v128_aesdeclast( v, k ) \
|
||||
v128_xor( k, vaesdq_u8( v, v128_zero ) )
|
||||
|
||||
#define v128_aesdeclast_nokey( v, k ) \
|
||||
vaesdq_u8( v, v128_zero )
|
||||
|
||||
#define v128_aesdec( v, k ) vaesimcq_u8( vaesdq_u8( v, k ) )
|
||||
#define v128_aesdeclast( v, k ) vaesdq_u8( v, k )
|
||||
|
||||
typedef union
|
||||
{
|
||||
@@ -189,7 +208,7 @@ typedef union
|
||||
} __attribute__ ((aligned (16))) v128_ovly;
|
||||
|
||||
|
||||
// Broadcast lane 0 to all lanes
|
||||
// Broadcast lane 0 to all lanes, consistent with x86_64 broadcast
|
||||
#define v128_bcast64(v) vdupq_laneq_u64( v, 0 )
|
||||
#define v128_bcast32(v) vdupq_laneq_u32( v, 0 )
|
||||
#define v128_bcast16(v) vdupq_laneq_u16( v, 0 )
|
||||
|
Reference in New Issue
Block a user