mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v25.4
This commit is contained in:
@@ -589,20 +589,7 @@ static inline void extr_lane_4x32( void *d, const void *s,
|
||||
((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+60 ];
|
||||
}
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
static inline void v128_bswap32_80( void *d, void *s )
|
||||
{
|
||||
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), bswap_shuf );
|
||||
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), bswap_shuf );
|
||||
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), bswap_shuf );
|
||||
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), bswap_shuf );
|
||||
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), bswap_shuf );
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__) && defined(__ARM_NEON)
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
|
||||
static inline void v128_bswap32_80( void *d, void *s )
|
||||
{
|
||||
@@ -641,6 +628,8 @@ static inline void v128_bswap32_80( void *d, void *s )
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
{
|
||||
v128u32_t s0 = casti_v128u32( src,0 );
|
||||
@@ -649,27 +638,12 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
v128u32_t s3 = casti_v128u32( src,3 );
|
||||
v128u32_t s4 = casti_v128u32( src,4 );
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
|
||||
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
|
||||
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
|
||||
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
|
||||
|
||||
#else
|
||||
|
||||
s0 = v128_bswap32( s0 );
|
||||
s1 = v128_bswap32( s1 );
|
||||
s2 = v128_bswap32( s2 );
|
||||
s3 = v128_bswap32( s3 );
|
||||
s4 = v128_bswap32( s4 );
|
||||
|
||||
#endif
|
||||
|
||||
casti_v128u32( d, 0 ) = v128_duplane32( s0, 0 );
|
||||
casti_v128u32( d, 1 ) = v128_duplane32( s0, 1 );
|
||||
casti_v128u32( d, 2 ) = v128_duplane32( s0, 2 );
|
||||
@@ -696,6 +670,8 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
casti_v128u32( d,19 ) = v128_duplane32( s2, 3 );
|
||||
}
|
||||
|
||||
#endif // SSE2 || NEON
|
||||
|
||||
// 8x32
|
||||
|
||||
#if defined(__AVX2__)
|
||||
@@ -1112,8 +1088,6 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
|
||||
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
{
|
||||
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
const __m256i c1 = v256_32( 1 );
|
||||
const __m256i c2 = _mm256_add_epi32( c1, c1 );
|
||||
const __m256i c3 = _mm256_add_epi32( c2, c1 );
|
||||
@@ -1124,11 +1098,11 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
v128_t s3 = casti_v128( src,3 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
|
||||
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
|
||||
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
|
||||
s0 = v128_bswap32( s0 );
|
||||
s1 = v128_bswap32( s1 );
|
||||
s2 = v128_bswap32( s2 );
|
||||
s3 = v128_bswap32( s3 );
|
||||
s4 = v128_bswap32( s4 );
|
||||
|
||||
casti_m256i( d, 0 ) = _mm256_broadcastd_epi32( s0 );
|
||||
casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32(
|
||||
@@ -1617,8 +1591,6 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
|
||||
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
{
|
||||
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
const __m512i c1 = v512_32( 1 );
|
||||
const __m512i c2 = _mm512_add_epi32( c1, c1 );
|
||||
const __m512i c3 = _mm512_add_epi32( c2, c1 );
|
||||
@@ -1628,11 +1600,11 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
v128_t s3 = casti_v128( src,3 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
|
||||
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
|
||||
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
|
||||
s0 = v128_bswap32( s0 );
|
||||
s1 = v128_bswap32( s1 );
|
||||
s2 = v128_bswap32( s2 );
|
||||
s3 = v128_bswap32( s3 );
|
||||
s4 = v128_bswap32( s4 );
|
||||
|
||||
casti_m512i( d, 0 ) = _mm512_broadcastd_epi32( s0 );
|
||||
casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( c1,
|
||||
@@ -1878,6 +1850,8 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
|
||||
{
|
||||
v128u64_t s0 = casti_v128u64( src,0 );
|
||||
@@ -1886,27 +1860,12 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
|
||||
v128u64_t s3 = casti_v128u64( src,3 );
|
||||
v128u64_t s4 = casti_v128u64( src,4 );
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
const v128u64_t bswap_shuf = v128_set64( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
|
||||
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
|
||||
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
|
||||
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
|
||||
|
||||
#else
|
||||
|
||||
s0 = v128_bswap32( s0 );
|
||||
s1 = v128_bswap32( s1 );
|
||||
s2 = v128_bswap32( s2 );
|
||||
s3 = v128_bswap32( s3 );
|
||||
s4 = v128_bswap32( s4 );
|
||||
|
||||
#endif
|
||||
|
||||
casti_v128u64( d,0 ) = v128_duplane64( s0, 0 );
|
||||
casti_v128u64( d,1 ) = v128_duplane64( s0, 1 );
|
||||
|
||||
@@ -1923,6 +1882,8 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
|
||||
casti_v128u64( d,9 ) = v128_duplane64( s4, 1 );
|
||||
}
|
||||
|
||||
#endif // SSE2 || NEON
|
||||
|
||||
static inline void extr_lane_2x64( void *dst, const void *src,
|
||||
const int lane, const int bit_len )
|
||||
{
|
||||
@@ -2233,25 +2194,23 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
|
||||
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
{
|
||||
const __m256i bswap_shuf = mm256_bcast_m128(
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
__m256i s0 = casti_m256i( src,0 );
|
||||
__m256i s1 = casti_m256i( src,1 );
|
||||
__m256i s0 = casti_m256i( src,0 ); // s0, s1
|
||||
__m256i s2 = casti_m256i( src,1 ); // s2, s3
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
s0 = _mm256_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm256_shuffle_epi8( s1, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, _mm256_castsi256_si128( bswap_shuf ) );
|
||||
s0 = mm256_bswap_32( s0 );
|
||||
s2 = mm256_bswap_32( s2 );
|
||||
s4 = v128_bswap32( s4 );
|
||||
|
||||
casti_m256i( d, 0 ) = _mm256_permute4x64_epi64( s0, 0x00 );
|
||||
casti_m256i( d, 1 ) = _mm256_permute4x64_epi64( s0, 0x55 );
|
||||
casti_m256i( d, 2 ) = _mm256_permute4x64_epi64( s0, 0xaa );
|
||||
casti_m256i( d, 3 ) = _mm256_permute4x64_epi64( s0, 0xff );
|
||||
|
||||
casti_m256i( d, 4 ) = _mm256_permute4x64_epi64( s1, 0x00 );
|
||||
casti_m256i( d, 5 ) = _mm256_permute4x64_epi64( s1, 0x55 );
|
||||
casti_m256i( d, 6 ) = _mm256_permute4x64_epi64( s1, 0xaa );
|
||||
casti_m256i( d, 7 ) = _mm256_permute4x64_epi64( s1, 0xff );
|
||||
casti_m256i( d, 4 ) = _mm256_permute4x64_epi64( s2, 0x00 );
|
||||
casti_m256i( d, 5 ) = _mm256_permute4x64_epi64( s2, 0x55 );
|
||||
casti_m256i( d, 6 ) = _mm256_permute4x64_epi64( s2, 0xaa );
|
||||
casti_m256i( d, 7 ) = _mm256_permute4x64_epi64( s2, 0xff );
|
||||
|
||||
casti_m256i( d, 8 ) = _mm256_permute4x64_epi64(
|
||||
_mm256_castsi128_si256( s4 ), 0x00 );
|
||||
@@ -2648,8 +2607,6 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
||||
|
||||
static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
||||
{
|
||||
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
const __m512i c1 = v512_64( 1 );
|
||||
v128_t s0 = casti_v128( src,0 );
|
||||
v128_t s1 = casti_v128( src,1 );
|
||||
@@ -2657,11 +2614,11 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
||||
v128_t s3 = casti_v128( src,3 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
|
||||
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
|
||||
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
|
||||
s0 = v128_bswap32( s0 );
|
||||
s1 = v128_bswap32( s1 );
|
||||
s2 = v128_bswap32( s2 );
|
||||
s3 = v128_bswap32( s3 );
|
||||
s4 = v128_bswap32( s4 );
|
||||
|
||||
casti_m512i( d,0 ) = _mm512_broadcastq_epi64( s0 );
|
||||
casti_m512i( d,1 ) = _mm512_permutexvar_epi64( c1,
|
||||
@@ -2842,49 +2799,45 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,
|
||||
|
||||
static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
|
||||
{
|
||||
const __m512i bswap_shuf = mm512_bcast_m128(
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
const v128_t s0 = casti_v128( src,0 );
|
||||
const v128_t s1 = casti_v128( src,1 );
|
||||
const v128_t s2 = casti_v128( src,2 );
|
||||
const v128_t s3 = casti_v128( src,3 );
|
||||
const v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
casti_m512i( d,0 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s0 ),
|
||||
bswap_shuf );
|
||||
casti_m512i( d,1 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s1 ),
|
||||
bswap_shuf );
|
||||
casti_m512i( d,2 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s2 ),
|
||||
bswap_shuf );
|
||||
casti_m512i( d,3 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s3 ),
|
||||
bswap_shuf );
|
||||
casti_m512i( d,4 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s4 ),
|
||||
bswap_shuf );
|
||||
casti_m512i( d,0 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
|
||||
_mm512_castsi128_si512( s0 ) );
|
||||
casti_m512i( d,1 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
|
||||
_mm512_castsi128_si512( s1 ) );
|
||||
casti_m512i( d,2 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
|
||||
_mm512_castsi128_si512( s2 ) );
|
||||
casti_m512i( d,3 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
|
||||
_mm512_castsi128_si512( s3 ) );
|
||||
casti_m512i( d,4 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
|
||||
_mm512_castsi128_si512( s4 ) );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
|
||||
{
|
||||
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
v128_t s0 = casti_v128( src,0 );
|
||||
v128_t s1 = casti_v128( src,1 );
|
||||
v128_t s2 = casti_v128( src,2 );
|
||||
v128_t s3 = casti_v128( src,3 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
|
||||
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
|
||||
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
|
||||
s0 = v128_bswap32( s0 );
|
||||
s1 = v128_bswap32( s1 );
|
||||
s2 = v128_bswap32( s2 );
|
||||
s3 = v128_bswap32( s3 );
|
||||
s4 = v128_bswap32( s4 );
|
||||
|
||||
casti_m512i( d,0 ) = mm512_bcast_m128( s0 );
|
||||
casti_m512i( d,1 ) = mm512_bcast_m128( s1 );
|
||||
casti_m512i( d,2 ) = mm512_bcast_m128( s2 );
|
||||
casti_m512i( d,3 ) = mm512_bcast_m128( s3 );
|
||||
casti_m512i( d,4 ) = mm512_bcast_m128( s4 );
|
||||
casti_m512i( d,0 ) = mm512_bcast128( s0 );
|
||||
casti_m512i( d,1 ) = mm512_bcast128( s1 );
|
||||
casti_m512i( d,2 ) = mm512_bcast128( s2 );
|
||||
casti_m512i( d,3 ) = mm512_bcast128( s3 );
|
||||
casti_m512i( d,4 ) = mm512_bcast128( s4 );
|
||||
}
|
||||
|
||||
#endif // AVX512VBMI ELSE
|
||||
|
||||
@@ -521,29 +521,12 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
#elif defined(__SSSE3__)
|
||||
// SSSE3: fastest 32 bit, very fast 16, fast 8
|
||||
|
||||
#define v128_shuflr64_8( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x080f0e0d0c0b0a09, 0x0007060504030201 ) )
|
||||
|
||||
#define v128_shufll64_8( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0e0d0c0b0a09080f, 0x0605040302010007 ) )
|
||||
|
||||
#define v128_shuflr64_24( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
|
||||
|
||||
#define v128_shufll64_24( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0c0b0a09080f0e0d, 0x0403020100070605 ) )
|
||||
|
||||
#define v128_shuflr32_8( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
|
||||
|
||||
#define v128_shufll32_8( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0e0d0c0f0a09080b, 0x0605040702010003 ) )
|
||||
#define v128_shuflr64_8( v ) _mm_shuffle_epi8( v, V128_SHUFLR64_8 )
|
||||
#define v128_shufll64_8( v ) _mm_shuffle_epi8( v, V128_SHUFLL64_8 )
|
||||
#define v128_shuflr64_24(v ) _mm_shuffle_epi8( v, V128_SHUFLR64_24 )
|
||||
#define v128_shufll64_24(v ) _mm_shuffle_epi8( v, V128_SHUFLL64_24 )
|
||||
#define v128_shuflr32_8( v ) _mm_shuffle_epi8( v, V128_SHUFLR32_8 )
|
||||
#define v128_shufll32_8( v ) _mm_shuffle_epi8( v, V128_SHUFLL32_8 )
|
||||
|
||||
#define v128_ror64( v, c ) \
|
||||
( (c) == 8 ) ? v128_shuflr64_8( v ) \
|
||||
@@ -612,74 +595,6 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
// (v1 ^ v0) >>> n, ARM NEON has optimized version
|
||||
#define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n )
|
||||
|
||||
/* not used
|
||||
// x2 rotates elements in 2 individual vectors in a double buffered
|
||||
// optimization for SSE2, does nothing for AVX512 but is there for
|
||||
// transparency.
|
||||
|
||||
#if defined(VL256)
|
||||
|
||||
#define v128_2ror64( v1, v0, c ) \
|
||||
_mm_ror_epi64( v0, c ); \
|
||||
_mm_ror_epi64( v1, c )
|
||||
|
||||
#define v128_2rol64( v1, v0, c ) \
|
||||
_mm_rol_epi64( v0, c ); \
|
||||
_mm_rol_epi64( v1, c )
|
||||
|
||||
#define v128_2ror32( v1, v0, c ) \
|
||||
_mm_ror_epi32( v0, c ); \
|
||||
_mm_ror_epi32( v1, c )
|
||||
|
||||
#define v128_2rol32( v1, v0, c ) \
|
||||
_mm_rol_epi32( v0, c ); \
|
||||
_mm_rol_epi32( v1, c )
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define v128_2ror64( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_srli_epi64( v0, c ); \
|
||||
__m128i t1 = _mm_srli_epi64( v1, c ); \
|
||||
v0 = _mm_slli_epi64( v0, 64-(c) ); \
|
||||
v1 = _mm_slli_epi64( v1, 64-(c) ); \
|
||||
v0 = _mm_or_si256( v0, t0 ); \
|
||||
v1 = _mm_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define v128_2rol64( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_slli_epi64( v0, c ); \
|
||||
__m128i t1 = _mm_slli_epi64( v1, c ); \
|
||||
v0 = _mm_srli_epi64( v0, 64-(c) ); \
|
||||
v1 = _mm_srli_epi64( v1, 64-(c) ); \
|
||||
v0 = _mm_or_si256( v0, t0 ); \
|
||||
v1 = _mm_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define v128_2ror32( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_srli_epi32( v0, c ); \
|
||||
__m128i t1 = _mm_srli_epi32( v1, c ); \
|
||||
v0 = _mm_slli_epi32( v0, 32-(c) ); \
|
||||
v1 = _mm_slli_epi32( v1, 32-(c) ); \
|
||||
v0 = _mm_or_si256( v0, t0 ); \
|
||||
v1 = _mm_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define v128_2rol32( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_slli_epi32( v0, c ); \
|
||||
__m128i t1 = _mm_slli_epi32( v1, c ); \
|
||||
v0 = _mm_srli_epi32( v0, 32-(c) ); \
|
||||
v1 = _mm_srli_epi32( v1, 32-(c) ); \
|
||||
v0 = _mm_or_si256( v0, t0 ); \
|
||||
v1 = _mm_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#endif // AVX512 else SSE2
|
||||
*/
|
||||
|
||||
// Cross lane shuffles
|
||||
|
||||
// No NEON version
|
||||
@@ -721,13 +636,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0001020304050607, \
|
||||
0x08090a0b0c0d0e0f ) )
|
||||
|
||||
#define v128_bswap64( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
|
||||
0x0001020304050607 ) )
|
||||
#define v128_bswap64( v ) _mm_shuffle_epi8( v, V128_BSWAP64 )
|
||||
|
||||
#define v128_bswap32( v ) _mm_shuffle_epi8( v, V128_BSWAP32 )
|
||||
|
||||
#define v128_bswap32( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ) )
|
||||
#define v128_bswap16( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \
|
||||
0x0607040502030001 )
|
||||
@@ -735,85 +647,30 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
// 8 byte qword * 8 qwords * 2 lanes = 128 bytes
|
||||
#define v128_block_bswap64( d, s ) \
|
||||
{ \
|
||||
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
|
||||
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
|
||||
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
|
||||
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
|
||||
casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
|
||||
casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
|
||||
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
|
||||
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
|
||||
}
|
||||
#define v128_block_bswap64_512 v128_block_bswap64
|
||||
|
||||
#define v128_block_bswap64_1024( d, s ) \
|
||||
{ \
|
||||
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
|
||||
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
|
||||
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
|
||||
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
|
||||
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
|
||||
casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
|
||||
casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
|
||||
casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
|
||||
casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
|
||||
casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
|
||||
casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
|
||||
casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
|
||||
casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
|
||||
casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
|
||||
casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
|
||||
casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
|
||||
casti_v128( d,0 ) = v128_bswap64( casti_v128( s,0 ) ); \
|
||||
casti_v128( d,1 ) = v128_bswap64( casti_v128( s,1 ) ); \
|
||||
casti_v128( d,2 ) = v128_bswap64( casti_v128( s,2 ) ); \
|
||||
casti_v128( d,3 ) = v128_bswap64( casti_v128( s,3 ) ); \
|
||||
casti_v128( d,4 ) = v128_bswap64( casti_v128( s,4 ) ); \
|
||||
casti_v128( d,5 ) = v128_bswap64( casti_v128( s,5 ) ); \
|
||||
casti_v128( d,6 ) = v128_bswap64( casti_v128( s,6 ) ); \
|
||||
casti_v128( d,7 ) = v128_bswap64( casti_v128( s,7 ) ); \
|
||||
}
|
||||
|
||||
// 4 byte dword * 8 dwords * 4 lanes = 128 bytes
|
||||
#define v128_block_bswap32( d, s ) \
|
||||
{ \
|
||||
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
|
||||
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
|
||||
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
|
||||
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
|
||||
casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
|
||||
casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
|
||||
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
|
||||
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
|
||||
casti_v128( d,0 ) = v128_bswap32( casti_v128( s,0 ) ); \
|
||||
casti_v128( d,1 ) = v128_bswap32( casti_v128( s,1 ) ); \
|
||||
casti_v128( d,2 ) = v128_bswap32( casti_v128( s,2 ) ); \
|
||||
casti_v128( d,3 ) = v128_bswap32( casti_v128( s,3 ) ); \
|
||||
casti_v128( d,4 ) = v128_bswap32( casti_v128( s,4 ) ); \
|
||||
casti_v128( d,5 ) = v128_bswap32( casti_v128( s,5 ) ); \
|
||||
casti_v128( d,6 ) = v128_bswap32( casti_v128( s,6 ) ); \
|
||||
casti_v128( d,7 ) = v128_bswap32( casti_v128( s,7 ) ); \
|
||||
}
|
||||
#define v128_block_bswap32_256 v128_block_bswap32
|
||||
|
||||
|
||||
#define v128_block_bswap32_128( d, s ) \
|
||||
{ \
|
||||
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
|
||||
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
|
||||
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
|
||||
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
|
||||
}
|
||||
|
||||
#define v128_block_bswap32_512( d, s ) \
|
||||
{ \
|
||||
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
|
||||
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
|
||||
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
|
||||
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
|
||||
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
|
||||
casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
|
||||
casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
|
||||
casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
|
||||
casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
|
||||
casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
|
||||
casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
|
||||
casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
|
||||
casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
|
||||
casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
|
||||
casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
|
||||
casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
|
||||
}
|
||||
|
||||
#else // SSE2
|
||||
|
||||
static inline v128_t v128_bswap64( __m128i v )
|
||||
@@ -835,7 +692,7 @@ static inline v128_t v128_bswap16( __m128i v )
|
||||
return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
|
||||
}
|
||||
|
||||
#define v128_bswap128( v ) v128_qrev32( v128_bswap64( v ) )
|
||||
#define v128_bswap128( v ) v128_rev64( v128_bswap64( v ) )
|
||||
|
||||
static inline void v128_block_bswap64( __m128i *d, const __m128i *s )
|
||||
{
|
||||
@@ -849,26 +706,6 @@ static inline void v128_block_bswap64( __m128i *d, const __m128i *s )
|
||||
d[7] = v128_bswap64( s[7] );
|
||||
}
|
||||
|
||||
static inline void v128_block_bswap64_1024( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[ 0] = v128_bswap64( s[ 0] );
|
||||
d[ 1] = v128_bswap64( s[ 1] );
|
||||
d[ 2] = v128_bswap64( s[ 2] );
|
||||
d[ 3] = v128_bswap64( s[ 3] );
|
||||
d[ 4] = v128_bswap64( s[ 4] );
|
||||
d[ 5] = v128_bswap64( s[ 5] );
|
||||
d[ 6] = v128_bswap64( s[ 6] );
|
||||
d[ 7] = v128_bswap64( s[ 7] );
|
||||
d[ 8] = v128_bswap64( s[ 8] );
|
||||
d[ 9] = v128_bswap64( s[ 9] );
|
||||
d[10] = v128_bswap64( s[10] );
|
||||
d[11] = v128_bswap64( s[11] );
|
||||
d[14] = v128_bswap64( s[12] );
|
||||
d[13] = v128_bswap64( s[13] );
|
||||
d[14] = v128_bswap64( s[14] );
|
||||
d[15] = v128_bswap64( s[15] );
|
||||
}
|
||||
|
||||
static inline void v128_block_bswap32( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[0] = v128_bswap32( s[0] );
|
||||
@@ -882,26 +719,6 @@ static inline void v128_block_bswap32( __m128i *d, const __m128i *s )
|
||||
}
|
||||
#define v128_block_bswap32_256 v128_block_bswap32
|
||||
|
||||
static inline void v128_block_bswap32_512( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[ 0] = v128_bswap32( s[ 0] );
|
||||
d[ 1] = v128_bswap32( s[ 1] );
|
||||
d[ 2] = v128_bswap32( s[ 2] );
|
||||
d[ 3] = v128_bswap32( s[ 3] );
|
||||
d[ 4] = v128_bswap32( s[ 4] );
|
||||
d[ 5] = v128_bswap32( s[ 5] );
|
||||
d[ 6] = v128_bswap32( s[ 6] );
|
||||
d[ 7] = v128_bswap32( s[ 7] );
|
||||
d[ 8] = v128_bswap32( s[ 8] );
|
||||
d[ 9] = v128_bswap32( s[ 9] );
|
||||
d[10] = v128_bswap32( s[10] );
|
||||
d[11] = v128_bswap32( s[11] );
|
||||
d[12] = v128_bswap32( s[12] );
|
||||
d[13] = v128_bswap32( s[13] );
|
||||
d[14] = v128_bswap32( s[14] );
|
||||
d[15] = v128_bswap32( s[15] );
|
||||
}
|
||||
|
||||
#endif // SSSE3 else SSE2
|
||||
|
||||
// alignr instruction for 32 & 64 bit elements is only available with AVX512
|
||||
|
||||
@@ -61,8 +61,10 @@ typedef union
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// Broadcast, ie set1, from 128 bit vector input.
|
||||
#define mm256_bcast_m128( v ) \
|
||||
#define mm256_bcast128( v ) \
|
||||
_mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
|
||||
// deprecated
|
||||
#define mm256_bcast_m128 mm256_bcast128
|
||||
|
||||
// Set either the low or high 64 bit elements in 128 bit lanes, other elements
|
||||
// are set to zero.
|
||||
@@ -73,23 +75,23 @@ typedef union
|
||||
|
||||
#else
|
||||
|
||||
#define mm256_bcast128lo_64( i64 ) mm256_bcast_m128( v128_mov64( i64 ) )
|
||||
#define mm256_bcast128lo_64( i64 ) mm256_bcast128( v128_mov64( i64 ) )
|
||||
|
||||
#define mm256_bcast128hi_64( i64 ) _mm256_permute4x64_epi64( \
|
||||
_mm256_castsi128_si256( v128_mov64( i64 ) ), 0x11 )
|
||||
|
||||
#endif
|
||||
|
||||
#define mm256_set2_64( i1, i0 ) mm256_bcast_m128( _mm_set_epi64x( i1, i0 ) )
|
||||
#define mm256_set2_64( i1, i0 ) mm256_bcast128( _mm_set_epi64x( i1, i0 ) )
|
||||
|
||||
#define mm256_set4_32( i3, i2, i1, i0 ) \
|
||||
mm256_bcast_m128( _mm_set_epi32( i3, i2, i1, i0 ) )
|
||||
mm256_bcast128( _mm_set_epi32( i3, i2, i1, i0 ) )
|
||||
|
||||
// All SIMD constant macros are actually functions containing executable
|
||||
// code and therefore can't be used as compile time initializers.
|
||||
|
||||
#define m256_zero _mm256_setzero_si256()
|
||||
#define m256_one_128 mm256_bcast_m128( v128_one )
|
||||
#define m256_one_128 mm256_bcast128( v128_one )
|
||||
|
||||
static inline __m256i mm256_neg1_fn()
|
||||
{
|
||||
@@ -231,21 +233,8 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
#define mm256_swap64_32 mm256_qrev32 // grandfathered
|
||||
|
||||
#define mm256_qrev16(v) mm256_shuffle16( v, 0x1b )
|
||||
|
||||
#define mm256_qrev8(v) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
v128_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) )
|
||||
|
||||
#define mm256_lrev16(v) mm256_shuffle16( v, 0xb1 )
|
||||
|
||||
#define mm256_lrev8(v) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
v128_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) )
|
||||
|
||||
#define mm256_wrev8(v) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
v128_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
|
||||
|
||||
//
|
||||
// Bit rotations.
|
||||
|
||||
@@ -268,50 +257,33 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
|
||||
#if defined(VL256)
|
||||
|
||||
#define mm256_ror_64 _mm256_ror_epi64
|
||||
#define mm256_rol_64 _mm256_rol_epi64
|
||||
#define mm256_ror_32 _mm256_ror_epi32
|
||||
#define mm256_rol_32 _mm256_rol_epi32
|
||||
#define mm256_ror_64 _mm256_ror_epi64
|
||||
#define mm256_rol_64 _mm256_rol_epi64
|
||||
#define mm256_ror_32 _mm256_ror_epi32
|
||||
#define mm256_rol_32 _mm256_rol_epi32
|
||||
|
||||
// Redundant but naming may be a better fit in some applications.
|
||||
#define mm126_shuflr64_8( v) _mm256_ror_epi64( v, 8 )
|
||||
#define mm156_shufll64_8( v) _mm256_rol_epi64( v, 8 )
|
||||
#define mm256_shuflr64_16(v) _mm256_ror_epi64( v, 16 )
|
||||
#define mm256_shufll64_16(v) _mm256_rol_epi64( v, 16 )
|
||||
#define mm256_shuflr64_24(v) _mm256_ror_epi64( v, 24 )
|
||||
#define mm256_shufll64_24(v) _mm256_rol_epi64( v, 24 )
|
||||
#define mm256_shuflr32_8( v) _mm256_ror_epi32( v, 8 )
|
||||
#define mm256_shufll32_8( v) _mm256_rol_epi32( v, 8 )
|
||||
#define mm256_shuflr32_16(v) _mm256_ror_epi32( v, 16 )
|
||||
#define mm256_shufll32_16(v) _mm256_rol_epi32( v, 16 )
|
||||
#define mm256_shuflr64_8( v) _mm256_ror_epi64( v, 8 )
|
||||
#define mm256_shufll64_8( v) _mm256_rol_epi64( v, 8 )
|
||||
#define mm256_shuflr64_16(v) _mm256_ror_epi64( v, 16 )
|
||||
#define mm256_shufll64_16(v) _mm256_rol_epi64( v, 16 )
|
||||
#define mm256_shuflr64_24(v) _mm256_ror_epi64( v, 24 )
|
||||
#define mm256_shufll64_24(v) _mm256_rol_epi64( v, 24 )
|
||||
#define mm256_shuflr32_8( v) _mm256_ror_epi32( v, 8 )
|
||||
#define mm256_shufll32_8( v) _mm256_rol_epi32( v, 8 )
|
||||
#define mm256_shuflr32_16(v) _mm256_ror_epi32( v, 16 )
|
||||
#define mm256_shufll32_16(v) _mm256_rol_epi32( v, 16 )
|
||||
|
||||
#else
|
||||
|
||||
// ROR & ROL will always find the fastest but these names may be a better fit
|
||||
// in some applications.
|
||||
#define mm256_shuflr64_8( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) )
|
||||
|
||||
#define mm256_shufll64_8( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) )
|
||||
|
||||
#define mm256_shuflr64_24( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) )
|
||||
|
||||
#define mm256_shufll64_24( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) )
|
||||
|
||||
#define mm256_shuflr32_8( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) )
|
||||
|
||||
#define mm256_shufll32_8( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
|
||||
_mm_set_epi64x( 0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) )
|
||||
#define mm256_shuflr64_8( v ) _mm256_shuffle_epi8( v, V256_SHUFLR64_8 )
|
||||
#define mm256_shufll64_8( v ) _mm256_shuffle_epi8( v, V256_SHUFLL64_8 )
|
||||
#define mm256_shuflr64_24(v ) _mm256_shuffle_epi8( v, V256_SHUFLR64_24 )
|
||||
#define mm256_shufll64_24(v ) _mm256_shuffle_epi8( v, V256_SHUFLL64_24 )
|
||||
#define mm256_shuflr32_8( v ) _mm256_shuffle_epi8( v, V256_SHUFLR32_8 )
|
||||
#define mm256_shufll32_8( v ) _mm256_shuffle_epi8( v, V256_SHUFLL32_8 )
|
||||
|
||||
#define mm256_ror_64( v, c ) \
|
||||
( (c) == 8 ) ? mm256_shuflr64_8( v ) \
|
||||
@@ -347,96 +319,6 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
// x2 rotates elements in 2 individual vectors in a double buffered
|
||||
// optimization for AVX2, does nothing for AVX512 but is here for
|
||||
// transparency.
|
||||
|
||||
#if defined(VL256)
|
||||
/*
|
||||
#define mm256_ror_64 _mm256_ror_epi64
|
||||
#define mm256_rol_64 _mm256_rol_epi64
|
||||
#define mm256_ror_32 _mm256_ror_epi32
|
||||
#define mm256_rol_32 _mm256_rol_epi32
|
||||
*/
|
||||
#define mm256_rorx2_64( v1, v0, c ) \
|
||||
_mm256_ror_epi64( v0, c ); \
|
||||
_mm256_ror_epi64( v1, c )
|
||||
|
||||
#define mm256_rolx2_64( v1, v0, c ) \
|
||||
_mm256_rol_epi64( v0, c ); \
|
||||
_mm256_rol_epi64( v1, c )
|
||||
|
||||
#define mm256_rorx2_32( v1, v0, c ) \
|
||||
_mm256_ror_epi32( v0, c ); \
|
||||
_mm256_ror_epi32( v1, c )
|
||||
|
||||
#define mm256_rolx2_32( v1, v0, c ) \
|
||||
_mm256_rol_epi32( v0, c ); \
|
||||
_mm256_rol_epi32( v1, c )
|
||||
|
||||
#else // AVX2
|
||||
/*
|
||||
// use shuflr64 shuflr32 below for optimized bit rotations of multiples of 8.
|
||||
|
||||
#define mm256_ror_64( v, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
|
||||
_mm256_slli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm256_rol_64( v, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi64( v, c ), \
|
||||
_mm256_srli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm256_ror_32( v, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi32( v, c ), \
|
||||
_mm256_slli_epi32( v, 32-(c) ) )
|
||||
|
||||
#define mm256_rol_32( v, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
|
||||
_mm256_srli_epi32( v, 32-(c) ) )
|
||||
*/
|
||||
#define mm256_rorx2_64( v1, v0, c ) \
|
||||
{ \
|
||||
__m256i t0 = _mm256_srli_epi64( v0, c ); \
|
||||
__m256i t1 = _mm256_srli_epi64( v1, c ); \
|
||||
v0 = _mm256_slli_epi64( v0, 64-(c) ); \
|
||||
v1 = _mm256_slli_epi64( v1, 64-(c) ); \
|
||||
v0 = _mm256_or_si256( v0, t0 ); \
|
||||
v1 = _mm256_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define mm256_rolx2_64( v1, v0, c ) \
|
||||
{ \
|
||||
__m256i t0 = _mm256_slli_epi64( v0, c ); \
|
||||
__m256i t1 = _mm256_slli_epi64( v1, c ); \
|
||||
v0 = _mm256_srli_epi64( v0, 64-(c) ); \
|
||||
v1 = _mm256_srli_epi64( v1, 64-(c) ); \
|
||||
v0 = _mm256_or_si256( v0, t0 ); \
|
||||
v1 = _mm256_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define mm256_rorx2_32( v1, v0, c ) \
|
||||
{ \
|
||||
__m256i t0 = _mm256_srli_epi32( v0, c ); \
|
||||
__m256i t1 = _mm256_srli_epi32( v1, c ); \
|
||||
v0 = _mm256_slli_epi32( v0, 32-(c) ); \
|
||||
v1 = _mm256_slli_epi32( v1, 32-(c) ); \
|
||||
v0 = _mm256_or_si256( v0, t0 ); \
|
||||
v1 = _mm256_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define mm256_rolx2_32( v1, v0, c ) \
|
||||
{ \
|
||||
__m256i t0 = _mm256_slli_epi32( v0, c ); \
|
||||
__m256i t1 = _mm256_slli_epi32( v1, c ); \
|
||||
v0 = _mm256_srli_epi32( v0, 32-(c) ); \
|
||||
v1 = _mm256_srli_epi32( v1, 32-(c) ); \
|
||||
v0 = _mm256_or_si256( v0, t0 ); \
|
||||
v1 = _mm256_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#endif // AVX512 else AVX2
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// 128 bit version of unpack
|
||||
@@ -453,20 +335,14 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
//
|
||||
// Cross lane shuffles
|
||||
//
|
||||
// Rotate elements accross all lanes.
|
||||
#define mm256_shuffle_16( v, c ) \
|
||||
_mm256_or_si256( _mm256_shufflehi_epi16( v, c ), \
|
||||
_mm256_shufflelo_epi16( v, c ) )
|
||||
|
||||
// Swap 128 bit elements in 256 bit vector.
|
||||
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
|
||||
#define mm256_rev_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
|
||||
#define mm256_swap_128 mm256_rev_128 // grandfathered
|
||||
|
||||
// Rotate 256 bit vector by one 64 bit element
|
||||
#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 )
|
||||
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
|
||||
|
||||
// Reverse 64 bit elements
|
||||
/* not used
|
||||
// Reverse elements
|
||||
#define mm256_rev_64( v ) _mm256_permute4x64_epi64( v, 0x1b )
|
||||
|
||||
#define mm256_rev_32( v ) \
|
||||
@@ -474,7 +350,12 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
0x0000000400000005, 0x0000000600000007 )
|
||||
|
||||
#define mm256_rev_16( v ) \
|
||||
_mm256_permute4x64_epi64( mm256_shuffle_16( v, 0x1b ), 0x4e )
|
||||
_mm256_permute4x64_epi64( mm256_shuffle16( v, 0x1b ), 0x4e )
|
||||
*/
|
||||
|
||||
// Rotate 256 bit vector by one 64 bit element
|
||||
#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 )
|
||||
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
|
||||
|
||||
/* Not used
|
||||
// Rotate 256 bit vector by one 32 bit element.
|
||||
@@ -486,7 +367,7 @@ static inline __m256i mm256_shufll_32( const __m256i v )
|
||||
#else
|
||||
#define mm256_shuflr_32( v ) \
|
||||
_mm256_permutevar8x32_epi32( v, \
|
||||
_mm256_set_spi64x( 0x0000000000000007, 0x0000000600000005, \
|
||||
_mm256_set_epi64x( 0x0000000000000007, 0x0000000600000005, \
|
||||
0x0000000400000003, 0x0000000200000001 ) )
|
||||
#define mm256_shufll_32( v ) \
|
||||
_mm256_permutevar8x32_epi32( v, \
|
||||
@@ -507,113 +388,64 @@ static inline __m256i mm256_shufll_32( const __m256i v )
|
||||
_mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \
|
||||
_mm256_castsi256_ps( v2 ), c ) );
|
||||
|
||||
#define mm256_swap128_64(v) _mm256_shuffle_epi32( v, 0x4e )
|
||||
#define mm256_rev128_64(v) _mm256_shuffle_epi32( v, 0x4e )
|
||||
#define mm256_swap128_64 mm256_rev128_64 // grandfathered
|
||||
|
||||
/*not used
|
||||
#define mm256_rev128_32(v) _mm256_shuffle_epi32( v, 0x1b )
|
||||
#define mm256_rev128_16(v) mm256_shuffle_16( v, 0x1b )
|
||||
#define mm256_rev128_16(v) mm256_shuffle16( v, 0x1b )
|
||||
*/
|
||||
|
||||
#define mm256_shuflr128_32(v) _mm256_shuffle_epi32( v, 0x39 )
|
||||
#define mm256_shufll128_32(v) _mm256_shuffle_epi32( v, 0x93 )
|
||||
|
||||
#define mm256_shuflr128_16(v) mm256_shuffle_16( v, 0x39 )
|
||||
#define mm256_shufll128_16(v) mm256_shuffle_16( v, 0x93 )
|
||||
/* not used
|
||||
#define mm256_shuflr128_16(v) mm256_shuffle16( v, 0x39 )
|
||||
#define mm256_shufll128_16(v) mm256_shuffle16( v, 0x93 )
|
||||
|
||||
/* Not used
|
||||
static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
{ return _mm256_alignr_epi8( v, v, c ); }
|
||||
*/
|
||||
|
||||
// Reverse byte order in elements, endian bswap.
|
||||
#define mm256_bswap_64( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) )
|
||||
#define mm256_bswap_64( v ) _mm256_shuffle_epi8( v, V256_BSWAP64 )
|
||||
|
||||
#define mm256_bswap_32( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) )
|
||||
#define mm256_bswap_32( v ) _mm256_shuffle_epi8( v, V256_BSWAP32 )
|
||||
|
||||
/* not used
|
||||
#define mm256_bswap_16( v ) \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
|
||||
_mm256_shuffle_epi8( v, mm256_bcast128( _mm_set_epi64x( \
|
||||
0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
|
||||
//
|
||||
*/
|
||||
|
||||
// Source and destination are pointers, may point to same memory.
|
||||
// 8 byte qword * 8 qwords * 4 lanes = 256 bytes
|
||||
#define mm256_block_bswap_64( d, s ) \
|
||||
{ \
|
||||
__m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
|
||||
0x0001020304050607 ) ); \
|
||||
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
|
||||
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
|
||||
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
|
||||
casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
|
||||
casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
|
||||
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
|
||||
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
|
||||
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
|
||||
}
|
||||
#define mm256_block_bswap64_512 mm256_block_bswap_64
|
||||
|
||||
#define mm256_block_bswap64_1024( d, s ) \
|
||||
{ \
|
||||
__m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
|
||||
0x0001020304050607 ) ); \
|
||||
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
|
||||
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
|
||||
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
|
||||
casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
|
||||
casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
|
||||
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
|
||||
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
|
||||
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
|
||||
casti_m256i( d, 8 ) = _mm256_shuffle_epi8( casti_m256i( s, 8 ), ctl ); \
|
||||
casti_m256i( d, 9 ) = _mm256_shuffle_epi8( casti_m256i( s, 9 ), ctl ); \
|
||||
casti_m256i( d,10 ) = _mm256_shuffle_epi8( casti_m256i( s,10 ), ctl ); \
|
||||
casti_m256i( d,11 ) = _mm256_shuffle_epi8( casti_m256i( s,11 ), ctl ); \
|
||||
casti_m256i( d,12 ) = _mm256_shuffle_epi8( casti_m256i( s,12 ), ctl ); \
|
||||
casti_m256i( d,13 ) = _mm256_shuffle_epi8( casti_m256i( s,13 ), ctl ); \
|
||||
casti_m256i( d,14 ) = _mm256_shuffle_epi8( casti_m256i( s,14 ), ctl ); \
|
||||
casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \
|
||||
casti_m256i( d,0 ) = mm256_bswap_64( casti_m256i( s,0 ) ); \
|
||||
casti_m256i( d,1 ) = mm256_bswap_64( casti_m256i( s,1 ) ); \
|
||||
casti_m256i( d,2 ) = mm256_bswap_64( casti_m256i( s,2 ) ); \
|
||||
casti_m256i( d,3 ) = mm256_bswap_64( casti_m256i( s,3 ) ); \
|
||||
casti_m256i( d,4 ) = mm256_bswap_64( casti_m256i( s,4 ) ); \
|
||||
casti_m256i( d,5 ) = mm256_bswap_64( casti_m256i( s,5 ) ); \
|
||||
casti_m256i( d,6 ) = mm256_bswap_64( casti_m256i( s,6 ) ); \
|
||||
casti_m256i( d,7 ) = mm256_bswap_64( casti_m256i( s,7 ) ); \
|
||||
}
|
||||
|
||||
// 4 byte dword * 8 dwords * 8 lanes = 256 bytes
|
||||
#define mm256_block_bswap_32( d, s ) \
|
||||
{ \
|
||||
__m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ) ); \
|
||||
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
|
||||
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
|
||||
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
|
||||
casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
|
||||
casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
|
||||
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
|
||||
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
|
||||
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
|
||||
casti_m256i( d, 0 ) = mm256_bswap_32( casti_m256i( s, 0 ) ); \
|
||||
casti_m256i( d, 1 ) = mm256_bswap_32( casti_m256i( s, 1 ) ); \
|
||||
casti_m256i( d, 2 ) = mm256_bswap_32( casti_m256i( s, 2 ) ); \
|
||||
casti_m256i( d, 3 ) = mm256_bswap_32( casti_m256i( s, 3 ) ); \
|
||||
casti_m256i( d, 4 ) = mm256_bswap_32( casti_m256i( s, 4 ) ); \
|
||||
casti_m256i( d, 5 ) = mm256_bswap_32( casti_m256i( s, 5 ) ); \
|
||||
casti_m256i( d, 6 ) = mm256_bswap_32( casti_m256i( s, 6 ) ); \
|
||||
casti_m256i( d, 7 ) = mm256_bswap_32( casti_m256i( s, 7 ) ); \
|
||||
}
|
||||
#define mm256_block_bswap32_256 mm256_block_bswap_32
|
||||
|
||||
#define mm256_block_bswap32_512( d, s ) \
|
||||
{ \
|
||||
__m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ) ); \
|
||||
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
|
||||
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
|
||||
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
|
||||
casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
|
||||
casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
|
||||
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
|
||||
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
|
||||
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
|
||||
casti_m256i( d, 8 ) = _mm256_shuffle_epi8( casti_m256i( s, 8 ), ctl ); \
|
||||
casti_m256i( d, 9 ) = _mm256_shuffle_epi8( casti_m256i( s, 9 ), ctl ); \
|
||||
casti_m256i( d,10 ) = _mm256_shuffle_epi8( casti_m256i( s,10 ), ctl ); \
|
||||
casti_m256i( d,11 ) = _mm256_shuffle_epi8( casti_m256i( s,11 ), ctl ); \
|
||||
casti_m256i( d,12 ) = _mm256_shuffle_epi8( casti_m256i( s,12 ), ctl ); \
|
||||
casti_m256i( d,13 ) = _mm256_shuffle_epi8( casti_m256i( s,13 ), ctl ); \
|
||||
casti_m256i( d,14 ) = _mm256_shuffle_epi8( casti_m256i( s,14 ), ctl ); \
|
||||
casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \
|
||||
}
|
||||
|
||||
#if defined(VL256)
|
||||
|
||||
#define mm256_alignr64 _mm256_alignr_epi64
|
||||
|
||||
@@ -108,11 +108,13 @@ typedef union
|
||||
|
||||
// A simple 128 bit permute, using function instead of macro avoids
|
||||
// problems if the v arg passed as an expression.
|
||||
static inline __m512i mm512_perm_128( const __m512i v, const int c )
|
||||
static inline __m512i mm512_perm128( const __m512i v, const int c )
|
||||
{ return _mm512_shuffle_i64x2( v, v, c ); }
|
||||
|
||||
// Broadcast 128 bit vector to all lanes of 512 bit vector.
|
||||
#define mm512_bcast_m128( v ) mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
|
||||
#define mm512_bcast128( v ) mm512_perm128( _mm512_castsi128_si512( v ), 0 )
|
||||
// deprecated
|
||||
#define mm512_bcast_m128 mm512_bcast128
|
||||
|
||||
// Set either the low or high 64 bit elements in 128 bit lanes, other elements
|
||||
// are set to zero.
|
||||
@@ -120,7 +122,7 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
|
||||
#define mm512_bcast128hi_64( i64 ) _mm512_maskz_set1_epi64( 0xaa, i64 )
|
||||
|
||||
#define mm512_set2_64( i1, i0 ) \
|
||||
mm512_bcast_m128( _mm_set_epi64x( i1, i0 ) )
|
||||
mm512_bcast128( _mm_set_epi64x( i1, i0 ) )
|
||||
|
||||
// Pseudo constants.
|
||||
#define m512_zero _mm512_setzero_si512()
|
||||
@@ -248,105 +250,57 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
//
|
||||
// Reverse byte order of packed elements, vectorized endian conversion.
|
||||
|
||||
#define mm512_bswap_64( v ) \
|
||||
_mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) )
|
||||
#define mm512_bswap_64( v ) _mm512_shuffle_epi8( v, V512_BSWAP64 )
|
||||
|
||||
#define mm512_bswap_32( v ) \
|
||||
_mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) )
|
||||
#define mm512_bswap_32( v ) _mm512_shuffle_epi8( v, V512_BSWAP32 )
|
||||
|
||||
/* not used
|
||||
#define mm512_bswap_16( v ) \
|
||||
_mm512_shuffle_epi8( v, mm512_bcast128( _mm_set_epi64x( \
|
||||
0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
|
||||
*/
|
||||
|
||||
#define mm512_bswap_16( v ) \
|
||||
_mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \
|
||||
0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
|
||||
|
||||
// Source and destination are pointers, may point to same memory.
|
||||
// 8 lanes of 64 bytes each
|
||||
#define mm512_block_bswap_64( d, s ) \
|
||||
{ \
|
||||
const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
|
||||
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
|
||||
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
|
||||
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
|
||||
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
|
||||
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
|
||||
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
|
||||
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
|
||||
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
|
||||
}
|
||||
#define mm512_block_bswap64_512 mm512_block_bswap_64
|
||||
|
||||
#define mm512_block_bswap64_1024( d, s ) \
|
||||
{ \
|
||||
const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
|
||||
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
|
||||
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
|
||||
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
|
||||
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
|
||||
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
|
||||
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
|
||||
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
|
||||
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
|
||||
casti_m512i( d, 8 ) = _mm512_shuffle_epi8( casti_m512i( s, 8 ), ctl ); \
|
||||
casti_m512i( d, 9 ) = _mm512_shuffle_epi8( casti_m512i( s, 9 ), ctl ); \
|
||||
casti_m512i( d,10 ) = _mm512_shuffle_epi8( casti_m512i( s,10 ), ctl ); \
|
||||
casti_m512i( d,11 ) = _mm512_shuffle_epi8( casti_m512i( s,11 ), ctl ); \
|
||||
casti_m512i( d,12 ) = _mm512_shuffle_epi8( casti_m512i( s,12 ), ctl ); \
|
||||
casti_m512i( d,13 ) = _mm512_shuffle_epi8( casti_m512i( s,13 ), ctl ); \
|
||||
casti_m512i( d,14 ) = _mm512_shuffle_epi8( casti_m512i( s,14 ), ctl ); \
|
||||
casti_m512i( d,15 ) = _mm512_shuffle_epi8( casti_m512i( s,15 ), ctl ); \
|
||||
casti_m512i( d, 0 ) = mm512_bswap_64( casti_m512i( s, 0 ) ); \
|
||||
casti_m512i( d, 1 ) = mm512_bswap_64( casti_m512i( s, 1 ) ); \
|
||||
casti_m512i( d, 2 ) = mm512_bswap_64( casti_m512i( s, 2 ) ); \
|
||||
casti_m512i( d, 3 ) = mm512_bswap_64( casti_m512i( s, 3 ) ); \
|
||||
casti_m512i( d, 4 ) = mm512_bswap_64( casti_m512i( s, 4 ) ); \
|
||||
casti_m512i( d, 5 ) = mm512_bswap_64( casti_m512i( s, 5 ) ); \
|
||||
casti_m512i( d, 6 ) = mm512_bswap_64( casti_m512i( s, 6 ) ); \
|
||||
casti_m512i( d, 7 ) = mm512_bswap_64( casti_m512i( s, 7 ) ); \
|
||||
}
|
||||
|
||||
// 16 lanes of 32 bytes each
|
||||
#define mm512_block_bswap_32( d, s ) \
|
||||
{ \
|
||||
const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
|
||||
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
|
||||
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
|
||||
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
|
||||
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
|
||||
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
|
||||
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
|
||||
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
|
||||
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
|
||||
casti_m512i( d, 0 ) = mm512_bswap_32( casti_m512i( s, 0 ) ); \
|
||||
casti_m512i( d, 1 ) = mm512_bswap_32( casti_m512i( s, 1 ) ); \
|
||||
casti_m512i( d, 2 ) = mm512_bswap_32( casti_m512i( s, 2 ) ); \
|
||||
casti_m512i( d, 3 ) = mm512_bswap_32( casti_m512i( s, 3 ) ); \
|
||||
casti_m512i( d, 4 ) = mm512_bswap_32( casti_m512i( s, 4 ) ); \
|
||||
casti_m512i( d, 5 ) = mm512_bswap_32( casti_m512i( s, 5 ) ); \
|
||||
casti_m512i( d, 6 ) = mm512_bswap_32( casti_m512i( s, 6 ) ); \
|
||||
casti_m512i( d, 7 ) = mm512_bswap_32( casti_m512i( s, 7 ) ); \
|
||||
}
|
||||
#define mm512_block_bswap32_256 mm512_block_bswap_32
|
||||
|
||||
#define mm512_block_bswap32_512( d, s ) \
|
||||
{ \
|
||||
const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
|
||||
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
|
||||
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
|
||||
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
|
||||
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
|
||||
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
|
||||
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
|
||||
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
|
||||
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
|
||||
casti_m512i( d, 8 ) = _mm512_shuffle_epi8( casti_m512i( s, 8 ), ctl ); \
|
||||
casti_m512i( d, 9 ) = _mm512_shuffle_epi8( casti_m512i( s, 9 ), ctl ); \
|
||||
casti_m512i( d,10 ) = _mm512_shuffle_epi8( casti_m512i( s,10 ), ctl ); \
|
||||
casti_m512i( d,11 ) = _mm512_shuffle_epi8( casti_m512i( s,11 ), ctl ); \
|
||||
casti_m512i( d,12 ) = _mm512_shuffle_epi8( casti_m512i( s,12 ), ctl ); \
|
||||
casti_m512i( d,13 ) = _mm512_shuffle_epi8( casti_m512i( s,13 ), ctl ); \
|
||||
casti_m512i( d,14 ) = _mm512_shuffle_epi8( casti_m512i( s,14 ), ctl ); \
|
||||
casti_m512i( d,15 ) = _mm512_shuffle_epi8( casti_m512i( s,15 ), ctl ); \
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Cross-lane shuffles implementing rotation of packed elements.
|
||||
//
|
||||
|
||||
// shuffle 16 bit elements within 64 bit lanes.
|
||||
#define mm512_shuffle16( v, c ) \
|
||||
_mm512_shufflehi_epi16( _mm512_shufflelo_epi16( v, c ), c )
|
||||
|
||||
// Rotate elements across entire vector.
|
||||
static inline __m512i mm512_swap_256( const __m512i v )
|
||||
static inline __m512i mm512_rev_256( const __m512i v )
|
||||
{ return _mm512_alignr_epi64( v, v, 4 ); }
|
||||
#define mm512_shuflr_256 mm512_swap_256
|
||||
#define mm512_shufll_256 mm512_swap_256
|
||||
#define mm512_swap_256 mm512_rev_256 // grandfathered
|
||||
|
||||
static inline __m512i mm512_shuflr_128( const __m512i v )
|
||||
{ return _mm512_alignr_epi64( v, v, 2 ); }
|
||||
@@ -394,9 +348,8 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
// Rotate elements within 256 bit lanes of 512 bit vector.
|
||||
|
||||
// Swap hi & lo 128 bits in each 256 bit lane
|
||||
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
|
||||
#define mm512_shuflr256_128 mm512_swap256_128
|
||||
#define mm512_shufll256_128 mm512_swap256_128
|
||||
#define mm512_rev256_128( v ) _mm512_permutex_epi64( v, 0x4e )
|
||||
#define mm512_swap256_128 mm512_rev256_128 // grandfathered
|
||||
|
||||
// Rotate 256 bit lanes by one 64 bit element
|
||||
#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 )
|
||||
@@ -450,15 +403,23 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
//
|
||||
// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
|
||||
|
||||
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
|
||||
#define mm512_shuflr128_64 mm512_swap128_64
|
||||
#define mm512_shufll128_64 mm512_swap128_64
|
||||
#define mm512_rev128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
|
||||
#define mm512_swap128_64 mm512_rev128_64 // grandfathered
|
||||
|
||||
/*not used
|
||||
#define mm512_rev128_32(v) _mm526_shuffle_epi32( v, 0x1b )
|
||||
#define mm512_rev128_16(v) mm512_shuffle16( v, 0x1b )
|
||||
*/
|
||||
|
||||
// Rotate 128 bit lanes by one 32 bit element
|
||||
#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
|
||||
#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
|
||||
|
||||
/* Not used
|
||||
|
||||
#define mm512_shuflr128_16(v) mm512_shuffle16( v, 0x39 )
|
||||
#define mm512_shufll128_16(v) mm512_shuffle16( v, 0x93 )
|
||||
|
||||
// Rotate 128 bit lanes right by c bytes, versatile and just as fast
|
||||
static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
|
||||
{ return _mm512_alignr_epi8( v, v, c ); }
|
||||
@@ -476,11 +437,10 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
|
||||
_mm512_castsi512_ps( v2 ), c ) );
|
||||
|
||||
// 64 bit lanes
|
||||
// Not really necessary with AVX512, included for consistency with AVX2/SSE.
|
||||
// ROL, ROR not necessary with AVX512, included for consistency with AVX2/SSE.
|
||||
|
||||
#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 )
|
||||
#define mm512_shuflr64_32 mm512_swap64_32
|
||||
#define mm512_shufll64_32 mm512_swap64_32
|
||||
#define mm512_qrev32( v ) _mm512_shuffle_epi32( v, 0xb1 )
|
||||
#define mm512_swap64_32 mm512_qrev32 // grandfathered
|
||||
|
||||
#define mm512_shuflr64_24( v ) _mm512_ror_epi64( v, 24 )
|
||||
#define mm512_shufll64_24( v ) _mm512_rol_epi64( v, 24 )
|
||||
@@ -494,9 +454,7 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
|
||||
/* Not used
|
||||
// 32 bit lanes
|
||||
|
||||
#define mm512_swap32_16( v ) _mm512_ror_epi32( v, 16 )
|
||||
#define mm512_shuflr32_16 mm512_swap32_16
|
||||
#define mm512_shufll32_16 mm512_swap32_16
|
||||
#define mm512_lrev16( v ) _mm512_ror_epi32( v, 16 )
|
||||
|
||||
#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 )
|
||||
#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 )
|
||||
|
||||
55
simd-utils/simd-constants.c
Normal file
55
simd-utils/simd-constants.c
Normal file
@@ -0,0 +1,55 @@
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(SIMD512)
|
||||
|
||||
const __m512i V512_BSWAP64 = { 0x0001020304050607, 0x08090a0b0c0d0e0f,
|
||||
0x0001020304050607, 0x08090a0b0c0d0e0f,
|
||||
0x0001020304050607, 0x08090a0b0c0d0e0f,
|
||||
0x0001020304050607, 0x08090a0b0c0d0e0f };
|
||||
|
||||
const __m512i V512_BSWAP32 = { 0x0405060700010203, 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203, 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203, 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203, 0x0c0d0e0f08090a0b };
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
|
||||
const __m256i V256_BSWAP64 = { 0x0001020304050607, 0x08090a0b0c0d0e0f,
|
||||
0x0001020304050607, 0x08090a0b0c0d0e0f };
|
||||
|
||||
const __m256i V256_BSWAP32 = { 0x0405060700010203, 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203, 0x0c0d0e0f08090a0b };
|
||||
|
||||
const __m256i V256_SHUFLR64_8 = { 0x0007060504030201, 0x080f0e0d0c0b0a09,
|
||||
0x0007060504030201, 0x080f0e0d0c0b0a09 };
|
||||
|
||||
const __m256i V256_SHUFLR64_24 = { 0x0201000706050403, 0x0a09080f0e0d0c0b,
|
||||
0x0201000706050403, 0x0a09080f0e0d0c0b };
|
||||
|
||||
const __m256i V256_SHUFLL64_8 = { 0x0605040302010007, 0x0e0d0c0b0a09080f,
|
||||
0x0605040302010007, 0x0e0d0c0b0a09080f };
|
||||
|
||||
const __m256i V256_SHUFLL64_24 = { 0x0403020100070605, 0x0c0b0a09080f0e0d,
|
||||
0x0403020100070605, 0x0c0b0a09080f0e0d };
|
||||
|
||||
const __m256i V256_SHUFLR32_8 = { 0x0407060500030201, 0x0c0f0e0d080b0a09,
|
||||
0x0407060500030201, 0x0c0f0e0d080b0a09 };
|
||||
|
||||
const __m256i V256_SHUFLL32_8 = { 0x0605040702010003, 0x0e0d0c0f0a09080b,
|
||||
0x0605040702010003, 0x0e0d0c0f0a09080b };
|
||||
|
||||
#elif defined(__SSSE3__)
|
||||
|
||||
const v128_t V128_BSWAP64 = { 0x0001020304050607, 0x08090a0b0c0d0e0f };
|
||||
const v128_t V128_BSWAP32 = { 0x0405060700010203, 0x0c0d0e0f08090a0b };
|
||||
|
||||
const v128_t V128_SHUFLR64_8 = { 0x0007060504030201, 0x080f0e0d0c0b0a09 };
|
||||
const v128_t V128_SHUFLR64_24 = { 0x0201000706050403, 0x0a09080f0e0d0c0b };
|
||||
const v128_t V128_SHUFLL64_8 = { 0x0605040302010007, 0x0e0d0c0b0a09080f };
|
||||
const v128_t V128_SHUFLL64_24 = { 0x0403020100070605, 0x0c0b0a09080f0e0d };
|
||||
|
||||
const v128_t V128_SHUFLR32_8 = { 0x0407060500030201, 0x0c0f0e0d080b0a09 };
|
||||
const v128_t V128_SHUFLL32_8 = { 0x0605040702010003, 0x0e0d0c0f0a09080b };
|
||||
|
||||
#endif
|
||||
|
||||
@@ -14,10 +14,10 @@
|
||||
// veor3q( v2, v1, v0 ) xor3 v2 ^ v1 ^ v0
|
||||
// vxarq_u64( v1, v0, n ) ror64xor ( v1 ^ v0 ) >>> n )
|
||||
// vbcaxq_u{64,32,16,8}( v2, v1, v0 ) xorandnot v2 ^ ( v1 & ~v0 )
|
||||
// vsraq_n_u{64,32,16,8}( v1, v0, n ) v1 + ( v0 >> n )
|
||||
//
|
||||
// not used anywhere yet
|
||||
// vrax1q_u64( v1, v0 ) v1 ^ ( v0 <<< 1 )
|
||||
// vsraq_n_u{64,32,16,8}( v1, v0, n ) v1 + ( v0 >> n )
|
||||
// vrax1q_u64( v1, v0 ) v1 ^ ( v0 <<< 1 )
|
||||
|
||||
#define v128_t uint32x4_t // default,
|
||||
#define v128u64_t uint64x2_t
|
||||
@@ -124,7 +124,7 @@
|
||||
// ~v1 & v0
|
||||
#define v128_andnot( v1, v0 ) vbicq_u32( v0, v1 )
|
||||
|
||||
// ~( a ^ b ), same as (~a) ^ b
|
||||
// ~( v1 ^ v0 ), same as (~v1) ^ v0
|
||||
#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
|
||||
|
||||
// ~v1 | v0, args reversed for consistency with x86_64
|
||||
@@ -136,8 +136,11 @@
|
||||
// known way to test arm minor version.
|
||||
#if defined(__ARM_FEATURE_SHA3)
|
||||
#define v128_xor3 veor3q_u32
|
||||
#define v128_xor4( v3, v2, v1, v0 ) veorq_u32( v3, veor3q_u32( v2, v1, v0 ) )
|
||||
#else
|
||||
#define v128_xor3( v2, v1, v0 ) veorq_u32( veorq_u32( v2, v1 ), v0 )
|
||||
#define v128_xor4( v3, v2, v1, v0 ) veorq_u32 ( veorq_u32( v3, v2 ), \
|
||||
veorq_u32( v1, v0 ) )
|
||||
#endif
|
||||
|
||||
// v2 & v1 & v0
|
||||
@@ -153,13 +156,13 @@
|
||||
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
|
||||
#endif
|
||||
|
||||
// a ^ ( b & c )
|
||||
// v2 ^ ( v1 & v0 )
|
||||
#define v128_xorand( v2, v1, v0 ) v128_xor( v2, v128_and( v1, v0 ) )
|
||||
|
||||
// a & ( b ^ c )
|
||||
// v2 & ( v1 ^ v0 )
|
||||
#define v128_andxor( v2, v1, v0 ) v128_and( v2, v128_xor( v1, v0 ) )
|
||||
|
||||
// a ^ ( b | c )
|
||||
// v2 ^ ( v1 | v0 )
|
||||
#define v128_xoror( v2, v1, v0 ) v128_xor( v2, v128_or( v1, v0 ) )
|
||||
|
||||
// v2 | ( v1 & v0 )
|
||||
@@ -240,7 +243,7 @@ typedef union
|
||||
#define cast_v128u32( p ) (*((uint32x4_t*)(p)))
|
||||
#define castp_v128u32( p ) ((uint32x4_t*)(p))
|
||||
|
||||
// set1
|
||||
// set1, integer argument
|
||||
#define v128_64 vmovq_n_u64
|
||||
#define v128_32 vmovq_n_u32
|
||||
#define v128_16 vmovq_n_u16
|
||||
@@ -326,10 +329,59 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
}
|
||||
|
||||
// how to build a bitmask from vector elements? Efficiently???
|
||||
#define v128_movmask32
|
||||
#define v128_movmask64
|
||||
//#define v128_movmask32
|
||||
//#define v128_movmask64
|
||||
|
||||
#define v128_shuffle8( v, vmask ) \
|
||||
vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )
|
||||
|
||||
// Bit rotation
|
||||
/*
|
||||
#define v128_shuflr64_8( v ) v128_shuffle8( v, V128_SHUFLR64_8 )
|
||||
#define v128_shufll64_8( v ) v128_shuffle8( v, V128_SHUFLL64_8 )
|
||||
#define v128_shuflr64_16(v ) v128_shuffle8( v, V128_SHUFLR64_16 )
|
||||
#define v128_shufll64_16(v ) v128_shuffle8( v, V128_SHUFLL64_16 )
|
||||
#define v128_shuflr64_24(v ) v128_shuffle8( v, V128_SHUFLR64_24 )
|
||||
#define v128_shufll64_24(v ) v128_shuffle8( v, V128_SHUFLL64_24 )
|
||||
#define v128_shuflr32_8( v ) v128_shuffle8( v, V128_SHUFLR32_8 )
|
||||
#define v128_shufll32_8( v ) v128_shuffle8( v, V128_SHUFLL32_8 )
|
||||
|
||||
#define v128_ror64( v, c ) \
|
||||
( (c) == 8 ) ? v128_shuflr64_8( v ) \
|
||||
: ( (c) == 16 ) ? v128_shuflr64_16( v ) \
|
||||
: ( (c) == 24 ) ? v128_shuflr64_24( v ) \
|
||||
: ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
|
||||
: ( (c) == 40 ) ? v128_shufll64_24( v ) \
|
||||
: ( (c) == 48 ) ? v128_shufll64_16( v ) \
|
||||
: ( (c) == 56 ) ? v128_shufll64_8( v ) \
|
||||
: vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
|
||||
((uint64x2_t)(v)), c )
|
||||
|
||||
#define v128_rol64( v, c ) \
|
||||
( (c) == 8 ) ? v128_shufll64_8( v ) \
|
||||
: ( (c) == 16 ) ? v128_shufll64_16( v ) \
|
||||
: ( (c) == 24 ) ? v128_shufll64_24( v ) \
|
||||
: ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
|
||||
: ( (c) == 40 ) ? v128_shuflr64_24( v ) \
|
||||
: ( (c) == 48 ) ? v128_shuflr64_16( v ) \
|
||||
: ( (c) == 56 ) ? v128_shuflr64_8( v ) \
|
||||
: vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
|
||||
((uint64x2_t)(v)), c )
|
||||
|
||||
#define v128_ror32( v, c ) \
|
||||
( (c) == 8 ) ? v128_shuflr32_8( v ) \
|
||||
: ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
|
||||
: ( (c) == 24 ) ? v128_shufll32_8( v ) \
|
||||
: vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
|
||||
((uint32x4_t)(v)), c )
|
||||
|
||||
#define v128_rol32( v, c ) \
|
||||
( (c) == 8 ) ? v128_shufll32_8( v ) \
|
||||
: ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
|
||||
: ( (c) == 24 ) ? v128_shuflr32_8( v ) \
|
||||
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
|
||||
((uint32x4_t)(v)), c )
|
||||
*/
|
||||
|
||||
#define v128_ror64( v, c ) \
|
||||
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
|
||||
@@ -351,6 +403,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
|
||||
((uint32x4_t)(v)), c )
|
||||
|
||||
/* not used
|
||||
#define v128_ror16( v, c ) \
|
||||
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \
|
||||
: vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
|
||||
@@ -368,6 +421,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
#define v128_rol8( v, c ) \
|
||||
vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
|
||||
((uint8x16_t)(v)), c )
|
||||
*/
|
||||
|
||||
// ( v1 ^ v0 ) >>> c
|
||||
#if defined(__ARM_FEATURE_SHA3)
|
||||
@@ -376,57 +430,13 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
#define v128_ror64xor( v1, v0, c ) v128_ror64( v128_xor( v1, v0 ), c )
|
||||
#endif
|
||||
|
||||
#define v128_2ror64( v1, v0, c ) \
|
||||
{ \
|
||||
uint64x2_t t0 = vshrq_n_u64( v0, c ); \
|
||||
uint64x2_t t1 = vshrq_n_u64( v1, c ); \
|
||||
v0 = vsliq_n_u64( v0, 64-(c) ); \
|
||||
v1 = vsliq_n_u64( v1, 64-(c) ); \
|
||||
v0 = vorrq_u64( v0, t0 ); \
|
||||
v1 = vorrq_u64( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define v128_2rol64_( v1, v0, c ) \
|
||||
{ \
|
||||
uint64x2_t t0 = vshlq_n_u64( v0, c ); \
|
||||
uint64x2_t t1 = vshlq_n_u64( v1, c ); \
|
||||
v0 = vsriq_n_u64( v0, 64-(c) ); \
|
||||
v1 = vsriq_n_u64( v1, 64-(c) ); \
|
||||
v0 = vorrq_u64( v0, t0 ); \
|
||||
v1 = vorrq_u64( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define v128_2rorl32( v1, v0, c ) \
|
||||
{ \
|
||||
uint32x4_t t0 = vshrq_n_u32( v0, c ); \
|
||||
uint32x4_t t1 = vshrq_n_u32( v1, c ); \
|
||||
v0 = vsliq_n_u32( v0, 32-(c) ); \
|
||||
v1 = vsliq_n_u32( v1, 32-(c) ); \
|
||||
v0 = vorrq_32( v0, t0 ); \
|
||||
v1 = vorrq_u32( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define v128_2ror32( v1, v0, c ) \
|
||||
{ \
|
||||
uint32x4_t t0 = vshlq_n_u32( v0, c ); \
|
||||
uint32x4_t t1 = vshlq_n_u32( v1, c ); \
|
||||
v0 = vsriq_n_u32( v0, 32-(c) ); \
|
||||
v1 = vsriq_n_u32( v1, 32-(c) ); \
|
||||
v0 = vorrq_u32( v0, t0 ); \
|
||||
v1 = vorrq_u32( v1, t1 ); \
|
||||
}
|
||||
|
||||
/* not used anywhere and hopefully never will
|
||||
// vector mask, use as last resort. prefer tbl, rev, alignr, etc
|
||||
#define v128_shufflev32( v, vmask ) \
|
||||
v128_set32( ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[3] ], \
|
||||
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[2] ], \
|
||||
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[1] ], \
|
||||
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[0] ] ) \
|
||||
/* not used
|
||||
// v1 + ( v0 >> c )
|
||||
#define v128_addsr64( v1, v0, c ) vsraq_n_u64( v1, v0, c )
|
||||
#define v128_addsr32( v1, v0, c ) vsraq_n_u32( v1, v0, c )
|
||||
*/
|
||||
|
||||
#define v128_shuffle8( v, vmask ) \
|
||||
vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )
|
||||
// Cross lane shuffle
|
||||
|
||||
// sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
|
||||
// Bit rotation already promotes faster widths. Usage is context sensitive.
|
||||
@@ -438,19 +448,14 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
#define v128_qrev16 vrev64q_u16
|
||||
#define v128_lrev16 vrev32q_u16
|
||||
|
||||
// aka bswap
|
||||
// #define v128_qrev8 vrev64q_u8
|
||||
// #define v128_lrev8 vrev32q_u8
|
||||
// #define v128_wrev8 vrev16q_u8
|
||||
|
||||
// full vector rotation
|
||||
|
||||
// reverse elements in vector
|
||||
static inline uint64x2_t v128_rev64( uint64x2_t v )
|
||||
{ return vextq_u64( v, v, 1 ); }
|
||||
#define v128_swap64 v128_rev64 // grandfathered
|
||||
#define v128_swap64 v128_rev64 // grandfathered
|
||||
|
||||
#define v128_rev32(v) v128_rev64( v128_qrev32( v ) )
|
||||
#define v128_rev32(v) v128_rev64( v128_qrev32( v ) )
|
||||
|
||||
// shuffle-rotate vector elements
|
||||
static inline uint32x4_t v128_shuflr32( uint32x4_t v )
|
||||
@@ -468,7 +473,6 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
|
||||
#define v128_bswap64(v) (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
|
||||
#define v128_bswap128(v) (uint32x4_t)v128_rev64( v128_bswap64(v) )
|
||||
|
||||
// Useful for x86_64 but does nothing for ARM
|
||||
#define v128_block_bswap32( dst, src ) \
|
||||
{ \
|
||||
casti_v128u32( dst,0 ) = v128_bswap32( casti_v128u32( src,0 ) ); \
|
||||
@@ -482,26 +486,6 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
|
||||
}
|
||||
#define v128_block_bswap32_256 v128_block_bswap32
|
||||
|
||||
#define v128_block_bswap32_512( dst, src ) \
|
||||
{ \
|
||||
casti_v128u32( dst, 0 ) = v128_bswap32( casti_v128u32( src, 0 ) ); \
|
||||
casti_v128u32( dst, 1 ) = v128_bswap32( casti_v128u32( src, 1 ) ); \
|
||||
casti_v128u32( dst, 2 ) = v128_bswap32( casti_v128u32( src, 2 ) ); \
|
||||
casti_v128u32( dst, 3 ) = v128_bswap32( casti_v128u32( src, 3 ) ); \
|
||||
casti_v128u32( dst, 4 ) = v128_bswap32( casti_v128u32( src, 4 ) ); \
|
||||
casti_v128u32( dst, 5 ) = v128_bswap32( casti_v128u32( src, 5 ) ); \
|
||||
casti_v128u32( dst, 6 ) = v128_bswap32( casti_v128u32( src, 6 ) ); \
|
||||
casti_v128u32( dst, 7 ) = v128_bswap32( casti_v128u32( src, 7 ) ); \
|
||||
casti_v128u32( dst, 8 ) = v128_bswap32( casti_v128u32( src, 8 ) ); \
|
||||
casti_v128u32( dst, 9 ) = v128_bswap32( casti_v128u32( src, 9 ) ); \
|
||||
casti_v128u32( dst,10 ) = v128_bswap32( casti_v128u32( src,10 ) ); \
|
||||
casti_v128u32( dst,11 ) = v128_bswap32( casti_v128u32( src,11 ) ); \
|
||||
casti_v128u32( dst,12 ) = v128_bswap32( casti_v128u32( src,12 ) ); \
|
||||
casti_v128u32( dst,13 ) = v128_bswap32( casti_v128u32( src,13 ) ); \
|
||||
casti_v128u32( dst,14 ) = v128_bswap32( casti_v128u32( src,14 ) ); \
|
||||
casti_v128u32( dst,15 ) = v128_bswap32( casti_v128u32( src,15 ) ); \
|
||||
}
|
||||
|
||||
#define v128_block_bswap64( dst, src ) \
|
||||
{ \
|
||||
casti_v128u64( dst,0 ) = v128_bswap64( casti_v128u64( src,0 ) ); \
|
||||
@@ -513,27 +497,6 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
|
||||
casti_v128u64( dst,6 ) = v128_bswap64( casti_v128u64( src,6 ) ); \
|
||||
casti_v128u64( dst,7 ) = v128_bswap64( casti_v128u64( src,7 ) ); \
|
||||
}
|
||||
#define v128_block_bswap64_512 v128_block_bswap64 \
|
||||
|
||||
#define v128_block_bswap64_1024( dst, src ) \
|
||||
{ \
|
||||
casti_v128u64( dst, 0 ) = v128_bswap64( casti_v128u64( src, 0 ) ); \
|
||||
casti_v128u64( dst, 1 ) = v128_bswap64( casti_v128u64( src, 1 ) ); \
|
||||
casti_v128u64( dst, 2 ) = v128_bswap64( casti_v128u64( src, 2 ) ); \
|
||||
casti_v128u64( dst, 3 ) = v128_bswap64( casti_v128u64( src, 3 ) ); \
|
||||
casti_v128u64( dst, 4 ) = v128_bswap64( casti_v128u64( src, 4 ) ); \
|
||||
casti_v128u64( dst, 5 ) = v128_bswap64( casti_v128u64( src, 5 ) ); \
|
||||
casti_v128u64( dst, 6 ) = v128_bswap64( casti_v128u64( src, 6 ) ); \
|
||||
casti_v128u64( dst, 7 ) = v128_bswap64( casti_v128u64( src, 7 ) ); \
|
||||
casti_v128u64( dst, 8 ) = v128_bswap64( casti_v128u64( src, 8 ) ); \
|
||||
casti_v128u64( dst, 9 ) = v128_bswap64( casti_v128u64( src, 9 ) ); \
|
||||
casti_v128u64( dst,10 ) = v128_bswap64( casti_v128u64( src,10 ) ); \
|
||||
casti_v128u64( dst,11 ) = v128_bswap64( casti_v128u64( src,11 ) ); \
|
||||
casti_v128u64( dst,12 ) = v128_bswap64( casti_v128u64( src,12 ) ); \
|
||||
casti_v128u64( dst,13 ) = v128_bswap64( casti_v128u64( src,13 ) ); \
|
||||
casti_v128u64( dst,14 ) = v128_bswap64( casti_v128u64( src,14 ) ); \
|
||||
casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
|
||||
}
|
||||
|
||||
// Bitwise blend using vector mask, use only bytewise for compatibility
|
||||
// with x86_64.
|
||||
|
||||
Reference in New Issue
Block a user