This commit is contained in:
Jay D Dee
2025-06-20 20:31:41 -04:00
parent dd99580a4c
commit 66191db93c
86 changed files with 2701 additions and 4322 deletions

View File

@@ -589,20 +589,7 @@ static inline void extr_lane_4x32( void *d, const void *s,
((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+60 ];
}
#if defined(__SSSE3__)
static inline void v128_bswap32_80( void *d, void *s )
{
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), bswap_shuf );
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), bswap_shuf );
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), bswap_shuf );
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), bswap_shuf );
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), bswap_shuf );
}
#elif defined(__aarch64__) && defined(__ARM_NEON)
#if defined(__SSSE3__) || defined(__ARM_NEON)
static inline void v128_bswap32_80( void *d, void *s )
{
@@ -641,6 +628,8 @@ static inline void v128_bswap32_80( void *d, void *s )
#endif
#if defined(__SSE2__) || defined(__ARM_NEON)
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
{
v128u32_t s0 = casti_v128u32( src,0 );
@@ -649,27 +638,12 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
v128u32_t s3 = casti_v128u32( src,3 );
v128u32_t s4 = casti_v128u32( src,4 );
#if defined(__SSSE3__)
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
#else
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
#endif
casti_v128u32( d, 0 ) = v128_duplane32( s0, 0 );
casti_v128u32( d, 1 ) = v128_duplane32( s0, 1 );
casti_v128u32( d, 2 ) = v128_duplane32( s0, 2 );
@@ -696,6 +670,8 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
casti_v128u32( d,19 ) = v128_duplane32( s2, 3 );
}
#endif // SSE2 || NEON
// 8x32
#if defined(__AVX2__)
@@ -1112,8 +1088,6 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
{
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
const __m256i c1 = v256_32( 1 );
const __m256i c2 = _mm256_add_epi32( c1, c1 );
const __m256i c3 = _mm256_add_epi32( c2, c1 );
@@ -1124,11 +1098,11 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
casti_m256i( d, 0 ) = _mm256_broadcastd_epi32( s0 );
casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32(
@@ -1617,8 +1591,6 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
{
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
const __m512i c1 = v512_32( 1 );
const __m512i c2 = _mm512_add_epi32( c1, c1 );
const __m512i c3 = _mm512_add_epi32( c2, c1 );
@@ -1628,11 +1600,11 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
casti_m512i( d, 0 ) = _mm512_broadcastd_epi32( s0 );
casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( c1,
@@ -1878,6 +1850,8 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
#endif
#if defined(__SSE2__) || defined(__ARM_NEON)
static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
{
v128u64_t s0 = casti_v128u64( src,0 );
@@ -1886,27 +1860,12 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
v128u64_t s3 = casti_v128u64( src,3 );
v128u64_t s4 = casti_v128u64( src,4 );
#if defined(__SSSE3__)
const v128u64_t bswap_shuf = v128_set64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
#else
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
#endif
casti_v128u64( d,0 ) = v128_duplane64( s0, 0 );
casti_v128u64( d,1 ) = v128_duplane64( s0, 1 );
@@ -1923,6 +1882,8 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
casti_v128u64( d,9 ) = v128_duplane64( s4, 1 );
}
#endif // SSE2 || NEON
static inline void extr_lane_2x64( void *dst, const void *src,
const int lane, const int bit_len )
{
@@ -2233,25 +2194,23 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
{
const __m256i bswap_shuf = mm256_bcast_m128(
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
__m256i s0 = casti_m256i( src,0 );
__m256i s1 = casti_m256i( src,1 );
__m256i s0 = casti_m256i( src,0 ); // s0, s1
__m256i s2 = casti_m256i( src,1 ); // s2, s3
v128_t s4 = casti_v128( src,4 );
s0 = _mm256_shuffle_epi8( s0, bswap_shuf );
s1 = _mm256_shuffle_epi8( s1, bswap_shuf );
s4 = _mm_shuffle_epi8( s4, _mm256_castsi256_si128( bswap_shuf ) );
s0 = mm256_bswap_32( s0 );
s2 = mm256_bswap_32( s2 );
s4 = v128_bswap32( s4 );
casti_m256i( d, 0 ) = _mm256_permute4x64_epi64( s0, 0x00 );
casti_m256i( d, 1 ) = _mm256_permute4x64_epi64( s0, 0x55 );
casti_m256i( d, 2 ) = _mm256_permute4x64_epi64( s0, 0xaa );
casti_m256i( d, 3 ) = _mm256_permute4x64_epi64( s0, 0xff );
casti_m256i( d, 4 ) = _mm256_permute4x64_epi64( s1, 0x00 );
casti_m256i( d, 5 ) = _mm256_permute4x64_epi64( s1, 0x55 );
casti_m256i( d, 6 ) = _mm256_permute4x64_epi64( s1, 0xaa );
casti_m256i( d, 7 ) = _mm256_permute4x64_epi64( s1, 0xff );
casti_m256i( d, 4 ) = _mm256_permute4x64_epi64( s2, 0x00 );
casti_m256i( d, 5 ) = _mm256_permute4x64_epi64( s2, 0x55 );
casti_m256i( d, 6 ) = _mm256_permute4x64_epi64( s2, 0xaa );
casti_m256i( d, 7 ) = _mm256_permute4x64_epi64( s2, 0xff );
casti_m256i( d, 8 ) = _mm256_permute4x64_epi64(
_mm256_castsi128_si256( s4 ), 0x00 );
@@ -2648,8 +2607,6 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
{
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
const __m512i c1 = v512_64( 1 );
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
@@ -2657,11 +2614,11 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
casti_m512i( d,0 ) = _mm512_broadcastq_epi64( s0 );
casti_m512i( d,1 ) = _mm512_permutexvar_epi64( c1,
@@ -2842,49 +2799,45 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,
static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
{
const __m512i bswap_shuf = mm512_bcast_m128(
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
const v128_t s0 = casti_v128( src,0 );
const v128_t s1 = casti_v128( src,1 );
const v128_t s2 = casti_v128( src,2 );
const v128_t s3 = casti_v128( src,3 );
const v128_t s4 = casti_v128( src,4 );
casti_m512i( d,0 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s0 ),
bswap_shuf );
casti_m512i( d,1 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s1 ),
bswap_shuf );
casti_m512i( d,2 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s2 ),
bswap_shuf );
casti_m512i( d,3 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s3 ),
bswap_shuf );
casti_m512i( d,4 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s4 ),
bswap_shuf );
casti_m512i( d,0 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
_mm512_castsi128_si512( s0 ) );
casti_m512i( d,1 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
_mm512_castsi128_si512( s1 ) );
casti_m512i( d,2 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
_mm512_castsi128_si512( s2 ) );
casti_m512i( d,3 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
_mm512_castsi128_si512( s3 ) );
casti_m512i( d,4 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
_mm512_castsi128_si512( s4 ) );
}
#else
static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
{
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
casti_m512i( d,0 ) = mm512_bcast_m128( s0 );
casti_m512i( d,1 ) = mm512_bcast_m128( s1 );
casti_m512i( d,2 ) = mm512_bcast_m128( s2 );
casti_m512i( d,3 ) = mm512_bcast_m128( s3 );
casti_m512i( d,4 ) = mm512_bcast_m128( s4 );
casti_m512i( d,0 ) = mm512_bcast128( s0 );
casti_m512i( d,1 ) = mm512_bcast128( s1 );
casti_m512i( d,2 ) = mm512_bcast128( s2 );
casti_m512i( d,3 ) = mm512_bcast128( s3 );
casti_m512i( d,4 ) = mm512_bcast128( s4 );
}
#endif // AVX512VBMI ELSE

View File

@@ -521,29 +521,12 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#elif defined(__SSSE3__)
// SSSE3: fastest 32 bit, very fast 16, fast 8
#define v128_shuflr64_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x080f0e0d0c0b0a09, 0x0007060504030201 ) )
#define v128_shufll64_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0e0d0c0b0a09080f, 0x0605040302010007 ) )
#define v128_shuflr64_24( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
#define v128_shufll64_24( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0c0b0a09080f0e0d, 0x0403020100070605 ) )
#define v128_shuflr32_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
#define v128_shufll32_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0e0d0c0f0a09080b, 0x0605040702010003 ) )
#define v128_shuflr64_8( v ) _mm_shuffle_epi8( v, V128_SHUFLR64_8 )
#define v128_shufll64_8( v ) _mm_shuffle_epi8( v, V128_SHUFLL64_8 )
#define v128_shuflr64_24(v ) _mm_shuffle_epi8( v, V128_SHUFLR64_24 )
#define v128_shufll64_24(v ) _mm_shuffle_epi8( v, V128_SHUFLL64_24 )
#define v128_shuflr32_8( v ) _mm_shuffle_epi8( v, V128_SHUFLR32_8 )
#define v128_shufll32_8( v ) _mm_shuffle_epi8( v, V128_SHUFLL32_8 )
#define v128_ror64( v, c ) \
( (c) == 8 ) ? v128_shuflr64_8( v ) \
@@ -612,74 +595,6 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
// (v1 ^ v0) >>> n, ARM NEON has optimized version
#define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n )
/* not used
// x2 rotates elements in 2 individual vectors in a double buffered
// optimization for SSE2, does nothing for AVX512 but is there for
// transparency.
#if defined(VL256)
#define v128_2ror64( v1, v0, c ) \
_mm_ror_epi64( v0, c ); \
_mm_ror_epi64( v1, c )
#define v128_2rol64( v1, v0, c ) \
_mm_rol_epi64( v0, c ); \
_mm_rol_epi64( v1, c )
#define v128_2ror32( v1, v0, c ) \
_mm_ror_epi32( v0, c ); \
_mm_ror_epi32( v1, c )
#define v128_2rol32( v1, v0, c ) \
_mm_rol_epi32( v0, c ); \
_mm_rol_epi32( v1, c )
#else // SSE2
#define v128_2ror64( v1, v0, c ) \
{ \
__m128i t0 = _mm_srli_epi64( v0, c ); \
__m128i t1 = _mm_srli_epi64( v1, c ); \
v0 = _mm_slli_epi64( v0, 64-(c) ); \
v1 = _mm_slli_epi64( v1, 64-(c) ); \
v0 = _mm_or_si256( v0, t0 ); \
v1 = _mm_or_si256( v1, t1 ); \
}
#define v128_2rol64( v1, v0, c ) \
{ \
__m128i t0 = _mm_slli_epi64( v0, c ); \
__m128i t1 = _mm_slli_epi64( v1, c ); \
v0 = _mm_srli_epi64( v0, 64-(c) ); \
v1 = _mm_srli_epi64( v1, 64-(c) ); \
v0 = _mm_or_si256( v0, t0 ); \
v1 = _mm_or_si256( v1, t1 ); \
}
#define v128_2ror32( v1, v0, c ) \
{ \
__m128i t0 = _mm_srli_epi32( v0, c ); \
__m128i t1 = _mm_srli_epi32( v1, c ); \
v0 = _mm_slli_epi32( v0, 32-(c) ); \
v1 = _mm_slli_epi32( v1, 32-(c) ); \
v0 = _mm_or_si256( v0, t0 ); \
v1 = _mm_or_si256( v1, t1 ); \
}
#define v128_2rol32( v1, v0, c ) \
{ \
__m128i t0 = _mm_slli_epi32( v0, c ); \
__m128i t1 = _mm_slli_epi32( v1, c ); \
v0 = _mm_srli_epi32( v0, 32-(c) ); \
v1 = _mm_srli_epi32( v1, 32-(c) ); \
v0 = _mm_or_si256( v0, t0 ); \
v1 = _mm_or_si256( v1, t1 ); \
}
#endif // AVX512 else SSE2
*/
// Cross lane shuffles
// No NEON version
@@ -721,13 +636,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0001020304050607, \
0x08090a0b0c0d0e0f ) )
#define v128_bswap64( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
0x0001020304050607 ) )
#define v128_bswap64( v ) _mm_shuffle_epi8( v, V128_BSWAP64 )
#define v128_bswap32( v ) _mm_shuffle_epi8( v, V128_BSWAP32 )
#define v128_bswap32( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ) )
#define v128_bswap16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \
0x0607040502030001 )
@@ -735,85 +647,30 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
// 8 byte qword * 8 qwords * 2 lanes = 128 bytes
#define v128_block_bswap64( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
}
#define v128_block_bswap64_512 v128_block_bswap64
#define v128_block_bswap64_1024( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
casti_v128( d,0 ) = v128_bswap64( casti_v128( s,0 ) ); \
casti_v128( d,1 ) = v128_bswap64( casti_v128( s,1 ) ); \
casti_v128( d,2 ) = v128_bswap64( casti_v128( s,2 ) ); \
casti_v128( d,3 ) = v128_bswap64( casti_v128( s,3 ) ); \
casti_v128( d,4 ) = v128_bswap64( casti_v128( s,4 ) ); \
casti_v128( d,5 ) = v128_bswap64( casti_v128( s,5 ) ); \
casti_v128( d,6 ) = v128_bswap64( casti_v128( s,6 ) ); \
casti_v128( d,7 ) = v128_bswap64( casti_v128( s,7 ) ); \
}
// 4 byte dword * 8 dwords * 4 lanes = 128 bytes
#define v128_block_bswap32( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
casti_v128( d,0 ) = v128_bswap32( casti_v128( s,0 ) ); \
casti_v128( d,1 ) = v128_bswap32( casti_v128( s,1 ) ); \
casti_v128( d,2 ) = v128_bswap32( casti_v128( s,2 ) ); \
casti_v128( d,3 ) = v128_bswap32( casti_v128( s,3 ) ); \
casti_v128( d,4 ) = v128_bswap32( casti_v128( s,4 ) ); \
casti_v128( d,5 ) = v128_bswap32( casti_v128( s,5 ) ); \
casti_v128( d,6 ) = v128_bswap32( casti_v128( s,6 ) ); \
casti_v128( d,7 ) = v128_bswap32( casti_v128( s,7 ) ); \
}
#define v128_block_bswap32_256 v128_block_bswap32
#define v128_block_bswap32_128( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
}
#define v128_block_bswap32_512( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
}
#else // SSE2
static inline v128_t v128_bswap64( __m128i v )
@@ -835,7 +692,7 @@ static inline v128_t v128_bswap16( __m128i v )
return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
}
#define v128_bswap128( v ) v128_qrev32( v128_bswap64( v ) )
#define v128_bswap128( v ) v128_rev64( v128_bswap64( v ) )
static inline void v128_block_bswap64( __m128i *d, const __m128i *s )
{
@@ -849,26 +706,6 @@ static inline void v128_block_bswap64( __m128i *d, const __m128i *s )
d[7] = v128_bswap64( s[7] );
}
static inline void v128_block_bswap64_1024( __m128i *d, const __m128i *s )
{
d[ 0] = v128_bswap64( s[ 0] );
d[ 1] = v128_bswap64( s[ 1] );
d[ 2] = v128_bswap64( s[ 2] );
d[ 3] = v128_bswap64( s[ 3] );
d[ 4] = v128_bswap64( s[ 4] );
d[ 5] = v128_bswap64( s[ 5] );
d[ 6] = v128_bswap64( s[ 6] );
d[ 7] = v128_bswap64( s[ 7] );
d[ 8] = v128_bswap64( s[ 8] );
d[ 9] = v128_bswap64( s[ 9] );
d[10] = v128_bswap64( s[10] );
d[11] = v128_bswap64( s[11] );
d[14] = v128_bswap64( s[12] );
d[13] = v128_bswap64( s[13] );
d[14] = v128_bswap64( s[14] );
d[15] = v128_bswap64( s[15] );
}
static inline void v128_block_bswap32( __m128i *d, const __m128i *s )
{
d[0] = v128_bswap32( s[0] );
@@ -882,26 +719,6 @@ static inline void v128_block_bswap32( __m128i *d, const __m128i *s )
}
#define v128_block_bswap32_256 v128_block_bswap32
static inline void v128_block_bswap32_512( __m128i *d, const __m128i *s )
{
d[ 0] = v128_bswap32( s[ 0] );
d[ 1] = v128_bswap32( s[ 1] );
d[ 2] = v128_bswap32( s[ 2] );
d[ 3] = v128_bswap32( s[ 3] );
d[ 4] = v128_bswap32( s[ 4] );
d[ 5] = v128_bswap32( s[ 5] );
d[ 6] = v128_bswap32( s[ 6] );
d[ 7] = v128_bswap32( s[ 7] );
d[ 8] = v128_bswap32( s[ 8] );
d[ 9] = v128_bswap32( s[ 9] );
d[10] = v128_bswap32( s[10] );
d[11] = v128_bswap32( s[11] );
d[12] = v128_bswap32( s[12] );
d[13] = v128_bswap32( s[13] );
d[14] = v128_bswap32( s[14] );
d[15] = v128_bswap32( s[15] );
}
#endif // SSSE3 else SSE2
// alignr instruction for 32 & 64 bit elements is only available with AVX512

View File

@@ -61,8 +61,10 @@ typedef union
#if defined(__AVX2__)
// Broadcast, ie set1, from 128 bit vector input.
#define mm256_bcast_m128( v ) \
#define mm256_bcast128( v ) \
_mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
// deprecated
#define mm256_bcast_m128 mm256_bcast128
// Set either the low or high 64 bit elements in 128 bit lanes, other elements
// are set to zero.
@@ -73,23 +75,23 @@ typedef union
#else
#define mm256_bcast128lo_64( i64 ) mm256_bcast_m128( v128_mov64( i64 ) )
#define mm256_bcast128lo_64( i64 ) mm256_bcast128( v128_mov64( i64 ) )
#define mm256_bcast128hi_64( i64 ) _mm256_permute4x64_epi64( \
_mm256_castsi128_si256( v128_mov64( i64 ) ), 0x11 )
#endif
#define mm256_set2_64( i1, i0 ) mm256_bcast_m128( _mm_set_epi64x( i1, i0 ) )
#define mm256_set2_64( i1, i0 ) mm256_bcast128( _mm_set_epi64x( i1, i0 ) )
#define mm256_set4_32( i3, i2, i1, i0 ) \
mm256_bcast_m128( _mm_set_epi32( i3, i2, i1, i0 ) )
mm256_bcast128( _mm_set_epi32( i3, i2, i1, i0 ) )
// All SIMD constant macros are actually functions containing executable
// code and therefore can't be used as compile time initializers.
#define m256_zero _mm256_setzero_si256()
#define m256_one_128 mm256_bcast_m128( v128_one )
#define m256_one_128 mm256_bcast128( v128_one )
static inline __m256i mm256_neg1_fn()
{
@@ -231,21 +233,8 @@ static inline __m256i mm256_not( const __m256i v )
#define mm256_swap64_32 mm256_qrev32 // grandfathered
#define mm256_qrev16(v) mm256_shuffle16( v, 0x1b )
#define mm256_qrev8(v) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
v128_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) )
#define mm256_lrev16(v) mm256_shuffle16( v, 0xb1 )
#define mm256_lrev8(v) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
v128_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) )
#define mm256_wrev8(v) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
v128_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
//
// Bit rotations.
@@ -268,50 +257,33 @@ static inline __m256i mm256_not( const __m256i v )
#if defined(VL256)
#define mm256_ror_64 _mm256_ror_epi64
#define mm256_rol_64 _mm256_rol_epi64
#define mm256_ror_32 _mm256_ror_epi32
#define mm256_rol_32 _mm256_rol_epi32
#define mm256_ror_64 _mm256_ror_epi64
#define mm256_rol_64 _mm256_rol_epi64
#define mm256_ror_32 _mm256_ror_epi32
#define mm256_rol_32 _mm256_rol_epi32
// Redundant but naming may be a better fit in some applications.
#define mm126_shuflr64_8( v) _mm256_ror_epi64( v, 8 )
#define mm156_shufll64_8( v) _mm256_rol_epi64( v, 8 )
#define mm256_shuflr64_16(v) _mm256_ror_epi64( v, 16 )
#define mm256_shufll64_16(v) _mm256_rol_epi64( v, 16 )
#define mm256_shuflr64_24(v) _mm256_ror_epi64( v, 24 )
#define mm256_shufll64_24(v) _mm256_rol_epi64( v, 24 )
#define mm256_shuflr32_8( v) _mm256_ror_epi32( v, 8 )
#define mm256_shufll32_8( v) _mm256_rol_epi32( v, 8 )
#define mm256_shuflr32_16(v) _mm256_ror_epi32( v, 16 )
#define mm256_shufll32_16(v) _mm256_rol_epi32( v, 16 )
#define mm256_shuflr64_8( v) _mm256_ror_epi64( v, 8 )
#define mm256_shufll64_8( v) _mm256_rol_epi64( v, 8 )
#define mm256_shuflr64_16(v) _mm256_ror_epi64( v, 16 )
#define mm256_shufll64_16(v) _mm256_rol_epi64( v, 16 )
#define mm256_shuflr64_24(v) _mm256_ror_epi64( v, 24 )
#define mm256_shufll64_24(v) _mm256_rol_epi64( v, 24 )
#define mm256_shuflr32_8( v) _mm256_ror_epi32( v, 8 )
#define mm256_shufll32_8( v) _mm256_rol_epi32( v, 8 )
#define mm256_shuflr32_16(v) _mm256_ror_epi32( v, 16 )
#define mm256_shufll32_16(v) _mm256_rol_epi32( v, 16 )
#else
// ROR & ROL will always find the fastest but these names may be a better fit
// in some applications.
#define mm256_shuflr64_8( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) )
#define mm256_shufll64_8( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) )
#define mm256_shuflr64_24( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) )
#define mm256_shufll64_24( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) )
#define mm256_shuflr32_8( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) )
#define mm256_shufll32_8( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) )
#define mm256_shuflr64_8( v ) _mm256_shuffle_epi8( v, V256_SHUFLR64_8 )
#define mm256_shufll64_8( v ) _mm256_shuffle_epi8( v, V256_SHUFLL64_8 )
#define mm256_shuflr64_24(v ) _mm256_shuffle_epi8( v, V256_SHUFLR64_24 )
#define mm256_shufll64_24(v ) _mm256_shuffle_epi8( v, V256_SHUFLL64_24 )
#define mm256_shuflr32_8( v ) _mm256_shuffle_epi8( v, V256_SHUFLR32_8 )
#define mm256_shufll32_8( v ) _mm256_shuffle_epi8( v, V256_SHUFLL32_8 )
#define mm256_ror_64( v, c ) \
( (c) == 8 ) ? mm256_shuflr64_8( v ) \
@@ -347,96 +319,6 @@ static inline __m256i mm256_not( const __m256i v )
#endif
//
// x2 rotates elements in 2 individual vectors in a double buffered
// optimization for AVX2, does nothing for AVX512 but is here for
// transparency.
#if defined(VL256)
/*
#define mm256_ror_64 _mm256_ror_epi64
#define mm256_rol_64 _mm256_rol_epi64
#define mm256_ror_32 _mm256_ror_epi32
#define mm256_rol_32 _mm256_rol_epi32
*/
#define mm256_rorx2_64( v1, v0, c ) \
_mm256_ror_epi64( v0, c ); \
_mm256_ror_epi64( v1, c )
#define mm256_rolx2_64( v1, v0, c ) \
_mm256_rol_epi64( v0, c ); \
_mm256_rol_epi64( v1, c )
#define mm256_rorx2_32( v1, v0, c ) \
_mm256_ror_epi32( v0, c ); \
_mm256_ror_epi32( v1, c )
#define mm256_rolx2_32( v1, v0, c ) \
_mm256_rol_epi32( v0, c ); \
_mm256_rol_epi32( v1, c )
#else // AVX2
/*
// use shuflr64 shuflr32 below for optimized bit rotations of multiples of 8.
#define mm256_ror_64( v, c ) \
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
_mm256_slli_epi64( v, 64-(c) ) )
#define mm256_rol_64( v, c ) \
_mm256_or_si256( _mm256_slli_epi64( v, c ), \
_mm256_srli_epi64( v, 64-(c) ) )
#define mm256_ror_32( v, c ) \
_mm256_or_si256( _mm256_srli_epi32( v, c ), \
_mm256_slli_epi32( v, 32-(c) ) )
#define mm256_rol_32( v, c ) \
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
_mm256_srli_epi32( v, 32-(c) ) )
*/
#define mm256_rorx2_64( v1, v0, c ) \
{ \
__m256i t0 = _mm256_srli_epi64( v0, c ); \
__m256i t1 = _mm256_srli_epi64( v1, c ); \
v0 = _mm256_slli_epi64( v0, 64-(c) ); \
v1 = _mm256_slli_epi64( v1, 64-(c) ); \
v0 = _mm256_or_si256( v0, t0 ); \
v1 = _mm256_or_si256( v1, t1 ); \
}
#define mm256_rolx2_64( v1, v0, c ) \
{ \
__m256i t0 = _mm256_slli_epi64( v0, c ); \
__m256i t1 = _mm256_slli_epi64( v1, c ); \
v0 = _mm256_srli_epi64( v0, 64-(c) ); \
v1 = _mm256_srli_epi64( v1, 64-(c) ); \
v0 = _mm256_or_si256( v0, t0 ); \
v1 = _mm256_or_si256( v1, t1 ); \
}
#define mm256_rorx2_32( v1, v0, c ) \
{ \
__m256i t0 = _mm256_srli_epi32( v0, c ); \
__m256i t1 = _mm256_srli_epi32( v1, c ); \
v0 = _mm256_slli_epi32( v0, 32-(c) ); \
v1 = _mm256_slli_epi32( v1, 32-(c) ); \
v0 = _mm256_or_si256( v0, t0 ); \
v1 = _mm256_or_si256( v1, t1 ); \
}
#define mm256_rolx2_32( v1, v0, c ) \
{ \
__m256i t0 = _mm256_slli_epi32( v0, c ); \
__m256i t1 = _mm256_slli_epi32( v1, c ); \
v0 = _mm256_srli_epi32( v0, 32-(c) ); \
v1 = _mm256_srli_epi32( v1, 32-(c) ); \
v0 = _mm256_or_si256( v0, t0 ); \
v1 = _mm256_or_si256( v1, t1 ); \
}
#endif // AVX512 else AVX2
#if defined(__AVX2__)
// 128 bit version of unpack
@@ -453,20 +335,14 @@ static inline __m256i mm256_not( const __m256i v )
//
// Cross lane shuffles
//
// Rotate elements accross all lanes.
#define mm256_shuffle_16( v, c ) \
_mm256_or_si256( _mm256_shufflehi_epi16( v, c ), \
_mm256_shufflelo_epi16( v, c ) )
// Swap 128 bit elements in 256 bit vector.
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
#define mm256_rev_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
#define mm256_swap_128 mm256_rev_128 // grandfathered
// Rotate 256 bit vector by one 64 bit element
#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
// Reverse 64 bit elements
/* not used
// Reverse elements
#define mm256_rev_64( v ) _mm256_permute4x64_epi64( v, 0x1b )
#define mm256_rev_32( v ) \
@@ -474,7 +350,12 @@ static inline __m256i mm256_not( const __m256i v )
0x0000000400000005, 0x0000000600000007 )
#define mm256_rev_16( v ) \
_mm256_permute4x64_epi64( mm256_shuffle_16( v, 0x1b ), 0x4e )
_mm256_permute4x64_epi64( mm256_shuffle16( v, 0x1b ), 0x4e )
*/
// Rotate 256 bit vector by one 64 bit element
#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
/* Not used
// Rotate 256 bit vector by one 32 bit element.
@@ -486,7 +367,7 @@ static inline __m256i mm256_shufll_32( const __m256i v )
#else
#define mm256_shuflr_32( v ) \
_mm256_permutevar8x32_epi32( v, \
_mm256_set_spi64x( 0x0000000000000007, 0x0000000600000005, \
_mm256_set_epi64x( 0x0000000000000007, 0x0000000600000005, \
0x0000000400000003, 0x0000000200000001 ) )
#define mm256_shufll_32( v ) \
_mm256_permutevar8x32_epi32( v, \
@@ -507,113 +388,64 @@ static inline __m256i mm256_shufll_32( const __m256i v )
_mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \
_mm256_castsi256_ps( v2 ), c ) );
#define mm256_swap128_64(v) _mm256_shuffle_epi32( v, 0x4e )
#define mm256_rev128_64(v) _mm256_shuffle_epi32( v, 0x4e )
#define mm256_swap128_64 mm256_rev128_64 // grandfathered
/*not used
#define mm256_rev128_32(v) _mm256_shuffle_epi32( v, 0x1b )
#define mm256_rev128_16(v) mm256_shuffle_16( v, 0x1b )
#define mm256_rev128_16(v) mm256_shuffle16( v, 0x1b )
*/
#define mm256_shuflr128_32(v) _mm256_shuffle_epi32( v, 0x39 )
#define mm256_shufll128_32(v) _mm256_shuffle_epi32( v, 0x93 )
#define mm256_shuflr128_16(v) mm256_shuffle_16( v, 0x39 )
#define mm256_shufll128_16(v) mm256_shuffle_16( v, 0x93 )
/* not used
#define mm256_shuflr128_16(v) mm256_shuffle16( v, 0x39 )
#define mm256_shufll128_16(v) mm256_shuffle16( v, 0x93 )
/* Not used
static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
{ return _mm256_alignr_epi8( v, v, c ); }
*/
// Reverse byte order in elements, endian bswap.
#define mm256_bswap_64( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) )
#define mm256_bswap_64( v ) _mm256_shuffle_epi8( v, V256_BSWAP64 )
#define mm256_bswap_32( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) )
#define mm256_bswap_32( v ) _mm256_shuffle_epi8( v, V256_BSWAP32 )
/* not used
#define mm256_bswap_16( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
_mm256_shuffle_epi8( v, mm256_bcast128( _mm_set_epi64x( \
0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
//
*/
// Source and destination are pointers, may point to same memory.
// 8 byte qword * 8 qwords * 4 lanes = 256 bytes
#define mm256_block_bswap_64( d, s ) \
{ \
__m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
0x0001020304050607 ) ); \
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
}
#define mm256_block_bswap64_512 mm256_block_bswap_64
#define mm256_block_bswap64_1024( d, s ) \
{ \
__m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
0x0001020304050607 ) ); \
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
casti_m256i( d, 8 ) = _mm256_shuffle_epi8( casti_m256i( s, 8 ), ctl ); \
casti_m256i( d, 9 ) = _mm256_shuffle_epi8( casti_m256i( s, 9 ), ctl ); \
casti_m256i( d,10 ) = _mm256_shuffle_epi8( casti_m256i( s,10 ), ctl ); \
casti_m256i( d,11 ) = _mm256_shuffle_epi8( casti_m256i( s,11 ), ctl ); \
casti_m256i( d,12 ) = _mm256_shuffle_epi8( casti_m256i( s,12 ), ctl ); \
casti_m256i( d,13 ) = _mm256_shuffle_epi8( casti_m256i( s,13 ), ctl ); \
casti_m256i( d,14 ) = _mm256_shuffle_epi8( casti_m256i( s,14 ), ctl ); \
casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \
casti_m256i( d,0 ) = mm256_bswap_64( casti_m256i( s,0 ) ); \
casti_m256i( d,1 ) = mm256_bswap_64( casti_m256i( s,1 ) ); \
casti_m256i( d,2 ) = mm256_bswap_64( casti_m256i( s,2 ) ); \
casti_m256i( d,3 ) = mm256_bswap_64( casti_m256i( s,3 ) ); \
casti_m256i( d,4 ) = mm256_bswap_64( casti_m256i( s,4 ) ); \
casti_m256i( d,5 ) = mm256_bswap_64( casti_m256i( s,5 ) ); \
casti_m256i( d,6 ) = mm256_bswap_64( casti_m256i( s,6 ) ); \
casti_m256i( d,7 ) = mm256_bswap_64( casti_m256i( s,7 ) ); \
}
// 4 byte dword * 8 dwords * 8 lanes = 256 bytes
#define mm256_block_bswap_32( d, s ) \
{ \
__m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ) ); \
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
casti_m256i( d, 0 ) = mm256_bswap_32( casti_m256i( s, 0 ) ); \
casti_m256i( d, 1 ) = mm256_bswap_32( casti_m256i( s, 1 ) ); \
casti_m256i( d, 2 ) = mm256_bswap_32( casti_m256i( s, 2 ) ); \
casti_m256i( d, 3 ) = mm256_bswap_32( casti_m256i( s, 3 ) ); \
casti_m256i( d, 4 ) = mm256_bswap_32( casti_m256i( s, 4 ) ); \
casti_m256i( d, 5 ) = mm256_bswap_32( casti_m256i( s, 5 ) ); \
casti_m256i( d, 6 ) = mm256_bswap_32( casti_m256i( s, 6 ) ); \
casti_m256i( d, 7 ) = mm256_bswap_32( casti_m256i( s, 7 ) ); \
}
#define mm256_block_bswap32_256 mm256_block_bswap_32
#define mm256_block_bswap32_512( d, s ) \
{ \
__m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ) ); \
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
casti_m256i( d, 8 ) = _mm256_shuffle_epi8( casti_m256i( s, 8 ), ctl ); \
casti_m256i( d, 9 ) = _mm256_shuffle_epi8( casti_m256i( s, 9 ), ctl ); \
casti_m256i( d,10 ) = _mm256_shuffle_epi8( casti_m256i( s,10 ), ctl ); \
casti_m256i( d,11 ) = _mm256_shuffle_epi8( casti_m256i( s,11 ), ctl ); \
casti_m256i( d,12 ) = _mm256_shuffle_epi8( casti_m256i( s,12 ), ctl ); \
casti_m256i( d,13 ) = _mm256_shuffle_epi8( casti_m256i( s,13 ), ctl ); \
casti_m256i( d,14 ) = _mm256_shuffle_epi8( casti_m256i( s,14 ), ctl ); \
casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \
}
#if defined(VL256)
#define mm256_alignr64 _mm256_alignr_epi64

View File

@@ -108,11 +108,13 @@ typedef union
// A simple 128 bit permute, using function instead of macro avoids
// problems if the v arg passed as an expression.
static inline __m512i mm512_perm_128( const __m512i v, const int c )
static inline __m512i mm512_perm128( const __m512i v, const int c )
{ return _mm512_shuffle_i64x2( v, v, c ); }
// Broadcast 128 bit vector to all lanes of 512 bit vector.
#define mm512_bcast_m128( v ) mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
#define mm512_bcast128( v ) mm512_perm128( _mm512_castsi128_si512( v ), 0 )
// deprecated
#define mm512_bcast_m128 mm512_bcast128
// Set either the low or high 64 bit elements in 128 bit lanes, other elements
// are set to zero.
@@ -120,7 +122,7 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
#define mm512_bcast128hi_64( i64 ) _mm512_maskz_set1_epi64( 0xaa, i64 )
#define mm512_set2_64( i1, i0 ) \
mm512_bcast_m128( _mm_set_epi64x( i1, i0 ) )
mm512_bcast128( _mm_set_epi64x( i1, i0 ) )
// Pseudo constants.
#define m512_zero _mm512_setzero_si512()
@@ -248,105 +250,57 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
//
// Reverse byte order of packed elements, vectorized endian conversion.
#define mm512_bswap_64( v ) \
_mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) )
#define mm512_bswap_64( v ) _mm512_shuffle_epi8( v, V512_BSWAP64 )
#define mm512_bswap_32( v ) \
_mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) )
#define mm512_bswap_32( v ) _mm512_shuffle_epi8( v, V512_BSWAP32 )
/* not used
#define mm512_bswap_16( v ) \
_mm512_shuffle_epi8( v, mm512_bcast128( _mm_set_epi64x( \
0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
*/
#define mm512_bswap_16( v ) \
_mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \
0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
// Source and destination are pointers, may point to same memory.
// 8 lanes of 64 bytes each
#define mm512_block_bswap_64( d, s ) \
{ \
const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
}
#define mm512_block_bswap64_512 mm512_block_bswap_64
#define mm512_block_bswap64_1024( d, s ) \
{ \
const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
casti_m512i( d, 8 ) = _mm512_shuffle_epi8( casti_m512i( s, 8 ), ctl ); \
casti_m512i( d, 9 ) = _mm512_shuffle_epi8( casti_m512i( s, 9 ), ctl ); \
casti_m512i( d,10 ) = _mm512_shuffle_epi8( casti_m512i( s,10 ), ctl ); \
casti_m512i( d,11 ) = _mm512_shuffle_epi8( casti_m512i( s,11 ), ctl ); \
casti_m512i( d,12 ) = _mm512_shuffle_epi8( casti_m512i( s,12 ), ctl ); \
casti_m512i( d,13 ) = _mm512_shuffle_epi8( casti_m512i( s,13 ), ctl ); \
casti_m512i( d,14 ) = _mm512_shuffle_epi8( casti_m512i( s,14 ), ctl ); \
casti_m512i( d,15 ) = _mm512_shuffle_epi8( casti_m512i( s,15 ), ctl ); \
casti_m512i( d, 0 ) = mm512_bswap_64( casti_m512i( s, 0 ) ); \
casti_m512i( d, 1 ) = mm512_bswap_64( casti_m512i( s, 1 ) ); \
casti_m512i( d, 2 ) = mm512_bswap_64( casti_m512i( s, 2 ) ); \
casti_m512i( d, 3 ) = mm512_bswap_64( casti_m512i( s, 3 ) ); \
casti_m512i( d, 4 ) = mm512_bswap_64( casti_m512i( s, 4 ) ); \
casti_m512i( d, 5 ) = mm512_bswap_64( casti_m512i( s, 5 ) ); \
casti_m512i( d, 6 ) = mm512_bswap_64( casti_m512i( s, 6 ) ); \
casti_m512i( d, 7 ) = mm512_bswap_64( casti_m512i( s, 7 ) ); \
}
// 16 lanes of 32 bytes each
#define mm512_block_bswap_32( d, s ) \
{ \
const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
casti_m512i( d, 0 ) = mm512_bswap_32( casti_m512i( s, 0 ) ); \
casti_m512i( d, 1 ) = mm512_bswap_32( casti_m512i( s, 1 ) ); \
casti_m512i( d, 2 ) = mm512_bswap_32( casti_m512i( s, 2 ) ); \
casti_m512i( d, 3 ) = mm512_bswap_32( casti_m512i( s, 3 ) ); \
casti_m512i( d, 4 ) = mm512_bswap_32( casti_m512i( s, 4 ) ); \
casti_m512i( d, 5 ) = mm512_bswap_32( casti_m512i( s, 5 ) ); \
casti_m512i( d, 6 ) = mm512_bswap_32( casti_m512i( s, 6 ) ); \
casti_m512i( d, 7 ) = mm512_bswap_32( casti_m512i( s, 7 ) ); \
}
#define mm512_block_bswap32_256 mm512_block_bswap_32
#define mm512_block_bswap32_512( d, s ) \
{ \
const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
casti_m512i( d, 8 ) = _mm512_shuffle_epi8( casti_m512i( s, 8 ), ctl ); \
casti_m512i( d, 9 ) = _mm512_shuffle_epi8( casti_m512i( s, 9 ), ctl ); \
casti_m512i( d,10 ) = _mm512_shuffle_epi8( casti_m512i( s,10 ), ctl ); \
casti_m512i( d,11 ) = _mm512_shuffle_epi8( casti_m512i( s,11 ), ctl ); \
casti_m512i( d,12 ) = _mm512_shuffle_epi8( casti_m512i( s,12 ), ctl ); \
casti_m512i( d,13 ) = _mm512_shuffle_epi8( casti_m512i( s,13 ), ctl ); \
casti_m512i( d,14 ) = _mm512_shuffle_epi8( casti_m512i( s,14 ), ctl ); \
casti_m512i( d,15 ) = _mm512_shuffle_epi8( casti_m512i( s,15 ), ctl ); \
}
// Cross-lane shuffles implementing rotation of packed elements.
//
// shuffle 16 bit elements within 64 bit lanes.
#define mm512_shuffle16( v, c ) \
_mm512_shufflehi_epi16( _mm512_shufflelo_epi16( v, c ), c )
// Rotate elements across entire vector.
static inline __m512i mm512_swap_256( const __m512i v )
static inline __m512i mm512_rev_256( const __m512i v )
{ return _mm512_alignr_epi64( v, v, 4 ); }
#define mm512_shuflr_256 mm512_swap_256
#define mm512_shufll_256 mm512_swap_256
#define mm512_swap_256 mm512_rev_256 // grandfathered
static inline __m512i mm512_shuflr_128( const __m512i v )
{ return _mm512_alignr_epi64( v, v, 2 ); }
@@ -394,9 +348,8 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
// Rotate elements within 256 bit lanes of 512 bit vector.
// Swap hi & lo 128 bits in each 256 bit lane
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
#define mm512_shuflr256_128 mm512_swap256_128
#define mm512_shufll256_128 mm512_swap256_128
#define mm512_rev256_128( v ) _mm512_permutex_epi64( v, 0x4e )
#define mm512_swap256_128 mm512_rev256_128 // grandfathered
// Rotate 256 bit lanes by one 64 bit element
#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 )
@@ -450,15 +403,23 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
//
// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
#define mm512_shuflr128_64 mm512_swap128_64
#define mm512_shufll128_64 mm512_swap128_64
#define mm512_rev128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
#define mm512_swap128_64 mm512_rev128_64 // grandfathered
/*not used
#define mm512_rev128_32(v) _mm526_shuffle_epi32( v, 0x1b )
#define mm512_rev128_16(v) mm512_shuffle16( v, 0x1b )
*/
// Rotate 128 bit lanes by one 32 bit element
#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
/* Not used
#define mm512_shuflr128_16(v) mm512_shuffle16( v, 0x39 )
#define mm512_shufll128_16(v) mm512_shuffle16( v, 0x93 )
// Rotate 128 bit lanes right by c bytes, versatile and just as fast
static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
{ return _mm512_alignr_epi8( v, v, c ); }
@@ -476,11 +437,10 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
_mm512_castsi512_ps( v2 ), c ) );
// 64 bit lanes
// Not really necessary with AVX512, included for consistency with AVX2/SSE.
// ROL, ROR not necessary with AVX512, included for consistency with AVX2/SSE.
#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 )
#define mm512_shuflr64_32 mm512_swap64_32
#define mm512_shufll64_32 mm512_swap64_32
#define mm512_qrev32( v ) _mm512_shuffle_epi32( v, 0xb1 )
#define mm512_swap64_32 mm512_qrev32 // grandfathered
#define mm512_shuflr64_24( v ) _mm512_ror_epi64( v, 24 )
#define mm512_shufll64_24( v ) _mm512_rol_epi64( v, 24 )
@@ -494,9 +454,7 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
/* Not used
// 32 bit lanes
#define mm512_swap32_16( v ) _mm512_ror_epi32( v, 16 )
#define mm512_shuflr32_16 mm512_swap32_16
#define mm512_shufll32_16 mm512_swap32_16
#define mm512_lrev16( v ) _mm512_ror_epi32( v, 16 )
#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 )
#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 )

View File

@@ -0,0 +1,55 @@
#include "simd-utils.h"
#if defined(SIMD512)
const __m512i V512_BSWAP64 = { 0x0001020304050607, 0x08090a0b0c0d0e0f,
0x0001020304050607, 0x08090a0b0c0d0e0f,
0x0001020304050607, 0x08090a0b0c0d0e0f,
0x0001020304050607, 0x08090a0b0c0d0e0f };
const __m512i V512_BSWAP32 = { 0x0405060700010203, 0x0c0d0e0f08090a0b,
0x0405060700010203, 0x0c0d0e0f08090a0b,
0x0405060700010203, 0x0c0d0e0f08090a0b,
0x0405060700010203, 0x0c0d0e0f08090a0b };
#elif defined(__AVX2__)
const __m256i V256_BSWAP64 = { 0x0001020304050607, 0x08090a0b0c0d0e0f,
0x0001020304050607, 0x08090a0b0c0d0e0f };
const __m256i V256_BSWAP32 = { 0x0405060700010203, 0x0c0d0e0f08090a0b,
0x0405060700010203, 0x0c0d0e0f08090a0b };
const __m256i V256_SHUFLR64_8 = { 0x0007060504030201, 0x080f0e0d0c0b0a09,
0x0007060504030201, 0x080f0e0d0c0b0a09 };
const __m256i V256_SHUFLR64_24 = { 0x0201000706050403, 0x0a09080f0e0d0c0b,
0x0201000706050403, 0x0a09080f0e0d0c0b };
const __m256i V256_SHUFLL64_8 = { 0x0605040302010007, 0x0e0d0c0b0a09080f,
0x0605040302010007, 0x0e0d0c0b0a09080f };
const __m256i V256_SHUFLL64_24 = { 0x0403020100070605, 0x0c0b0a09080f0e0d,
0x0403020100070605, 0x0c0b0a09080f0e0d };
const __m256i V256_SHUFLR32_8 = { 0x0407060500030201, 0x0c0f0e0d080b0a09,
0x0407060500030201, 0x0c0f0e0d080b0a09 };
const __m256i V256_SHUFLL32_8 = { 0x0605040702010003, 0x0e0d0c0f0a09080b,
0x0605040702010003, 0x0e0d0c0f0a09080b };
#elif defined(__SSSE3__)
const v128_t V128_BSWAP64 = { 0x0001020304050607, 0x08090a0b0c0d0e0f };
const v128_t V128_BSWAP32 = { 0x0405060700010203, 0x0c0d0e0f08090a0b };
const v128_t V128_SHUFLR64_8 = { 0x0007060504030201, 0x080f0e0d0c0b0a09 };
const v128_t V128_SHUFLR64_24 = { 0x0201000706050403, 0x0a09080f0e0d0c0b };
const v128_t V128_SHUFLL64_8 = { 0x0605040302010007, 0x0e0d0c0b0a09080f };
const v128_t V128_SHUFLL64_24 = { 0x0403020100070605, 0x0c0b0a09080f0e0d };
const v128_t V128_SHUFLR32_8 = { 0x0407060500030201, 0x0c0f0e0d080b0a09 };
const v128_t V128_SHUFLL32_8 = { 0x0605040702010003, 0x0e0d0c0f0a09080b };
#endif

View File

@@ -14,10 +14,10 @@
// veor3q( v2, v1, v0 ) xor3 v2 ^ v1 ^ v0
// vxarq_u64( v1, v0, n ) ror64xor ( v1 ^ v0 ) >>> n )
// vbcaxq_u{64,32,16,8}( v2, v1, v0 ) xorandnot v2 ^ ( v1 & ~v0 )
// vsraq_n_u{64,32,16,8}( v1, v0, n ) v1 + ( v0 >> n )
//
// not used anywhere yet
// vrax1q_u64( v1, v0 ) v1 ^ ( v0 <<< 1 )
// vsraq_n_u{64,32,16,8}( v1, v0, n ) v1 + ( v0 >> n )
// vrax1q_u64( v1, v0 ) v1 ^ ( v0 <<< 1 )
#define v128_t uint32x4_t // default,
#define v128u64_t uint64x2_t
@@ -124,7 +124,7 @@
// ~v1 & v0
#define v128_andnot( v1, v0 ) vbicq_u32( v0, v1 )
// ~( a ^ b ), same as (~a) ^ b
// ~( v1 ^ v0 ), same as (~v1) ^ v0
#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
// ~v1 | v0, args reversed for consistency with x86_64
@@ -136,8 +136,11 @@
// known way to test arm minor version.
#if defined(__ARM_FEATURE_SHA3)
#define v128_xor3 veor3q_u32
#define v128_xor4( v3, v2, v1, v0 ) veorq_u32( v3, veor3q_u32( v2, v1, v0 ) )
#else
#define v128_xor3( v2, v1, v0 ) veorq_u32( veorq_u32( v2, v1 ), v0 )
#define v128_xor4( v3, v2, v1, v0 ) veorq_u32 ( veorq_u32( v3, v2 ), \
veorq_u32( v1, v0 ) )
#endif
// v2 & v1 & v0
@@ -153,13 +156,13 @@
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
#endif
// a ^ ( b & c )
// v2 ^ ( v1 & v0 )
#define v128_xorand( v2, v1, v0 ) v128_xor( v2, v128_and( v1, v0 ) )
// a & ( b ^ c )
// v2 & ( v1 ^ v0 )
#define v128_andxor( v2, v1, v0 ) v128_and( v2, v128_xor( v1, v0 ) )
// a ^ ( b | c )
// v2 ^ ( v1 | v0 )
#define v128_xoror( v2, v1, v0 ) v128_xor( v2, v128_or( v1, v0 ) )
// v2 | ( v1 & v0 )
@@ -240,7 +243,7 @@ typedef union
#define cast_v128u32( p ) (*((uint32x4_t*)(p)))
#define castp_v128u32( p ) ((uint32x4_t*)(p))
// set1
// set1, integer argument
#define v128_64 vmovq_n_u64
#define v128_32 vmovq_n_u32
#define v128_16 vmovq_n_u16
@@ -326,10 +329,59 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
}
// how to build a bitmask from vector elements? Efficiently???
#define v128_movmask32
#define v128_movmask64
//#define v128_movmask32
//#define v128_movmask64
#define v128_shuffle8( v, vmask ) \
vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )
// Bit rotation
/*
#define v128_shuflr64_8( v ) v128_shuffle8( v, V128_SHUFLR64_8 )
#define v128_shufll64_8( v ) v128_shuffle8( v, V128_SHUFLL64_8 )
#define v128_shuflr64_16(v ) v128_shuffle8( v, V128_SHUFLR64_16 )
#define v128_shufll64_16(v ) v128_shuffle8( v, V128_SHUFLL64_16 )
#define v128_shuflr64_24(v ) v128_shuffle8( v, V128_SHUFLR64_24 )
#define v128_shufll64_24(v ) v128_shuffle8( v, V128_SHUFLL64_24 )
#define v128_shuflr32_8( v ) v128_shuffle8( v, V128_SHUFLR32_8 )
#define v128_shufll32_8( v ) v128_shuffle8( v, V128_SHUFLL32_8 )
#define v128_ror64( v, c ) \
( (c) == 8 ) ? v128_shuflr64_8( v ) \
: ( (c) == 16 ) ? v128_shuflr64_16( v ) \
: ( (c) == 24 ) ? v128_shuflr64_24( v ) \
: ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
: ( (c) == 40 ) ? v128_shufll64_24( v ) \
: ( (c) == 48 ) ? v128_shufll64_16( v ) \
: ( (c) == 56 ) ? v128_shufll64_8( v ) \
: vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
((uint64x2_t)(v)), c )
#define v128_rol64( v, c ) \
( (c) == 8 ) ? v128_shufll64_8( v ) \
: ( (c) == 16 ) ? v128_shufll64_16( v ) \
: ( (c) == 24 ) ? v128_shufll64_24( v ) \
: ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
: ( (c) == 40 ) ? v128_shuflr64_24( v ) \
: ( (c) == 48 ) ? v128_shuflr64_16( v ) \
: ( (c) == 56 ) ? v128_shuflr64_8( v ) \
: vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
((uint64x2_t)(v)), c )
#define v128_ror32( v, c ) \
( (c) == 8 ) ? v128_shuflr32_8( v ) \
: ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
: ( (c) == 24 ) ? v128_shufll32_8( v ) \
: vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
((uint32x4_t)(v)), c )
#define v128_rol32( v, c ) \
( (c) == 8 ) ? v128_shufll32_8( v ) \
: ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
: ( (c) == 24 ) ? v128_shuflr32_8( v ) \
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
((uint32x4_t)(v)), c )
*/
#define v128_ror64( v, c ) \
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
@@ -351,6 +403,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
((uint32x4_t)(v)), c )
/* not used
#define v128_ror16( v, c ) \
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \
: vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
@@ -368,6 +421,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
#define v128_rol8( v, c ) \
vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
((uint8x16_t)(v)), c )
*/
// ( v1 ^ v0 ) >>> c
#if defined(__ARM_FEATURE_SHA3)
@@ -376,57 +430,13 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
#define v128_ror64xor( v1, v0, c ) v128_ror64( v128_xor( v1, v0 ), c )
#endif
#define v128_2ror64( v1, v0, c ) \
{ \
uint64x2_t t0 = vshrq_n_u64( v0, c ); \
uint64x2_t t1 = vshrq_n_u64( v1, c ); \
v0 = vsliq_n_u64( v0, 64-(c) ); \
v1 = vsliq_n_u64( v1, 64-(c) ); \
v0 = vorrq_u64( v0, t0 ); \
v1 = vorrq_u64( v1, t1 ); \
}
#define v128_2rol64_( v1, v0, c ) \
{ \
uint64x2_t t0 = vshlq_n_u64( v0, c ); \
uint64x2_t t1 = vshlq_n_u64( v1, c ); \
v0 = vsriq_n_u64( v0, 64-(c) ); \
v1 = vsriq_n_u64( v1, 64-(c) ); \
v0 = vorrq_u64( v0, t0 ); \
v1 = vorrq_u64( v1, t1 ); \
}
#define v128_2rorl32( v1, v0, c ) \
{ \
uint32x4_t t0 = vshrq_n_u32( v0, c ); \
uint32x4_t t1 = vshrq_n_u32( v1, c ); \
v0 = vsliq_n_u32( v0, 32-(c) ); \
v1 = vsliq_n_u32( v1, 32-(c) ); \
v0 = vorrq_32( v0, t0 ); \
v1 = vorrq_u32( v1, t1 ); \
}
#define v128_2ror32( v1, v0, c ) \
{ \
uint32x4_t t0 = vshlq_n_u32( v0, c ); \
uint32x4_t t1 = vshlq_n_u32( v1, c ); \
v0 = vsriq_n_u32( v0, 32-(c) ); \
v1 = vsriq_n_u32( v1, 32-(c) ); \
v0 = vorrq_u32( v0, t0 ); \
v1 = vorrq_u32( v1, t1 ); \
}
/* not used anywhere and hopefully never will
// vector mask, use as last resort. prefer tbl, rev, alignr, etc
#define v128_shufflev32( v, vmask ) \
v128_set32( ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[3] ], \
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[2] ], \
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[1] ], \
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[0] ] ) \
/* not used
// v1 + ( v0 >> c )
#define v128_addsr64( v1, v0, c ) vsraq_n_u64( v1, v0, c )
#define v128_addsr32( v1, v0, c ) vsraq_n_u32( v1, v0, c )
*/
#define v128_shuffle8( v, vmask ) \
vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )
// Cross lane shuffle
// sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
// Bit rotation already promotes faster widths. Usage is context sensitive.
@@ -438,19 +448,14 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
#define v128_qrev16 vrev64q_u16
#define v128_lrev16 vrev32q_u16
// aka bswap
// #define v128_qrev8 vrev64q_u8
// #define v128_lrev8 vrev32q_u8
// #define v128_wrev8 vrev16q_u8
// full vector rotation
// reverse elements in vector
static inline uint64x2_t v128_rev64( uint64x2_t v )
{ return vextq_u64( v, v, 1 ); }
#define v128_swap64 v128_rev64 // grandfathered
#define v128_swap64 v128_rev64 // grandfathered
#define v128_rev32(v) v128_rev64( v128_qrev32( v ) )
#define v128_rev32(v) v128_rev64( v128_qrev32( v ) )
// shuffle-rotate vector elements
static inline uint32x4_t v128_shuflr32( uint32x4_t v )
@@ -468,7 +473,6 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
#define v128_bswap64(v) (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
#define v128_bswap128(v) (uint32x4_t)v128_rev64( v128_bswap64(v) )
// Useful for x86_64 but does nothing for ARM
#define v128_block_bswap32( dst, src ) \
{ \
casti_v128u32( dst,0 ) = v128_bswap32( casti_v128u32( src,0 ) ); \
@@ -482,26 +486,6 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
}
#define v128_block_bswap32_256 v128_block_bswap32
#define v128_block_bswap32_512( dst, src ) \
{ \
casti_v128u32( dst, 0 ) = v128_bswap32( casti_v128u32( src, 0 ) ); \
casti_v128u32( dst, 1 ) = v128_bswap32( casti_v128u32( src, 1 ) ); \
casti_v128u32( dst, 2 ) = v128_bswap32( casti_v128u32( src, 2 ) ); \
casti_v128u32( dst, 3 ) = v128_bswap32( casti_v128u32( src, 3 ) ); \
casti_v128u32( dst, 4 ) = v128_bswap32( casti_v128u32( src, 4 ) ); \
casti_v128u32( dst, 5 ) = v128_bswap32( casti_v128u32( src, 5 ) ); \
casti_v128u32( dst, 6 ) = v128_bswap32( casti_v128u32( src, 6 ) ); \
casti_v128u32( dst, 7 ) = v128_bswap32( casti_v128u32( src, 7 ) ); \
casti_v128u32( dst, 8 ) = v128_bswap32( casti_v128u32( src, 8 ) ); \
casti_v128u32( dst, 9 ) = v128_bswap32( casti_v128u32( src, 9 ) ); \
casti_v128u32( dst,10 ) = v128_bswap32( casti_v128u32( src,10 ) ); \
casti_v128u32( dst,11 ) = v128_bswap32( casti_v128u32( src,11 ) ); \
casti_v128u32( dst,12 ) = v128_bswap32( casti_v128u32( src,12 ) ); \
casti_v128u32( dst,13 ) = v128_bswap32( casti_v128u32( src,13 ) ); \
casti_v128u32( dst,14 ) = v128_bswap32( casti_v128u32( src,14 ) ); \
casti_v128u32( dst,15 ) = v128_bswap32( casti_v128u32( src,15 ) ); \
}
#define v128_block_bswap64( dst, src ) \
{ \
casti_v128u64( dst,0 ) = v128_bswap64( casti_v128u64( src,0 ) ); \
@@ -513,27 +497,6 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
casti_v128u64( dst,6 ) = v128_bswap64( casti_v128u64( src,6 ) ); \
casti_v128u64( dst,7 ) = v128_bswap64( casti_v128u64( src,7 ) ); \
}
#define v128_block_bswap64_512 v128_block_bswap64 \
#define v128_block_bswap64_1024( dst, src ) \
{ \
casti_v128u64( dst, 0 ) = v128_bswap64( casti_v128u64( src, 0 ) ); \
casti_v128u64( dst, 1 ) = v128_bswap64( casti_v128u64( src, 1 ) ); \
casti_v128u64( dst, 2 ) = v128_bswap64( casti_v128u64( src, 2 ) ); \
casti_v128u64( dst, 3 ) = v128_bswap64( casti_v128u64( src, 3 ) ); \
casti_v128u64( dst, 4 ) = v128_bswap64( casti_v128u64( src, 4 ) ); \
casti_v128u64( dst, 5 ) = v128_bswap64( casti_v128u64( src, 5 ) ); \
casti_v128u64( dst, 6 ) = v128_bswap64( casti_v128u64( src, 6 ) ); \
casti_v128u64( dst, 7 ) = v128_bswap64( casti_v128u64( src, 7 ) ); \
casti_v128u64( dst, 8 ) = v128_bswap64( casti_v128u64( src, 8 ) ); \
casti_v128u64( dst, 9 ) = v128_bswap64( casti_v128u64( src, 9 ) ); \
casti_v128u64( dst,10 ) = v128_bswap64( casti_v128u64( src,10 ) ); \
casti_v128u64( dst,11 ) = v128_bswap64( casti_v128u64( src,11 ) ); \
casti_v128u64( dst,12 ) = v128_bswap64( casti_v128u64( src,12 ) ); \
casti_v128u64( dst,13 ) = v128_bswap64( casti_v128u64( src,13 ) ); \
casti_v128u64( dst,14 ) = v128_bswap64( casti_v128u64( src,14 ) ); \
casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
}
// Bitwise blend using vector mask, use only bytewise for compatibility
// with x86_64.