mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.22.3
This commit is contained in:
@@ -410,7 +410,8 @@ static inline void extr_lane_4x32( void *d, const void *s,
|
||||
|
||||
static inline void mm128_bswap32_80( void *d, void *s )
|
||||
{
|
||||
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), bswap_shuf );
|
||||
casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), bswap_shuf );
|
||||
casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), bswap_shuf );
|
||||
@@ -456,7 +457,8 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
|
||||
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
|
||||
@@ -731,7 +733,12 @@ static inline void extr_lane_8x32( void *d, const void *s,
|
||||
|
||||
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
{
|
||||
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
const __m256i c1 = _mm256_set1_epi32( 1 );
|
||||
const __m256i c2 = _mm256_add_epi32( c1, c1 );
|
||||
const __m256i c3 = _mm256_add_epi32( c2, c1 );
|
||||
|
||||
__m128i s0 = casti_m128i( src,0 );
|
||||
__m128i s1 = casti_m128i( src,1 );
|
||||
__m128i s2 = casti_m128i( src,2 );
|
||||
@@ -744,52 +751,46 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
|
||||
|
||||
casti_m128i( d, 0 ) =
|
||||
casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0 , 0x00 );
|
||||
casti_m128i( d, 2 ) =
|
||||
casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0 , 0x55 );
|
||||
casti_m128i( d, 4 ) =
|
||||
casti_m128i( d, 5 ) = _mm_shuffle_epi32( s0 , 0xaa );
|
||||
casti_m128i( d, 6 ) =
|
||||
casti_m128i( d, 7 ) = _mm_shuffle_epi32( s0 , 0xff );
|
||||
casti_m256i( d, 0 ) = _mm256_broadcastd_epi32( s0 );
|
||||
casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s0 ), c1 );
|
||||
casti_m256i( d, 2 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s0 ), c2 );
|
||||
casti_m256i( d, 3 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s0 ), c3 );
|
||||
|
||||
casti_m128i( d, 8 ) =
|
||||
casti_m128i( d, 9 ) = _mm_shuffle_epi32( s1 , 0x00 );
|
||||
casti_m128i( d,10 ) =
|
||||
casti_m128i( d,11 ) = _mm_shuffle_epi32( s1 , 0x55 );
|
||||
casti_m128i( d,12 ) =
|
||||
casti_m128i( d,13 ) = _mm_shuffle_epi32( s1 , 0xaa );
|
||||
casti_m128i( d,14 ) =
|
||||
casti_m128i( d,15 ) = _mm_shuffle_epi32( s1 , 0xff );
|
||||
casti_m256i( d, 4 ) = _mm256_broadcastd_epi32( s1 );
|
||||
casti_m256i( d, 5 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s1 ), c1 );
|
||||
casti_m256i( d, 6 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s1 ), c2 );
|
||||
casti_m256i( d, 7 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s1 ), c3 );
|
||||
|
||||
casti_m128i( d,16 ) =
|
||||
casti_m128i( d,17 ) = _mm_shuffle_epi32( s2 , 0x00 );
|
||||
casti_m128i( d,18 ) =
|
||||
casti_m128i( d,19 ) = _mm_shuffle_epi32( s2 , 0x55 );
|
||||
casti_m128i( d,20 ) =
|
||||
casti_m128i( d,21 ) = _mm_shuffle_epi32( s2 , 0xaa );
|
||||
casti_m128i( d,22 ) =
|
||||
casti_m128i( d,23 ) = _mm_shuffle_epi32( s2 , 0xff );
|
||||
casti_m256i( d, 8 ) = _mm256_broadcastd_epi32( s2 );
|
||||
casti_m256i( d, 9 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s2 ), c1 );
|
||||
casti_m256i( d,10 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s2 ), c2 );
|
||||
casti_m256i( d,11 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s2 ), c3 );
|
||||
|
||||
casti_m128i( d,24 ) =
|
||||
casti_m128i( d,25 ) = _mm_shuffle_epi32( s3 , 0x00 );
|
||||
casti_m128i( d,26 ) =
|
||||
casti_m128i( d,27 ) = _mm_shuffle_epi32( s3 , 0x55 );
|
||||
casti_m128i( d,28 ) =
|
||||
casti_m128i( d,29 ) = _mm_shuffle_epi32( s3 , 0xaa );
|
||||
casti_m128i( d,30 ) =
|
||||
casti_m128i( d,31 ) = _mm_shuffle_epi32( s3 , 0xff );
|
||||
|
||||
casti_m128i( d,32 ) =
|
||||
casti_m128i( d,33 ) = _mm_shuffle_epi32( s4 , 0x00 );
|
||||
casti_m128i( d,34 ) =
|
||||
casti_m128i( d,35 ) = _mm_shuffle_epi32( s4 , 0x55 );
|
||||
casti_m128i( d,36 ) =
|
||||
casti_m128i( d,37 ) = _mm_shuffle_epi32( s4 , 0xaa );
|
||||
casti_m128i( d,38 ) =
|
||||
casti_m128i( d,39 ) = _mm_shuffle_epi32( s4 , 0xff );
|
||||
}
|
||||
casti_m256i( d,12 ) = _mm256_broadcastd_epi32( s3 );
|
||||
casti_m256i( d,13 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s3 ), c1 );
|
||||
casti_m256i( d,14 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s3 ), c2 );
|
||||
casti_m256i( d,15 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s3 ), c3 );
|
||||
|
||||
casti_m256i( d,16 ) = _mm256_broadcastd_epi32( s4 );
|
||||
casti_m256i( d,17 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s4 ), c1 );
|
||||
casti_m256i( d,18 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s4 ), c2 );
|
||||
casti_m256i( d,19 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s4 ), c3 );
|
||||
}
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
@@ -1174,9 +1175,72 @@ static inline void extr_lane_16x32( void *d, const void *s,
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__)
|
||||
|
||||
#if defined(__AVX512VBMI__)
|
||||
|
||||
// Combine byte swap & broadcast in one permute
|
||||
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
{
|
||||
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
const __m512i c0 = _mm512_set1_epi32( 0x00010203 );
|
||||
const __m512i c1 = _mm512_set1_epi32( 0x04050607 );
|
||||
const __m512i c2 = _mm512_set1_epi32( 0x08090a0b );
|
||||
const __m512i c3 = _mm512_set1_epi32( 0x0c0d0e0f );
|
||||
const __m128i s0 = casti_m128i( src,0 );
|
||||
const __m128i s1 = casti_m128i( src,1 );
|
||||
const __m128i s2 = casti_m128i( src,2 );
|
||||
const __m128i s3 = casti_m128i( src,3 );
|
||||
const __m128i s4 = casti_m128i( src,4 );
|
||||
|
||||
casti_m512i( d, 0 ) = _mm512_permutexvar_epi8( c0,
|
||||
_mm512_castsi128_si512( s0 ) );
|
||||
casti_m512i( d, 1 ) = _mm512_permutexvar_epi8( c1,
|
||||
_mm512_castsi128_si512( s0 ) );
|
||||
casti_m512i( d, 2 ) = _mm512_permutexvar_epi8( c2,
|
||||
_mm512_castsi128_si512( s0 ) );
|
||||
casti_m512i( d, 3 ) = _mm512_permutexvar_epi8( c3,
|
||||
_mm512_castsi128_si512( s0 ) );
|
||||
casti_m512i( d, 4 ) = _mm512_permutexvar_epi8( c0,
|
||||
_mm512_castsi128_si512( s1 ) );
|
||||
casti_m512i( d, 5 ) = _mm512_permutexvar_epi8( c1,
|
||||
_mm512_castsi128_si512( s1 ) );
|
||||
casti_m512i( d, 6 ) = _mm512_permutexvar_epi8( c2,
|
||||
_mm512_castsi128_si512( s1 ) );
|
||||
casti_m512i( d, 7 ) = _mm512_permutexvar_epi8( c3,
|
||||
_mm512_castsi128_si512( s1 ) );
|
||||
casti_m512i( d, 8 ) = _mm512_permutexvar_epi8( c0,
|
||||
_mm512_castsi128_si512( s2 ) );
|
||||
casti_m512i( d, 9 ) = _mm512_permutexvar_epi8( c1,
|
||||
_mm512_castsi128_si512( s2 ) );
|
||||
casti_m512i( d,10 ) = _mm512_permutexvar_epi8( c2,
|
||||
_mm512_castsi128_si512( s2 ) );
|
||||
casti_m512i( d,11 ) = _mm512_permutexvar_epi8( c3,
|
||||
_mm512_castsi128_si512( s2 ) );
|
||||
casti_m512i( d,12 ) = _mm512_permutexvar_epi8( c0,
|
||||
_mm512_castsi128_si512( s3 ) );
|
||||
casti_m512i( d,13 ) = _mm512_permutexvar_epi8( c1,
|
||||
_mm512_castsi128_si512( s3 ) );
|
||||
casti_m512i( d,14 ) = _mm512_permutexvar_epi8( c2,
|
||||
_mm512_castsi128_si512( s3 ) );
|
||||
casti_m512i( d,15 ) = _mm512_permutexvar_epi8( c3,
|
||||
_mm512_castsi128_si512( s3 ) );
|
||||
casti_m512i( d,16 ) = _mm512_permutexvar_epi8( c0,
|
||||
_mm512_castsi128_si512( s4 ) );
|
||||
casti_m512i( d,17 ) = _mm512_permutexvar_epi8( c1,
|
||||
_mm512_castsi128_si512( s4 ) );
|
||||
casti_m512i( d,18 ) = _mm512_permutexvar_epi8( c2,
|
||||
_mm512_castsi128_si512( s4 ) );
|
||||
casti_m512i( d,19 ) = _mm512_permutexvar_epi8( c3,
|
||||
_mm512_castsi128_si512( s4 ) );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
{
|
||||
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
const __m512i c1 = _mm512_set1_epi32( 1 );
|
||||
const __m512i c2 = _mm512_add_epi32( c1, c1 );
|
||||
const __m512i c3 = _mm512_add_epi32( c2, c1 );
|
||||
__m128i s0 = casti_m128i( src,0 );
|
||||
__m128i s1 = casti_m128i( src,1 );
|
||||
__m128i s2 = casti_m128i( src,2 );
|
||||
@@ -1189,33 +1253,48 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
|
||||
|
||||
casti_m512i( d, 0 ) = mm512_bcast_m128( _mm_shuffle_epi32( s0 , 0x00 ) );
|
||||
casti_m512i( d, 1 ) = mm512_bcast_m128( _mm_shuffle_epi32( s0 , 0x55 ) );
|
||||
casti_m512i( d, 2 ) = mm512_bcast_m128( _mm_shuffle_epi32( s0 , 0xaa ) );
|
||||
casti_m512i( d, 3 ) = mm512_bcast_m128( _mm_shuffle_epi32( s0 , 0xff ) );
|
||||
casti_m512i( d, 0 ) = _mm512_broadcastd_epi32( s0 );
|
||||
casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( c1,
|
||||
_mm512_castsi128_si512( s0 ) );
|
||||
casti_m512i( d, 2 ) = _mm512_permutexvar_epi32( c2,
|
||||
_mm512_castsi128_si512( s0 ) );
|
||||
casti_m512i( d, 3 ) = _mm512_permutexvar_epi32( c3,
|
||||
_mm512_castsi128_si512( s0 ) );
|
||||
|
||||
casti_m512i( d, 4 ) = mm512_bcast_m128( _mm_shuffle_epi32( s1 , 0x00 ) );
|
||||
casti_m512i( d, 5 ) = mm512_bcast_m128( _mm_shuffle_epi32( s1 , 0x55 ) );
|
||||
casti_m512i( d, 6 ) = mm512_bcast_m128( _mm_shuffle_epi32( s1 , 0xaa ) );
|
||||
casti_m512i( d, 7 ) = mm512_bcast_m128( _mm_shuffle_epi32( s1 , 0xff ) );
|
||||
casti_m512i( d, 4 ) = _mm512_broadcastd_epi32( s1 );
|
||||
casti_m512i( d, 5 ) = _mm512_permutexvar_epi32( c1,
|
||||
_mm512_castsi128_si512( s1 ) );
|
||||
casti_m512i( d, 6 ) = _mm512_permutexvar_epi32( c2,
|
||||
_mm512_castsi128_si512( s1 ) );
|
||||
casti_m512i( d, 7 ) = _mm512_permutexvar_epi32( c3,
|
||||
_mm512_castsi128_si512( s1 ) );
|
||||
|
||||
casti_m512i( d, 8 ) = mm512_bcast_m128( _mm_shuffle_epi32( s2 , 0x00 ) );
|
||||
casti_m512i( d, 9 ) = mm512_bcast_m128( _mm_shuffle_epi32( s2 , 0x55 ) );
|
||||
casti_m512i( d,10 ) = mm512_bcast_m128( _mm_shuffle_epi32( s2 , 0xaa ) );
|
||||
casti_m512i( d,11 ) = mm512_bcast_m128( _mm_shuffle_epi32( s2 , 0xff ) );
|
||||
casti_m512i( d, 8 ) = _mm512_broadcastd_epi32( s2 );
|
||||
casti_m512i( d, 9 ) = _mm512_permutexvar_epi32( c1,
|
||||
_mm512_castsi128_si512( s2 ) );
|
||||
casti_m512i( d,10 ) = _mm512_permutexvar_epi32( c2,
|
||||
_mm512_castsi128_si512( s2 ) );
|
||||
casti_m512i( d,11 ) = _mm512_permutexvar_epi32( c3,
|
||||
_mm512_castsi128_si512( s2 ) );
|
||||
|
||||
casti_m512i( d,12 ) = mm512_bcast_m128( _mm_shuffle_epi32( s3 , 0x00 ) );
|
||||
casti_m512i( d,13 ) = mm512_bcast_m128( _mm_shuffle_epi32( s3 , 0x55 ) );
|
||||
casti_m512i( d,14 ) = mm512_bcast_m128( _mm_shuffle_epi32( s3 , 0xaa ) );
|
||||
casti_m512i( d,15 ) = mm512_bcast_m128( _mm_shuffle_epi32( s3 , 0xff ) );
|
||||
casti_m512i( d,12 ) = _mm512_broadcastd_epi32( s3 );
|
||||
casti_m512i( d,13 ) = _mm512_permutexvar_epi32( c1,
|
||||
_mm512_castsi128_si512( s3 ) );
|
||||
casti_m512i( d,14 ) = _mm512_permutexvar_epi32( c2,
|
||||
_mm512_castsi128_si512( s3 ) );
|
||||
casti_m512i( d,15 ) = _mm512_permutexvar_epi32( c3,
|
||||
_mm512_castsi128_si512( s3 ) );
|
||||
|
||||
casti_m512i( d,16 ) = mm512_bcast_m128( _mm_shuffle_epi32( s4 , 0x00 ) );
|
||||
casti_m512i( d,17 ) = mm512_bcast_m128( _mm_shuffle_epi32( s4 , 0x55 ) );
|
||||
casti_m512i( d,18 ) = mm512_bcast_m128( _mm_shuffle_epi32( s4 , 0xaa ) );
|
||||
casti_m512i( d,19 ) = mm512_bcast_m128( _mm_shuffle_epi32( s4 , 0xff ) );
|
||||
casti_m512i( d,16 ) = _mm512_broadcastd_epi32( s4 );
|
||||
casti_m512i( d,17 ) = _mm512_permutexvar_epi32( c1,
|
||||
_mm512_castsi128_si512( s4 ) );
|
||||
casti_m512i( d,18 ) = _mm512_permutexvar_epi32( c2,
|
||||
_mm512_castsi128_si512( s4 ) );
|
||||
casti_m512i( d,19 ) = _mm512_permutexvar_epi32( c3,
|
||||
_mm512_castsi128_si512( s4 ) );
|
||||
}
|
||||
|
||||
|
||||
#endif // VBMI else
|
||||
#endif // AVX512
|
||||
|
||||
///////////////////////////
|
||||
@@ -1393,82 +1472,56 @@ static inline void extr_lane_4x64( void *dst, const void *src, const int lane,
|
||||
return; // bit_len == 512
|
||||
}
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
#if defined(__AVX2__)
|
||||
|
||||
static inline void mm256_intrlv80_4x64( void *d, const void *src )
|
||||
{
|
||||
__m128i s0 = casti_m128i( src,0 );
|
||||
__m128i s1 = casti_m128i( src,1 );
|
||||
__m128i s2 = casti_m128i( src,2 );
|
||||
__m128i s3 = casti_m128i( src,3 );
|
||||
__m256i s0 = casti_m256i( src,0 );
|
||||
__m256i s1 = casti_m256i( src,1 );
|
||||
__m128i s4 = casti_m128i( src,4 );
|
||||
|
||||
casti_m128i( d, 0 ) =
|
||||
casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0, 0x44 );
|
||||
casti_m128i( d, 2 ) =
|
||||
casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0, 0xee );
|
||||
casti_m256i( d, 0 ) = _mm256_permute4x64_epi64( s0, 0x00 );
|
||||
casti_m256i( d, 1 ) = _mm256_permute4x64_epi64( s0, 0x55 );
|
||||
casti_m256i( d, 2 ) = _mm256_permute4x64_epi64( s0, 0xaa );
|
||||
casti_m256i( d, 3 ) = _mm256_permute4x64_epi64( s0, 0xff );
|
||||
|
||||
casti_m128i( d, 4 ) =
|
||||
casti_m128i( d, 5 ) = _mm_shuffle_epi32( s1, 0x44 );
|
||||
casti_m128i( d, 6 ) =
|
||||
casti_m128i( d, 7 ) = _mm_shuffle_epi32( s1, 0xee );
|
||||
casti_m256i( d, 4 ) = _mm256_permute4x64_epi64( s1, 0x00 );
|
||||
casti_m256i( d, 5 ) = _mm256_permute4x64_epi64( s1, 0x55 );
|
||||
casti_m256i( d, 6 ) = _mm256_permute4x64_epi64( s1, 0xaa );
|
||||
casti_m256i( d, 7 ) = _mm256_permute4x64_epi64( s1, 0xff );
|
||||
|
||||
casti_m128i( d, 8 ) =
|
||||
casti_m128i( d, 9 ) = _mm_shuffle_epi32( s2, 0x44 );
|
||||
casti_m128i( d, 10 ) =
|
||||
casti_m128i( d, 11 ) = _mm_shuffle_epi32( s2, 0xee );
|
||||
|
||||
casti_m128i( d, 12 ) =
|
||||
casti_m128i( d, 13 ) = _mm_shuffle_epi32( s3, 0x44 );
|
||||
casti_m128i( d, 14 ) =
|
||||
casti_m128i( d, 15 ) = _mm_shuffle_epi32( s3, 0xee );
|
||||
|
||||
casti_m128i( d, 16 ) =
|
||||
casti_m128i( d, 17 ) = _mm_shuffle_epi32( s4, 0x44 );
|
||||
casti_m128i( d, 18 ) =
|
||||
casti_m128i( d, 19 ) = _mm_shuffle_epi32( s4, 0xee );
|
||||
casti_m256i( d, 8 ) = _mm256_permute4x64_epi64(
|
||||
_mm256_castsi128_si256( s4 ), 0x00 );
|
||||
casti_m256i( d, 9 ) = _mm256_permute4x64_epi64(
|
||||
_mm256_castsi128_si256( s4 ), 0x55 );
|
||||
}
|
||||
|
||||
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
{
|
||||
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
__m128i s0 = casti_m128i( src,0 );
|
||||
__m128i s1 = casti_m128i( src,1 );
|
||||
__m128i s2 = casti_m128i( src,2 );
|
||||
__m128i s3 = casti_m128i( src,3 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128(
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
__m256i s0 = casti_m256i( src,0 );
|
||||
__m256i s1 = casti_m256i( src,1 );
|
||||
__m128i s4 = casti_m128i( src,4 );
|
||||
|
||||
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
|
||||
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
|
||||
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
|
||||
s0 = _mm256_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm256_shuffle_epi8( s1, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, _mm256_castsi256_si128( bswap_shuf ) );
|
||||
|
||||
casti_m128i( d, 0 ) =
|
||||
casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0, 0x44 );
|
||||
casti_m128i( d, 2 ) =
|
||||
casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0, 0xee );
|
||||
|
||||
casti_m128i( d, 4 ) =
|
||||
casti_m128i( d, 5 ) = _mm_shuffle_epi32( s1, 0x44 );
|
||||
casti_m128i( d, 6 ) =
|
||||
casti_m128i( d, 7 ) = _mm_shuffle_epi32( s1, 0xee );
|
||||
|
||||
casti_m128i( d, 8 ) =
|
||||
casti_m128i( d, 9 ) = _mm_shuffle_epi32( s2, 0x44 );
|
||||
casti_m128i( d, 10 ) =
|
||||
casti_m128i( d, 11 ) = _mm_shuffle_epi32( s2, 0xee );
|
||||
|
||||
casti_m128i( d, 12 ) =
|
||||
casti_m128i( d, 13 ) = _mm_shuffle_epi32( s3, 0x44 );
|
||||
casti_m128i( d, 14 ) =
|
||||
casti_m128i( d, 15 ) = _mm_shuffle_epi32( s3, 0xee );
|
||||
|
||||
casti_m128i( d, 16 ) =
|
||||
casti_m128i( d, 17 ) = _mm_shuffle_epi32( s4, 0x44 );
|
||||
casti_m128i( d, 18 ) =
|
||||
casti_m128i( d, 19 ) = _mm_shuffle_epi32( s4, 0xee );
|
||||
casti_m256i( d, 0 ) = _mm256_permute4x64_epi64( s0, 0x00 );
|
||||
casti_m256i( d, 1 ) = _mm256_permute4x64_epi64( s0, 0x55 );
|
||||
casti_m256i( d, 2 ) = _mm256_permute4x64_epi64( s0, 0xaa );
|
||||
casti_m256i( d, 3 ) = _mm256_permute4x64_epi64( s0, 0xff );
|
||||
|
||||
casti_m256i( d, 4 ) = _mm256_permute4x64_epi64( s1, 0x00 );
|
||||
casti_m256i( d, 5 ) = _mm256_permute4x64_epi64( s1, 0x55 );
|
||||
casti_m256i( d, 6 ) = _mm256_permute4x64_epi64( s1, 0xaa );
|
||||
casti_m256i( d, 7 ) = _mm256_permute4x64_epi64( s1, 0xff );
|
||||
|
||||
casti_m256i( d, 8 ) = _mm256_permute4x64_epi64(
|
||||
_mm256_castsi128_si256( s4 ), 0x00 );
|
||||
casti_m256i( d, 9 ) = _mm256_permute4x64_epi64(
|
||||
_mm256_castsi128_si256( s4 ), 0x55 );
|
||||
}
|
||||
|
||||
#endif // AVX2
|
||||
@@ -1796,25 +1849,65 @@ static inline void extr_lane_8x64( void *dst, const void *src, const int lane,
|
||||
// broadcast to all lanes
|
||||
static inline void mm512_intrlv80_8x64( void *dst, const void *src )
|
||||
{
|
||||
__m512i *d = (__m512i*)dst;
|
||||
const __m128i *s = (const __m128i*)src;
|
||||
__m512i *d = (__m512i*)dst;
|
||||
const uint64_t *s = (const uint64_t*)src;
|
||||
|
||||
d[ 0] = mm512_bcast_m128( _mm_shuffle_epi32( s[0], 0x44 ) );
|
||||
d[ 1] = mm512_bcast_m128( _mm_shuffle_epi32( s[0], 0xee ) );
|
||||
d[ 2] = mm512_bcast_m128( _mm_shuffle_epi32( s[1], 0x44 ) );
|
||||
d[ 3] = mm512_bcast_m128( _mm_shuffle_epi32( s[1], 0xee ) );
|
||||
d[ 4] = mm512_bcast_m128( _mm_shuffle_epi32( s[2], 0x44 ) );
|
||||
d[ 5] = mm512_bcast_m128( _mm_shuffle_epi32( s[2], 0xee ) );
|
||||
d[ 6] = mm512_bcast_m128( _mm_shuffle_epi32( s[3], 0x44 ) );
|
||||
d[ 7] = mm512_bcast_m128( _mm_shuffle_epi32( s[3], 0xee ) );
|
||||
d[ 8] = mm512_bcast_m128( _mm_shuffle_epi32( s[4], 0x44 ) );
|
||||
d[ 9] = mm512_bcast_m128( _mm_shuffle_epi32( s[4], 0xee ) );
|
||||
d[0] = _mm512_set1_epi64( s[0] );
|
||||
d[1] = _mm512_set1_epi64( s[1] );
|
||||
d[2] = _mm512_set1_epi64( s[2] );
|
||||
d[3] = _mm512_set1_epi64( s[3] );
|
||||
d[4] = _mm512_set1_epi64( s[4] );
|
||||
d[5] = _mm512_set1_epi64( s[5] );
|
||||
d[6] = _mm512_set1_epi64( s[6] );
|
||||
d[7] = _mm512_set1_epi64( s[7] );
|
||||
d[8] = _mm512_set1_epi64( s[8] );
|
||||
d[9] = _mm512_set1_epi64( s[9] );
|
||||
}
|
||||
|
||||
// byte swap and broadcast to al lanes
|
||||
// byte swap and broadcast to all lanes
|
||||
|
||||
#if defined(__AVX512VBMI__)
|
||||
|
||||
// Combine byte swap & broadcast in one permute
|
||||
static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
||||
{
|
||||
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
const __m512i c0 = _mm512_set1_epi64( 0x0405060700010203 );
|
||||
const __m512i c1 = _mm512_set1_epi64( 0x0c0d0e0f08090a0b );
|
||||
const __m128i s0 = casti_m128i( src,0 );
|
||||
const __m128i s1 = casti_m128i( src,1 );
|
||||
const __m128i s2 = casti_m128i( src,2 );
|
||||
const __m128i s3 = casti_m128i( src,3 );
|
||||
const __m128i s4 = casti_m128i( src,4 );
|
||||
|
||||
casti_m512i( d,0 ) = _mm512_permutexvar_epi8( c0,
|
||||
_mm512_castsi128_si512( s0 ) );
|
||||
casti_m512i( d,1 ) = _mm512_permutexvar_epi8( c1,
|
||||
_mm512_castsi128_si512( s0 ) );
|
||||
casti_m512i( d,2 ) = _mm512_permutexvar_epi8( c0,
|
||||
_mm512_castsi128_si512( s1 ) );
|
||||
casti_m512i( d,3 ) = _mm512_permutexvar_epi8( c1,
|
||||
_mm512_castsi128_si512( s1 ) );
|
||||
casti_m512i( d,4 ) = _mm512_permutexvar_epi8( c0,
|
||||
_mm512_castsi128_si512( s2 ) );
|
||||
casti_m512i( d,5 ) = _mm512_permutexvar_epi8( c1,
|
||||
_mm512_castsi128_si512( s2 ) );
|
||||
casti_m512i( d,6 ) = _mm512_permutexvar_epi8( c0,
|
||||
_mm512_castsi128_si512( s3 ) );
|
||||
casti_m512i( d,7 ) = _mm512_permutexvar_epi8( c1,
|
||||
_mm512_castsi128_si512( s3 ) );
|
||||
casti_m512i( d,8 ) = _mm512_permutexvar_epi8( c0,
|
||||
_mm512_castsi128_si512( s4 ) );
|
||||
casti_m512i( d,9 ) = _mm512_permutexvar_epi8( c1,
|
||||
_mm512_castsi128_si512( s4 ) );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
||||
{
|
||||
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
const __m512i c1 = _mm512_set1_epi64( 1 );
|
||||
__m128i s0 = casti_m128i( src,0 );
|
||||
__m128i s1 = casti_m128i( src,1 );
|
||||
__m128i s2 = casti_m128i( src,2 );
|
||||
@@ -1827,18 +1920,24 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
||||
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
|
||||
|
||||
casti_m512i( d, 0 ) = mm512_bcast_m128( _mm_shuffle_epi32( s0, 0x44 ) );
|
||||
casti_m512i( d, 1 ) = mm512_bcast_m128( _mm_shuffle_epi32( s0, 0xee ) );
|
||||
casti_m512i( d, 2 ) = mm512_bcast_m128( _mm_shuffle_epi32( s1, 0x44 ) );
|
||||
casti_m512i( d, 3 ) = mm512_bcast_m128( _mm_shuffle_epi32( s1, 0xee ) );
|
||||
casti_m512i( d, 4 ) = mm512_bcast_m128( _mm_shuffle_epi32( s2, 0x44 ) );
|
||||
casti_m512i( d, 5 ) = mm512_bcast_m128( _mm_shuffle_epi32( s2, 0xee ) );
|
||||
casti_m512i( d, 6 ) = mm512_bcast_m128( _mm_shuffle_epi32( s3, 0x44 ) );
|
||||
casti_m512i( d, 7 ) = mm512_bcast_m128( _mm_shuffle_epi32( s3, 0xee ) );
|
||||
casti_m512i( d, 8 ) = mm512_bcast_m128( _mm_shuffle_epi32( s4, 0x44 ) );
|
||||
casti_m512i( d, 9 ) = mm512_bcast_m128( _mm_shuffle_epi32( s4, 0xee ) );
|
||||
casti_m512i( d,0 ) = _mm512_broadcastq_epi64( s0 );
|
||||
casti_m512i( d,1 ) = _mm512_permutexvar_epi64( c1,
|
||||
_mm512_castsi128_si512( s0 ) );
|
||||
casti_m512i( d,2 ) = _mm512_broadcastq_epi64( s1 );
|
||||
casti_m512i( d,3 ) = _mm512_permutexvar_epi64( c1,
|
||||
_mm512_castsi128_si512( s1 ) );
|
||||
casti_m512i( d,4 ) = _mm512_broadcastq_epi64( s2 );
|
||||
casti_m512i( d,5 ) = _mm512_permutexvar_epi64( c1,
|
||||
_mm512_castsi128_si512( s2 ) );
|
||||
casti_m512i( d,6 ) = _mm512_broadcastq_epi64( s3 );
|
||||
casti_m512i( d,7 ) = _mm512_permutexvar_epi64( c1,
|
||||
_mm512_castsi128_si512( s3 ) );
|
||||
casti_m512i( d,8 ) = _mm512_broadcastq_epi64( s4 );
|
||||
casti_m512i( d,9 ) = _mm512_permutexvar_epi64( c1,
|
||||
_mm512_castsi128_si512( s4 ) );
|
||||
}
|
||||
|
||||
#endif // VBMI else
|
||||
#endif // AVX512
|
||||
|
||||
//////////////////////////
|
||||
@@ -1995,7 +2094,8 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,
|
||||
|
||||
static inline void mm512_bswap32_intrlv80_4x128( void *d, void *src )
|
||||
{
|
||||
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
__m128i s0 = casti_m128i( src,0 );
|
||||
__m128i s1 = casti_m128i( src,1 );
|
||||
__m128i s2 = casti_m128i( src,2 );
|
||||
|
||||
Reference in New Issue
Block a user