mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.22.2
This commit is contained in:
@@ -89,18 +89,18 @@ static inline void extr_lane_2x32( void *dst, const void *src,
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
#define ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ) \
|
||||
D0 = mm128_shuflmov_32( S0, 1, S1, 0 ); \
|
||||
D1 = mm128_shuflmov_32( S1, 0, S0, 1 ); \
|
||||
D2 = mm128_shuflmov_32( S2, 0, S0, 2 ); \
|
||||
D3 = mm128_shuflmov_32( S3, 0, S0, 3 ); \
|
||||
D0 = mm128_shuflmov_32( D0, 2, S2, 0 ); \
|
||||
D1 = mm128_shuflmov_32( D1, 2, S2, 1 ); \
|
||||
D2 = mm128_shuflmov_32( D2, 1, S1, 2 ); \
|
||||
D3 = mm128_shuflmov_32( D3, 1, S1, 3 ); \
|
||||
D0 = mm128_shuflmov_32( D0, 3, S3, 0 ); \
|
||||
D1 = mm128_shuflmov_32( D1, 3, S3, 1 ); \
|
||||
D2 = mm128_shuflmov_32( D2, 3, S3, 2 ); \
|
||||
D3 = mm128_shuflmov_32( D3, 2, S2, 3 );
|
||||
D0 = mm128_mov32_32( S0, 1, S1, 0 ); \
|
||||
D1 = mm128_mov32_32( S1, 0, S0, 1 ); \
|
||||
D2 = mm128_mov32_32( S2, 0, S0, 2 ); \
|
||||
D3 = mm128_mov32_32( S3, 0, S0, 3 ); \
|
||||
D0 = mm128_mov32_32( D0, 2, S2, 0 ); \
|
||||
D1 = mm128_mov32_32( D1, 2, S2, 1 ); \
|
||||
D2 = mm128_mov32_32( D2, 1, S1, 2 ); \
|
||||
D3 = mm128_mov32_32( D3, 1, S1, 3 ); \
|
||||
D0 = mm128_mov32_32( D0, 3, S3, 0 ); \
|
||||
D1 = mm128_mov32_32( D1, 3, S3, 1 ); \
|
||||
D2 = mm128_mov32_32( D2, 3, S3, 2 ); \
|
||||
D3 = mm128_mov32_32( D3, 2, S2, 3 );
|
||||
|
||||
#define LOAD_SRCE( S0, S1, S2, S3, src0, i0, src1, i1, src2, i2, src3, i3 ) \
|
||||
S0 = _mm_load_si128( (const __m128i*)(src0) + (i0) ); \
|
||||
@@ -406,28 +406,6 @@ static inline void extr_lane_4x32( void *d, const void *s,
|
||||
((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+60 ];
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Still used by decred due to odd data size: 180 bytes
|
||||
// bit_len must be multiple of 32
|
||||
static inline void mm128_intrlv_4x32x( void *dst, void *src0, void *src1,
|
||||
void *src2, void *src3, const int bit_len )
|
||||
{
|
||||
uint32_t *d = (uint32_t*)dst;
|
||||
uint32_t *s0 = (uint32_t*)src0;
|
||||
uint32_t *s1 = (uint32_t*)src1;
|
||||
uint32_t *s2 = (uint32_t*)src2;
|
||||
uint32_t *s3 = (uint32_t*)src3;
|
||||
|
||||
for ( int i = 0; i < bit_len >> 5; i++, d += 4 )
|
||||
{
|
||||
*d = *(s0+i);
|
||||
*(d+1) = *(s1+i);
|
||||
*(d+2) = *(s2+i);
|
||||
*(d+3) = *(s3+i);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
static inline void mm128_bswap32_80( void *d, void *s )
|
||||
@@ -468,32 +446,6 @@ static inline void mm128_bswap32_80( void *d, void *s )
|
||||
|
||||
#endif
|
||||
|
||||
static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
{
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
casti_m128i( d, 0 ) = _mm_set1_epi32( bswap_32( s[ 0] ) );
|
||||
casti_m128i( d, 1 ) = _mm_set1_epi32( bswap_32( s[ 1] ) );
|
||||
casti_m128i( d, 2 ) = _mm_set1_epi32( bswap_32( s[ 2] ) );
|
||||
casti_m128i( d, 3 ) = _mm_set1_epi32( bswap_32( s[ 3] ) );
|
||||
casti_m128i( d, 4 ) = _mm_set1_epi32( bswap_32( s[ 4] ) );
|
||||
casti_m128i( d, 5 ) = _mm_set1_epi32( bswap_32( s[ 5] ) );
|
||||
casti_m128i( d, 6 ) = _mm_set1_epi32( bswap_32( s[ 6] ) );
|
||||
casti_m128i( d, 7 ) = _mm_set1_epi32( bswap_32( s[ 7] ) );
|
||||
casti_m128i( d, 8 ) = _mm_set1_epi32( bswap_32( s[ 8] ) );
|
||||
casti_m128i( d, 9 ) = _mm_set1_epi32( bswap_32( s[ 9] ) );
|
||||
casti_m128i( d,10 ) = _mm_set1_epi32( bswap_32( s[10] ) );
|
||||
casti_m128i( d,11 ) = _mm_set1_epi32( bswap_32( s[11] ) );
|
||||
casti_m128i( d,12 ) = _mm_set1_epi32( bswap_32( s[12] ) );
|
||||
casti_m128i( d,13 ) = _mm_set1_epi32( bswap_32( s[13] ) );
|
||||
casti_m128i( d,14 ) = _mm_set1_epi32( bswap_32( s[14] ) );
|
||||
casti_m128i( d,15 ) = _mm_set1_epi32( bswap_32( s[15] ) );
|
||||
casti_m128i( d,16 ) = _mm_set1_epi32( bswap_32( s[16] ) );
|
||||
casti_m128i( d,17 ) = _mm_set1_epi32( bswap_32( s[17] ) );
|
||||
casti_m128i( d,18 ) = _mm_set1_epi32( bswap_32( s[18] ) );
|
||||
casti_m128i( d,19 ) = _mm_set1_epi32( bswap_32( s[19] ) );
|
||||
}
|
||||
|
||||
/*
|
||||
static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
{
|
||||
__m128i s0 = casti_m128i( src,0 );
|
||||
@@ -547,77 +499,10 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
casti_m128i( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
|
||||
casti_m128i( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
// 8x32
|
||||
|
||||
#define ILEAVE_8x32( i ) do \
|
||||
{ \
|
||||
uint32_t *d = (uint32_t*)(dst) + ( (i) << 3 ); \
|
||||
d[0] = *( (const uint32_t*)(s0) +(i) ); \
|
||||
d[1] = *( (const uint32_t*)(s1) +(i) ); \
|
||||
d[2] = *( (const uint32_t*)(s2) +(i) ); \
|
||||
d[3] = *( (const uint32_t*)(s3) +(i) ); \
|
||||
d[4] = *( (const uint32_t*)(s4) +(i) ); \
|
||||
d[5] = *( (const uint32_t*)(s5) +(i) ); \
|
||||
d[6] = *( (const uint32_t*)(s6) +(i) ); \
|
||||
d[7] = *( (const uint32_t*)(s7) +(i) ); \
|
||||
} while(0)
|
||||
|
||||
static inline void intrlv_8x32b( void *dst, const void *s0, const void *s1,
|
||||
const void *s2, const void *s3, const void *s4, const void *s5,
|
||||
const void *s6, const void *s7, const int bit_len )
|
||||
{
|
||||
for ( int i = 0; i < bit_len/32; i++ )
|
||||
ILEAVE_8x32( i );
|
||||
}
|
||||
|
||||
/*
|
||||
// default
|
||||
static inline void intrlv_8x32( void *dst, const void *s0, const void *s1,
|
||||
const void *s2, const void *s3, const void *s4, const void *s5,
|
||||
const void *s6, const void *s7, const int bit_len )
|
||||
{
|
||||
ILEAVE_8x32( 0 ); ILEAVE_8x32( 1 );
|
||||
ILEAVE_8x32( 2 ); ILEAVE_8x32( 3 );
|
||||
ILEAVE_8x32( 4 ); ILEAVE_8x32( 5 );
|
||||
ILEAVE_8x32( 6 ); ILEAVE_8x32( 7 );
|
||||
if ( bit_len <= 256 ) return;
|
||||
ILEAVE_8x32( 8 ); ILEAVE_8x32( 9 );
|
||||
ILEAVE_8x32( 10 ); ILEAVE_8x32( 11 );
|
||||
ILEAVE_8x32( 12 ); ILEAVE_8x32( 13 );
|
||||
ILEAVE_8x32( 14 ); ILEAVE_8x32( 15 );
|
||||
if ( bit_len <= 512 ) return;
|
||||
ILEAVE_8x32( 16 ); ILEAVE_8x32( 17 );
|
||||
ILEAVE_8x32( 18 ); ILEAVE_8x32( 19 );
|
||||
if ( bit_len <= 640 ) return;
|
||||
ILEAVE_8x32( 20 ); ILEAVE_8x32( 21 );
|
||||
ILEAVE_8x32( 22 ); ILEAVE_8x32( 23 );
|
||||
ILEAVE_8x32( 24 ); ILEAVE_8x32( 25 );
|
||||
ILEAVE_8x32( 26 ); ILEAVE_8x32( 27 );
|
||||
ILEAVE_8x32( 28 ); ILEAVE_8x32( 29 );
|
||||
ILEAVE_8x32( 30 ); ILEAVE_8x32( 31 );
|
||||
}
|
||||
*/
|
||||
|
||||
/* default
|
||||
static inline void intrlv_8x32_512( void *dst, const void *s0, const void *s1,
|
||||
const void *s2, const void *s3, const void *s4, const void *s5,
|
||||
const void *s6, const void *s7 )
|
||||
{
|
||||
ILEAVE_8x32( 0 ); ILEAVE_8x32( 1 );
|
||||
ILEAVE_8x32( 2 ); ILEAVE_8x32( 3 );
|
||||
ILEAVE_8x32( 4 ); ILEAVE_8x32( 5 );
|
||||
ILEAVE_8x32( 6 ); ILEAVE_8x32( 7 );
|
||||
ILEAVE_8x32( 8 ); ILEAVE_8x32( 9 );
|
||||
ILEAVE_8x32( 10 ); ILEAVE_8x32( 11 );
|
||||
ILEAVE_8x32( 12 ); ILEAVE_8x32( 13 );
|
||||
ILEAVE_8x32( 14 ); ILEAVE_8x32( 15 );
|
||||
}
|
||||
*/
|
||||
|
||||
#undef ILEAVE_8x32
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
static inline void intrlv_8x32( void *dst, const void *s0, const void *s1,
|
||||
@@ -820,71 +705,6 @@ static inline void dintrlv_8x32_512( void *dst0, void *dst1, void *dst2,
|
||||
|
||||
#endif // SSE4_1
|
||||
|
||||
|
||||
#define DLEAVE_8x32( i ) do \
|
||||
{ \
|
||||
const uint32_t *s = (const uint32_t*)(src) + ( (i) << 3 ); \
|
||||
*( (uint32_t*)(d0) +(i) ) = s[0]; \
|
||||
*( (uint32_t*)(d1) +(i) ) = s[1]; \
|
||||
*( (uint32_t*)(d2) +(i) ) = s[2]; \
|
||||
*( (uint32_t*)(d3) +(i) ) = s[3]; \
|
||||
*( (uint32_t*)(d4) +(i) ) = s[4]; \
|
||||
*( (uint32_t*)(d5) +(i) ) = s[5]; \
|
||||
*( (uint32_t*)(d6) +(i) ) = s[6]; \
|
||||
*( (uint32_t*)(d7) +(i) ) = s[7]; \
|
||||
} while(0)
|
||||
|
||||
static inline void dintrlv_8x32b( void *d0, void *d1, void *d2, void *d3,
|
||||
void *d4, void *d5, void *d6, void *d7, const void *src,
|
||||
const int bit_len )
|
||||
{
|
||||
for ( int i = 0; i < bit_len/32; i++ )
|
||||
DLEAVE_8x32( i );
|
||||
}
|
||||
|
||||
/* default
|
||||
static inline void dintrlv_8x32( void *d0, void *d1, void *d2, void *d3,
|
||||
void *d4, void *d5, void *d6, void *d7, const void *src,
|
||||
const int bit_len )
|
||||
{
|
||||
DLEAVE_8x32( 0 ); DLEAVE_8x32( 1 );
|
||||
DLEAVE_8x32( 2 ); DLEAVE_8x32( 3 );
|
||||
DLEAVE_8x32( 4 ); DLEAVE_8x32( 5 );
|
||||
DLEAVE_8x32( 6 ); DLEAVE_8x32( 7 );
|
||||
if ( bit_len <= 256 ) return;
|
||||
DLEAVE_8x32( 8 ); DLEAVE_8x32( 9 );
|
||||
DLEAVE_8x32( 10 ); DLEAVE_8x32( 11 );
|
||||
DLEAVE_8x32( 12 ); DLEAVE_8x32( 13 );
|
||||
DLEAVE_8x32( 14 ); DLEAVE_8x32( 15 );
|
||||
if ( bit_len <= 512 ) return;
|
||||
DLEAVE_8x32( 16 ); DLEAVE_8x32( 17 );
|
||||
DLEAVE_8x32( 18 ); DLEAVE_8x32( 19 );
|
||||
if ( bit_len <= 640 ) return;
|
||||
DLEAVE_8x32( 20 ); DLEAVE_8x32( 21 );
|
||||
DLEAVE_8x32( 22 ); DLEAVE_8x32( 23 );
|
||||
DLEAVE_8x32( 24 ); DLEAVE_8x32( 25 );
|
||||
DLEAVE_8x32( 26 ); DLEAVE_8x32( 27 );
|
||||
DLEAVE_8x32( 28 ); DLEAVE_8x32( 29 );
|
||||
DLEAVE_8x32( 30 ); DLEAVE_8x32( 31 );
|
||||
}
|
||||
*/
|
||||
|
||||
/* default
|
||||
static inline void dintrlv_8x32_512( void *d0, void *d1, void *d2, void *d3,
|
||||
void *d4, void *d5, void *d6, void *d7, const void *src )
|
||||
{
|
||||
DLEAVE_8x32( 0 ); DLEAVE_8x32( 1 );
|
||||
DLEAVE_8x32( 2 ); DLEAVE_8x32( 3 );
|
||||
DLEAVE_8x32( 4 ); DLEAVE_8x32( 5 );
|
||||
DLEAVE_8x32( 6 ); DLEAVE_8x32( 7 );
|
||||
DLEAVE_8x32( 8 ); DLEAVE_8x32( 9 );
|
||||
DLEAVE_8x32( 10 ); DLEAVE_8x32( 11 );
|
||||
DLEAVE_8x32( 12 ); DLEAVE_8x32( 13 );
|
||||
DLEAVE_8x32( 14 ); DLEAVE_8x32( 15 );
|
||||
}
|
||||
*/
|
||||
#undef DLEAVE_8x32
|
||||
|
||||
static inline void extr_lane_8x32( void *d, const void *s,
|
||||
const int lane, const int bit_len )
|
||||
{
|
||||
@@ -909,31 +729,6 @@ static inline void extr_lane_8x32( void *d, const void *s,
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
{
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
casti_m256i( d, 0 ) = _mm256_set1_epi32( bswap_32( s[ 0] ) );
|
||||
casti_m256i( d, 1 ) = _mm256_set1_epi32( bswap_32( s[ 1] ) );
|
||||
casti_m256i( d, 2 ) = _mm256_set1_epi32( bswap_32( s[ 2] ) );
|
||||
casti_m256i( d, 3 ) = _mm256_set1_epi32( bswap_32( s[ 3] ) );
|
||||
casti_m256i( d, 4 ) = _mm256_set1_epi32( bswap_32( s[ 4] ) );
|
||||
casti_m256i( d, 5 ) = _mm256_set1_epi32( bswap_32( s[ 5] ) );
|
||||
casti_m256i( d, 6 ) = _mm256_set1_epi32( bswap_32( s[ 6] ) );
|
||||
casti_m256i( d, 7 ) = _mm256_set1_epi32( bswap_32( s[ 7] ) );
|
||||
casti_m256i( d, 8 ) = _mm256_set1_epi32( bswap_32( s[ 8] ) );
|
||||
casti_m256i( d, 9 ) = _mm256_set1_epi32( bswap_32( s[ 9] ) );
|
||||
casti_m256i( d,10 ) = _mm256_set1_epi32( bswap_32( s[10] ) );
|
||||
casti_m256i( d,11 ) = _mm256_set1_epi32( bswap_32( s[11] ) );
|
||||
casti_m256i( d,12 ) = _mm256_set1_epi32( bswap_32( s[12] ) );
|
||||
casti_m256i( d,13 ) = _mm256_set1_epi32( bswap_32( s[13] ) );
|
||||
casti_m256i( d,14 ) = _mm256_set1_epi32( bswap_32( s[14] ) );
|
||||
casti_m256i( d,15 ) = _mm256_set1_epi32( bswap_32( s[15] ) );
|
||||
casti_m256i( d,16 ) = _mm256_set1_epi32( bswap_32( s[16] ) );
|
||||
casti_m256i( d,17 ) = _mm256_set1_epi32( bswap_32( s[17] ) );
|
||||
casti_m256i( d,18 ) = _mm256_set1_epi32( bswap_32( s[18] ) );
|
||||
casti_m256i( d,19 ) = _mm256_set1_epi32( bswap_32( s[19] ) );
|
||||
}
|
||||
/*
|
||||
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
{
|
||||
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
@@ -994,7 +789,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
casti_m128i( d,38 ) =
|
||||
casti_m128i( d,39 ) = _mm_shuffle_epi32( s4 , 0xff );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
@@ -1355,28 +1150,6 @@ static inline void dintrlv_16x32_512( void *dst00, void *dst01, void *dst02,
|
||||
|
||||
#endif // SSE4_1
|
||||
|
||||
|
||||
#define DLEAVE_16x32( i ) do \
|
||||
{ \
|
||||
const uint32_t *s = (const uint32_t*)(src) + ( (i) << 4 ); \
|
||||
*( (uint32_t*)(d00) +(i) ) = s[ 0]; \
|
||||
*( (uint32_t*)(d01) +(i) ) = s[ 1]; \
|
||||
*( (uint32_t*)(d02) +(i) ) = s[ 2]; \
|
||||
*( (uint32_t*)(d03) +(i) ) = s[ 3]; \
|
||||
*( (uint32_t*)(d04) +(i) ) = s[ 4]; \
|
||||
*( (uint32_t*)(d05) +(i) ) = s[ 5]; \
|
||||
*( (uint32_t*)(d06) +(i) ) = s[ 6]; \
|
||||
*( (uint32_t*)(d07) +(i) ) = s[ 7]; \
|
||||
*( (uint32_t*)(d08) +(i) ) = s[ 8]; \
|
||||
*( (uint32_t*)(d09) +(i) ) = s[ 9]; \
|
||||
*( (uint32_t*)(d10) +(i) ) = s[10]; \
|
||||
*( (uint32_t*)(d11) +(i) ) = s[11]; \
|
||||
*( (uint32_t*)(d12) +(i) ) = s[12]; \
|
||||
*( (uint32_t*)(d13) +(i) ) = s[13]; \
|
||||
*( (uint32_t*)(d14) +(i) ) = s[14]; \
|
||||
*( (uint32_t*)(d15) +(i) ) = s[15]; \
|
||||
} while(0)
|
||||
|
||||
static inline void extr_lane_16x32( void *d, const void *s,
|
||||
const int lane, const int bit_len )
|
||||
{
|
||||
@@ -1401,31 +1174,6 @@ static inline void extr_lane_16x32( void *d, const void *s,
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__)
|
||||
|
||||
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
{
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
casti_m512i( d, 0 ) = _mm512_set1_epi32( bswap_32( s[ 0] ) );
|
||||
casti_m512i( d, 1 ) = _mm512_set1_epi32( bswap_32( s[ 1] ) );
|
||||
casti_m512i( d, 2 ) = _mm512_set1_epi32( bswap_32( s[ 2] ) );
|
||||
casti_m512i( d, 3 ) = _mm512_set1_epi32( bswap_32( s[ 3] ) );
|
||||
casti_m512i( d, 4 ) = _mm512_set1_epi32( bswap_32( s[ 4] ) );
|
||||
casti_m512i( d, 5 ) = _mm512_set1_epi32( bswap_32( s[ 5] ) );
|
||||
casti_m512i( d, 6 ) = _mm512_set1_epi32( bswap_32( s[ 6] ) );
|
||||
casti_m512i( d, 7 ) = _mm512_set1_epi32( bswap_32( s[ 7] ) );
|
||||
casti_m512i( d, 8 ) = _mm512_set1_epi32( bswap_32( s[ 8] ) );
|
||||
casti_m512i( d, 9 ) = _mm512_set1_epi32( bswap_32( s[ 9] ) );
|
||||
casti_m512i( d,10 ) = _mm512_set1_epi32( bswap_32( s[10] ) );
|
||||
casti_m512i( d,11 ) = _mm512_set1_epi32( bswap_32( s[11] ) );
|
||||
casti_m512i( d,12 ) = _mm512_set1_epi32( bswap_32( s[12] ) );
|
||||
casti_m512i( d,13 ) = _mm512_set1_epi32( bswap_32( s[13] ) );
|
||||
casti_m512i( d,14 ) = _mm512_set1_epi32( bswap_32( s[14] ) );
|
||||
casti_m512i( d,15 ) = _mm512_set1_epi32( bswap_32( s[15] ) );
|
||||
casti_m512i( d,16 ) = _mm512_set1_epi32( bswap_32( s[16] ) );
|
||||
casti_m512i( d,17 ) = _mm512_set1_epi32( bswap_32( s[17] ) );
|
||||
casti_m512i( d,18 ) = _mm512_set1_epi32( bswap_32( s[18] ) );
|
||||
casti_m512i( d,19 ) = _mm512_set1_epi32( bswap_32( s[19] ) );
|
||||
}
|
||||
/*
|
||||
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
{
|
||||
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
@@ -1441,92 +1189,32 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
|
||||
|
||||
casti_m128i( d, 0 ) =
|
||||
casti_m128i( d, 1 ) =
|
||||
casti_m128i( d, 2 ) =
|
||||
casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0 , 0x00 );
|
||||
casti_m128i( d, 4 ) =
|
||||
casti_m128i( d, 5 ) =
|
||||
casti_m128i( d, 6 ) =
|
||||
casti_m128i( d, 7 ) = _mm_shuffle_epi32( s0 , 0x55 );
|
||||
casti_m128i( d, 8 ) =
|
||||
casti_m128i( d, 9 ) =
|
||||
casti_m128i( d,10 ) =
|
||||
casti_m128i( d,11 ) = _mm_shuffle_epi32( s0 , 0xaa );
|
||||
casti_m128i( d,12 ) =
|
||||
casti_m128i( d,13 ) =
|
||||
casti_m128i( d,14 ) =
|
||||
casti_m128i( d,15 ) = _mm_shuffle_epi32( s0 , 0xff );
|
||||
casti_m512i( d, 0 ) = mm512_bcast_m128( _mm_shuffle_epi32( s0 , 0x00 ) );
|
||||
casti_m512i( d, 1 ) = mm512_bcast_m128( _mm_shuffle_epi32( s0 , 0x55 ) );
|
||||
casti_m512i( d, 2 ) = mm512_bcast_m128( _mm_shuffle_epi32( s0 , 0xaa ) );
|
||||
casti_m512i( d, 3 ) = mm512_bcast_m128( _mm_shuffle_epi32( s0 , 0xff ) );
|
||||
|
||||
casti_m128i( d,16 ) =
|
||||
casti_m128i( d,17 ) =
|
||||
casti_m128i( d,18 ) =
|
||||
casti_m128i( d,19 ) = _mm_shuffle_epi32( s1 , 0x00 );
|
||||
casti_m128i( d,20 ) =
|
||||
casti_m128i( d,21 ) =
|
||||
casti_m128i( d,22 ) =
|
||||
casti_m128i( d,23 ) = _mm_shuffle_epi32( s1 , 0x55 );
|
||||
casti_m128i( d,24 ) =
|
||||
casti_m128i( d,25 ) =
|
||||
casti_m128i( d,26 ) =
|
||||
casti_m128i( d,27 ) = _mm_shuffle_epi32( s1 , 0xaa );
|
||||
casti_m128i( d,28 ) =
|
||||
casti_m128i( d,29 ) =
|
||||
casti_m128i( d,30 ) =
|
||||
casti_m128i( d,31 ) = _mm_shuffle_epi32( s1 , 0xff );
|
||||
casti_m512i( d, 4 ) = mm512_bcast_m128( _mm_shuffle_epi32( s1 , 0x00 ) );
|
||||
casti_m512i( d, 5 ) = mm512_bcast_m128( _mm_shuffle_epi32( s1 , 0x55 ) );
|
||||
casti_m512i( d, 6 ) = mm512_bcast_m128( _mm_shuffle_epi32( s1 , 0xaa ) );
|
||||
casti_m512i( d, 7 ) = mm512_bcast_m128( _mm_shuffle_epi32( s1 , 0xff ) );
|
||||
|
||||
casti_m128i( d,32 ) =
|
||||
casti_m128i( d,33 ) =
|
||||
casti_m128i( d,34 ) =
|
||||
casti_m128i( d,35 ) = _mm_shuffle_epi32( s2 , 0x00 );
|
||||
casti_m128i( d,36 ) =
|
||||
casti_m128i( d,37 ) =
|
||||
casti_m128i( d,38 ) =
|
||||
casti_m128i( d,39 ) = _mm_shuffle_epi32( s2 , 0x55 );
|
||||
casti_m128i( d,40 ) =
|
||||
casti_m128i( d,41 ) =
|
||||
casti_m128i( d,42 ) =
|
||||
casti_m128i( d,43 ) = _mm_shuffle_epi32( s2 , 0xaa );
|
||||
casti_m128i( d,44 ) =
|
||||
casti_m128i( d,45 ) =
|
||||
casti_m128i( d,46 ) =
|
||||
casti_m128i( d,47 ) = _mm_shuffle_epi32( s2 , 0xff );
|
||||
casti_m512i( d, 8 ) = mm512_bcast_m128( _mm_shuffle_epi32( s2 , 0x00 ) );
|
||||
casti_m512i( d, 9 ) = mm512_bcast_m128( _mm_shuffle_epi32( s2 , 0x55 ) );
|
||||
casti_m512i( d,10 ) = mm512_bcast_m128( _mm_shuffle_epi32( s2 , 0xaa ) );
|
||||
casti_m512i( d,11 ) = mm512_bcast_m128( _mm_shuffle_epi32( s2 , 0xff ) );
|
||||
|
||||
casti_m128i( d,48 ) =
|
||||
casti_m128i( d,49 ) =
|
||||
casti_m128i( d,50 ) =
|
||||
casti_m128i( d,51 ) = _mm_shuffle_epi32( s3 , 0x00 );
|
||||
casti_m128i( d,52 ) =
|
||||
casti_m128i( d,53 ) =
|
||||
casti_m128i( d,54 ) =
|
||||
casti_m128i( d,55 ) = _mm_shuffle_epi32( s3 , 0x55 );
|
||||
casti_m128i( d,56 ) =
|
||||
casti_m128i( d,57 ) =
|
||||
casti_m128i( d,58 ) =
|
||||
casti_m128i( d,59 ) = _mm_shuffle_epi32( s3 , 0xaa );
|
||||
casti_m128i( d,60 ) =
|
||||
casti_m128i( d,61 ) =
|
||||
casti_m128i( d,62 ) =
|
||||
casti_m128i( d,63 ) = _mm_shuffle_epi32( s3 , 0xff );
|
||||
casti_m512i( d,12 ) = mm512_bcast_m128( _mm_shuffle_epi32( s3 , 0x00 ) );
|
||||
casti_m512i( d,13 ) = mm512_bcast_m128( _mm_shuffle_epi32( s3 , 0x55 ) );
|
||||
casti_m512i( d,14 ) = mm512_bcast_m128( _mm_shuffle_epi32( s3 , 0xaa ) );
|
||||
casti_m512i( d,15 ) = mm512_bcast_m128( _mm_shuffle_epi32( s3 , 0xff ) );
|
||||
|
||||
casti_m128i( d,64 ) =
|
||||
casti_m128i( d,65 ) =
|
||||
casti_m128i( d,66 ) =
|
||||
casti_m128i( d,67 ) = _mm_shuffle_epi32( s4 , 0x00 );
|
||||
casti_m128i( d,68 ) =
|
||||
casti_m128i( d,69 ) =
|
||||
casti_m128i( d,70 ) =
|
||||
casti_m128i( d,71 ) = _mm_shuffle_epi32( s4 , 0x55 );
|
||||
casti_m128i( d,72 ) =
|
||||
casti_m128i( d,73 ) =
|
||||
casti_m128i( d,74 ) =
|
||||
casti_m128i( d,75 ) = _mm_shuffle_epi32( s4 , 0xaa );
|
||||
casti_m128i( d,76 ) =
|
||||
casti_m128i( d,77 ) =
|
||||
casti_m128i( d,78 ) =
|
||||
casti_m128i( d,79 ) = _mm_shuffle_epi32( s4 , 0xff );
|
||||
casti_m512i( d,16 ) = mm512_bcast_m128( _mm_shuffle_epi32( s4 , 0x00 ) );
|
||||
casti_m512i( d,17 ) = mm512_bcast_m128( _mm_shuffle_epi32( s4 , 0x55 ) );
|
||||
casti_m512i( d,18 ) = mm512_bcast_m128( _mm_shuffle_epi32( s4 , 0xaa ) );
|
||||
casti_m512i( d,19 ) = mm512_bcast_m128( _mm_shuffle_epi32( s4 , 0xff ) );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
@@ -1534,47 +1222,6 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
//
|
||||
// 64 bit data
|
||||
|
||||
// 2x64 (SSE2)
|
||||
|
||||
static inline void intrlv_2x64( void *dst, const void *src0,
|
||||
const void *src1, const int bit_len )
|
||||
{
|
||||
uint64_t *d = (uint64_t*)dst;;
|
||||
const uint64_t *s0 = (const uint64_t*)src0;
|
||||
const uint64_t *s1 = (const uint64_t*)src1;
|
||||
d[ 0] = s0[ 0]; d[ 1] = s1[ 0]; d[ 2] = s0[ 1]; d[ 3] = s1[ 1];
|
||||
d[ 4] = s0[ 2]; d[ 5] = s1[ 2]; d[ 6] = s0[ 3]; d[ 7] = s1[ 3];
|
||||
if ( bit_len <= 256 ) return;
|
||||
d[ 8] = s0[ 4]; d[ 9] = s1[ 4]; d[10] = s0[ 5]; d[11] = s1[ 5];
|
||||
d[12] = s0[ 6]; d[13] = s1[ 6]; d[14] = s0[ 7]; d[15] = s1[ 7];
|
||||
if ( bit_len <= 512 ) return;
|
||||
d[16] = s0[ 8]; d[17] = s1[ 8]; d[18] = s0[ 9]; d[19] = s1[ 9];
|
||||
if ( bit_len <= 640 ) return;
|
||||
d[20] = s0[10]; d[21] = s1[10]; d[22] = s0[11]; d[23] = s1[11];
|
||||
d[24] = s0[12]; d[25] = s1[12]; d[26] = s0[13]; d[27] = s1[13];
|
||||
d[28] = s0[14]; d[29] = s1[14]; d[30] = s0[15]; d[31] = s1[15];
|
||||
}
|
||||
|
||||
static inline void dintrlv_2x64( void *dst0, void *dst1,
|
||||
const void *src, const int bit_len )
|
||||
{
|
||||
uint64_t *d0 = (uint64_t*)dst0;
|
||||
uint64_t *d1 = (uint64_t*)dst1;
|
||||
const uint64_t *s = (const uint64_t*)src;
|
||||
|
||||
d0[ 0] = s[ 0]; d1[ 0] = s[ 1]; d0[ 1] = s[ 2]; d1[ 1] = s[ 3];
|
||||
d0[ 2] = s[ 4]; d1[ 2] = s[ 5]; d0[ 3] = s[ 6]; d1[ 3] = s[ 7];
|
||||
if ( bit_len <= 256 ) return;
|
||||
d0[ 4] = s[ 8]; d1[ 4] = s[ 9]; d0[ 5] = s[10]; d1[ 5] = s[11];
|
||||
d0[ 6] = s[12]; d1[ 6] = s[13]; d0[ 7] = s[14]; d1[ 7] = s[15];
|
||||
if ( bit_len <= 512 ) return;
|
||||
d0[ 8] = s[16]; d1[ 8] = s[17]; d0[ 9] = s[18]; d1[ 9] = s[19];
|
||||
if ( bit_len <= 640 ) return;
|
||||
d0[10] = s[20]; d1[10] = s[21]; d0[11] = s[22]; d1[11] = s[23];
|
||||
d0[12] = s[24]; d1[12] = s[25]; d0[13] = s[26]; d1[13] = s[27];
|
||||
d0[14] = s[28]; d1[14] = s[29]; d0[15] = s[30]; d1[15] = s[31];
|
||||
}
|
||||
|
||||
// 4x64 (AVX2)
|
||||
|
||||
static inline void intrlv_4x64( void *dst, const void *src0,
|
||||
@@ -2149,19 +1796,19 @@ static inline void extr_lane_8x64( void *dst, const void *src, const int lane,
|
||||
// broadcast to all lanes
|
||||
static inline void mm512_intrlv80_8x64( void *dst, const void *src )
|
||||
{
|
||||
__m128i *d = (__m128i*)dst;
|
||||
__m512i *d = (__m512i*)dst;
|
||||
const __m128i *s = (const __m128i*)src;
|
||||
|
||||
d[ 0] = d[ 1] = d[ 2] = d[ 3] = _mm_shuffle_epi32( s[0], 0x44 );
|
||||
d[ 4] = d[ 5] = d[ 6] = d[ 7] = _mm_shuffle_epi32( s[0], 0xee );
|
||||
d[ 8] = d[ 9] = d[10] = d[11] = _mm_shuffle_epi32( s[1], 0x44 );
|
||||
d[12] = d[13] = d[14] = d[15] = _mm_shuffle_epi32( s[1], 0xee );
|
||||
d[16] = d[17] = d[18] = d[19] = _mm_shuffle_epi32( s[2], 0x44 );
|
||||
d[20] = d[21] = d[22] = d[23] = _mm_shuffle_epi32( s[2], 0xee );
|
||||
d[24] = d[25] = d[26] = d[27] = _mm_shuffle_epi32( s[3], 0x44 );
|
||||
d[28] = d[29] = d[30] = d[31] = _mm_shuffle_epi32( s[3], 0xee );
|
||||
d[32] = d[33] = d[34] = d[35] = _mm_shuffle_epi32( s[4], 0x44 );
|
||||
d[36] = d[37] = d[38] = d[39] = _mm_shuffle_epi32( s[4], 0xee );
|
||||
d[ 0] = mm512_bcast_m128( _mm_shuffle_epi32( s[0], 0x44 ) );
|
||||
d[ 1] = mm512_bcast_m128( _mm_shuffle_epi32( s[0], 0xee ) );
|
||||
d[ 2] = mm512_bcast_m128( _mm_shuffle_epi32( s[1], 0x44 ) );
|
||||
d[ 3] = mm512_bcast_m128( _mm_shuffle_epi32( s[1], 0xee ) );
|
||||
d[ 4] = mm512_bcast_m128( _mm_shuffle_epi32( s[2], 0x44 ) );
|
||||
d[ 5] = mm512_bcast_m128( _mm_shuffle_epi32( s[2], 0xee ) );
|
||||
d[ 6] = mm512_bcast_m128( _mm_shuffle_epi32( s[3], 0x44 ) );
|
||||
d[ 7] = mm512_bcast_m128( _mm_shuffle_epi32( s[3], 0xee ) );
|
||||
d[ 8] = mm512_bcast_m128( _mm_shuffle_epi32( s[4], 0x44 ) );
|
||||
d[ 9] = mm512_bcast_m128( _mm_shuffle_epi32( s[4], 0xee ) );
|
||||
}
|
||||
|
||||
// byte swap and broadcast to al lanes
|
||||
@@ -2180,50 +1827,16 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
||||
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
|
||||
|
||||
casti_m128i( d, 0 ) =
|
||||
casti_m128i( d, 1 ) =
|
||||
casti_m128i( d, 2 ) =
|
||||
casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0, 0x44 );
|
||||
casti_m128i( d, 4 ) =
|
||||
casti_m128i( d, 5 ) =
|
||||
casti_m128i( d, 6 ) =
|
||||
casti_m128i( d, 7 ) = _mm_shuffle_epi32( s0, 0xee );
|
||||
|
||||
casti_m128i( d, 8 ) =
|
||||
casti_m128i( d, 9 ) =
|
||||
casti_m128i( d, 10 ) =
|
||||
casti_m128i( d, 11 ) = _mm_shuffle_epi32( s1, 0x44 );
|
||||
casti_m128i( d, 12 ) =
|
||||
casti_m128i( d, 13 ) =
|
||||
casti_m128i( d, 14 ) =
|
||||
casti_m128i( d, 15 ) = _mm_shuffle_epi32( s1, 0xee );
|
||||
|
||||
casti_m128i( d, 16 ) =
|
||||
casti_m128i( d, 17 ) =
|
||||
casti_m128i( d, 18 ) =
|
||||
casti_m128i( d, 19 ) = _mm_shuffle_epi32( s2, 0x44 );
|
||||
casti_m128i( d, 20 ) =
|
||||
casti_m128i( d, 21 ) =
|
||||
casti_m128i( d, 22 ) =
|
||||
casti_m128i( d, 23 ) = _mm_shuffle_epi32( s2, 0xee );
|
||||
|
||||
casti_m128i( d, 24 ) =
|
||||
casti_m128i( d, 25 ) =
|
||||
casti_m128i( d, 26 ) =
|
||||
casti_m128i( d, 27 ) = _mm_shuffle_epi32( s3, 0x44 );
|
||||
casti_m128i( d, 28 ) =
|
||||
casti_m128i( d, 29 ) =
|
||||
casti_m128i( d, 30 ) =
|
||||
casti_m128i( d, 31 ) = _mm_shuffle_epi32( s3, 0xee );
|
||||
|
||||
casti_m128i( d, 32 ) =
|
||||
casti_m128i( d, 33 ) =
|
||||
casti_m128i( d, 34 ) =
|
||||
casti_m128i( d, 35 ) = _mm_shuffle_epi32( s4, 0x44 );
|
||||
casti_m128i( d, 36 ) =
|
||||
casti_m128i( d, 37 ) =
|
||||
casti_m128i( d, 38 ) =
|
||||
casti_m128i( d, 39 ) = _mm_shuffle_epi32( s4, 0xee );
|
||||
casti_m512i( d, 0 ) = mm512_bcast_m128( _mm_shuffle_epi32( s0, 0x44 ) );
|
||||
casti_m512i( d, 1 ) = mm512_bcast_m128( _mm_shuffle_epi32( s0, 0xee ) );
|
||||
casti_m512i( d, 2 ) = mm512_bcast_m128( _mm_shuffle_epi32( s1, 0x44 ) );
|
||||
casti_m512i( d, 3 ) = mm512_bcast_m128( _mm_shuffle_epi32( s1, 0xee ) );
|
||||
casti_m512i( d, 4 ) = mm512_bcast_m128( _mm_shuffle_epi32( s2, 0x44 ) );
|
||||
casti_m512i( d, 5 ) = mm512_bcast_m128( _mm_shuffle_epi32( s2, 0xee ) );
|
||||
casti_m512i( d, 6 ) = mm512_bcast_m128( _mm_shuffle_epi32( s3, 0x44 ) );
|
||||
casti_m512i( d, 7 ) = mm512_bcast_m128( _mm_shuffle_epi32( s3, 0xee ) );
|
||||
casti_m512i( d, 8 ) = mm512_bcast_m128( _mm_shuffle_epi32( s4, 0x44 ) );
|
||||
casti_m512i( d, 9 ) = mm512_bcast_m128( _mm_shuffle_epi32( s4, 0xee ) );
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
@@ -2395,11 +2008,11 @@ static inline void mm512_bswap32_intrlv80_4x128( void *d, void *src )
|
||||
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
|
||||
|
||||
casti_m512i( d, 0 ) = _mm512_broadcast_i64x2( s0 );
|
||||
casti_m512i( d, 1 ) = _mm512_broadcast_i64x2( s1 );
|
||||
casti_m512i( d, 2 ) = _mm512_broadcast_i64x2( s2 );
|
||||
casti_m512i( d, 3 ) = _mm512_broadcast_i64x2( s3 );
|
||||
casti_m512i( d, 4 ) = _mm512_broadcast_i64x2( s4 );
|
||||
casti_m512i( d, 0 ) = mm512_bcast_m128( s0 );
|
||||
casti_m512i( d, 1 ) = mm512_bcast_m128( s1 );
|
||||
casti_m512i( d, 2 ) = mm512_bcast_m128( s2 );
|
||||
casti_m512i( d, 3 ) = mm512_bcast_m128( s3 );
|
||||
casti_m512i( d, 4 ) = mm512_bcast_m128( s4 );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -93,10 +93,15 @@ static inline uint32_t u32_mov128_32( const __m128i a )
|
||||
return n;
|
||||
}
|
||||
|
||||
// Equivalent of set1, broadcast integer to all elements.
|
||||
#define m128_const_i128( i ) mm128_mov64_128( i )
|
||||
#define m128_const1_64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
|
||||
#define m128_const1_32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
|
||||
// Emulate broadcast & insert instructions not available in SSE2
|
||||
#define mm128_bcast_i64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
|
||||
#define mm128_bcast_i32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
|
||||
|
||||
#define m128_const_i128( i ) mm128_mov64_128( i )
|
||||
|
||||
// deprecated
|
||||
#define m128_const1_64 mm128_bcast_i64
|
||||
#define m128_const1_32 mm128_bcast_i32
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
@@ -104,7 +109,7 @@ static inline uint32_t u32_mov128_32( const __m128i a )
|
||||
#define m128_const_64( hi, lo ) \
|
||||
_mm_insert_epi64( mm128_mov64_128( lo ), hi, 1 )
|
||||
|
||||
#else // No insert in SSE2
|
||||
#else
|
||||
|
||||
#define m128_const_64 _mm_set_epi64x
|
||||
|
||||
@@ -114,12 +119,10 @@ static inline uint32_t u32_mov128_32( const __m128i a )
|
||||
|
||||
#define m128_zero _mm_setzero_si128()
|
||||
#define m128_one_128 mm128_mov64_128( 1 )
|
||||
#define m128_one_64 _mm_shuffle_epi32( mm128_mov64_128( 1 ), 0x44 )
|
||||
#define m128_one_32 _mm_shuffle_epi32( mm128_mov32_128( 1 ), 0x00 )
|
||||
#define m128_one_16 _mm_shuffle_epi32( \
|
||||
mm128_mov32_128( 0x00010001 ), 0x00 )
|
||||
#define m128_one_8 _mm_shuffle_epi32( \
|
||||
mm128_mov32_128( 0x01010101 ), 0x00 )
|
||||
#define m128_one_64 mm128_bcast_i64( 1 )
|
||||
#define m128_one_32 mm128_bcast_i32( 1 )
|
||||
#define m128_one_16 mm128_bcast_i32( 0x00010001 )
|
||||
#define m128_one_8 mm128_bcast_i32( 0x01010101 )
|
||||
|
||||
// ASM avoids the need to initialize return variable to avoid compiler warning.
|
||||
// Macro abstracts function parentheses to look like an identifier.
|
||||
@@ -149,7 +152,7 @@ static inline __m128i mm128_neg1_fn()
|
||||
// sizing. It's unique.
|
||||
//
|
||||
// It can:
|
||||
// - zero 32 bit elements of a 128 bit vector.
|
||||
// - zero any number of 32 bit elements of a 128 bit vector.
|
||||
// - extract any 32 bit element from one 128 bit vector and insert the
|
||||
// data to any 32 bit element of another 128 bit vector, or the same vector.
|
||||
// - do both simultaneoulsly.
|
||||
@@ -162,14 +165,21 @@ static inline __m128i mm128_neg1_fn()
|
||||
// c[5:4] destination element selector
|
||||
// c[7:6] source element selector
|
||||
|
||||
// Convert type and abbreviate name: e"x"tract "i"nsert "m"ask
|
||||
// Convert type and abbreviate name: eXtract Insert Mask = XIM
|
||||
#define mm128_xim_32( v1, v2, c ) \
|
||||
_mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
|
||||
_mm_castsi128_ps( v2 ), c ) )
|
||||
|
||||
// Some examples of simple operations:
|
||||
/* Another way to do it with individual arguments.
|
||||
#define mm128_xim_32( v1, i1, v2, i2, mask ) \
|
||||
_mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
|
||||
_mm_castsi128_ps( v2 ), \
|
||||
(mask) | ((i1)<<4) | ((i2)<<6) ) )
|
||||
*/
|
||||
|
||||
// Insert 32 bit integer into v at element c and return modified v.
|
||||
// Examples of simple operations using xim:
|
||||
|
||||
// Insert 32 bit integer into v at element c and return updated v.
|
||||
static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
|
||||
const int c )
|
||||
{ return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); }
|
||||
@@ -178,13 +188,12 @@ static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
|
||||
static inline uint32_t mm128_extract_32( const __m128i v, const int c )
|
||||
{ return u32_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }
|
||||
|
||||
// Clear (zero) 32 bit elements based on bits set in 4 bit mask.
|
||||
// Zero 32 bit elements when bit in mask is set.
|
||||
static inline __m128i mm128_mask_32( const __m128i v, const int m )
|
||||
{ return mm128_xim_32( v, v, m ); }
|
||||
|
||||
// Move element i2 of v2 to element i1 of v1. For reference and convenience,
|
||||
// it's faster to precalculate the index.
|
||||
#define mm128_shuflmov_32( v1, i1, v2, i2 ) \
|
||||
// Move element i2 of v2 to element i1 of v1 and return updated v1.
|
||||
#define mm128_mov32_32( v1, i1, v2, i2 ) \
|
||||
mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )
|
||||
|
||||
#endif // SSE4_1
|
||||
@@ -280,7 +289,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
|
||||
// Mask making
|
||||
// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
|
||||
// Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
|
||||
// Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
|
||||
// Effectively a sign test.
|
||||
|
||||
#define mm_movmask_64( v ) \
|
||||
@@ -385,6 +394,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm128_rol_var_32( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
// Cross lane shuffles
|
||||
//
|
||||
// Limited 2 input shuffle, combines shuffle with blend. The destination low
|
||||
// half is always taken from v1, and the high half from v2.
|
||||
@@ -396,12 +406,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
|
||||
_mm_castsi128_ps( v2 ), c ) );
|
||||
|
||||
//
|
||||
// Rotate vector elements accross all lanes
|
||||
|
||||
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
|
||||
#define mm128_shuflr_64 mm128_swap_64
|
||||
#define mm128_shufll_64 mm128_swap_64
|
||||
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
|
||||
#define mm128_shuflr_64 mm128_swap_64
|
||||
#define mm128_shufll_64 mm128_swap_64
|
||||
|
||||
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
|
||||
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
|
||||
@@ -414,13 +423,11 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
|
||||
#endif
|
||||
|
||||
// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
|
||||
// for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
|
||||
// (unlikely but faster), or when SSSE3 is not available (slower).
|
||||
// Rotate 64 bit lanes
|
||||
|
||||
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
|
||||
#define mm128_shuflr64_32 mm128_swap64_32
|
||||
#define mm128_shufll64_32 mm128_swap64_32
|
||||
#define mm128_shuflr64_32 mm128_swap64_32
|
||||
#define mm128_shufll64_32 mm128_swap64_32
|
||||
|
||||
#if defined(__SSSE3__) && !defined(__AVX512VL__)
|
||||
#define mm128_shuflr64_24( v ) \
|
||||
@@ -438,6 +445,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
#define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
|
||||
#endif
|
||||
|
||||
// Rotate 32 bit lanes
|
||||
|
||||
#if defined(__SSSE3__) && !defined(__AVX512VL__)
|
||||
#define mm128_swap32_16( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
@@ -445,8 +454,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
#else
|
||||
#define mm128_swap32_16( v ) mm128_ror_32( v, 16 )
|
||||
#endif
|
||||
#define mm128_shuflr32_16 mm128_swap32_16
|
||||
#define mm128_shufll32_16 mm128_swap32_16
|
||||
#define mm128_shuflr32_16 mm128_swap32_16
|
||||
#define mm128_shufll32_16 mm128_swap32_16
|
||||
|
||||
#if defined(__SSSE3__) && !defined(__AVX512VL__)
|
||||
#define mm128_shuflr32_8( v ) \
|
||||
@@ -563,9 +572,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
v1 = _mm_xor_si128( v1, v2 );
|
||||
|
||||
|
||||
// alignr for 32 & 64 bit elements is only available with AVX512 but
|
||||
// emulated here. Shift argument is not needed, it's always 1.
|
||||
// Behaviour is otherwise consistent with Intel alignr intrinsics.
|
||||
// alignr instruction for 32 & 64 bit elements is only available with AVX512
|
||||
// but emulated here. Behaviour is consistent with Intel alignr intrinsics.
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
|
@@ -68,31 +68,33 @@ typedef union
|
||||
#define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) )
|
||||
|
||||
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
|
||||
|
||||
#define mm256_concat_128( hi, lo ) \
|
||||
_mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )
|
||||
|
||||
#define mm256_bcast_m128( v ) \
|
||||
_mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
|
||||
#define mm256_bcast_i128( i ) mm256_bcast_m128( mm128_mov64_128( i ) )
|
||||
#define mm256_bcast_i64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
|
||||
#define mm256_bcast_i32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
|
||||
#define mm256_bcast_i16( i ) _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
|
||||
#define mm256_bcast_i8( i ) _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
|
||||
|
||||
// Equivalent of set, move 64 bit integer constants to respective 64 bit
|
||||
// elements.
|
||||
static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
|
||||
const uint64_t i1, const uint64_t i0 )
|
||||
{
|
||||
union { __m256i m256i;
|
||||
uint64_t u64[4]; } v;
|
||||
union { __m256i m256i; uint64_t u64[4]; } v;
|
||||
v.u64[0] = i0; v.u64[1] = i1; v.u64[2] = i2; v.u64[3] = i3;
|
||||
return v.m256i;
|
||||
}
|
||||
|
||||
// Equivalent of set1.
|
||||
// 128 bit vector argument
|
||||
#define m256_const1_128( v ) \
|
||||
_mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
|
||||
// 64 bit integer argument zero extended to 128 bits.
|
||||
#define m256_const1_i128( i ) m256_const1_128( mm128_mov64_128( i ) )
|
||||
#define m256_const1_64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
|
||||
#define m256_const1_32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
|
||||
#define m256_const1_16( i ) _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
|
||||
#define m256_const1_8 ( i ) _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
|
||||
// Deprecated
|
||||
#define m256_const1_128 mm256_bcast_m128
|
||||
#define m256_const1_i128 mm256_bcast_i128
|
||||
#define m256_const1_64 mm256_bcast_i64
|
||||
#define m256_const1_32 mm256_bcast_i32
|
||||
|
||||
#define m256_const2_64( i1, i0 ) \
|
||||
m256_const1_128( m128_const_64( i1, i0 ) )
|
||||
@@ -101,13 +103,13 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
|
||||
// All SIMD constant macros are actually functions containing executable
|
||||
// code and therefore can't be used as compile time initializers.
|
||||
|
||||
#define m256_zero _mm256_setzero_si256()
|
||||
#define m256_one_256 mm256_mov64_256( 1 )
|
||||
#define m256_one_128 m256_const1_i128( 1 )
|
||||
#define m256_one_64 _mm256_broadcastq_epi64( mm128_mov64_128( 1 ) )
|
||||
#define m256_one_32 _mm256_broadcastd_epi32( mm128_mov64_128( 1 ) )
|
||||
#define m256_one_16 _mm256_broadcastw_epi16( mm128_mov64_128( 1 ) )
|
||||
#define m256_one_8 _mm256_broadcastb_epi8 ( mm128_mov64_128( 1 ) )
|
||||
#define m256_zero _mm256_setzero_si256()
|
||||
#define m256_one_256 mm256_mov64_256( 1 )
|
||||
#define m256_one_128 mm256_bcast_i128( 1 )
|
||||
#define m256_one_64 mm256_bcast_i64( 1 )
|
||||
#define m256_one_32 mm256_bcast_i32( 1 )
|
||||
#define m256_one_16 mm256_bcast_i16( 1 )
|
||||
#define m256_one_8 mm256_bcast_i8 ( 1 )
|
||||
|
||||
static inline __m256i mm256_neg1_fn()
|
||||
{
|
||||
@@ -118,8 +120,8 @@ static inline __m256i mm256_neg1_fn()
|
||||
#define m256_neg1 mm256_neg1_fn()
|
||||
|
||||
// Consistent naming for similar operations.
|
||||
#define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
|
||||
#define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
|
||||
#define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
|
||||
#define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
|
||||
|
||||
//
|
||||
// Memory functions
|
||||
@@ -241,7 +243,7 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
|
||||
// Mask making
|
||||
// Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
|
||||
// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
|
||||
// Returns 4 or 8 bit integer mask from MSBit of 64 or 32 bit elements.
|
||||
// Effectively a sign test.
|
||||
|
||||
#define mm256_movmask_64( v ) \
|
||||
@@ -355,18 +357,22 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
|
||||
_mm256_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
//
|
||||
// Cross lane shuffles
|
||||
//
|
||||
// Rotate elements accross all lanes.
|
||||
|
||||
// Swap 128 bit elements in 256 bit vector.
|
||||
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
|
||||
#define mm256_shuflr_128 mm256_swap_128
|
||||
#define mm256_shufll_128 mm256_swap_128
|
||||
#define mm256_shuflr_128 mm256_swap_128
|
||||
#define mm256_shufll_128 mm256_swap_128
|
||||
|
||||
// Rotate 256 bit vector by one 64 bit element
|
||||
#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 )
|
||||
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
|
||||
|
||||
|
||||
/* Not used
|
||||
// Rotate 256 bit vector by one 32 bit element.
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
@@ -389,6 +395,7 @@ static inline __m256i mm256_shufll_32( const __m256i v )
|
||||
0x0000000200000001, 0x0000000000000007 ) )
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
//
|
||||
// Rotate elements within each 128 bit lane of 256 bit vector.
|
||||
@@ -412,13 +419,11 @@ static inline __m256i mm256_shufll_32( const __m256i v )
|
||||
static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
{ return _mm256_alignr_epi8( v, v, c ); }
|
||||
|
||||
// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit
|
||||
// rotations for multiples of 8 bits. Uses faster ror/rol instructions when
|
||||
// AVX512 is available.
|
||||
// 64 bit lanes
|
||||
|
||||
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
|
||||
#define mm256_shuflr64_32 mm256_swap64_32
|
||||
#define mm256_shufll64_32 mm256_swap64_32
|
||||
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
|
||||
#define mm256_shuflr64_32 mm256_swap64_32
|
||||
#define mm256_shufll64_32 mm256_swap64_32
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm256_shuflr64_24( v ) _mm256_ror_epi64( v, 24 )
|
||||
@@ -436,6 +441,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
|
||||
#endif
|
||||
|
||||
// 32 bit lanes
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm256_swap32_16( v ) _mm256_ror_epi32( v, 16 )
|
||||
#else
|
||||
@@ -443,8 +450,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
_mm256_shuffle_epi8( v, m256_const2_64( \
|
||||
0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
|
||||
#endif
|
||||
#define mm256_shuflr32_16 mm256_swap32_16
|
||||
#define mm256_shufll32_16 mm256_swap32_16
|
||||
#define mm256_shuflr32_16 mm256_swap32_16
|
||||
#define mm256_shufll32_16 mm256_swap32_16
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm256_shuflr32_8( v ) _mm256_ror_epi32( v, 8 )
|
||||
|
@@ -113,7 +113,17 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
|
||||
#define mm512_concat_256( hi, lo ) \
|
||||
_mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 )
|
||||
|
||||
#define m512_const_128( v3, v2, v1, v0 ) \
|
||||
// Work in progress.
|
||||
// modified naming scheme to align more with opcode mnenonic:
|
||||
// m512_const1 becomes mm512_bcast_m[n] or mm512_bcast_i[n], short for
|
||||
// broadcast, i indicates integer arg, m is vector. Set1 intrinsics should
|
||||
// genarally be used for integer data.
|
||||
// mm512_const should only be used with immediate integer arguments, use
|
||||
// _mm512_set intrinsic instead.
|
||||
// mm512_set, mm512_set[n] macros may be defined when no intrinsic exists
|
||||
// for either the arg size or arg count.
|
||||
|
||||
#define mm512_set_128( v3, v2, v1, v0 ) \
|
||||
mm512_concat_256( mm256_concat_128( v3, v2 ), \
|
||||
mm256_concat_128( v1, v0 ) )
|
||||
|
||||
@@ -133,29 +143,35 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
|
||||
return v.m512i;
|
||||
}
|
||||
|
||||
// Broadcast with vector argument is generally more efficient except for
|
||||
// integer immediate constants or when data was most recently referenced as
|
||||
// integer and is still available in an integer register.
|
||||
|
||||
/* not used
|
||||
// Equivalent of set1, broadcast lo element to all elements.
|
||||
static inline __m512i m512_const1_256( const __m256i v )
|
||||
{ return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); }
|
||||
*/
|
||||
|
||||
#define m512_const1_128( v ) \
|
||||
mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
|
||||
// Integer input argument up to 64 bits
|
||||
#define m512_const1_i128( i ) \
|
||||
mm512_perm_128( _mm512_castsi128_si512( mm128_mov64_128( i ) ), 0 )
|
||||
#define mm512_bcast_m128( v ) mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
|
||||
// Low 64 bits only, high 64 bits are zeroed.
|
||||
#define mm512_bcast_i128( i ) mm512_bcast_m128( mm128_mov64_128( i ) )
|
||||
#define mm512_bcast_i64( i ) _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
|
||||
#define mm512_bcast_i32( i ) _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
|
||||
#define mm512_bcast_i16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
|
||||
#define mm512_bcast_i8( i ) _mm512_broadcastb_epi8( mm128_mov32_128( i ) )
|
||||
|
||||
//#define m512_const1_256( v ) _mm512_broadcast_i64x4( v )
|
||||
//#define m512_const1_128( v ) _mm512_broadcast_i64x2( v )
|
||||
#define m512_const1_64( i ) _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
|
||||
#define m512_const1_32( i ) _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
|
||||
#define m512_const1_16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
|
||||
#define m512_const1_8( i ) _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
|
||||
// const1 is deprecated, use bcast instead
|
||||
#define m512_const1_128 mm512_bcast_m128
|
||||
#define m512_const1_i128 mm512_bcast_i128
|
||||
#define m512_const1_64 mm512_bcast_i64
|
||||
#define m512_const1_32 mm512_bcast_i32
|
||||
|
||||
#define m512_const2_128( v1, v0 ) \
|
||||
m512_const1_256( _mm512_inserti64x2( _mm512_castsi128_si512( v0 ), v1, 1 ) )
|
||||
_mm512_inserti64x2( _mm512_castsi128_si512( v0 ), v1, 1 )
|
||||
|
||||
#define m512_const2_64( i1, i0 ) \
|
||||
m512_const1_128( m128_const_64( i1, i0 ) )
|
||||
|
||||
mm512_bcast_m128( m128_const_64( i1, i0 ) )
|
||||
|
||||
static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
||||
const uint64_t i1, const uint64_t i0 )
|
||||
@@ -179,11 +195,11 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
||||
#define m512_zero _mm512_setzero_si512()
|
||||
#define m512_one_512 mm512_mov64_512( 1 )
|
||||
#define m512_one_256 _mm512_inserti64x4( m512_one_512, m256_one_256, 1 )
|
||||
#define m512_one_128 m512_const1_i128( 1 )
|
||||
#define m512_one_64 m512_const1_64( 1 )
|
||||
#define m512_one_32 m512_const1_32( 1 )
|
||||
#define m512_one_16 m512_const1_16( 1 )
|
||||
#define m512_one_8 m512_const1_8( 1 )
|
||||
#define m512_one_128 mm512_bcast_i128( (__uint128_t)1 )
|
||||
#define m512_one_64 mm512_bcast_i64( (uint64_t)1 )
|
||||
#define m512_one_32 mm512_bcast_i32( (uint32_t)1 )
|
||||
#define m512_one_16 mm512_bcast_i16( (uint16_t)1 )
|
||||
#define m512_one_8 mm512_bcast_i8( (uint8_t)1 )
|
||||
|
||||
// use asm to avoid compiler warning for unitialized local
|
||||
static inline __m512i mm512_neg1_fn()
|
||||
@@ -193,8 +209,6 @@ static inline __m512i mm512_neg1_fn()
|
||||
return a;
|
||||
}
|
||||
#define m512_neg1 mm512_neg1_fn() // 1 clock
|
||||
//#define m512_neg1 m512_const1_64( 0xffffffffffffffff ) // 5 clocks
|
||||
//#define m512_neg1 _mm512_movm_epi64( 0xff ) // 2 clocks
|
||||
|
||||
//
|
||||
// Basic operations without SIMD equivalent
|
||||
@@ -343,10 +357,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
// 8 lanes of 64 bytes each
|
||||
#define mm512_block_bswap_64( d, s ) do \
|
||||
{ \
|
||||
__m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
|
||||
0x28292a2b2c2d2e2f, 0x2021222324252627, \
|
||||
0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
const __m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
|
||||
0x28292a2b2c2d2e2f, 0x2021222324252627, \
|
||||
0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
|
||||
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
|
||||
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
|
||||
@@ -360,10 +374,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
// 16 lanes of 32 bytes each
|
||||
#define mm512_block_bswap_32( d, s ) do \
|
||||
{ \
|
||||
__m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223, \
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
const __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223, \
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
|
||||
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
|
||||
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
|
||||
@@ -449,7 +463,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 )
|
||||
#define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 )
|
||||
|
||||
/*
|
||||
/* Not used
|
||||
// Rotate 256 bit lanes by one 32 bit element
|
||||
#define mm512_shuflr256_32( v ) \
|
||||
_mm512_permutexvar_epi32( m512_const_64( \
|
||||
@@ -496,6 +510,18 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
//
|
||||
// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
|
||||
|
||||
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
|
||||
#define mm512_shuflr128_64 mm512_swap128_64
|
||||
#define mm512_shufll128_64 mm512_swap128_64
|
||||
|
||||
// Rotate 128 bit lanes by one 32 bit element
|
||||
#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
|
||||
#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
|
||||
|
||||
// Rotate 128 bit lanes right by c bytes, versatile and just as fast
|
||||
static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
{ return _mm512_alignr_epi8( v, v, c ); }
|
||||
|
||||
// Limited 2 input, 1 output shuffle, combines shuffle with blend.
|
||||
// Like most shuffles it's limited to 128 bit lanes and like some shuffles
|
||||
// destination elements must come from a specific source arg.
|
||||
@@ -507,26 +533,11 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
_mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( v1 ), \
|
||||
_mm512_castsi512_ps( v2 ), c ) );
|
||||
|
||||
// Swap 64 bits in each 128 bit lane
|
||||
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
|
||||
#define mm512_shuflr128_64 mm512_swap128_64
|
||||
#define mm512_shufll128_64 mm512_swap128_64
|
||||
|
||||
// Rotate 128 bit lanes by one 32 bit element
|
||||
#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
|
||||
#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
|
||||
|
||||
// Rotate right 128 bit lanes by c bytes, versatile and just as fast
|
||||
static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
{ return _mm512_alignr_epi8( v, v, c ); }
|
||||
|
||||
// Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
|
||||
// can be done with ror & rol. Defined only for convenience and consistency
|
||||
// with AVX2 & SSE2 macros.
|
||||
// 64 bit lanes
|
||||
|
||||
#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 )
|
||||
#define mm512_shuflr64_32 mm512_swap64_32
|
||||
#define mm512_shufll64_32 mm512_swap64_32
|
||||
#define mm512_shuflr64_32 mm512_swap64_32
|
||||
#define mm512_shufll64_32 mm512_swap64_32
|
||||
|
||||
#define mm512_shuflr64_24( v ) _mm512_ror_epi64( v, 24 )
|
||||
#define mm512_shufll64_24( v ) _mm512_rol_epi64( v, 24 )
|
||||
@@ -537,12 +548,14 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
#define mm512_shuflr64_8( v ) _mm512_ror_epi64( v, 8 )
|
||||
#define mm512_shufll64_8( v ) _mm512_rol_epi64( v, 8 )
|
||||
|
||||
#define mm512_swap32_16( v ) _mm512_ror_epi32( v, 16 )
|
||||
#define mm512_shuflr32_16 mm512_swap32_16
|
||||
#define mm512_shufll32_16 mm512_swap32_16
|
||||
// 32 bit lanes
|
||||
|
||||
#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 )
|
||||
#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 )
|
||||
#define mm512_swap32_16( v ) _mm512_ror_epi32( v, 16 )
|
||||
#define mm512_shuflr32_16 mm512_swap32_16
|
||||
#define mm512_shufll32_16 mm512_swap32_16
|
||||
|
||||
#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 )
|
||||
#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 )
|
||||
|
||||
#endif // AVX512
|
||||
#endif // SIMD_512_H__
|
||||
|
@@ -55,6 +55,13 @@
|
||||
typedef __int128 int128_t;
|
||||
typedef unsigned __int128 uint128_t;
|
||||
|
||||
typedef union
|
||||
{
|
||||
uint128_t u128;
|
||||
uint64_t u64[2];
|
||||
uint32_t u32[4];
|
||||
} __attribute__ ((aligned (16))) u128_ovly;
|
||||
|
||||
// Extracting the low bits is a trivial cast.
|
||||
// These specialized functions are optimized while providing a
|
||||
// consistent interface.
|
||||
|
Reference in New Issue
Block a user