mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.23.0
This commit is contained in:
@@ -731,6 +731,67 @@ static inline void extr_lane_8x32( void *d, const void *s,
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
|
||||
|
||||
//TODO Enable for AVX10_256 AVX10_512
|
||||
|
||||
// Combine byte swap & broadcast in one permute
|
||||
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
{
|
||||
const __m256i c0 = _mm256_set1_epi32( 0x00010203 );
|
||||
const __m256i c1 = _mm256_set1_epi32( 0x04050607 );
|
||||
const __m256i c2 = _mm256_set1_epi32( 0x08090a0b );
|
||||
const __m256i c3 = _mm256_set1_epi32( 0x0c0d0e0f );
|
||||
const __m128i s0 = casti_m128i( src,0 );
|
||||
const __m128i s1 = casti_m128i( src,1 );
|
||||
const __m128i s2 = casti_m128i( src,2 );
|
||||
const __m128i s3 = casti_m128i( src,3 );
|
||||
const __m128i s4 = casti_m128i( src,4 );
|
||||
|
||||
casti_m256i( d, 0 ) = _mm256_permutexvar_epi8( c0,
|
||||
_mm256_castsi128_si256( s0 ) );
|
||||
casti_m256i( d, 1 ) = _mm256_permutexvar_epi8( c1,
|
||||
_mm256_castsi128_si256( s0 ) );
|
||||
casti_m256i( d, 2 ) = _mm256_permutexvar_epi8( c2,
|
||||
_mm256_castsi128_si256( s0 ) );
|
||||
casti_m256i( d, 3 ) = _mm256_permutexvar_epi8( c3,
|
||||
_mm256_castsi128_si256( s0 ) );
|
||||
casti_m256i( d, 4 ) = _mm256_permutexvar_epi8( c0,
|
||||
_mm256_castsi128_si256( s1 ) );
|
||||
casti_m256i( d, 5 ) = _mm256_permutexvar_epi8( c1,
|
||||
_mm256_castsi128_si256( s1 ) );
|
||||
casti_m256i( d, 6 ) = _mm256_permutexvar_epi8( c2,
|
||||
_mm256_castsi128_si256( s1 ) );
|
||||
casti_m256i( d, 7 ) = _mm256_permutexvar_epi8( c3,
|
||||
_mm256_castsi128_si256( s1 ) );
|
||||
casti_m256i( d, 8 ) = _mm256_permutexvar_epi8( c0,
|
||||
_mm256_castsi128_si256( s2 ) );
|
||||
casti_m256i( d, 9 ) = _mm256_permutexvar_epi8( c1,
|
||||
_mm256_castsi128_si256( s2 ) );
|
||||
casti_m256i( d,10 ) = _mm256_permutexvar_epi8( c2,
|
||||
_mm256_castsi128_si256( s2 ) );
|
||||
casti_m256i( d,11 ) = _mm256_permutexvar_epi8( c3,
|
||||
_mm256_castsi128_si256( s2 ) );
|
||||
casti_m256i( d,12 ) = _mm256_permutexvar_epi8( c0,
|
||||
_mm256_castsi128_si256( s3 ) );
|
||||
casti_m256i( d,13 ) = _mm256_permutexvar_epi8( c1,
|
||||
_mm256_castsi128_si256( s3 ) );
|
||||
casti_m256i( d,14 ) = _mm256_permutexvar_epi8( c2,
|
||||
_mm256_castsi128_si256( s3 ) );
|
||||
casti_m256i( d,15 ) = _mm256_permutexvar_epi8( c3,
|
||||
_mm256_castsi128_si256( s3 ) );
|
||||
casti_m256i( d,16 ) = _mm256_permutexvar_epi8( c0,
|
||||
_mm256_castsi128_si256( s4 ) );
|
||||
casti_m256i( d,17 ) = _mm256_permutexvar_epi8( c1,
|
||||
_mm256_castsi128_si256( s4 ) );
|
||||
casti_m256i( d,18 ) = _mm256_permutexvar_epi8( c2,
|
||||
_mm256_castsi128_si256( s4 ) );
|
||||
casti_m256i( d,19 ) = _mm256_permutexvar_epi8( c3,
|
||||
_mm256_castsi128_si256( s4 ) );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
{
|
||||
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
@@ -792,6 +853,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
_mm256_castsi128_si256( s4 ), c3 );
|
||||
}
|
||||
|
||||
#endif // AVX512VBMI else
|
||||
#endif // AVX2
|
||||
|
||||
// 16x32
|
||||
@@ -1173,10 +1235,12 @@ static inline void extr_lane_16x32( void *d, const void *s,
|
||||
((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+240 ];
|
||||
}
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__)
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#if defined(__AVX512VBMI__)
|
||||
|
||||
// TODO Enable for AVX10_512
|
||||
|
||||
// Combine byte swap & broadcast in one permute
|
||||
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
{
|
||||
@@ -1496,10 +1560,48 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
|
||||
_mm256_castsi128_si256( s4 ), 0x55 );
|
||||
}
|
||||
|
||||
#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
|
||||
|
||||
//TODO Enable for AVX10_256 AVX10_512
|
||||
|
||||
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
{
|
||||
const __m256i c0 = _mm256_set1_epi64x( 0x0405060700010203 );
|
||||
const __m256i c1 = _mm256_set1_epi64x( 0x0c0d0e0f08090a0b );
|
||||
const __m128i s0 = casti_m128i( src,0 );
|
||||
const __m128i s1 = casti_m128i( src,1 );
|
||||
const __m128i s2 = casti_m128i( src,2 );
|
||||
const __m128i s3 = casti_m128i( src,3 );
|
||||
const __m128i s4 = casti_m128i( src,4 );
|
||||
|
||||
casti_m256i( d,0 ) = _mm256_permutexvar_epi8( c0,
|
||||
_mm256_castsi128_si256( s0 ) );
|
||||
casti_m256i( d,1 ) = _mm256_permutexvar_epi8( c1,
|
||||
_mm256_castsi128_si256( s0 ) );
|
||||
casti_m256i( d,2 ) = _mm256_permutexvar_epi8( c0,
|
||||
_mm256_castsi128_si256( s1 ) );
|
||||
casti_m256i( d,3 ) = _mm256_permutexvar_epi8( c1,
|
||||
_mm256_castsi128_si256( s1 ) );
|
||||
casti_m256i( d,4 ) = _mm256_permutexvar_epi8( c0,
|
||||
_mm256_castsi128_si256( s2 ) );
|
||||
casti_m256i( d,5 ) = _mm256_permutexvar_epi8( c1,
|
||||
_mm256_castsi128_si256( s2 ) );
|
||||
casti_m256i( d,6 ) = _mm256_permutexvar_epi8( c0,
|
||||
_mm256_castsi128_si256( s3 ) );
|
||||
casti_m256i( d,7 ) = _mm256_permutexvar_epi8( c1,
|
||||
_mm256_castsi128_si256( s3 ) );
|
||||
casti_m256i( d,8 ) = _mm256_permutexvar_epi8( c0,
|
||||
_mm256_castsi128_si256( s4 ) );
|
||||
casti_m256i( d,9 ) = _mm256_permutexvar_epi8( c1,
|
||||
_mm256_castsi128_si256( s4 ) );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
{
|
||||
const __m256i bswap_shuf = mm256_bcast_m128(
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
__m256i s0 = casti_m256i( src,0 );
|
||||
__m256i s1 = casti_m256i( src,1 );
|
||||
__m128i s4 = casti_m128i( src,4 );
|
||||
@@ -1524,6 +1626,8 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
_mm256_castsi128_si256( s4 ), 0x55 );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
// 8x64 (AVX512)
|
||||
@@ -1846,6 +1950,8 @@ static inline void extr_lane_8x64( void *dst, const void *src, const int lane,
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__)
|
||||
|
||||
//TODO Enable for AVX10_512
|
||||
|
||||
// broadcast to all lanes
|
||||
static inline void mm512_intrlv80_8x64( void *dst, const void *src )
|
||||
{
|
||||
@@ -2089,10 +2195,36 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,
|
||||
d0[3] = s[12]; d1[3] = s[13]; d2[3] = s[14]; d3[3] = s[15];
|
||||
}
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
static inline void mm512_bswap32_intrlv80_4x128( void *d, void *src )
|
||||
#if defined(__AVX512VBMI__)
|
||||
//TODO Enable for AVX10_512
|
||||
|
||||
static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
|
||||
{
|
||||
const __m512i bswap_shuf = mm512_bcast_m128(
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
const __m128i s0 = casti_m128i( src,0 );
|
||||
const __m128i s1 = casti_m128i( src,1 );
|
||||
const __m128i s2 = casti_m128i( src,2 );
|
||||
const __m128i s3 = casti_m128i( src,3 );
|
||||
const __m128i s4 = casti_m128i( src,4 );
|
||||
|
||||
casti_m512i( d,0 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s0 ),
|
||||
bswap_shuf );
|
||||
casti_m512i( d,1 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s1 ),
|
||||
bswap_shuf );
|
||||
casti_m512i( d,2 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s2 ),
|
||||
bswap_shuf );
|
||||
casti_m512i( d,3 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s3 ),
|
||||
bswap_shuf );
|
||||
casti_m512i( d,4 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s4 ),
|
||||
bswap_shuf );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
|
||||
{
|
||||
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
@@ -2108,14 +2240,15 @@ static inline void mm512_bswap32_intrlv80_4x128( void *d, void *src )
|
||||
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
|
||||
|
||||
casti_m512i( d, 0 ) = mm512_bcast_m128( s0 );
|
||||
casti_m512i( d, 1 ) = mm512_bcast_m128( s1 );
|
||||
casti_m512i( d, 2 ) = mm512_bcast_m128( s2 );
|
||||
casti_m512i( d, 3 ) = mm512_bcast_m128( s3 );
|
||||
casti_m512i( d, 4 ) = mm512_bcast_m128( s4 );
|
||||
}
|
||||
casti_m512i( d,0 ) = mm512_bcast_m128( s0 );
|
||||
casti_m512i( d,1 ) = mm512_bcast_m128( s1 );
|
||||
casti_m512i( d,2 ) = mm512_bcast_m128( s2 );
|
||||
casti_m512i( d,3 ) = mm512_bcast_m128( s3 );
|
||||
casti_m512i( d,4 ) = mm512_bcast_m128( s4 );
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // AVX512VBMI ELSE
|
||||
#endif // AVX512
|
||||
|
||||
// 2x256 (AVX512)
|
||||
|
||||
@@ -2955,6 +3088,8 @@ do { \
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
//TODO Enable for AVX10_512
|
||||
|
||||
/*
|
||||
#define mm512_intrlv_blend_128( hi, lo ) \
|
||||
_mm512_mask_blend_epi32( 0x0f0f, hi, lo )
|
||||
|
@@ -43,9 +43,11 @@ typedef union
|
||||
} __attribute__ ((aligned (16))) m128_ovly;
|
||||
|
||||
|
||||
// Deprecated. EVEX adds support for integer argument in broadcast instruction
|
||||
// eliminating the need for an explicit move in most cases. Use the set1
|
||||
// intrinsic with integers and let the compiler figure it out.
|
||||
// Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements
|
||||
// that make these functions either unnecessary or inefficient.
|
||||
// In cases where an explicit move betweeen GP & SIMD registers is still
|
||||
// necessary the cvt, set, or set1 intrinsics can be used allowing the
|
||||
// compiler to exploilt new features to produce optimum code.
|
||||
static inline __m128i mm128_mov64_128( const uint64_t n )
|
||||
{
|
||||
__m128i a;
|
||||
@@ -73,15 +75,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
|
||||
//#define mm128_bcast_m64( v ) _mm_shuffle_epi32( v, 0x44 )
|
||||
//#define mm128_bcast_m32( v ) _mm_shuffle_epi32( v, 0x00 )
|
||||
|
||||
// Deprecated, use set1 directly
|
||||
#define m128_const1_64 _mm_set1_epi64x
|
||||
#define m128_const1_32 _mm_set1_epi32
|
||||
|
||||
// Deprecated, use set directly
|
||||
#define m128_const_64 _mm_set_epi64x
|
||||
|
||||
// Pseudo constants
|
||||
|
||||
#define m128_zero _mm_setzero_si128()
|
||||
#define m128_one_128 mm128_mov64_128( 1 )
|
||||
//#define m128_one_64 _mm_set1_epi64x( 1 )
|
||||
@@ -141,7 +135,7 @@ static inline __m128i mm128_neg1_fn()
|
||||
|
||||
// Examples of simple operations using xim:
|
||||
|
||||
// Insert 32 bit integer into v at element c and return updated v.
|
||||
// Copy i to element c of dest and copy remaining elemnts from v.
|
||||
static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
|
||||
const int c )
|
||||
{ return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); }
|
||||
@@ -161,6 +155,7 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
|
||||
|
||||
// Bitwise not (~v)
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
|
||||
static inline __m128i mm128_not( const __m128i v )
|
||||
{ return _mm_ternarylogic_epi64( v, v, v, 1 ); }
|
||||
@@ -223,18 +218,54 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
|
||||
// a ^ b ^ c
|
||||
#define mm128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
#define mm128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
|
||||
// a & b & c
|
||||
#define mm128_and3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x80 )
|
||||
|
||||
// a | b | c
|
||||
#define mm128_or3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xfe )
|
||||
|
||||
// a ^ ( b & c )
|
||||
#define mm128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 )
|
||||
#define mm128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 )
|
||||
|
||||
// a & ( b ^ c )
|
||||
#define mm128_andxor( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x60 )
|
||||
|
||||
// a ^ ( b | c )
|
||||
#define mm128_xoror( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x1e )
|
||||
|
||||
// a ^ ( ~b & c )
|
||||
#define mm128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 )
|
||||
|
||||
// a | ( b & c )
|
||||
#define mm128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 )
|
||||
|
||||
// ~( a ^ b ), same as (~a) ^ b
|
||||
#define mm128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
|
||||
#define mm128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
|
||||
|
||||
#define mm128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
|
||||
#define mm128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
#define mm128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
|
||||
|
||||
#define mm128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
#define mm128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
|
||||
|
||||
#define mm128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
|
||||
|
||||
#define mm128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) )
|
||||
|
||||
#define mm128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
#define mm128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
|
||||
|
||||
#endif
|
||||
|
||||
@@ -257,6 +288,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
// transparency.
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
|
||||
#define mm128_ror_64 _mm_ror_epi64
|
||||
#define mm128_rol_64 _mm_rol_epi64
|
||||
@@ -372,7 +404,10 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
#define mm128_shuflr64_32 mm128_swap64_32
|
||||
#define mm128_shufll64_32 mm128_swap64_32
|
||||
|
||||
#if defined(__SSSE3__) && !defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
#if defined(__AVX512VL__)
|
||||
#define m1286_shuflr64_24( v ) _mm_ror_epi64( v, 24 )
|
||||
#elif defined(__SSSE3__)
|
||||
#define mm128_shuflr64_24( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
|
||||
@@ -380,7 +415,9 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
#define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
|
||||
#endif
|
||||
|
||||
#if defined(__SSSE3__) && !defined(__AVX512VL__)
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm128_shuflr64_16( v ) _mm_ror_epi64( v, 16 )
|
||||
#elif defined(__SSSE3__)
|
||||
#define mm128_shuflr64_16( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
|
||||
@@ -390,7 +427,9 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
|
||||
// Rotate 32 bit lanes
|
||||
|
||||
#if defined(__SSSE3__) && !defined(__AVX512VL__)
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm128_swap32_16( v ) _mm_ror_epi32( v, 16 )
|
||||
#elif defined(__SSSE3__)
|
||||
#define mm128_swap32_16( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
|
||||
@@ -400,7 +439,9 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
#define mm128_shuflr32_16 mm128_swap32_16
|
||||
#define mm128_shufll32_16 mm128_swap32_16
|
||||
|
||||
#if defined(__SSSE3__) && !defined(__AVX512VL__)
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm128_shuflr32_8( v ) _mm_ror_epi32( v, 8 )
|
||||
#elif defined(__SSSE3__)
|
||||
#define mm128_shuflr32_8( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
|
||||
|
@@ -13,17 +13,14 @@
|
||||
// automatically but their use is limited because 256 bit vectors are less
|
||||
// likely to be used when 512 is available.
|
||||
//
|
||||
// AVX10_256 will support AVX512VL instructions on CPUs limited to 256 bit
|
||||
// vectors. This will require enabling when the compiler's AVX10 feature
|
||||
// macros are known.
|
||||
//
|
||||
// "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
|
||||
// lanes and data can't cross the 128 bit lane boundary.
|
||||
// Full width byte shuffle is available with AVX512VL using the mask version
|
||||
// with a full mask (-1).
|
||||
// Instructions that can move data across 128 bit lane boundary incur a
|
||||
// performance penalty over those that can't.
|
||||
// Some usage of index vectors may be encoded as if full vector shuffles are
|
||||
// supported. This has no side effects and would have the same results using
|
||||
// either version.
|
||||
// If the need arises and AVX512VL is available, 256 bit full vector byte
|
||||
// shuffles can be implemented using the AVX512 mask feature with a NULL mask.
|
||||
|
||||
#if defined(__AVX__)
|
||||
|
||||
@@ -66,6 +63,7 @@ typedef union
|
||||
// Set either the low or high 64 bit elements in 128 bit lanes, other elements
|
||||
// are set to zero.
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
|
||||
#define mm256_bcast128lo_64( i64 ) _mm256_maskz_set1_epi64( 0x55, i64 )
|
||||
#define mm256_bcast128hi_64( i64 ) _mm256_maskz_set1_epi64( 0xaa, i64 )
|
||||
@@ -81,11 +79,9 @@ typedef union
|
||||
|
||||
#define mm256_set2_64( i1, i0 ) mm256_bcast_m128( _mm_set_epi64x( i1, i0 ) )
|
||||
|
||||
// Deprecated
|
||||
#define m256_const1_64 _mm256_set1_epi64x
|
||||
#define m256_const1_32 _mm256_set1_epi32
|
||||
#define mm256_set4_32( i3, i2, i1, i0 ) \
|
||||
mm256_bcast_m128( _mm_set_epi32( i3, i2, i1, i0 ) )
|
||||
|
||||
//
|
||||
// All SIMD constant macros are actually functions containing executable
|
||||
// code and therefore can't be used as compile time initializers.
|
||||
|
||||
@@ -121,6 +117,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
// Basic operations without SIMD equivalent
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
|
||||
static inline __m256i mm256_not( const __m256i v )
|
||||
{ return _mm256_ternarylogic_epi64( v, v, v, 1 ); }
|
||||
@@ -140,8 +137,7 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
_mm256_add_epi32( _mm256_add_epi32( a, b ), _mm256_add_epi32( c, d ) )
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
// AVX512 has ternary logic that supports any 3 input boolean expression.
|
||||
//TODO Enable for AVX10_256
|
||||
|
||||
// a ^ b ^ c
|
||||
#define mm256_xor3( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
@@ -176,31 +172,31 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
#else
|
||||
|
||||
#define mm256_xor3( a, b, c ) \
|
||||
_mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
|
||||
_mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
|
||||
|
||||
#define mm256_xor4( a, b, c, d ) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
|
||||
_mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
|
||||
|
||||
#define mm256_and3( a, b, c ) \
|
||||
_mm256_and_si256( a, _mm256_and_si256( b, c ) )
|
||||
_mm256_and_si256( a, _mm256_and_si256( b, c ) )
|
||||
|
||||
#define mm256_or3( a, b, c ) \
|
||||
_mm256_or_si256( a, _mm256_or_si256( b, c ) )
|
||||
|
||||
#define mm256_xorand( a, b, c ) \
|
||||
_mm256_xor_si256( a, _mm256_and_si256( b, c ) )
|
||||
_mm256_xor_si256( a, _mm256_and_si256( b, c ) )
|
||||
|
||||
#define mm256_andxor( a, b, c ) \
|
||||
_mm256_and_si256( a, _mm256_xor_si256( b, c ))
|
||||
|
||||
#define mm256_xoror( a, b, c ) \
|
||||
_mm256_xor_si256( a, _mm256_or_si256( b, c ) )
|
||||
_mm256_xor_si256( a, _mm256_or_si256( b, c ) )
|
||||
|
||||
#define mm256_xorandnot( a, b, c ) \
|
||||
_mm256_xor_si256( a, _mm256_andnot_si256( b, c ) )
|
||||
_mm256_xor_si256( a, _mm256_andnot_si256( b, c ) )
|
||||
|
||||
#define mm256_orand( a, b, c ) \
|
||||
_mm256_or_si256( a, _mm256_and_si256( b, c ) )
|
||||
_mm256_or_si256( a, _mm256_and_si256( b, c ) )
|
||||
|
||||
#define mm256_xnor( a, b ) \
|
||||
mm256_not( _mm256_xor_si256( a, b ) )
|
||||
@@ -226,6 +222,7 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
// transparency.
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
|
||||
#define mm256_ror_64 _mm256_ror_epi64
|
||||
#define mm256_rol_64 _mm256_rol_epi64
|
||||
@@ -380,6 +377,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
#define mm256_shuflr64_32 mm256_swap64_32
|
||||
#define mm256_shufll64_32 mm256_swap64_32
|
||||
|
||||
//TODO Enable for AVX10_256
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm256_shuflr64_24( v ) _mm256_ror_epi64( v, 24 )
|
||||
#else
|
||||
|
@@ -113,10 +113,6 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
|
||||
#define mm512_set2_64( i1, i0 ) \
|
||||
mm512_bcast_m128( _mm_set_epi64x( i1, i0 ) )
|
||||
|
||||
// Deprecated, use set
|
||||
#define m512_const1_64 _mm512_set1_epi64
|
||||
#define m512_const1_32 _mm512_set1_epi32
|
||||
|
||||
// Pseudo constants.
|
||||
#define m512_zero _mm512_setzero_si512()
|
||||
// Deprecated
|
||||
|
Reference in New Issue
Block a user