This commit is contained in:
Jay D Dee
2023-08-30 20:15:48 -04:00
parent 57a6b7b58b
commit 4378d2f841
72 changed files with 10184 additions and 2182 deletions

View File

@@ -731,6 +731,67 @@ static inline void extr_lane_8x32( void *d, const void *s,
#if defined(__AVX2__)
#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
//TODO Enable for AVX10_256 AVX10_512
// Combine byte swap & broadcast in one permute
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
{
const __m256i c0 = _mm256_set1_epi32( 0x00010203 );
const __m256i c1 = _mm256_set1_epi32( 0x04050607 );
const __m256i c2 = _mm256_set1_epi32( 0x08090a0b );
const __m256i c3 = _mm256_set1_epi32( 0x0c0d0e0f );
const __m128i s0 = casti_m128i( src,0 );
const __m128i s1 = casti_m128i( src,1 );
const __m128i s2 = casti_m128i( src,2 );
const __m128i s3 = casti_m128i( src,3 );
const __m128i s4 = casti_m128i( src,4 );
casti_m256i( d, 0 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s0 ) );
casti_m256i( d, 1 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s0 ) );
casti_m256i( d, 2 ) = _mm256_permutexvar_epi8( c2,
_mm256_castsi128_si256( s0 ) );
casti_m256i( d, 3 ) = _mm256_permutexvar_epi8( c3,
_mm256_castsi128_si256( s0 ) );
casti_m256i( d, 4 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s1 ) );
casti_m256i( d, 5 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s1 ) );
casti_m256i( d, 6 ) = _mm256_permutexvar_epi8( c2,
_mm256_castsi128_si256( s1 ) );
casti_m256i( d, 7 ) = _mm256_permutexvar_epi8( c3,
_mm256_castsi128_si256( s1 ) );
casti_m256i( d, 8 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s2 ) );
casti_m256i( d, 9 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s2 ) );
casti_m256i( d,10 ) = _mm256_permutexvar_epi8( c2,
_mm256_castsi128_si256( s2 ) );
casti_m256i( d,11 ) = _mm256_permutexvar_epi8( c3,
_mm256_castsi128_si256( s2 ) );
casti_m256i( d,12 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s3 ) );
casti_m256i( d,13 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s3 ) );
casti_m256i( d,14 ) = _mm256_permutexvar_epi8( c2,
_mm256_castsi128_si256( s3 ) );
casti_m256i( d,15 ) = _mm256_permutexvar_epi8( c3,
_mm256_castsi128_si256( s3 ) );
casti_m256i( d,16 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s4 ) );
casti_m256i( d,17 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s4 ) );
casti_m256i( d,18 ) = _mm256_permutexvar_epi8( c2,
_mm256_castsi128_si256( s4 ) );
casti_m256i( d,19 ) = _mm256_permutexvar_epi8( c3,
_mm256_castsi128_si256( s4 ) );
}
#else
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
{
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
@@ -792,6 +853,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
_mm256_castsi128_si256( s4 ), c3 );
}
#endif // AVX512VBMI else
#endif // AVX2
// 16x32
@@ -1173,10 +1235,12 @@ static inline void extr_lane_16x32( void *d, const void *s,
((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+240 ];
}
#if defined(__AVX512F__) && defined(__AVX512VL__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__AVX512VBMI__)
// TODO Enable for AVX10_512
// Combine byte swap & broadcast in one permute
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
{
@@ -1496,10 +1560,48 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
_mm256_castsi128_si256( s4 ), 0x55 );
}
#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
//TODO Enable for AVX10_256 AVX10_512
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
{
const __m256i c0 = _mm256_set1_epi64x( 0x0405060700010203 );
const __m256i c1 = _mm256_set1_epi64x( 0x0c0d0e0f08090a0b );
const __m128i s0 = casti_m128i( src,0 );
const __m128i s1 = casti_m128i( src,1 );
const __m128i s2 = casti_m128i( src,2 );
const __m128i s3 = casti_m128i( src,3 );
const __m128i s4 = casti_m128i( src,4 );
casti_m256i( d,0 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s0 ) );
casti_m256i( d,1 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s0 ) );
casti_m256i( d,2 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s1 ) );
casti_m256i( d,3 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s1 ) );
casti_m256i( d,4 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s2 ) );
casti_m256i( d,5 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s2 ) );
casti_m256i( d,6 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s3 ) );
casti_m256i( d,7 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s3 ) );
casti_m256i( d,8 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s4 ) );
casti_m256i( d,9 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s4 ) );
}
#else
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
{
const __m256i bswap_shuf = mm256_bcast_m128(
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
__m256i s0 = casti_m256i( src,0 );
__m256i s1 = casti_m256i( src,1 );
__m128i s4 = casti_m128i( src,4 );
@@ -1524,6 +1626,8 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
_mm256_castsi128_si256( s4 ), 0x55 );
}
#endif
#endif // AVX2
// 8x64 (AVX512)
@@ -1846,6 +1950,8 @@ static inline void extr_lane_8x64( void *dst, const void *src, const int lane,
#if defined(__AVX512F__) && defined(__AVX512VL__)
//TODO Enable for AVX10_512
// broadcast to all lanes
static inline void mm512_intrlv80_8x64( void *dst, const void *src )
{
@@ -2089,10 +2195,36 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,
d0[3] = s[12]; d1[3] = s[13]; d2[3] = s[14]; d3[3] = s[15];
}
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
static inline void mm512_bswap32_intrlv80_4x128( void *d, void *src )
#if defined(__AVX512VBMI__)
//TODO Enable for AVX10_512
static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
{
const __m512i bswap_shuf = mm512_bcast_m128(
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
const __m128i s0 = casti_m128i( src,0 );
const __m128i s1 = casti_m128i( src,1 );
const __m128i s2 = casti_m128i( src,2 );
const __m128i s3 = casti_m128i( src,3 );
const __m128i s4 = casti_m128i( src,4 );
casti_m512i( d,0 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s0 ),
bswap_shuf );
casti_m512i( d,1 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s1 ),
bswap_shuf );
casti_m512i( d,2 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s2 ),
bswap_shuf );
casti_m512i( d,3 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s3 ),
bswap_shuf );
casti_m512i( d,4 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s4 ),
bswap_shuf );
}
#else
static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
{
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
@@ -2108,14 +2240,15 @@ static inline void mm512_bswap32_intrlv80_4x128( void *d, void *src )
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
casti_m512i( d, 0 ) = mm512_bcast_m128( s0 );
casti_m512i( d, 1 ) = mm512_bcast_m128( s1 );
casti_m512i( d, 2 ) = mm512_bcast_m128( s2 );
casti_m512i( d, 3 ) = mm512_bcast_m128( s3 );
casti_m512i( d, 4 ) = mm512_bcast_m128( s4 );
}
casti_m512i( d,0 ) = mm512_bcast_m128( s0 );
casti_m512i( d,1 ) = mm512_bcast_m128( s1 );
casti_m512i( d,2 ) = mm512_bcast_m128( s2 );
casti_m512i( d,3 ) = mm512_bcast_m128( s3 );
casti_m512i( d,4 ) = mm512_bcast_m128( s4 );
}
#endif
#endif // AVX512VBMI ELSE
#endif // AVX512
// 2x256 (AVX512)
@@ -2955,6 +3088,8 @@ do { \
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
//TODO Enable for AVX10_512
/*
#define mm512_intrlv_blend_128( hi, lo ) \
_mm512_mask_blend_epi32( 0x0f0f, hi, lo )

View File

@@ -43,9 +43,11 @@ typedef union
} __attribute__ ((aligned (16))) m128_ovly;
// Deprecated. EVEX adds support for integer argument in broadcast instruction
// eliminating the need for an explicit move in most cases. Use the set1
// intrinsic with integers and let the compiler figure it out.
// Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements
// that make these functions either unnecessary or inefficient.
// In cases where an explicit move betweeen GP & SIMD registers is still
// necessary the cvt, set, or set1 intrinsics can be used allowing the
// compiler to exploilt new features to produce optimum code.
static inline __m128i mm128_mov64_128( const uint64_t n )
{
__m128i a;
@@ -73,15 +75,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
//#define mm128_bcast_m64( v ) _mm_shuffle_epi32( v, 0x44 )
//#define mm128_bcast_m32( v ) _mm_shuffle_epi32( v, 0x00 )
// Deprecated, use set1 directly
#define m128_const1_64 _mm_set1_epi64x
#define m128_const1_32 _mm_set1_epi32
// Deprecated, use set directly
#define m128_const_64 _mm_set_epi64x
// Pseudo constants
#define m128_zero _mm_setzero_si128()
#define m128_one_128 mm128_mov64_128( 1 )
//#define m128_one_64 _mm_set1_epi64x( 1 )
@@ -141,7 +135,7 @@ static inline __m128i mm128_neg1_fn()
// Examples of simple operations using xim:
// Insert 32 bit integer into v at element c and return updated v.
// Copy i to element c of dest and copy remaining elemnts from v.
static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
const int c )
{ return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); }
@@ -161,6 +155,7 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
// Bitwise not (~v)
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
static inline __m128i mm128_not( const __m128i v )
{ return _mm_ternarylogic_epi64( v, v, v, 1 ); }
@@ -223,18 +218,54 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
// a ^ b ^ c
#define mm128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
#define mm128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
// a & b & c
#define mm128_and3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x80 )
// a | b | c
#define mm128_or3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xfe )
// a ^ ( b & c )
#define mm128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 )
#define mm128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 )
// a & ( b ^ c )
#define mm128_andxor( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x60 )
// a ^ ( b | c )
#define mm128_xoror( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x1e )
// a ^ ( ~b & c )
#define mm128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 )
// a | ( b & c )
#define mm128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 )
// ~( a ^ b ), same as (~a) ^ b
#define mm128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
#else
#define mm128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
#define mm128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
#define mm128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
#define mm128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
#define mm128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
#define mm128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
#define mm128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
#define mm128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
#define mm128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) )
#define mm128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
#define mm128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
#endif
@@ -257,6 +288,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
// transparency.
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#define mm128_ror_64 _mm_ror_epi64
#define mm128_rol_64 _mm_rol_epi64
@@ -372,7 +404,10 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#define mm128_shuflr64_32 mm128_swap64_32
#define mm128_shufll64_32 mm128_swap64_32
#if defined(__SSSE3__) && !defined(__AVX512VL__)
//TODO Enable for AVX10_256
#if defined(__AVX512VL__)
#define m1286_shuflr64_24( v ) _mm_ror_epi64( v, 24 )
#elif defined(__SSSE3__)
#define mm128_shuflr64_24( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
@@ -380,7 +415,9 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
#endif
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#if defined(__AVX512VL__)
#define mm128_shuflr64_16( v ) _mm_ror_epi64( v, 16 )
#elif defined(__SSSE3__)
#define mm128_shuflr64_16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
@@ -390,7 +427,9 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
// Rotate 32 bit lanes
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#if defined(__AVX512VL__)
#define mm128_swap32_16( v ) _mm_ror_epi32( v, 16 )
#elif defined(__SSSE3__)
#define mm128_swap32_16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
@@ -400,7 +439,9 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#define mm128_shuflr32_16 mm128_swap32_16
#define mm128_shufll32_16 mm128_swap32_16
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#if defined(__AVX512VL__)
#define mm128_shuflr32_8( v ) _mm_ror_epi32( v, 8 )
#elif defined(__SSSE3__)
#define mm128_shuflr32_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )

View File

@@ -13,17 +13,14 @@
// automatically but their use is limited because 256 bit vectors are less
// likely to be used when 512 is available.
//
// AVX10_256 will support AVX512VL instructions on CPUs limited to 256 bit
// vectors. This will require enabling when the compiler's AVX10 feature
// macros are known.
//
// "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
// lanes and data can't cross the 128 bit lane boundary.
// Full width byte shuffle is available with AVX512VL using the mask version
// with a full mask (-1).
// Instructions that can move data across 128 bit lane boundary incur a
// performance penalty over those that can't.
// Some usage of index vectors may be encoded as if full vector shuffles are
// supported. This has no side effects and would have the same results using
// either version.
// If the need arises and AVX512VL is available, 256 bit full vector byte
// shuffles can be implemented using the AVX512 mask feature with a NULL mask.
#if defined(__AVX__)
@@ -66,6 +63,7 @@ typedef union
// Set either the low or high 64 bit elements in 128 bit lanes, other elements
// are set to zero.
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#define mm256_bcast128lo_64( i64 ) _mm256_maskz_set1_epi64( 0x55, i64 )
#define mm256_bcast128hi_64( i64 ) _mm256_maskz_set1_epi64( 0xaa, i64 )
@@ -81,11 +79,9 @@ typedef union
#define mm256_set2_64( i1, i0 ) mm256_bcast_m128( _mm_set_epi64x( i1, i0 ) )
// Deprecated
#define m256_const1_64 _mm256_set1_epi64x
#define m256_const1_32 _mm256_set1_epi32
#define mm256_set4_32( i3, i2, i1, i0 ) \
mm256_bcast_m128( _mm_set_epi32( i3, i2, i1, i0 ) )
//
// All SIMD constant macros are actually functions containing executable
// code and therefore can't be used as compile time initializers.
@@ -121,6 +117,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
// Basic operations without SIMD equivalent
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
static inline __m256i mm256_not( const __m256i v )
{ return _mm256_ternarylogic_epi64( v, v, v, 1 ); }
@@ -140,8 +137,7 @@ static inline __m256i mm256_not( const __m256i v )
_mm256_add_epi32( _mm256_add_epi32( a, b ), _mm256_add_epi32( c, d ) )
#if defined(__AVX512VL__)
// AVX512 has ternary logic that supports any 3 input boolean expression.
//TODO Enable for AVX10_256
// a ^ b ^ c
#define mm256_xor3( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0x96 )
@@ -176,31 +172,31 @@ static inline __m256i mm256_not( const __m256i v )
#else
#define mm256_xor3( a, b, c ) \
_mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
_mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
#define mm256_xor4( a, b, c, d ) \
_mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
_mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
#define mm256_and3( a, b, c ) \
_mm256_and_si256( a, _mm256_and_si256( b, c ) )
_mm256_and_si256( a, _mm256_and_si256( b, c ) )
#define mm256_or3( a, b, c ) \
_mm256_or_si256( a, _mm256_or_si256( b, c ) )
#define mm256_xorand( a, b, c ) \
_mm256_xor_si256( a, _mm256_and_si256( b, c ) )
_mm256_xor_si256( a, _mm256_and_si256( b, c ) )
#define mm256_andxor( a, b, c ) \
_mm256_and_si256( a, _mm256_xor_si256( b, c ))
#define mm256_xoror( a, b, c ) \
_mm256_xor_si256( a, _mm256_or_si256( b, c ) )
_mm256_xor_si256( a, _mm256_or_si256( b, c ) )
#define mm256_xorandnot( a, b, c ) \
_mm256_xor_si256( a, _mm256_andnot_si256( b, c ) )
_mm256_xor_si256( a, _mm256_andnot_si256( b, c ) )
#define mm256_orand( a, b, c ) \
_mm256_or_si256( a, _mm256_and_si256( b, c ) )
_mm256_or_si256( a, _mm256_and_si256( b, c ) )
#define mm256_xnor( a, b ) \
mm256_not( _mm256_xor_si256( a, b ) )
@@ -226,6 +222,7 @@ static inline __m256i mm256_not( const __m256i v )
// transparency.
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#define mm256_ror_64 _mm256_ror_epi64
#define mm256_rol_64 _mm256_rol_epi64
@@ -380,6 +377,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
#define mm256_shuflr64_32 mm256_swap64_32
#define mm256_shufll64_32 mm256_swap64_32
//TODO Enable for AVX10_256
#if defined(__AVX512VL__)
#define mm256_shuflr64_24( v ) _mm256_ror_epi64( v, 24 )
#else

View File

@@ -113,10 +113,6 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
#define mm512_set2_64( i1, i0 ) \
mm512_bcast_m128( _mm_set_epi64x( i1, i0 ) )
// Deprecated, use set
#define m512_const1_64 _mm512_set1_epi64
#define m512_const1_32 _mm512_set1_epi32
// Pseudo constants.
#define m512_zero _mm512_setzero_si512()
// Deprecated