This commit is contained in:
Jay D Dee
2023-09-28 18:43:18 -04:00
parent be88afc349
commit bc5a5c6df8
88 changed files with 5526 additions and 3361 deletions

View File

@@ -738,10 +738,10 @@ static inline void extr_lane_8x32( void *d, const void *s,
// Combine byte swap & broadcast in one permute
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
{
const __m256i c0 = _mm256_set1_epi32( 0x00010203 );
const __m256i c1 = _mm256_set1_epi32( 0x04050607 );
const __m256i c2 = _mm256_set1_epi32( 0x08090a0b );
const __m256i c3 = _mm256_set1_epi32( 0x0c0d0e0f );
const __m256i c0 = v256_32( 0x00010203 );
const __m256i c1 = v256_32( 0x04050607 );
const __m256i c2 = v256_32( 0x08090a0b );
const __m256i c3 = v256_32( 0x0c0d0e0f );
const __m128i s0 = casti_m128i( src,0 );
const __m128i s1 = casti_m128i( src,1 );
const __m128i s2 = casti_m128i( src,2 );
@@ -796,7 +796,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
{
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
const __m256i c1 = _mm256_set1_epi32( 1 );
const __m256i c1 = v256_32( 1 );
const __m256i c2 = _mm256_add_epi32( c1, c1 );
const __m256i c3 = _mm256_add_epi32( c2, c1 );
@@ -1244,10 +1244,10 @@ static inline void extr_lane_16x32( void *d, const void *s,
// Combine byte swap & broadcast in one permute
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
{
const __m512i c0 = _mm512_set1_epi32( 0x00010203 );
const __m512i c1 = _mm512_set1_epi32( 0x04050607 );
const __m512i c2 = _mm512_set1_epi32( 0x08090a0b );
const __m512i c3 = _mm512_set1_epi32( 0x0c0d0e0f );
const __m512i c0 = v512_32( 0x00010203 );
const __m512i c1 = v512_32( 0x04050607 );
const __m512i c2 = v512_32( 0x08090a0b );
const __m512i c3 = v512_32( 0x0c0d0e0f );
const __m128i s0 = casti_m128i( src,0 );
const __m128i s1 = casti_m128i( src,1 );
const __m128i s2 = casti_m128i( src,2 );
@@ -1302,7 +1302,7 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
{
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
const __m512i c1 = _mm512_set1_epi32( 1 );
const __m512i c1 = v512_32( 1 );
const __m512i c2 = _mm512_add_epi32( c1, c1 );
const __m512i c3 = _mm512_add_epi32( c2, c1 );
__m128i s0 = casti_m128i( src,0 );
@@ -1566,8 +1566,8 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
{
const __m256i c0 = _mm256_set1_epi64x( 0x0405060700010203 );
const __m256i c1 = _mm256_set1_epi64x( 0x0c0d0e0f08090a0b );
const __m256i c0 = v256_64( 0x0405060700010203 );
const __m256i c1 = v256_64( 0x0c0d0e0f08090a0b );
const __m128i s0 = casti_m128i( src,0 );
const __m128i s1 = casti_m128i( src,1 );
const __m128i s2 = casti_m128i( src,2 );
@@ -1958,16 +1958,16 @@ static inline void mm512_intrlv80_8x64( void *dst, const void *src )
__m512i *d = (__m512i*)dst;
const uint64_t *s = (const uint64_t*)src;
d[0] = _mm512_set1_epi64( s[0] );
d[1] = _mm512_set1_epi64( s[1] );
d[2] = _mm512_set1_epi64( s[2] );
d[3] = _mm512_set1_epi64( s[3] );
d[4] = _mm512_set1_epi64( s[4] );
d[5] = _mm512_set1_epi64( s[5] );
d[6] = _mm512_set1_epi64( s[6] );
d[7] = _mm512_set1_epi64( s[7] );
d[8] = _mm512_set1_epi64( s[8] );
d[9] = _mm512_set1_epi64( s[9] );
d[0] = v512_64( s[0] );
d[1] = v512_64( s[1] );
d[2] = v512_64( s[2] );
d[3] = v512_64( s[3] );
d[4] = v512_64( s[4] );
d[5] = v512_64( s[5] );
d[6] = v512_64( s[6] );
d[7] = v512_64( s[7] );
d[8] = v512_64( s[8] );
d[9] = v512_64( s[9] );
}
// byte swap and broadcast to all lanes
@@ -1977,8 +1977,8 @@ static inline void mm512_intrlv80_8x64( void *dst, const void *src )
// Combine byte swap & broadcast in one permute
static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
{
const __m512i c0 = _mm512_set1_epi64( 0x0405060700010203 );
const __m512i c1 = _mm512_set1_epi64( 0x0c0d0e0f08090a0b );
const __m512i c0 = v512_64( 0x0405060700010203 );
const __m512i c1 = v512_64( 0x0c0d0e0f08090a0b );
const __m128i s0 = casti_m128i( src,0 );
const __m128i s1 = casti_m128i( src,1 );
const __m128i s2 = casti_m128i( src,2 );
@@ -2013,7 +2013,7 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
{
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
const __m512i c1 = _mm512_set1_epi64( 1 );
const __m512i c1 = v512_64( 1 );
__m128i s0 = casti_m128i( src,0 );
__m128i s1 = casti_m128i( src,1 );
__m128i s2 = casti_m128i( src,2 );

View File

@@ -3,7 +3,7 @@
#if defined(__SSE2__)
///////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
//
// 128 bit SSE vectors
//
@@ -20,9 +20,9 @@
// define a local const for repeated references to the same constant.
//
// One common use for simd constants is as a control index for vector
// instructions like blend and shuffle. Alhough the ultimate instruction
// may execute in a single clock cycle, generating the control index adds
// several more cycles to the entire operation.
// shuffle instructions. Alhough the ultimate instruction may execute in a
// single clock cycle, generating the control index adds several more cycles
// to the entire operation.
//
// All of the utilities here assume all data is in registers except
// in rare cases where arguments are pointers.
@@ -32,7 +32,7 @@
// Intrinsics automatically promote from REX to VEX when AVX is available
// but ASM needs to be done manually.
//
///////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Used instead if casting.
@@ -43,8 +43,8 @@ typedef union
} __attribute__ ((aligned (16))) m128_ovly;
#define v128_64(i) _mm_set1_epi64x(i)
#define v128_32(i) _mm_set1_epi32(i)
#define v128_64(i64) _mm_set1_epi64x(i64)
#define v128_32(i32) _mm_set1_epi32(i32)
// Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements
// that make these functions either unnecessary or inefficient.
@@ -81,8 +81,6 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
// Pseudo constants
#define m128_zero _mm_setzero_si128()
#define m128_one_128 mm128_mov64_128( 1 )
//#define m128_one_64 _mm_set1_epi64x( 1 )
#define m128_one_32 _mm_set1_epi32( 1 )
// ASM avoids the need to initialize return variable to avoid compiler warning.
// Macro abstracts function parentheses to look like an identifier.
@@ -100,7 +98,7 @@ static inline __m128i mm128_neg1_fn()
#if defined(__SSE4_1__)
/////////////////////////////
/////////////////////////////////////////////////////////////
//
// _mm_insert_ps( _mm128i v1, __m128i v2, imm8 c )
//

View File

@@ -90,10 +90,7 @@ typedef union
// code and therefore can't be used as compile time initializers.
#define m256_zero _mm256_setzero_si256()
//#define m256_one_256 mm256_mov64_256( 1 )
#define m256_one_128 mm256_bcast_m128( m128_one_128 )
#define m256_one_64 _mm256_set1_epi64x( 1 )
#define m256_one_32 _mm256_set1_epi32( 1 )
static inline __m256i mm256_neg1_fn()
{

View File

@@ -97,8 +97,8 @@ typedef union
uint64_t u64[8];
} __attribute__ ((aligned (64))) m512_ovly;
#define v512_64(i) _mm512_set1_epi64(i)
#define v512_32(i) _mm512_set1_epi32(i)
#define v512_64(i64) _mm512_set1_epi64(i64)
#define v512_32(i32) _mm512_set1_epi32(i32)
// A simple 128 bit permute, using function instead of macro avoids
// problems if the v arg passed as an expression.
@@ -118,9 +118,6 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
// Pseudo constants.
#define m512_zero _mm512_setzero_si512()
// Deprecated
#define m512_one_64 _mm512_set1_epi64( 1 )
#define m512_one_32 _mm512_set1_epi32( 1 )
// use asm to avoid compiler warning for unitialized local
static inline __m512i mm512_neg1_fn()