This commit is contained in:
Jay D Dee
2021-10-10 22:50:19 -04:00
parent 2cd1507c2e
commit 47cc5dcff5
14 changed files with 2057 additions and 2827 deletions

View File

@@ -1,7 +1,7 @@
#if !defined(SIMD_256_H__)
#define SIMD_256_H__ 1
#if defined(__AVX2__)
//#if defined(__AVX2__)
/////////////////////////////////////////////////////////////////////
//
@@ -14,7 +14,9 @@
// is limited because 256 bit vectors are less likely to be used when 512
// is available.
// Used instead if casting.
#if defined(__AVX__)
// Used instead of casting.
typedef union
{
__m256i m256;
@@ -23,6 +25,28 @@ typedef union
uint32_t u32[8];
} __attribute__ ((aligned (32))) m256_ovly;
//
// Pointer casting
// p = any aligned pointer
// returns p as pointer to vector type, not very useful
#define castp_m256i(p) ((__m256i*)(p))
// p = any aligned pointer
// returns *p, watch your pointer arithmetic
#define cast_m256i(p) (*((__m256i*)(p)))
// p = any aligned pointer, i = scaled array index
// returns value p[i]
#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
// p = any aligned pointer, o = scaled offset
// returns pointer p+o
#define casto_m256i(p,o) (((__m256i*)(p))+(o))
#endif
#if defined(__AVX2__)
// Move integer to low element of vector, other elements are set to zero.
#define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
@@ -91,26 +115,6 @@ static inline __m256i mm256_neg1_fn()
#define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
#define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
//
// Pointer casting
// p = any aligned pointer
// returns p as pointer to vector type, not very useful
#define castp_m256i(p) ((__m256i*)(p))
// p = any aligned pointer
// returns *p, watch your pointer arithmetic
#define cast_m256i(p) (*((__m256i*)(p)))
// p = any aligned pointer, i = scaled array index
// returns value p[i]
#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
// p = any aligned pointer, o = scaled offset
// returns pointer p+o
#define casto_m256i(p,o) (((__m256i*)(p))+(o))
//
// Memory functions
// n = number of 256 bit (32 byte) vectors

View File

@@ -535,7 +535,6 @@ static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
// Rotate 256 bit lanes by one 64 bit element
#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 )
#define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 )
// Rotate 256 bit lanes by one 32 bit element
@@ -611,9 +610,6 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
// shufl2r is 2 input ...
// Drop macros? They can easilly be rebuilt using shufl2 functions
// add shuflr shufll functions performing rotate, returning first arg
// They're faster than doing both, when both not needed.
// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
// rotated v1
// visually confusing for shif2r because of arg order. First arg is always