v3.18.1

2026-07-14 10:56:50 +00:00 · 2021-10-10 22:50:19 -04:00
parent 2cd1507c2e
commit 47cc5dcff5
14 changed files with 2057 additions and 2827 deletions
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -1,7 +1,7 @@
 #if !defined(SIMD_256_H__)
 #define SIMD_256_H__ 1

-#if defined(__AVX2__)
+//#if defined(__AVX2__)

 /////////////////////////////////////////////////////////////////////
 //
@@ -14,7 +14,9 @@
 // is limited because 256 bit vectors are less likely to be used when 512
 // is available.

-// Used instead if casting.
+#if defined(__AVX__)
+
+// Used instead of casting.
 typedef union
 {
   __m256i m256;
@@ -23,6 +25,28 @@ typedef union
   uint32_t u32[8];
 } __attribute__ ((aligned (32))) m256_ovly;

+//
+// Pointer casting
+
+// p = any aligned pointer
+// returns p as pointer to vector type, not very useful
+#define castp_m256i(p) ((__m256i*)(p))
+
+// p = any aligned pointer
+// returns *p, watch your pointer arithmetic
+#define cast_m256i(p) (*((__m256i*)(p)))
+
+// p = any aligned pointer, i = scaled array index
+// returns value p[i]
+#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
+
+// p = any aligned pointer, o = scaled offset
+// returns pointer p+o
+#define casto_m256i(p,o) (((__m256i*)(p))+(o))
+
+#endif
+#if defined(__AVX2__)
+

 // Move integer to low element of vector, other elements are set to zero.
 #define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
@@ -91,26 +115,6 @@ static inline __m256i mm256_neg1_fn()
 #define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
 #define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )

-//
-// Pointer casting
-
-// p = any aligned pointer
-// returns p as pointer to vector type, not very useful
-#define castp_m256i(p) ((__m256i*)(p))
-
-// p = any aligned pointer
-// returns *p, watch your pointer arithmetic
-#define cast_m256i(p) (*((__m256i*)(p)))
-
-// p = any aligned pointer, i = scaled array index
-// returns value p[i]
-#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
-
-// p = any aligned pointer, o = scaled offset
-// returns pointer p+o
-#define casto_m256i(p,o) (((__m256i*)(p))+(o))
-
-
 //
 // Memory functions
 // n = number of 256 bit (32 byte) vectors
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -535,7 +535,6 @@ static inline __m512i mm512_shufll_x32( const __m512i v, const int n )

 // Rotate 256 bit lanes by one 64 bit element
 #define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
-
 #define mm512_shufll256_64( v )     _mm512_permutex_epi64( v, 0x93 )

 // Rotate 256 bit lanes by one 32 bit element
@@ -611,9 +610,6 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 // shufl2r is 2 input ...
 // Drop macros? They can easilly be rebuilt using shufl2 functions

-// add shuflr shufll functions performing rotate, returning first arg
-// They're faster than doing both, when both not needed.
-
 // Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
 // rotated v1 
 // visually confusing for shif2r because of arg order. First arg is always