v3.23.4

2025-09-17 23:44:27 +00:00 · 2023-10-06 22:18:09 -04:00
parent bc5a5c6df8
commit 31c4dedf59
144 changed files with 5931 additions and 3746 deletions
--- a/algo/simd/vector.h
+++ b/algo/simd/vector.h
@@ -3,14 +3,10 @@

 #include "compat.h"

-#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
-
 /******************************* 
 * Using GCC vector extensions * 
 *******************************/

-#if   defined(__SSE2__)
-
 //typedef unsigned char v16qi __attribute__ ((vector_size (16)));
 typedef char          v16qi __attribute__ ((vector_size (16)));
 typedef short          v8hi __attribute__ ((vector_size (16)));
@@ -65,6 +61,10 @@ union u32 {
 #define v32_andn(x,y) ((v32) vec_andn((x), (y)))
 #endif

+//TODO  aarch support for widening multiply
+
+#if defined(__SSE2__)
+
 #define vec_and(x,y) ((x)&(y))
 #define vec_or(x,y)  ((x)|(y))
 #define vec_xor(x,y) ((x)^(y))
@@ -127,72 +127,11 @@ union u32 {

 #define CV(x) {{x, x, x, x, x, x, x, x}}

-#elif defined(__ALTIVEC__)
-
-#include <altivec.h>
-
-typedef vector unsigned char  v8;
-typedef vector signed   short v16;
-typedef vector unsigned int   v32;
-
-#define V3216(x) ((v16) (x))
-#define V1632(x) ((v32) (x))
-#define  V168(x) ( (v8) (x))
-#define  V816(x) ((v16) (x))
-
-#define V16_SIZE 8
-#define print_vec print_sse
-
-#define MAKE_VECT(x, ...) {{x, __VA_ARGS__}}
-
-#define CV(x) MAKE_VECT(x, x, x, x, x, x, x, x)
-#define CV16(x)  ((vector   signed short) {x,x,x,x,x,x,x,x})
-#define CVU16(x) ((vector unsigned short) {x,x,x,x,x,x,x,x})
-#define CV32(x)  ((vector unsigned int  ) {x,x,x,x})
-
-union cv {
-  unsigned short u16[8];
-  v16 v16;
-};
-
-union cv8 {
-  unsigned char u8[16];
-  v8 v8;
-};
-
-union ucv {
-  unsigned short u16[8];
-  vector unsigned char v16;
-};
-
-// Nasty hack to avoid macro expansion madness
-
-
-/* altivec.h is broken with Gcc 3.3 is C99 mode  */
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
-#define typeof __typeof
-#endif
-
-MAYBE_INLINE v16 vec_and_fun (v16 x, v16 y) {
-  return vec_and (x, y);
-}
-
-MAYBE_INLINE v16 vec_or_fun (v16 x, v16 y) {
-  return vec_or (x, y);
-}
-
-MAYBE_INLINE v16 vec_xor_fun (v16 x, v16 y) {
-  return vec_xor (x, y);
-}
-
-#undef vec_and
-#undef vec_or
-#undef vec_xor
-
-#define vec_and(x,y) ((__typeof(x)) vec_and_fun((v16) x, (v16) y))
-#define vec_or(x,y)  ((__typeof(x)) vec_or_fun((v16) x, (v16) y))
-#define vec_xor(x,y) ((__typeof(x)) vec_xor_fun((v16) x, (v16) y))
+#elif defined(__aarch64__) && defined(__ARM_NEON)

+#define vec_and( x, y )    v128_and( x, y )
+#define vec_or(x,y)        v128_or( x, y )
+#define vec_xor(x,y)       v128_xor( x, y )

 #define v16_and vec_and
 #define v16_or  vec_or
@@ -202,128 +141,36 @@ MAYBE_INLINE v16 vec_xor_fun (v16 x, v16 y) {
 #define v32_or  vec_or
 #define v32_xor vec_xor

+#define vec_andn( x,y )   v128_andnot( x, y )
+#define v16_andn          vec_andn 
+#define v32_andn          vec_andn

-#define v32_add vec_add
+#define v32_add( x, y )   v128_add32( x, y )

-#define v16_add vec_add
-#define v16_sub vec_sub
-#define v16_mul(a,b) vec_mladd(a,b,CV16(0))
+#define v16_add( x, y )        v128_add16( x, y )
+#define v16_sub( x, y )        v128_sub16( x, y )
+#define v16_mul( x, y )        v128_mul16( x, y )
+#define v16_neg(x)             v128_negate16( x )
+#define v16_shift_l( x, c )    v128_sl16
+#define v16_shift_r            v128_sr16
+#define v16_cmp                v128_cmpgt16

-vector unsigned   short ZZ = {0,0,0,0,0,0,0,0};
+#define v16_interleavel        v128_unpacklo16
+#define v16_interleaveh        v128_unpackhi16 

-v16 v16_shift_l(v16 x,int s) {
-  vector unsigned short shift = {s,s,s,s,s,s,s,s};
-  v16 y = vec_sl (x, shift);
-  return y;
-}
-#define v16_shift_l(x,s)  vec_sl (x,CVU16(s))
-#define v16_shift_r(x,s)  vec_sra(x,CVU16(s))
-#define v16_cmp      vec_cmpgt
+// the builtins compile for arm, so ???
+#define v16_mergel(a,b)   V1632(__builtin_ia32_punpcklwd128(a,b))
+#define v16_mergeh(a,b)   V1632(__builtin_ia32_punpckhwd128(a,b))

-#define v16_mergel(a,b)   V1632(vec_mergeh(b,a))
-#define v16_mergeh(a,b)   V1632(vec_mergel(b,a))
+#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
+#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))

-#define v16_interleavel(a,b)   vec_mergeh(a,b)
-#define v16_interleaveh(a,b)   vec_mergel(a,b)
+#define v32_shift_l            v128_sl32
+#define v32_shift_r            v128_sr32

-#define v8_mergel(a,b) V816(vec_mergeh(b,a))
-#define v8_mergeh(a,b) V816(vec_mergel(b,a))
+#define v32_rotate(x,n)        v128_rol32

-#define v32_rotate(x,s)  vec_rl(x,CV32(s))
-
-// #define v32_unpckl   vec_mergel
-// #define v32_unpckh   vec_mergeh
-
-#define vector_shuffle(x,s) vec_perm(x,x,s)
-
-static const v8 SHUFXOR_1 = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
-static const v8 SHUFXOR_2 = {8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7};
-static const v8 SHUFXOR_3 = {12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3};
-
-#define v32_shufxor(x,s) vector_shuffle(x,SHUFXOR_##s)
-
-//static const v8 SHUFSWAP = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
-static const v8 SHUFSWAP = {3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12};
-
-#define v32_bswap(x) vector_shuffle(x,SHUFSWAP)
-
-#else
-
-#error "I don't know how to vectorize on this architecture."
-
-#endif
-
-#else
-
-/******************************** 
- * Using MSVC/ICC vector instrinsics * 
- ********************************/
-
-#include <emmintrin.h>
-
-typedef __m128i  v8;
-typedef __m128i v16;
-typedef __m128i v32;
-
-#define V3216(x) (x)
-#define V1632(x) (x)
-#define  V168(x) (x)
-#define  V816(x) (x)
-
-#define V16_SIZE 8
-
-union cv {
-  unsigned short u16[8];
-  v16 v16;
-};
-
-union cv8 {
-  unsigned char u8[16];
-  v8 v8;
-};
-
-#define CV(x) {{x, x, x, x, x, x, x, x}}
-
-#define vec_and      _mm_and_si128
-#define vec_or       _mm_or_si128
-#define vec_xor      _mm_xor_si128
-
-#define v16_and vec_and
-#define v16_or  vec_or
-#define v16_xor vec_xor
-
-#define v32_and vec_and
-#define v32_or  vec_or
-#define v32_xor vec_xor
-
-#define vector_shuffle(x,s) _mm_shuffle_epi8(x, s)
-
-#define v32_add      _mm_add_epi32
-
-#define v16_add      _mm_add_epi16
-#define v16_sub      _mm_sub_epi16
-#define v16_mul      _mm_mullo_epi16
-#define v16_neg(x)   (-(x))
-#define v16_shift_l  _mm_slli_epi16
-#define v16_shift_r  _mm_srai_epi16
-#define v16_cmp      _mm_cmpgt_epi16
-
-#define v16_interleavel   _mm_unpacklo_epi16
-#define v16_interleaveh   _mm_unpackhi_epi16
-
-#define v16_mergel   _mm_unpacklo_epi16
-#define v16_mergeh   _mm_unpackhi_epi16
-
-#define v8_mergel    _mm_unpacklo_epi8
-#define v8_mergeh    _mm_unpackhi_epi8
-
-#define v32_shift_l  _mm_slli_epi32
-#define v32_shift_r  _mm_srli_epi32
-
-#define v32_rotate(x,n)                                 \
-  vec_or(v32_shift_l(x,n), v32_shift_r(x,32-(n)))
-
-#define v32_shuf     _mm_shuffle_epi32
+#define v32_shuf __builtin_ia32_pshufd

 #define SHUFXOR_1 0xb1          /* 0b10110001 */
 #define SHUFXOR_2 0x4e          /* 0b01001110 */
@@ -332,13 +179,25 @@ union cv8 {
 #define CAT(x, y) x##y
 #define XCAT(x,y) CAT(x,y)

-//#define v32_shufxor(x,s) v32_shuf(x,SHUFXOR_##s)
 #define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))

 #define v32_bswap(x) (x)

+#define v16_broadcast(x) ({                     \
+      union u32 u;                              \
+      u32 xx = x;                               \
+      u.u[0] = xx | (xx << 16);                 \
+      V3216(v32_shuf(u.v,0)); })
+
+#define CV(x) {{x, x, x, x, x, x, x, x}}
+
+#else
+
+#error "I don't know how to vectorize on this architecture."
+
 #endif

+
 /* Twiddle tables */

  static const union cv FFT64_Twiddle[] = {