v3.9.8

2025-09-17 23:44:27 +00:00 · 2019-09-26 22:37:26 -04:00
parent a042fb7612
commit 01550d94a2
47 changed files with 1240 additions and 1544 deletions
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -42,9 +42,11 @@

 static inline __m128i m128_one_128_fn()
 {
+   register uint64_t one = 1;
   register __m128i a;
-   asm( "movq $1, %0\n\t"
-        : "=x"(a) );
+   asm( "movq %1, %0\n\t"
+        : "=x"(a)
+        : "r" (one) );
   return a;
 }
 #define m128_one_128    m128_one_128_fn()
@@ -54,9 +56,9 @@ static inline __m128i m128_one_64_fn()
  register uint64_t one = 1;
  register __m128i a;
  asm( "movq %1, %0\n\t"
-       : "=x"(a)
-       : "r"(one) );
-  return _mm_shuffle_epi32( a, 0x04 );
+       : "=x" (a)
+       : "r"  (one) );
+  return _mm_shuffle_epi32( a, 0x44 );
 }
 #define m128_one_64    m128_one_64_fn()

@@ -65,8 +67,8 @@ static inline __m128i m128_one_32_fn()
  register uint32_t one = 1;
  register __m128i a;
  asm( "movd %1, %0\n\t"
-       : "=x"(a)
-       : "r"(one) );
+       : "=x" (a)
+       : "r"  (one) );
  return _mm_shuffle_epi32( a, 0x00 );
 }
 #define m128_one_32    m128_one_32_fn()
@@ -76,8 +78,8 @@ static inline __m128i m128_one_16_fn()
  register uint32_t one = 0x00010001;
  register __m128i a;
  asm( "movd %1, %0\n\t"
-       : "=x"(a)
-       : "r"(one) );
+       : "=x" (a)
+       : "r"  (one) );
  return _mm_shuffle_epi32( a, 0x00 );
 }
 #define m128_one_16    m128_one_16_fn()
@@ -87,8 +89,8 @@ static inline __m128i m128_one_8_fn()
  register uint32_t one = 0x01010101;
  register __m128i a;
  asm( "movd %1, %0\n\t"
-       : "=x"(a)
-       : "r"(one) );
+       : "=x" (a)
+       : "r"  (one) );
  return _mm_shuffle_epi32( a, 0x00 );
 }
 #define m128_one_8    m128_one_8_fn()
@@ -97,7 +99,7 @@ static inline __m128i m128_neg1_fn()
 {
   __m128i a;
   asm( "pcmpeqd %0, %0\n\t"
-        : "=x"(a) );
+        : "=x" (a) );
   return a;
 }
 #define m128_neg1    m128_neg1_fn()
@@ -108,7 +110,7 @@ static inline __m128i mm128_mov64_128( uint64_t n )
  register __m128i a;
  asm( "movq %1, %0\n\t"
       : "=x" (a)
-       : "r" (n) );
+       : "r"  (n) );
  return  a;
 }

@@ -117,7 +119,7 @@ static inline __m128i mm128_mov32_128( uint32_t n )
  register __m128i a;
  asm( "movd %1, %0\n\t"
       : "=x" (a)
-       : "r" (n) );
+       : "r"  (n) );
  return  a;
 }

@@ -126,7 +128,7 @@ static inline uint64_t mm128_mov128_64( __m128i a )
  register uint64_t n;
  asm( "movq %1, %0\n\t"
       : "=x" (n)
-       : "r" (a) );
+       : "r"  (a) );
  return  n;
 }

@@ -135,10 +137,28 @@ static inline uint32_t mm128_mov128_32( __m128i a )
  register uint32_t n;
  asm( "movd %1, %0\n\t"
       : "=x" (n)
-       : "r" (a) );
+       : "r"  (a) );
  return  n;
 }

+static inline __m128i m128_const1_64( const uint64_t n )
+{
+  register __m128i a;
+  asm( "movq %1, %0\n\t"
+       : "=x" (a)
+       : "r"  (n) );
+  return _mm_shuffle_epi32( a, 0x44 );
+}
+
+static inline __m128i m128_const1_32( const uint32_t n )
+{
+  register __m128i a;
+  asm( "movd %1, %0\n\t"
+       : "=x" (a)
+       : "r"  (n) );
+  return _mm_shuffle_epi32( a, 0x00 );
+}
+
 #if defined(__SSE41__)

 // alternative to _mm_set_epi64x, doesn't use mem,
@@ -148,11 +168,12 @@ static inline __m128i m128_const_64( const uint64_t hi, const uint64_t lo )
   register __m128i a;
   asm( "movq %2, %0\n\t"
        "pinsrq $1, %1, %0\n\t"
-        : "=x"(a)
-        : "r"(hi), "r"(lo) );
+        : "=x" (a)
+        : "r"  (hi), "r" (lo) );
   return a;
 }

+/*
 static inline __m128i m128_const1_64( const uint64_t n )
 {
   register __m128i a;
@@ -162,13 +183,13 @@ static inline __m128i m128_const1_64( const uint64_t n )
        : "r"(n) );
   return a;
 }
-
+*/
 #else

 // #define m128_one_128   _mm_set_epi64x( 0ULL, 1ULL )

 #define m128_const_64  _mm_set_epi64x
-#define m128_const1_64 _mm_set1_epi64x
+// #define m128_const1_64 _mm_set1_epi64x

 #endif

@@ -263,46 +284,6 @@ do { \
 #endif


-// Gather and scatter data.
-// Surprise, they don't use vector instructions. Several reasons why.
-// Since scalar data elements are being manipulated scalar instructions
-// are most appropriate and can bypass vector registers. They are faster
-// and more efficient on a per instruction basis due to the higher clock
-// speed and greater avaiability of execution resources. It's good for
-// interleaving data buffers for parallel processing.
-// May suffer overhead if data is already in a vector register. This can
-// usually be easilly avoided by the coder. Sometimes _mm_set is simply better.
-// These macros are likely to be used when transposing matrices rather than
-// conversions of a single vector.
-
-// Gather data elements into contiguous memory for vector use.
-// Source args are appropriately sized value integers, destination arg  is a
-// type agnostic pointer.
-// Vector alignment is not required, though likely. Appropriate integer
-// alignment satisfies these macros.
-
-// rewrite using insert
-#define mm128_gather_64( d, s0, s1 ) \
-    ((uint64_t*)d)[0] = (uint64_t)s0; \
-    ((uint64_t*)d)[1] = (uint64_t)s1;
-
-#define mm128_gather_32( d, s0, s1, s2, s3 ) \
-    ((uint32_t*)d)[0] = (uint32_t)s0; \
-    ((uint32_t*)d)[1] = (uint32_t)s1; \
-    ((uint32_t*)d)[2] = (uint32_t)s2; \
-    ((uint32_t*)d)[3] = (uint32_t)s3;
-
-// Scatter data from contiguous memory.
-#define mm128_scatter_64( d0, d1, s ) \
-   *( (uint64_t*)d0) = ((uint64_t*)s)[0]; \
-   *( (uint64_t*)d1) = ((uint64_t*)s)[1]; 
-
-#define mm128_scatter_32( d0, d1, d2, d3, s ) \
-   *( (uint32_t*)d0) = ((uint32_t*)s)[0]; \
-   *( (uint32_t*)d1) = ((uint32_t*)s)[1]; \
-   *( (uint32_t*)d2) = ((uint32_t*)s)[2]; \
-   *( (uint32_t*)d3) = ((uint32_t*)s)[3];
-
 // Memory functions
 // Mostly for convenience, avoids calculating bytes.
 // Assumes data is alinged and integral.