v25.4

2026-02-22 16:33:08 +00:00 · 2025-06-20 20:31:41 -04:00
parent dd99580a4c
commit 66191db93c
86 changed files with 2701 additions and 4322 deletions
--- a/simd-utils.h
+++ b/simd-utils.h
@@ -140,28 +140,28 @@
 #include <stdint.h>
 #include <stddef.h>

-// AVX512 macros are not a reliable indicator of 512 bit vector capability
-// because they get defined with AVX10_1_256 which doesn't support 512 bit.
-// EVEX512 is also unreliable as it can also be defined when 512b is not
-// available.
-// Use AVX10_1_512 for 512b & AVX10_1_256 for 256b whenever AVX10 is present.
-// Use AVX512 macros only whithout AVX10.
-
 /*
-// Test for macros
-#ifdef __AVX10_1__
+// Test for AVX10 macros
+// AVX10-256 was abandoned by Intel before any CPUs were built.
+#ifdef __AVX10__            // does not exist
+#warning "__AVX10__"   
+#endif
+#ifdef __AVX10_1__          // GCC-14
 #warning "__AVX10_1__"
 #endif
-#ifdef __AVX10_1_256__
+#ifdef __AVX10_2__          // GCC-15
+#warning "__AVX10_2__"
+#endif
+#ifdef __AVX10_1_256__      // obsolete
 #warning "__AVX10_1_256__"
 #endif
-#ifdef __AVX10_1_512__
-#warning "__AVX10_1_512__"
+#ifdef __AVX10_1_512__    
+#warning "__AVX10_1_512__"  // does not exist
 #endif
-#ifdef __EVEX256__
-#warning "__EVEX256__"
+#ifdef __EVEX256__          // likely obsolete
+#warning "__EVEX256__"   
 #endif
-#ifdef __EVEX512__
+#ifdef __EVEX512__          // likely obsolete
 #warning "__EVEX512__"
 #endif
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -169,27 +169,14 @@
 #endif
 */

-// SIMD512: Use 512, 256 & 128 bit vectors, AVX512VBMI is not included and
-// must be tested seperately. 
-// VL256: Include AVX512VL instructions for 256 & 128 bit vectors.
-// VBMI: Include AVX512VBMI instructions for supported vector lengths.
-
-#if defined(__AVX10_1__)
-
-  #define VL256 1
-  #define VBMI 1
-  #if defined(__AVX10_1_512__)
-    #define SIMD512 1
-  #endif
-
-#elif defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-  #define VL256 1
+// With Intel abandoning AVX10-256 the SIM512 & VL256 macros are almost
+// identical with the only difference being VBMI is included in VL256.
+#if defined(__AVX10_1__) || ( defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) )
  #define SIMD512 1
-  #if defined(__AVX512VBMI__)
+  #define VL256 1
+  #if defined(__AVX10_1__) || defined(__AVX512VBMI__)
    #define VBMI 1
  #endif
-
 #endif

 /*
@@ -204,13 +191,75 @@
 #endif
 */

+// targetted intrinsics
 #if defined(__x86_64__)
+  #include <x86intrin.h>
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+  #include <arm_neon.h>
+#elif defined(__riscv) && defined(__riscv_vector)
+  #include <riscv_vector.h> 
+#endif

-#include <x86intrin.h>
+// Single global definition for frequently used vector constants.
+// The GCC optimizer can merge constants but merging different vector lengths
+// might be beyond it's scope. 

-#elif defined(__aarch64__)
+// Frequently used SSE/AVX shuffle constants.
+#if defined(SIMD512)

-#include <arm_neon.h>
+// When used with shuffle_epi8 performs are standard bswap of all elements.
+// When used with permutexvar_epi8 (requires AVX512VBMI or AVX10) performs a
+// bswap of the elements in the lower 128 bits of the source and broadcasts
+// the result to all 128 bit lanes of the destination.
+
+extern const __m512i V512_BSWAP64;
+#define V256_BSWAP64 _mm512_castsi512_si256( V512_BSWAP64 )
+#define V128_BSWAP64 _mm512_castsi512_si128( V512_BSWAP64 )
+
+extern const __m512i V512_BSWAP32;
+#define V256_BSWAP32 _mm512_castsi512_si256( V512_BSWAP32 )
+#define V128_BSWAP32 _mm512_castsi512_si128( V512_BSWAP32 )
+
+#elif defined(__AVX2__)
+
+extern const __m256i V256_BSWAP64;
+#define V128_BSWAP64 _mm256_castsi256_si128( V256_BSWAP64 )
+
+extern const __m256i V256_BSWAP32;
+#define V128_BSWAP32 _mm256_castsi256_si128( V256_BSWAP32 )
+
+// These shuffles aren't needed with AVX512, uses ror/rol instead.
+
+extern const __m256i V256_SHUFLR64_8;
+#define V128_SHUFLR64_8 _mm256_castsi256_si128( V256_SHUFLR64_8 )
+
+extern const __m256i V256_SHUFLR64_24;
+#define V128_SHUFLR64_24 _mm256_castsi256_si128( V256_SHUFLR64_24 )
+
+extern const __m256i V256_SHUFLL64_8;
+#define V128_SHUFLL64_8 _mm256_castsi256_si128( V256_SHUFLL64_8 )
+
+extern const __m256i V256_SHUFLL64_24;
+#define V128_SHUFLL64_24 _mm256_castsi256_si128( V256_SHUFLL64_24 )
+
+extern const __m256i V256_SHUFLR32_8;
+#define V128_SHUFLR32_8 _mm256_castsi256_si128( V256_SHUFLR32_8 )
+
+extern const __m256i V256_SHUFLL32_8;
+#define V128_SHUFLL32_8 _mm256_castsi256_si128( V256_SHUFLL32_8 )
+
+#elif defined(__SSSE3__)
+
+extern const __m128i V128_BSWAP64;
+extern const __m128i V128_BSWAP32;
+
+extern const __m128i V128_SHUFLR64_8;
+extern const __m128i V128_SHUFLR64_24;
+extern const __m128i V128_SHUFLL64_8;
+extern const __m128i V128_SHUFLL64_24;
+
+extern const __m128i V128_SHUFLR32_8;
+extern const __m128i V128_SHUFLL32_8;

 #endif

@@ -225,7 +274,7 @@
 // x86_64 AVX512 512 bit vectors
 #include "simd-utils/simd-512.h"

-// aarch64 neon 128 bit vectors
+// aarch64 NEON 128 bit vectors
 #include "simd-utils/simd-neon.h"

 #include "simd-utils/intrlv.h"