v3.23.0

2025-09-17 23:44:27 +00:00 · 2023-08-30 20:15:48 -04:00
parent 57a6b7b58b
commit 4378d2f841
72 changed files with 10184 additions and 2182 deletions
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -731,6 +731,67 @@ static inline void extr_lane_8x32( void *d, const void *s,

 #if defined(__AVX2__)

+#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
+
+//TODO Enable for AVX10_256 AVX10_512
+
+// Combine byte swap & broadcast in one permute
+static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
+{
+   const __m256i c0 = _mm256_set1_epi32( 0x00010203 );
+   const __m256i c1 = _mm256_set1_epi32( 0x04050607 );
+   const __m256i c2 = _mm256_set1_epi32( 0x08090a0b );
+   const __m256i c3 = _mm256_set1_epi32( 0x0c0d0e0f );
+   const __m128i s0 = casti_m128i( src,0 );
+   const __m128i s1 = casti_m128i( src,1 );
+   const __m128i s2 = casti_m128i( src,2 );
+   const __m128i s3 = casti_m128i( src,3 );
+   const __m128i s4 = casti_m128i( src,4 );
+
+   casti_m256i( d, 0 ) = _mm256_permutexvar_epi8( c0,
+                          _mm256_castsi128_si256( s0 ) );
+   casti_m256i( d, 1 ) = _mm256_permutexvar_epi8( c1,
+                          _mm256_castsi128_si256( s0 ) );
+   casti_m256i( d, 2 ) = _mm256_permutexvar_epi8( c2,
+                          _mm256_castsi128_si256( s0 ) );
+   casti_m256i( d, 3 ) = _mm256_permutexvar_epi8( c3,
+                          _mm256_castsi128_si256( s0 ) );
+   casti_m256i( d, 4 ) = _mm256_permutexvar_epi8( c0,
+                          _mm256_castsi128_si256( s1 ) );
+   casti_m256i( d, 5 ) = _mm256_permutexvar_epi8( c1,
+                          _mm256_castsi128_si256( s1 ) );
+   casti_m256i( d, 6 ) = _mm256_permutexvar_epi8( c2,
+                          _mm256_castsi128_si256( s1 ) );
+   casti_m256i( d, 7 ) = _mm256_permutexvar_epi8( c3,
+                          _mm256_castsi128_si256( s1 ) );
+   casti_m256i( d, 8 ) = _mm256_permutexvar_epi8( c0,
+                          _mm256_castsi128_si256( s2 ) );
+   casti_m256i( d, 9 ) = _mm256_permutexvar_epi8( c1,
+                          _mm256_castsi128_si256( s2 ) );
+   casti_m256i( d,10 ) = _mm256_permutexvar_epi8( c2,
+                          _mm256_castsi128_si256( s2 ) );
+   casti_m256i( d,11 ) = _mm256_permutexvar_epi8( c3,
+                          _mm256_castsi128_si256( s2 ) );
+   casti_m256i( d,12 ) = _mm256_permutexvar_epi8( c0,
+                          _mm256_castsi128_si256( s3 ) );
+   casti_m256i( d,13 ) = _mm256_permutexvar_epi8( c1,
+                          _mm256_castsi128_si256( s3 ) );
+   casti_m256i( d,14 ) = _mm256_permutexvar_epi8( c2,
+                          _mm256_castsi128_si256( s3 ) );
+   casti_m256i( d,15 ) = _mm256_permutexvar_epi8( c3,
+                          _mm256_castsi128_si256( s3 ) );
+   casti_m256i( d,16 ) = _mm256_permutexvar_epi8( c0,
+                          _mm256_castsi128_si256( s4 ) );
+   casti_m256i( d,17 ) = _mm256_permutexvar_epi8( c1,
+                          _mm256_castsi128_si256( s4 ) );
+   casti_m256i( d,18 ) = _mm256_permutexvar_epi8( c2,
+                          _mm256_castsi128_si256( s4 ) );
+   casti_m256i( d,19 ) = _mm256_permutexvar_epi8( c3,
+                          _mm256_castsi128_si256( s4 ) );
+}
+
+#else
+
 static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
 {
  const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
@@ -792,6 +853,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
                         _mm256_castsi128_si256( s4 ), c3 );
 }

+#endif   // AVX512VBMI else
 #endif   // AVX2

 // 16x32
@@ -1173,10 +1235,12 @@ static inline void extr_lane_16x32( void *d, const void *s,
   ((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+240 ];
 }

-#if defined(__AVX512F__) && defined(__AVX512VL__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #if defined(__AVX512VBMI__)

+// TODO Enable for AVX10_512
+
 // Combine byte swap & broadcast in one permute
 static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
 {
@@ -1496,10 +1560,48 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
                          _mm256_castsi128_si256( s4 ), 0x55 );
 }

+#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
+
+//TODO Enable for AVX10_256 AVX10_512
+
+static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
+{
+   const __m256i c0 = _mm256_set1_epi64x( 0x0405060700010203 );
+   const __m256i c1 = _mm256_set1_epi64x( 0x0c0d0e0f08090a0b );
+   const __m128i s0 = casti_m128i( src,0 );
+   const __m128i s1 = casti_m128i( src,1 );
+   const __m128i s2 = casti_m128i( src,2 );
+   const __m128i s3 = casti_m128i( src,3 );
+   const __m128i s4 = casti_m128i( src,4 );
+
+   casti_m256i( d,0 ) = _mm256_permutexvar_epi8( c0,
+                         _mm256_castsi128_si256( s0 ) );
+   casti_m256i( d,1 ) = _mm256_permutexvar_epi8( c1,
+                         _mm256_castsi128_si256( s0 ) );
+   casti_m256i( d,2 ) = _mm256_permutexvar_epi8( c0,
+                         _mm256_castsi128_si256( s1 ) );
+   casti_m256i( d,3 ) = _mm256_permutexvar_epi8( c1,
+                         _mm256_castsi128_si256( s1 ) );
+   casti_m256i( d,4 ) = _mm256_permutexvar_epi8( c0,
+                         _mm256_castsi128_si256( s2 ) );
+   casti_m256i( d,5 ) = _mm256_permutexvar_epi8( c1,
+                         _mm256_castsi128_si256( s2 ) );
+   casti_m256i( d,6 ) = _mm256_permutexvar_epi8( c0,
+                         _mm256_castsi128_si256( s3 ) );
+   casti_m256i( d,7 ) = _mm256_permutexvar_epi8( c1,
+                         _mm256_castsi128_si256( s3 ) );
+   casti_m256i( d,8 ) = _mm256_permutexvar_epi8( c0,
+                         _mm256_castsi128_si256( s4 ) );
+   casti_m256i( d,9 ) = _mm256_permutexvar_epi8( c1,
+                         _mm256_castsi128_si256( s4 ) );
+}
+
+#else
+
 static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
 {
  const __m256i bswap_shuf = mm256_bcast_m128(
-                     _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
+                    _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
  __m256i s0 = casti_m256i( src,0 );
  __m256i s1 = casti_m256i( src,1 );
  __m128i s4 = casti_m128i( src,4 );
@@ -1524,6 +1626,8 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
                          _mm256_castsi128_si256( s4 ), 0x55 );
 }

+#endif
+
 #endif  // AVX2

 // 8x64   (AVX512)
@@ -1846,6 +1950,8 @@ static inline void extr_lane_8x64( void *dst, const void *src, const int lane,

 #if defined(__AVX512F__) && defined(__AVX512VL__)

+//TODO Enable for AVX10_512
+
 // broadcast to all lanes
 static inline void mm512_intrlv80_8x64( void *dst, const void *src )
 {
@@ -2089,10 +2195,36 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,
   d0[3] = s[12];   d1[3] = s[13];    d2[3] = s[14];   d3[3] = s[15];
 }

-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-static inline void mm512_bswap32_intrlv80_4x128( void *d, void *src )
+#if defined(__AVX512VBMI__)
+//TODO Enable for AVX10_512
+
+static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
+{
+  const __m512i bswap_shuf = mm512_bcast_m128(
+                    _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
+  const __m128i s0 = casti_m128i( src,0 );
+  const __m128i s1 = casti_m128i( src,1 );
+  const __m128i s2 = casti_m128i( src,2 );
+  const __m128i s3 = casti_m128i( src,3 );
+  const __m128i s4 = casti_m128i( src,4 );
+
+  casti_m512i( d,0 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s0 ),
+                                                 bswap_shuf );
+  casti_m512i( d,1 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s1 ),
+                                                 bswap_shuf );
+  casti_m512i( d,2 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s2 ),
+                                                 bswap_shuf );
+  casti_m512i( d,3 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s3 ),
+                                                 bswap_shuf );
+  casti_m512i( d,4 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s4 ),
+                                                 bswap_shuf );
+}
+
+#else
+
+static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
 {
  const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
                                             0x0405060700010203 );
@@ -2108,14 +2240,15 @@ static inline void mm512_bswap32_intrlv80_4x128( void *d, void *src )
  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
  s4 = _mm_shuffle_epi8( s4, bswap_shuf );

-  casti_m512i( d, 0 ) = mm512_bcast_m128( s0 );
-  casti_m512i( d, 1 ) = mm512_bcast_m128( s1 );
-  casti_m512i( d, 2 ) = mm512_bcast_m128( s2 );
-  casti_m512i( d, 3 ) = mm512_bcast_m128( s3 );
-  casti_m512i( d, 4 ) = mm512_bcast_m128( s4 );
-}  
+  casti_m512i( d,0 ) = mm512_bcast_m128( s0 );
+  casti_m512i( d,1 ) = mm512_bcast_m128( s1 );
+  casti_m512i( d,2 ) = mm512_bcast_m128( s2 );
+  casti_m512i( d,3 ) = mm512_bcast_m128( s3 );
+  casti_m512i( d,4 ) = mm512_bcast_m128( s4 );
+}

-#endif
+#endif   // AVX512VBMI ELSE
+#endif   // AVX512

 // 2x256 (AVX512)

@@ -2955,6 +3088,8 @@ do { \

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+//TODO Enable for AVX10_512
+
 /*
 #define mm512_intrlv_blend_128( hi, lo ) \
   _mm512_mask_blend_epi32( 0x0f0f, hi, lo )
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -43,9 +43,11 @@ typedef union
 } __attribute__ ((aligned (16))) m128_ovly;


-// Deprecated. EVEX adds support for integer argument in broadcast instruction
-// eliminating the need for an explicit move in most cases. Use the set1
-// intrinsic with integers and let the compiler figure it out.
+// Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements
+// that make these functions either unnecessary or inefficient.
+// In cases where an explicit move betweeen GP & SIMD registers is still
+// necessary the cvt, set, or set1 intrinsics can be used allowing the
+// compiler to exploilt new features to produce optimum code.
 static inline __m128i mm128_mov64_128( const uint64_t n )
 {
  __m128i a;
@@ -73,15 +75,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
 //#define mm128_bcast_m64( v )   _mm_shuffle_epi32( v, 0x44 )
 //#define mm128_bcast_m32( v )   _mm_shuffle_epi32( v, 0x00 )

-// Deprecated, use set1 directly
-#define m128_const1_64          _mm_set1_epi64x
-#define m128_const1_32          _mm_set1_epi32
-
-// Deprecated, use set directly
-#define m128_const_64  _mm_set_epi64x
-
 // Pseudo constants
-
 #define m128_zero      _mm_setzero_si128()
 #define m128_one_128   mm128_mov64_128( 1 )
 //#define m128_one_64    _mm_set1_epi64x( 1 )
@@ -141,7 +135,7 @@ static inline __m128i mm128_neg1_fn()

 // Examples of simple operations using xim:

-// Insert 32 bit integer into v at element c and return updated v.
+// Copy i to element c of dest and copy remaining elemnts from v.
 static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
                                       const int c )
 {   return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); }
@@ -161,6 +155,7 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )

 // Bitwise not (~v)  
 #if defined(__AVX512VL__)
+//TODO Enable for AVX10_256

 static inline __m128i mm128_not( const __m128i v )
 {  return _mm_ternarylogic_epi64( v, v, v, 1 ); }
@@ -223,18 +218,54 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 {   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }

 #if defined(__AVX512VL__)
+//TODO Enable for AVX10_256

 // a ^ b ^ c
-#define mm128_xor3( a, b, c )    _mm_ternarylogic_epi64( a, b, c, 0x96 )
+#define mm128_xor3( a, b, c )      _mm_ternarylogic_epi64( a, b, c, 0x96 )
+
+// a & b & c
+#define mm128_and3( a, b, c )      _mm_ternarylogic_epi64( a, b, c, 0x80 )
+
+// a | b | c
+#define mm128_or3( a, b, c )       _mm_ternarylogic_epi64( a, b, c, 0xfe )

 // a ^ ( b & c )
-#define mm128_xorand( a, b, c )  _mm_ternarylogic_epi64( a, b, c, 0x78 )
+#define mm128_xorand( a, b, c )    _mm_ternarylogic_epi64( a, b, c, 0x78 )
+
+// a & ( b ^ c )
+#define mm128_andxor( a, b, c )    _mm_ternarylogic_epi64( a, b, c, 0x60 )
+
+// a ^ ( b | c )
+#define mm128_xoror( a, b, c )     _mm_ternarylogic_epi64( a, b, c, 0x1e )
+
+// a ^ ( ~b & c )
+#define mm128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 )
+
+// a | ( b & c )
+#define mm128_orand( a, b, c )     _mm_ternarylogic_epi64( a, b, c, 0xf8 )
+
+// ~( a ^ b ), same as (~a) ^ b
+#define mm128_xnor( a, b )         _mm_ternarylogic_epi64( a, b, b, 0x81 )

 #else

-#define mm128_xor3( a, b, c )    _mm_xor_si128( a, _mm_xor_si128( b, c ) )
+#define mm128_xor3( a, b, c )      _mm_xor_si128( a, _mm_xor_si128( b, c ) )

-#define mm128_xorand( a, b, c )  _mm_xor_si128( a, _mm_and_si128( b, c ) )
+#define mm128_and3( a, b, c )      _mm_and_si128( a, _mm_and_si128( b, c ) )
+
+#define mm128_or3( a, b, c )       _mm_or_si128( a, _mm_or_si128( b, c ) )
+
+#define mm128_xorand( a, b, c )    _mm_xor_si128( a, _mm_and_si128( b, c ) )
+
+#define mm128_andxor( a, b, c )    _mm_and_si128( a, _mm_xor_si128( b, c ))
+
+#define mm128_xoror( a, b, c )     _mm_xor_si128( a, _mm_or_si128( b, c ) )
+
+#define mm128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) )
+
+#define mm128_orand( a, b, c )     _mm_or_si128( a, _mm_and_si128( b, c ) )
+
+#define mm128_xnor( a, b )         mm128_not( _mm_xor_si128( a, b ) )

 #endif

@@ -257,6 +288,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 // transparency.

 #if defined(__AVX512VL__)
+//TODO Enable for AVX10_256

 #define mm128_ror_64    _mm_ror_epi64
 #define mm128_rol_64    _mm_rol_epi64
@@ -372,7 +404,10 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #define mm128_shuflr64_32     mm128_swap64_32
 #define mm128_shufll64_32     mm128_swap64_32

-#if defined(__SSSE3__) && !defined(__AVX512VL__)
+//TODO Enable for AVX10_256
+#if defined(__AVX512VL__)
+  #define m1286_shuflr64_24( v )  _mm_ror_epi64( v, 24 )
+#elif defined(__SSSE3__) 
  #define mm128_shuflr64_24( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                    0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
@@ -380,7 +415,9 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
  #define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
 #endif

-#if defined(__SSSE3__) && !defined(__AVX512VL__)
+#if defined(__AVX512VL__)
+  #define mm128_shuflr64_16( v )  _mm_ror_epi64( v, 16 )
+#elif defined(__SSSE3__) 
  #define mm128_shuflr64_16( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                    0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
@@ -390,7 +427,9 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )

 // Rotate 32 bit lanes

-#if defined(__SSSE3__) && !defined(__AVX512VL__)
+#if defined(__AVX512VL__)
+  #define mm128_swap32_16( v )  _mm_ror_epi32( v, 16 )
+#elif defined(__SSSE3__)
  #define mm128_swap32_16( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                    0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
@@ -400,7 +439,9 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #define mm128_shuflr32_16      mm128_swap32_16
 #define mm128_shufll32_16      mm128_swap32_16

-#if defined(__SSSE3__) && !defined(__AVX512VL__)
+#if defined(__AVX512VL__)
+  #define mm128_shuflr32_8( v )  _mm_ror_epi32( v, 8 )
+#elif defined(__SSSE3__)
  #define mm128_shuflr32_8( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                    0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -13,17 +13,14 @@
 // automatically but their use is limited because 256 bit vectors are less
 // likely to be used when 512 is available.
 //
+// AVX10_256 will support AVX512VL instructions on CPUs limited to 256 bit
+// vectors. This will require enabling when the compiler's AVX10 feature
+// macros are known.
+//
 // "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
 // lanes and data can't cross the 128 bit lane boundary.  
-// Full width byte shuffle is available with AVX512VL using the mask version
-// with a full mask (-1). 
 // Instructions that can move data across 128 bit lane boundary incur a
 // performance penalty over those that can't.
-// Some usage of index vectors may be encoded as if full vector shuffles are
-// supported. This has no side effects and would have the same results using
-// either version.
-// If the need arises and AVX512VL is available, 256 bit full vector byte 
-// shuffles can be implemented using the AVX512 mask feature with a NULL mask.

 #if defined(__AVX__)

@@ -66,6 +63,7 @@ typedef union
 // Set either the low or high 64 bit elements in 128 bit lanes, other elements
 // are set to zero.
 #if defined(__AVX512VL__)
+//TODO Enable for AVX10_256

 #define mm256_bcast128lo_64( i64 )     _mm256_maskz_set1_epi64( 0x55, i64 )
 #define mm256_bcast128hi_64( i64 )     _mm256_maskz_set1_epi64( 0xaa, i64 )
@@ -81,11 +79,9 @@ typedef union

 #define mm256_set2_64( i1, i0 )   mm256_bcast_m128( _mm_set_epi64x( i1, i0 ) )

-// Deprecated
-#define m256_const1_64       _mm256_set1_epi64x
-#define m256_const1_32       _mm256_set1_epi32
+#define mm256_set4_32( i3, i2, i1, i0 ) \
+   mm256_bcast_m128( _mm_set_epi32( i3, i2, i1, i0 ) )

-//
 // All SIMD constant macros are actually functions containing executable
 // code and therefore can't be used as compile time initializers.

@@ -121,6 +117,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 // Basic operations without SIMD equivalent

 #if defined(__AVX512VL__)
+//TODO Enable for AVX10_256

 static inline __m256i mm256_not( const __m256i v )
 {  return _mm256_ternarylogic_epi64( v, v, v, 1 ); }
@@ -140,8 +137,7 @@ static inline __m256i mm256_not( const __m256i v )
   _mm256_add_epi32( _mm256_add_epi32( a, b ), _mm256_add_epi32( c, d ) )

 #if defined(__AVX512VL__)
-
-// AVX512 has ternary logic that supports any 3 input boolean expression.
+//TODO Enable for AVX10_256

 // a ^ b ^ c
 #define mm256_xor3( a, b, c )      _mm256_ternarylogic_epi64( a, b, c, 0x96 )
@@ -176,31 +172,31 @@ static inline __m256i mm256_not( const __m256i v )
 #else

 #define mm256_xor3( a, b, c ) \
-   _mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
+  _mm256_xor_si256( a, _mm256_xor_si256( b, c ) )

 #define mm256_xor4( a, b, c, d ) \
-   _mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
+  _mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )

 #define mm256_and3( a, b, c ) \
-   _mm256_and_si256( a, _mm256_and_si256( b, c ) )
+  _mm256_and_si256( a, _mm256_and_si256( b, c ) )

 #define mm256_or3( a, b, c ) \
   _mm256_or_si256( a, _mm256_or_si256( b, c ) )

 #define mm256_xorand( a, b, c ) \
- _mm256_xor_si256( a, _mm256_and_si256( b, c ) )
+  _mm256_xor_si256( a, _mm256_and_si256( b, c ) )

 #define mm256_andxor( a, b, c ) \
  _mm256_and_si256( a, _mm256_xor_si256( b, c ))

 #define mm256_xoror( a, b, c ) \
- _mm256_xor_si256( a, _mm256_or_si256( b, c ) )
+  _mm256_xor_si256( a, _mm256_or_si256( b, c ) )

 #define mm256_xorandnot( a, b, c ) \
- _mm256_xor_si256( a, _mm256_andnot_si256( b, c ) )
+  _mm256_xor_si256( a, _mm256_andnot_si256( b, c ) )

 #define mm256_orand( a, b, c ) \
- _mm256_or_si256( a, _mm256_and_si256( b, c ) )
+  _mm256_or_si256( a, _mm256_and_si256( b, c ) )

 #define mm256_xnor( a, b ) \
  mm256_not( _mm256_xor_si256( a, b ) )
@@ -226,6 +222,7 @@ static inline __m256i mm256_not( const __m256i v )
 // transparency.

 #if defined(__AVX512VL__)
+//TODO Enable for AVX10_256

 #define mm256_ror_64    _mm256_ror_epi64
 #define mm256_rol_64    _mm256_rol_epi64
@@ -380,6 +377,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 #define mm256_shuflr64_32         mm256_swap64_32
 #define mm256_shufll64_32         mm256_swap64_32

+//TODO Enable for AVX10_256
 #if defined(__AVX512VL__)
  #define mm256_shuflr64_24( v )  _mm256_ror_epi64( v, 24 )
 #else
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -113,10 +113,6 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
 #define mm512_set2_64( i1, i0 ) \
   mm512_bcast_m128( _mm_set_epi64x( i1, i0 ) )

-// Deprecated, use set
-#define m512_const1_64    _mm512_set1_epi64
-#define m512_const1_32    _mm512_set1_epi32
-
 // Pseudo constants.
 #define m512_zero       _mm512_setzero_si512()
 // Deprecated