v24.6

2026-02-22 16:33:08 +00:00 · 2024-12-08 11:14:08 -05:00
parent 8e91bfbe19
commit 06624a0ff2
18 changed files with 1526 additions and 1140 deletions
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -93,6 +93,8 @@
 #define v128_cmplt16( v1, v0 )      vcltq_s16( (int16x8_t)v1, (int16x8_t)(v0) )
 #define v128_cmplt8( v1, v0 )       vcltq_s8( (int8x16_t)v1, (int8x16_t)(v0) )

+#define v128_cmpeq_zero                vceqzq_u64
+
 // Logical bit shift
 #define v128_sl64                     vshlq_n_u64
 #define v128_sl32                     vshlq_n_u32
@@ -135,14 +137,14 @@
 #if defined(__ARM_FEATURE_SHA3)
  #define v128_xor3                   veor3q_u32
 #else
-  #define v128_xor3( v2, v1, v0 )     veorq_u32( v2, veorq_u32( v1, v0 ) )
+  #define v128_xor3( v2, v1, v0 )     veorq_u32( veorq_u32( v2, v1 ), v0 )
 #endif

 // v2 & v1 & v0
-#define v128_and3( v2, v1, v0 )       v128_and( v2, v128_and( v1, v0 ) )
+#define v128_and3( v2, v1, v0 )       v128_and( v128_and( v2, v1 ), v0 )

 // v2 | v1 | v0
-#define v128_or3( v2, v1, v0 )        v128_or( v2, v128_or( v1, v0 ) )
+#define v128_or3( v2, v1, v0 )        v128_or( v128_or( v2, v1 ), v0 )

 // v2 ^ ( ~v1 & v0 )
 #if defined(__ARM_FEATURE_SHA3)
@@ -178,6 +180,7 @@
 #define v128_unpacklo8(  v1, v0 )     vzip1q_u8(  v1, v0 )
 #define v128_unpackhi8(  v1, v0 )     vzip2q_u8(  v1, v0 )

+// vzipq_u32 can do hi & lo and return uint32x4x2, no 64 bit version.

 // AES
 // consistent with Intel AES intrinsics, break up for optimizing
@@ -237,18 +240,15 @@ typedef union
 #define cast_v128u32( p )              (*((uint32x4_t*)(p)))
 #define castp_v128u32( p )             ((uint32x4_t*)(p))

-#define v128_zero                      v128_64( 0ull )
-
-#define v128_cmpeq_zero                vceqzq_u64
-
-#define v128_neg1                      v128_64( 0xffffffffffffffffull )
-
 // set1
 #define v128_64                        vmovq_n_u64
 #define v128_32                        vmovq_n_u32
 #define v128_16                        vmovq_n_u16
 #define v128_8                         vmovq_n_u8

+#define v128_zero                      v128_64( 0ull )
+#define v128_neg1                      v128_64( 0xffffffffffffffffull )
+
 #define v64_set32( u32_1, u32_0 ) \
  vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )

@@ -357,28 +357,23 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
                             ((uint16x8_t)(v)), c )

 #define v128_rol16( v, c ) \
-  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
+  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \
               : vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
                             ((uint16x8_t)(v)), c )

 #define v128_ror8( v, c ) \
-      vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)),  8-(c) ), \
+      vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
                  ((uint8x16_t)(v)), c )

 #define v128_rol8( v, c ) \
-      vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)),  8-(c) ), \
+      vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
                 ((uint8x16_t)(v)), c )

-
-// ( v1 ^ v0 ) >>> n 
+// ( v1 ^ v0 ) >>> c 
 #if defined(__ARM_FEATURE_SHA3)
-
-#define v128_ror64xor( v1, v0, n )  vxarq_u64( v1, v0, n ) 
-
+  #define v128_ror64xor( v1, v0, c )  vxarq_u64( v1, v0, c ) 
 #else
-
-#define v128_ror64xor( v1, v0, n )  v128_ror64( v128_xor( v1, v0 ), n ) 
-
+  #define v128_ror64xor( v1, v0, c )  v128_ror64( v128_xor( v1, v0 ), c ) 
 #endif

 #define v128_2ror64( v1, v0, c ) \
@@ -411,7 +406,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 v1 = vorrq_u32( v1, t1 ); \
 }

-#define v128_2rorx32( v1, v0, c ) \
+#define v128_2ror32( v1, v0, c ) \
 { \
 uint32x4_t t0 = vshlq_n_u32( v0, c ); \
 uint32x4_t t1 = vshlq_n_u32( v1, c ); \
@@ -444,9 +439,9 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 #define v128_lrev16            vrev32q_u16

 // aka bswap
-#define v128_qrev8             vrev64q_u8
-#define v128_lrev8             vrev32q_u8
-#define v128_wrev8             vrev16q_u8
+// #define v128_qrev8             vrev64q_u8
+// #define v128_lrev8             vrev32q_u8
+// #define v128_wrev8             vrev16q_u8

 // full vector rotation

@@ -471,9 +466,9 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
 #define v128_bswap16(v)        (uint16x8_t)vrev16q_u8( (uint8x16_t)(v) )
 #define v128_bswap32(v)        (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) )
 #define v128_bswap64(v)        (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
-#define v128_bswap128(v)       (uint32x4_t)v128_swap64( v128_bswap64(v) )
+#define v128_bswap128(v)       (uint32x4_t)v128_rev64( v128_bswap64(v) )

-// Usefull for x86_64 but does nothing for ARM
+// Useful for x86_64 but does nothing for ARM
 #define v128_block_bswap32( dst, src ) \
 { \
   casti_v128u32( dst,0 ) = v128_bswap32( casti_v128u32( src,0 ) ); \
@@ -542,7 +537,7 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )

 // Bitwise blend using vector mask, use only bytewise for compatibility
 // with x86_64.
-#define v128_blendv( v1, v0, mask )    vbslq_u32( mask, v1, v0 )
+#define v128_blendv( v1, v0, mask )    vbslq_u32( mask, v0, v1 )

 #endif   // __ARM_NEON
 #endif   // SIMD_NEON_H__