v23.6

2025-09-17 23:44:27 +00:00 · 2023-10-28 16:22:14 -04:00
parent 160608cce5
commit 46dca7a493
20 changed files with 3092 additions and 2297 deletions
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -1509,20 +1509,20 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )

 #elif defined(__ARM_NEON)

-  casti_v128u64( d,0 ) = vdupq_laneq_u64( s0, 0 );
-  casti_v128u64( d,1 ) = vdupq_laneq_u64( s0, 1 );
+  casti_v128u64( d,0 ) = vdupq_laneq_u64( (uint64x2_t)s0, 0 );
+  casti_v128u64( d,1 ) = vdupq_laneq_u64( (uint64x2_t)s0, 1 );

-  casti_v128u64( d,2 ) = vdupq_laneq_u64( s1, 0 );
-  casti_v128u64( d,3 ) = vdupq_laneq_u64( s1, 1 );
+  casti_v128u64( d,2 ) = vdupq_laneq_u64( (uint64x2_t)s1, 0 );
+  casti_v128u64( d,3 ) = vdupq_laneq_u64( (uint64x2_t)s1, 1 );

-  casti_v128u64( d,4 ) = vdupq_laneq_u64( s2, 0 );
-  casti_v128u64( d,5 ) = vdupq_laneq_u64( s2, 1 );
+  casti_v128u64( d,4 ) = vdupq_laneq_u64( (uint64x2_t)s2, 0 );
+  casti_v128u64( d,5 ) = vdupq_laneq_u64( (uint64x2_t)s2, 1 );

-  casti_v128u64( d,6 ) = vdupq_laneq_u64( s3, 0 );
-  casti_v128u64( d,7 ) = vdupq_laneq_u64( s3, 1 );
+  casti_v128u64( d,6 ) = vdupq_laneq_u64( (uint64x2_t)s3, 0 );
+  casti_v128u64( d,7 ) = vdupq_laneq_u64( (uint64x2_t)s3, 1 );

-  casti_v128u64( d,8 ) = vdupq_laneq_u64( s4, 0 );
-  casti_v128u64( d,9 ) = vdupq_laneq_u64( s4, 1 );
+  casti_v128u64( d,8 ) = vdupq_laneq_u64( (uint64x2_t)s4, 0 );
+  casti_v128u64( d,9 ) = vdupq_laneq_u64( (uint64x2_t)s4, 1 );

 #endif
 }
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -907,7 +907,7 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
 #else

 #define v128_blendv( v1, v0, mask ) \
-           v128_or( v128_andnot( mask, v0 ), v128_and( mask, v1 ) )
+   v128_or( v128_andnot( mask, v0 ), v128_and( mask, v1 ) )

 #endif

--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -100,13 +100,15 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_sra32                     vshrq_n_s32
 #define v128_sra16                     vshrq_n_s16

-// logic
+// unary logic
+#define v128_not                       vmvnq_u32
+
+// binary
 #define v128_or                        vorrq_u32
 #define v128_and                       vandq_u32
-#define v128_not                       vmvnq_u32
 #define v128_xor                       veorq_u32
-#define v128_andnot( v1, v0 )          vandq_u32( vmvnq_u32(v1), v0 )
-#define v128_xnor( a, b )              v128_not( v128_xor( a, b ) )
+#define v128_andnot                    vandq_u32
+#define v128_xnor( v1, v0 )            v128_not( v128_xor( v1, v0 ) )
 #define v128_ornot                     vornq_u32 

 // ternary logic, veorq_u32 not defined
@@ -117,7 +119,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_and3( a, b, c )           v128_and( a, v128_and( b, c ) )
 #define v128_or3( a, b, c )            v128_or( a, v128_or( b, c ) )
 #define v128_xorand( a, b, c )         v128_xor( a, v128_and( b, c ) )
-#define v128_andxor( a, b, c )         v128_and( a, v128_xor( b, c ))
+#define v128_andxor( a, b, c )         v128_and( a, v128_xor( b, c ) )
 #define v128_xoror( a, b, c )          v128_xor( a, v128_or( b, c ) )
 #define v128_orand( a, b, c )          v128_or( a, v128_and( b, c ) )

@@ -136,7 +138,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_unpacklo8(  v1, v0 )      vzip1q_u8(  v0, v1 )
 #define v128_unpackhi8(  v1, v0 )      vzip2q_u8(  v0, v1 )

-// Shorter agnostic names for unpack using NEON-like syntax
+// Shorter achchitecture agnostic names for unpack using NEON-like mnemonics
 #define v128_ziplo64                   vzip1q_u64
 #define v128_ziphi64                   vzip2q_u64
 #define v128_ziplo32                   vzip1q_u32
@@ -279,28 +281,44 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 //TODO, maybe, Optimize 64 bit rotations
 // Fall back for odd bit rotations
 static inline uint64x2_t v128_ror64( uint64x2_t v, int c )
-{  return vsriq_n_u64( vshlq_n_u64( v, 64-c ), v, c ); }
+{
+   return vsriq_n_u64( vshlq_n_u64( (uint64x2_t)v, 64-c ), (uint64x2_t)v, c );
+}

 static inline uint64x2_t v128_rol64( uint64x2_t v, int c )
-{  return vsriq_n_u64( vshlq_n_u64( v, c ), v, 64-c ); }
+{
+   return vsliq_n_u64( vshrq_n_u64( (uint64x2_t)v, 64-c ), (uint64x2_t)v, c );
+}
+
+//static inline uint64x2_t v128_rol64( uint64x2_t v, int c )
+//{  return vsriq_n_u64( vshlq_n_u64( v, c ), v, 64-c ); }

 static inline uint32x4_t v128_ror32( uint32x4_t v, int c )
 {  return vsriq_n_u32( vshlq_n_u32( v, 32-c ), v, c ); }

 static inline uint32x4_t v128_rol32( uint32x4_t v, int c )
-{  return vsriq_n_u32( vshlq_n_u32( v, c ), v, 32-c ); }
+{  return vsliq_n_u32( vshrq_n_u32( v, 32-c ), v, c ); }
+
+//static inline uint32x4_t v128_rol32( uint32x4_t v, int c )
+//{  return vsriq_n_u32( vshlq_n_u32( v, c ), v, 32-c ); }

 static inline uint16x8_t v128_ror16( uint16x8_t v, int c )
 {  return vsriq_n_u16( vshlq_n_u16( v, 16-c ), v, c ); }

 static inline uint16x8_t v128_rol16( uint16x8_t v, int c )
-{  return vsriq_n_u16( vshlq_n_u16( v, c ), v, 16-c ); }
+{  return vsliq_n_u16( vshrq_n_u16( v, 16-c ), v, c ); }
+
+//static inline uint16x8_t v128_rol16( uint16x8_t v, int c )
+//{  return vsriq_n_u16( vshlq_n_u16( v, c ), v, 16-c ); }

 static inline uint8x16_t v128_ror8( uint8x16_t v, int c )
 {  return vsriq_n_u8( vshlq_n_u8( v, 8-c ), v, c ); }

-static inline uint8x16_t v128_rol8( uint16x8_t v, int c )
-{  return vsriq_n_u8( vshlq_n_u8( v, c ), v, 8-c ); }
+static inline uint8x16_t v128_rol8( uint8x16_t v, int c )
+{  return vsliq_n_u8( vshrq_n_u8( v, 8-c ), v, c ); }
+
+//static inline uint8x16_t v128_rol8( uint16x8_t v, int c )
+//{  return vsriq_n_u8( vshlq_n_u8( v, c ), v, 8-c ); }

 /*
 // Optimzed for half element rotations (swap)
@@ -358,7 +376,7 @@ static inline uint8x16_t v128_rol8( uint16x8_t v, int c )
 }

 // vector rotation , size?
-static inline uint32x4_t v128_swap64( uint32x4_t v )
+static inline uint64x2_t v128_swap64( uint64x2_t v )
 {   return vextq_u64( v, v, 1 ); }

 static inline uint32x4_t v128_shuflr32( uint32x4_t v )
@@ -413,10 +431,10 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
 #define v128_bitrev8( v )              vrbitq_u8

 // reverse byte order
-#define v128_bswap16                   vrev16q_u8
-#define v128_bswap32                   vrev32q_u8
-#define v128_bswap64                   vrev64q_u8
-#define v128_bswap128(v)               v128_swap64( v128_bswap64(v) )
+#define v128_bswap16(v)                (uint16x8_t)vrev16q_u8( (uint8x16_t)(v) )
+#define v128_bswap32(v)                (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) )
+#define v128_bswap64(v)                (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
+#define v128_bswap128(v)               (uint32x4_t)v128_swap64( v128_bswap64(v) )
 #define v128_bswap256(p)               v128_bswap128( (p)[0], (p)[1] ) 

 // Usefull for x86_64 but does nothing for ARM