v24.2

2025-09-17 23:44:27 +00:00 · 2024-05-20 23:08:50 -04:00
parent 4f930574cc
commit 042d13d1e1
129 changed files with 835 additions and 538 deletions
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -469,7 +469,7 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
 #if defined(__SSSE3__)

  const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
-                                             0x0405060700010203 );
+                                            0x0405060700010203 );

  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
@@ -913,9 +913,7 @@ static inline void extr_lane_8x32( void *d, const void *s,

 #if defined(__AVX2__)

-#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
-
-//TODO Enable for AVX10_256 AVX10_512
+#if defined(VL256) && defined(VBMI)

 // Combine byte swap & broadcast in one permute
 static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
@@ -977,7 +975,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
 static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
 {
  const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
-                                             0x0405060700010203 );
+                                            0x0405060700010203 );
  const __m256i c1 = v256_32( 1 );
  const __m256i c2 = _mm256_add_epi32( c1, c1 );
  const __m256i c3 = _mm256_add_epi32( c2, c1 );
@@ -1035,7 +1033,8 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
                         _mm256_castsi128_si256( s4 ), c3 );
 }

-#endif   // AVX512VBMI else
+#endif
+
 #endif   // AVX2

 // 16x32
@@ -1417,11 +1416,9 @@ static inline void extr_lane_16x32( void *d, const void *s,
   ((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+240 ];
 }

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

-#if defined(__AVX512VBMI__)
-
-// TODO Enable for AVX10_512
+#if defined(VBMI)

 // Combine byte swap & broadcast in one permute
 static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
@@ -1540,7 +1537,7 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
                          _mm512_castsi128_si512( s4 ) );
 }

-#endif    // VBMI else
+#endif
 #endif    // AVX512

 ///////////////////////////
@@ -1983,9 +1980,9 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )

 #endif

-#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
+#if defined(__AVX2__) 

-//TODO Enable for AVX10_256 AVX10_512
+#if defined(VL256) && defined(VBMI)

 static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
 {
@@ -2019,7 +2016,7 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
                         _mm256_castsi128_si256( s4 ) );
 }

-#elif defined(__AVX2__)
+#else

 static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
 {
@@ -2049,6 +2046,8 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
                          _mm256_castsi128_si256( s4 ), 0x55 );
 }

+#endif
+
 #endif   // AVX2

 #endif  // SSE2
@@ -2375,9 +2374,7 @@ static inline void extr_lane_8x64( void *dst, const void *src, const int lane,

 #endif  // SSE2

-#if defined(__AVX512F__) && defined(__AVX512VL__)
-
-//TODO Enable for AVX10_512
+#if defined(SIMD512)

 // broadcast to all lanes
 static inline void mm512_intrlv80_8x64( void *dst, const void *src )
@@ -2399,7 +2396,7 @@ static inline void mm512_intrlv80_8x64( void *dst, const void *src )

 // byte swap and broadcast to all lanes

-#if defined(__AVX512VBMI__)
+#if defined(VBMI)

 // Combine byte swap & broadcast in one permute
 static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
@@ -2626,10 +2623,9 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,

 #endif  // SSE2

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

-#if defined(__AVX512VBMI__)
-//TODO Enable for AVX10_512
+#if defined(VBMI)

 static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
 {
@@ -3532,9 +3528,7 @@ do { \

 #endif  // AVX2

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-//TODO Enable for AVX10_512
+#if defined(SIMD512)

 /*
 #define mm512_intrlv_blend_128( hi, lo ) \
@@ -3559,7 +3553,7 @@ do { \
    dst[7] = _mm512_mask_blend_epi64( mask, a[7], b[7] ); \
 } while(0)

-#endif // AVX512
+#endif // SIMD512

 #undef ILEAVE_4x32
 #undef LOAD_SRCE_4x32
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -207,12 +207,12 @@ static inline __m128i mm128_mov32_128( const uint32_t n )

 #endif

-// broadcast (replicate) lane l to all lanes
-#define v128_replane64( v, l ) \
+// Broadcast lane l to all lanes
+#define v128_duplane64( v, l ) \
   ( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x44 ) \
                : _mm_shuffle_epi32( v, 0xee )

-#define v128_replane32( v, l ) \
+#define v128_duplane32( v, l ) \
    ( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x00 ) \
  : ( (l) == 1 ) ? _mm_shuffle_epi32( v, 0x55 ) \
  : ( (l) == 2 ) ? _mm_shuffle_epi32( v, 0xaa ) \
@@ -347,8 +347,7 @@ static inline __m128i v128_neg1_fn()
 // Basic operations without equivalent SIMD intrinsic

 // Bitwise not (~v)  
-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)

 static inline __m128i v128_not( const __m128i v )
 {  return _mm_ternarylogic_epi64( v, v, v, 1 ); }
@@ -402,8 +401,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 {   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
 #define  memcpy_128           v128_memcpy  

-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)
+
+// ~v1 | v0
+#define v128_ornot( v1, v0 )      _mm_ternarylogic_epi64( v1, v0, v0, 0xcf )

 // a ^ b ^ c
 #define v128_xor3( a, b, c )      _mm_ternarylogic_epi64( a, b, c, 0x96 )
@@ -434,6 +435,8 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #else

+#define v128_ornot( v1, v0 )      _mm_or_si128( v1, v128_not( v0 ) )
+
 #define v128_xor3( a, b, c )      _mm_xor_si128( a, _mm_xor_si128( b, c ) )

 #define v128_and3( a, b, c )      _mm_and_si128( a, _mm_and_si128( b, c ) )
@@ -454,7 +457,6 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #endif

-#define v128_ornot( a, b )             _mm_or_si128( a, v128_not( b ) )

 // Mask making
 // Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
@@ -494,7 +496,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_rol32_sse2( v, c ) \
   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )

-#if defined(__AVX512VL__)
+#if defined(VL256)

 // AVX512 fastest for all rotations.
 #define v128_ror64                _mm_ror_epi64
@@ -609,13 +611,15 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 // deprecated
 #define mm128_rol_32        v128_rol32

+// ror( v1 ^ v0, n )
+#define v128_ror64xor( v1, v0, n )  v128_ror64( v128_xor( v1, v0 ), n ) 
+
 /* not used
 // x2 rotates elements in 2 individual vectors in a double buffered
 // optimization for SSE2, does nothing for AVX512 but is there for
 // transparency.

-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)

 #define v128_2ror64( v1, v0, c ) \
   _mm_ror_epi64( v0, c ); \
@@ -917,10 +921,8 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
 #define v128_block_bswap32             mm128_block_bswap_32
 #define v128_block_bswap64             mm128_block_bswap_64

-
 // alignr instruction for 32 & 64 bit elements is only available with AVX512
 // but emulated here. Behaviour is consistent with Intel alignr intrinsics.
-
 #if defined(__SSSE3__)

 #define v128_alignr8                   _mm_alignr_epi8
@@ -929,6 +931,9 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )

 #else

+#define v128_alignr8( hi, lo, c ) \
+   _mm_or_si128( _mm_slli_si128( hi, c ), _mm_srli_si128( lo, c ) )
+
 #define v128_alignr64( hi, lo, c ) \
   _mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )

@@ -937,12 +942,15 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )

 #endif

+// blend using vector mask
 #if defined(__SSE4_1__)

+// Bytewise using sign bit of each byte element of mask
 #define v128_blendv                    _mm_blendv_epi8

 #else

+// Bitwise
 #define v128_blendv( v1, v0, mask ) \
   v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )

--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -66,8 +66,7 @@ typedef union

 // Set either the low or high 64 bit elements in 128 bit lanes, other elements
 // are set to zero.
-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)

 #define mm256_bcast128lo_64( i64 )     _mm256_maskz_set1_epi64( 0x55, i64 )
 #define mm256_bcast128hi_64( i64 )     _mm256_maskz_set1_epi64( 0xaa, i64 )
@@ -117,8 +116,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 // Basic operations without SIMD equivalent

-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)

 static inline __m256i mm256_not( const __m256i v )
 {  return _mm256_ternarylogic_epi64( v, v, v, 1 ); }
@@ -137,8 +135,10 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_add4_32( a, b, c, d ) \
   _mm256_add_epi32( _mm256_add_epi32( a, b ), _mm256_add_epi32( c, d ) )

-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)
+
+// ~v1 | v0
+#define mm256_ornot( v1, v0 )      _mm256_ternarylogic_epi64( v1, v0, v0, 0xcf )

 // a ^ b ^ c
 #define mm256_xor3( a, b, c )      _mm256_ternarylogic_epi64( a, b, c, 0x96 )
@@ -172,6 +172,8 @@ static inline __m256i mm256_not( const __m256i v )
    
 #else

+#define mm256_ornot( v1, v0 )      _mm256_or_si256( v1, mm256_not( v0 ) )
+
 #define mm256_xor3( a, b, c ) \
  _mm256_xor_si256( a, _mm256_xor_si256( b, c ) )

@@ -257,7 +259,7 @@ static inline __m256i mm256_not( const __m256i v )
   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
                    _mm256_srli_epi32( v, 32-(c) ) )

-#if defined(__AVX512VL__)
+#if defined(VL256)

 #define mm256_ror_64    _mm256_ror_epi64
 #define mm256_rol_64    _mm256_rol_epi64
@@ -343,8 +345,7 @@ static inline __m256i mm256_not( const __m256i v )
 // optimization for AVX2, does nothing for AVX512 but is here for
 // transparency.

-#if defined(__AVX512VL__)
-//TODO Enable for AVX10_256
+#if defined(VL256)
 /*
 #define mm256_ror_64    _mm256_ror_epi64
 #define mm256_rol_64    _mm256_rol_epi64
@@ -470,7 +471,7 @@ static inline __m256i mm256_not( const __m256i v )

 /* Not used
 // Rotate 256 bit vector by one 32 bit element.
-#if defined(__AVX512VL__)
+#if defined(VL256)
 static inline __m256i mm256_shuflr_32( const __m256i v )
 { return _mm256_alignr_epi32( v, v, 1 ); }
 static inline __m256i mm256_shufll_32( const __m256i v )
@@ -507,8 +508,8 @@ static inline __m256i mm256_shufll_32( const __m256i v )
 #define mm256_shuflr128_32(v)   _mm256_shuffle_epi32( v, 0x39 )
 #define mm256_shufll128_32(v)   _mm256_shuffle_epi32( v, 0x93 )

-#define mm256_shuflr128_16(v)   _mm256_shuffle_epi16( v, 0x39 )
-#define mm256_shufll128_16(v)   _mm256_shuffle_epi16( v, 0x93 )
+#define mm256_shuflr128_16(v)   mm256_shuffle_16( v, 0x39 )
+#define mm256_shufll128_16(v)   mm256_shuffle_16( v, 0x93 )

 /* Not used
 static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
@@ -606,6 +607,22 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
  casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \
 }

+#if defined(VL256)
+
+#define mm256_alignr64      _mm256_alignr_epi64
+
+#else
+
+#define mm256_alignr64( v1, v0, c ) \
+    ( ( (c) & 3 ) == 1 ) ? _mm256_blend_epi32( mm256_shuflr_64( v1 ), \
+                                               mm256_shuflr_64( v0 ), 0x3f ) \
+  : ( ( (c) & 3 ) == 2 ) ? _mm256_blend_epi32( mm256_rev_128( v1 ), \
+                                               mm256_rev_128( v0 ), 0x0f ) \
+  : ( ( (c) & 3 ) == 3 ) ? _mm256_blend_epi32( mm256_shufll_64( v1 ), \
+                                               mm256_shufll_64( v0 ), 0x03 ) \
+  : v0
+
+#endif

 #endif // __AVX2__
 #endif // SIMD_256_H__
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -14,7 +14,13 @@
 //   vectors. It is therefore not technically required for any 512 bit vector
 //   utilities defined below.

-#if defined(__x86_64__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+// if avx10   // avx512 is always set
+//      if evex512: yes   
+// else if avx512 : yes   // avx512 is set but not avx10
+// else           : no    // avx512 not set or avx10.1 is set without evex512
+
+
+#if defined(SIMD512)

 //  AVX512 intrinsics have a few changes from previous conventions.
 //
@@ -180,6 +186,9 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // Ternary logic uses 8 bit truth table to define any 3 input logical
 // expression using any number or combinations of AND, OR, XOR, NOT.

+// ~v1 | v0
+#define mm512_ornot( v1, v0 )      _mm512_ternarylogic_epi64( v1, v0, v0, 0xcf )
+
 // a ^ b ^ c
 #define mm512_xor3( a, b, c )      _mm512_ternarylogic_epi64( a, b, c, 0x96 )

--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -4,22 +4,20 @@
 #if defined(__aarch64__) && defined(__ARM_NEON)

 // Targeted functions supporting NEON SIMD 128 & 64 bit vectors.
-// Size matters!
+// Element size matters!
 //
 // Intel naming is generally used.
 //
-// documented instructions that aren't defined on RPi 4.
-// They seem to be all 3 op instructionsi.
+// Some advanced logical operations that require SHA3. Prior to GCC-13
+// they also require armv8.2
 //
-//  veor3q ie xor3
-//  vxarq_u64( v1, v0, n )    ror( xor( v1, v0 ), n )
-//  vraxlq_u64( v1, v0 )      xor( rol( v1, 1 ), rol( v0, 1 ) )
-//  vbcaxq( v2, v1, v0 )      xor( v2, and( v1, not(v0) ) )
-//  vsraq_n( v1, v0, n )   add( v1, sr( v0, n ) )
+//  veor3q( v2, v1, v0 )                xor3        v2 ^ v1 ^ v0   
+//  vxarq_u64( v1, v0, n )              ror64xor    ( v1 ^ v0 ) >>> n )
+//  vbcaxq_u{64,32,16,8}( v2, v1, v0 )  xorandnot   v2 ^ ( v1 & ~v0 )
 //
-//  Doesn't work on RPi but works on OPi:
-//
-//  vornq( v1, v0 )        or( v1, not( v0 ) )
+// not used anywhere yet
+//  vrax1q_u64( v1, v0 )                  v1 ^ ( v0 <<< 1 )
+//  vsraq_n_u{64,32,16,8}( v1, v0, n )    v1 + ( v0 >> n )

 #define v128_t                        uint32x4_t   // default, 
 #define v128u64_t                     uint64x2_t
@@ -87,15 +85,15 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 // Not yet needed
 //#define v128_cmpeq1
 // Signed
-#define v128_cmpgt64( v1, v0 )        vcgtq_s64( (int64x2_t)v1, (int64x2_t)v0 )
-#define v128_cmpgt32( v1, v0 )        vcgtq_s32( (int32x4_t)v1, (int32x4_t)v0 )
-#define v128_cmpgt16( v1, v0 )        vcgtq_s16( (int16x8_t)v1, (int16x8_t)v0 )
-#define v128_cmpgt8( v1, v0 )         vcgtq_s8( (int8x16_t)v1, (int8x16_t)v0 )
+#define v128_cmpgt64( v1, v0 )      vcgtq_s64( (int64x2_t)v1, (int64x2_t)(v0) )
+#define v128_cmpgt32( v1, v0 )      vcgtq_s32( (int32x4_t)v1, (int32x4_t)(v0) )
+#define v128_cmpgt16( v1, v0 )      vcgtq_s16( (int16x8_t)v1, (int16x8_t)(v0) )
+#define v128_cmpgt8( v1, v0 )       vcgtq_s8( (int8x16_t)v1, (int8x16_t)(v0) )

-#define v128_cmplt64( v1, v0 )        vcltq_s64( (int64x2_t)v1, (int64x2_t)v0 )
-#define v128_cmplt32( v1, v0 )        vcltq_s32( (int32x4_t)v1, (int32x4_t)v0 )
-#define v128_cmplt16( v1, v0 )        vcltq_s16( (int16x8_t)v1, (int16x8_t)v0 )
-#define v128_cmplt8( v1, v0 )         vcltq_s8( (int8x16_t)v1, (int8x16_t)v0 )
+#define v128_cmplt64( v1, v0 )      vcltq_s64( (int64x2_t)v1, (int64x2_t)(v0) )
+#define v128_cmplt32( v1, v0 )      vcltq_s32( (int32x4_t)v1, (int32x4_t)(v0) )
+#define v128_cmplt16( v1, v0 )      vcltq_s16( (int16x8_t)v1, (int16x8_t)(v0) )
+#define v128_cmplt8( v1, v0 )       vcltq_s8( (int8x16_t)v1, (int8x16_t)(v0) )

 // Logical bit shift
 #define v128_sl64                     vshlq_n_u64
@@ -109,33 +107,38 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_sr8                      vshrq_n_u8

 // Arithmetic shift.
-#define v128_sra64( v, c )            vshrq_n_s64( (int64x2_t)v, c )
-#define v128_sra32( v, c )            vshrq_n_s32( (int32x4_t)v, c )
-#define v128_sra16( v, c )            vshrq_n_s16( (int16x8_t)v, c )
+#define v128_sra64( v, c )            vshrq_n_s64( (int64x2_t)(v), c )
+#define v128_sra32( v, c )            vshrq_n_s32( (int32x4_t)(v), c )
+#define v128_sra16( v, c )            vshrq_n_s16( (int16x8_t)(v), c )

 // unary logic
+
 #define v128_not                      vmvnq_u32

 // binary logic
+
 #define v128_or                       vorrq_u32
 #define v128_and                      vandq_u32
 #define v128_xor                      veorq_u32

 // ~v1 & v0
-#define v128_andnot( v1, v0 )         vandq_u32( vmvnq_u32( v1 ), v0 )
+#define v128_andnot( v1, v0 )         vbicq_u32( v0, v1 )

 // ~( a ^ b ), same as (~a) ^ b
 #define v128_xnor( v1, v0 )           v128_not( v128_xor( v1, v0 ) )

-// ~v1 | v0, x86_64 convention, first arg is  not'ed
-#define v128_ornot( v1, v0 )          vornq_u32( v0, v1 ) 
+// ~v1 | v0,  args reversed for consistency with x86_64
+#define v128_ornot( v1, v0 )          vornq_u32( v0, v1 )

 // ternary logic

-// v2 ^ v1 ^ v0
-// veorq_u32 not defined
-//#define v128_xor3                      veor3q_u32
-#define v128_xor3( v2, v1, v0 )       veorq_u32( v2, veorq_u32( v1, v0 ) )
+// This will compile with GCC-11 on armv8.2 and above. At this time there is no
+// known way to test arm minor version.
+#if defined(__ARM_FEATURE_SHA3)
+  #define v128_xor3                   veor3q_u32
+#else
+  #define v128_xor3( v2, v1, v0 )     veorq_u32( v2, veorq_u32( v1, v0 ) )
+#endif

 // v2 & v1 & v0
 #define v128_and3( v2, v1, v0 )       v128_and( v2, v128_and( v1, v0 ) )
@@ -143,8 +146,12 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 // v2 | v1 | v0
 #define v128_or3( v2, v1, v0 )        v128_or( v2, v128_or( v1, v0 ) )

-// a ^ ( ~b & c )
-#define v128_xorandnot( v2, v1, v0 )  v128_xor( v2, v128_andnot( v1, v0 ) )
+// v2 ^ ( ~v1 & v0 )
+#if defined(__ARM_FEATURE_SHA3)
+  #define v128_xorandnot( v2, v1, v0 )  vbcaxq_u32( v2, v0, v1 )
+#else
+  #define v128_xorandnot( v2, v1, v0 )  v128_xor( v2, v128_andnot( v1, v0 ) )
+#endif

 // a ^ ( b & c )
 #define v128_xorand( v2, v1, v0 )     v128_xor( v2, v128_and( v1, v0 ) )
@@ -158,12 +165,12 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 // v2 | ( v1 & v0 )
 #define v128_orand( v2, v1, v0 )      v128_or( v2, v128_and( v1, v0 ) )

-// shift 2 concatenated vectors right.
+// shift 2 concatenated vectors right, args reversed for consistency with x86_64
 #define v128_alignr64( v1, v0, c )    vextq_u64( v0, v1, c )
 #define v128_alignr32( v1, v0, c )    vextq_u32( v0, v1, c )
 #define v128_alignr8(  v1, v0, c )    vextq_u8(  v0, v1, c ) 

-// Intetleave high or low half of 2 vectors.
+// Interleave high or low half of 2 vectors.
 #define v128_unpacklo64( v1, v0 )     vzip1q_u64( v1, v0 )
 #define v128_unpackhi64( v1, v0 )     vzip2q_u64( v1, v0 )
 #define v128_unpacklo32( v1, v0 )     vzip1q_u32( v1, v0 )
@@ -214,10 +221,10 @@ typedef union
 #define v128_bcast32(v)                vdupq_laneq_u32( v, 0 )
 #define v128_bcast16(v)                vdupq_laneq_u16( v, 0 )

-// Replicate (broadcast) lane l to all lanes
-#define v128_replane64( v, l )         vdupq_laneq_u64( v, l )
-#define v128_replane32( v, l )         vdupq_laneq_u32( v, l )
-#define v128_replane16( v, l )         vdupq_laneq_u16( v, l )
+// Broadcast lane l to all lanes
+#define v128_duplane64( v, l )         vdupq_laneq_u64( v, l )
+#define v128_duplane32( v, l )         vdupq_laneq_u32( v, l )
+#define v128_duplane16( v, l )         vdupq_laneq_u16( v, l )

 // pointer indexing
 #define casti_v128( p, i )             (((uint32x4_t*)(p))[i])
@@ -232,16 +239,6 @@ typedef union
 #define cast_v128u32( p )              (*((uint32x4_t*)(p)))
 #define castp_v128u32( p )             ((uint32x4_t*)(p))

-// use C cast, flexible source type
-#define u32_to_u64                     vreinterpretq_u64_u32
-#define u64_to_u32                     vreinterpretq_u32_u64
-
-#define u64_to_u8                      vreinterpretq_u8_u64
-#define u8_to_u64                      vreinterpretq_u64_u8
-
-#define u32_to_u8                      vreinterpretq_u8_u32
-#define u8_to_u32                      vreinterpretq_u32_u8
-
 #define v128_zero                      v128_64( 0ull )

 #define v128_cmpeq_zero                vceqzq_u64
@@ -336,35 +333,56 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 #define v128_movmask64                

 // Bit rotation
+
 #define v128_ror64( v, c ) \
-  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \
-   : vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
+  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
+                : vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
+                               ((uint64x2_t)(v)), c )

 #define v128_rol64( v, c ) \
-  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \
-   : vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
+  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
+                : vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
+                               ((uint64x2_t)(v)), c )

 #define v128_ror32( v, c ) \
-  ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \
-   : vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
+  ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
+                : vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
+                               ((uint32x4_t)(v)), c )

 #define v128_rol32( v, c ) \
-  ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \
-   : vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
+  ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
+                : vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
+                               ((uint32x4_t)(v)), c )

 #define v128_ror16( v, c ) \
-  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
-   : vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
+  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \
+               : vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
+                             ((uint16x8_t)(v)), c )

 #define v128_rol16( v, c ) \
  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
-   : vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
+               : vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
+                             ((uint16x8_t)(v)), c )

 #define v128_ror8( v, c ) \
-      vsriq_n_u8( vshlq_n_u8(  ((uint8x16_t)v),  8-c ), ((uint8x16_t)v), c )
+      vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)),  8-(c) ), \
+                  ((uint8x16_t)(v)), c )

 #define v128_rol8( v, c ) \
-      vsliq_n_u8( vshrq_n_u8(  ((uint8x16_t)v),  8-c ), ((uint8x16_t)v), c )
+      vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)),  8-(c) ), \
+                 ((uint8x16_t)(v)), c )
+
+
+// ror( v1 ^ v0, n )
+#if defined(__ARM_FEATURE_SHA3)
+
+#define v128_ror64xor( v1, v0, n )  vxarq_u64( v1, v0, n ) 
+
+#else
+
+#define v128_ror64xor( v1, v0, n )  v128_ror64( v128_xor( v1, v0 ), n ) 
+
+#endif

 #define v128_2ror64( v1, v0, c ) \
 { \
@@ -416,7 +434,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 */

 #define v128_shuffle8( v, vmask ) \
-     vqtbl1q_u8( (uint8x16_t)v, (uint8x16_t)vmask )
+     vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )

 // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
 // Bit rotation already promotes faster widths. Usage is context sensitive.
@@ -465,7 +483,6 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
 #define v128_bswap32(v)        (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) )
 #define v128_bswap64(v)        (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
 #define v128_bswap128(v)       (uint32x4_t)v128_swap64( v128_bswap64(v) )
-#define v128_bswap256(p)       v128_bswap128( (p)[0], (p)[1] ) 

 // Usefull for x86_64 but does nothing for ARM
 #define v128_block_bswap32( dst, src ) \
@@ -534,16 +551,8 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
   casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
 }

-// Blendv
-#define v128_blendv( v1, v0, mask ) \
-   v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )
-
-/*
-// vbcaxq not defined
-#define v128_blendv( v1, v0, mask ) \
-   vbcaxq_u32( v128_and( mask, v1 ), v0, mask )
-*/
+// Bitwise blend using vector mask
+#define v128_blendv( v1, v0, mask )    vbslq_u32( mask, v1, v0 )

 #endif   // __ARM_NEON
-
 #endif   // SIMD_NEON_H__