v23.7

2025-09-17 23:44:27 +00:00 · 2023-11-07 04:59:44 -05:00
parent 46dca7a493
commit e043698442
33 changed files with 3880 additions and 4763 deletions
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -1527,6 +1527,17 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
 #endif
 }

+static inline void extr_lane_2x64( void *dst, const void *src,
+                                   const int lane, const int bit_len )
+{
+   uint64_t *d = (uint64_t*)dst;
+   const uint64_t *s = (const uint64_t*)src;
+   d[ 0] = s[ lane    ];   d[ 1] = s[ lane+ 2 ];
+   d[ 2] = s[ lane+ 4 ];   d[ 3] = s[ lane+ 6 ];
+   if ( bit_len <= 256 ) return;
+   d[ 4] = s[ lane+ 8 ];   d[ 5] = s[ lane+10 ];
+   d[ 6] = s[ lane+12 ];   d[ 7] = s[ lane+14 ];
+}


 // 4x64   (AVX2)
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -152,16 +152,6 @@
 #define v128_unpacklo8                 _mm_unpacklo_epi8
 #define v128_unpackhi8                 _mm_unpackhi_epi8

-// New shorter agnostic name
-#define v128_ziplo64                   _mm_unpacklo_epi64
-#define v128_ziphi64                   _mm_unpackhi_epi64
-#define v128_ziplo32                   _mm_unpacklo_epi32
-#define v128_ziphi32                   _mm_unpackhi_epi32
-#define v128_ziplo16                   _mm_unpacklo_epi16
-#define v128_ziphi16                   _mm_unpackhi_epi16
-#define v128_ziplo8                    _mm_unpacklo_epi8
-#define v128_ziphi8                    _mm_unpackhi_epi8
-
 // AES
 #define v128_aesenc                    _mm_aesenc_si128
 #define v128_aesenclast                _mm_aesenclast_si128
@@ -171,7 +161,8 @@
 // Used instead if casting.
 typedef union
 {
-   __m128i m128;
+   v128_t   v128;
+   __m128i  m128;
   uint32_t u32[4];
 } __attribute__ ((aligned (16))) m128_ovly;
 #define v128_ovly   m128_ovly
@@ -218,19 +209,41 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
  return a;
 }

-// Emulate broadcast & insert instructions not available in SSE2
-// FYI only, not used anywhere
-//#define mm128_bcast_m64( v )   _mm_shuffle_epi32( v, 0x44 )
-//#define mm128_bcast_m32( v )   _mm_shuffle_epi32( v, 0x00 )
+// broadcast lane 0 to all lanes
+#define v128_bcast64(v)                 _mm_shuffle_epi32( v, 0x44 )
+#define v128_bcast32(v)                 _mm_shuffle_epi32( v, 0x00 )
+
+#if defined(__AVX2__)
+
+#define v128_bcast16(v)                 _mm_broadcastw_epi16(v)
+
+#else
+
+#define v128_bcast16(v) \
+   v128_bcast32( v128_or( v128_sl32( v, 16 ), v ) )
+
+#endif
+
+// broadcast lane l to all lanes
+#define v128_replane64( v, l ) \
+   ( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x44 ) \
+                : _mm_shuffle_epi32( v, 0xee )
+
+#define v128_replane32( v, l ) \
+    ( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x00 ) \
+  : ( (l) == 1 ) ? _mm_shuffle_epi32( v, 0x55 ) \
+  : ( (l) == 2 ) ? _mm_shuffle_epi32( v, 0xaa ) \
+  :                _mm_shuffle_epi32( v, 0xff )

 // Pseudo constants
 #define v128_zero                       _mm_setzero_si128()
-#define m128_zero                       v128_zero
+#define m128_zero                       _mm_setzero_si128()
+

 #if defined(__SSE4_1__)

 // Bitwise AND, return 1 if result is all bits clear.
-#define v128_and_eq0                 _mm_testz_si128
+#define v128_and_eq0                    _mm_testz_si128

 static inline int v128_cmpeq0( v128_t v )
 {  return v128_and_eq0( v, v ); }
@@ -341,9 +354,12 @@ static inline __m128i v128_neg1_fn()
 */


+#define mm128_mask_32( v, m )    mm128_xim_32( v, v, m )
+
 // Zero 32 bit elements when corresponding bit in 4 bit mask is set.
-static inline __m128i mm128_mask_32( const __m128i v, const int m ) 
-{   return mm128_xim_32( v, v, m ); }
+//static inline __m128i mm128_mask_32( const __m128i v, const int m ) 
+//{   return mm128_xim_32( v, v, m ); }
+#define v128_mask32    mm128_mask_32

 // Copy element i2 of v2 to element i1 of dest and copy remaining elements from v1.
 #define v128_movlane32( v1, l1, v0, l0 ) \
@@ -483,10 +499,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 //
 // Bit rotations

-// Neon has fast xor-ror, useful for big blake, if it actually works.
-#define v128_xror64( v1, v0, c ) v128_ror64( v128_xor( v1, v0 ) c )
-
-
 // Slow bit rotation, used as last resort
 #define mm128_ror_64_sse2( v, c ) \
   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
@@ -645,32 +657,55 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )

 // Limited 2 input shuffle, combines shuffle with blend. The destination low
 // half is always taken from v1, and the high half from v2.
-#define mm128_shuffle2_64( v1, v2, c ) \
+#define v128_shuffle2_64( v1, v2, c ) \
   _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \
                                     _mm_castsi128_pd( v2 ), c ) ); 
+#define mm128_shuffle2_64   v128_shuffle2_64

-#define mm128_shuffle2_32( v1, v2, c ) \
+#define v128_shuffle2_32( v1, v2, c ) \
   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
                                     _mm_castsi128_ps( v2 ), c ) ); 
+#define mm128_shuffle2_32   v128_shuffle2_32

 // Rotate vector elements accross all lanes

-#define mm128_swap_64( v )     _mm_shuffle_epi32( v, 0x4e )
-#define v128_swap64            mm128_swap_64
-#define mm128_shuflr_64        mm128_swap_64
-#define mm128_shufll_64        mm128_swap_64
+#define v128_shuffle16( v, c ) \
+   _mm_or_si128( _mm_shufflehi_epi16( v, c ), _mm_shufflelo_epi16( v, c ) )

-// Don't use as an alias for byte sized bit rotation
-#define mm128_shuflr_32( v )   _mm_shuffle_epi32( v, 0x39 )
-#define v128_shuflr32          mm128_shuflr_32
+// reverse elements in vector
+#define v128_swap64(v)      _mm_shuffle_epi32( v, 0x4e )  // grandfathered 
+#define v128_rev64(v)       _mm_shuffle_epi32( v, 0x4e )  // preferred
+#define v128_rev32(v)       _mm_shuffle_epi32( v, 0x1b )
+#define v128_rev16(v)       v128_shuffle16( v, 0x1b )

-#define mm128_shufll_32( v )   _mm_shuffle_epi32( v, 0x93 )
-#define v128_shufll32          mm128_shufll_32
+// rotate vector elements
+#define v128_shuflr32(v)    _mm_shuffle_epi32( v, 0x39 )
+#define v128_shufll32(v)    _mm_shuffle_epi32( v, 0x93 )

-#define v128_swap64_32( v )      v128_ror64( v, 32 )
+#define v128_shuflr16(v)    v128_shuffle16( v, 0x39 )
+#define v128_shufll16(v)    v128_shuffle16( v, 0x93 )

-#define mm128_rev_32( v )      _mm_shuffle_epi32( v, 0x1b )
-#define v128_rev32             mm128_rev_32
+// Some sub-vector shuffles are identical to bit rotation. Shuffle is faster.
+// Bit rotation already promotes faster widths. Usage of these versions
+// are context sensitive.
+
+// reverse elements in vector lanes
+#define v128_qrev32(v)      v128_ror64( v, 32 )
+#define v128_swap64_32(v)   v128_ror64( v, 32 )   // grandfathered
+
+#define v128_qrev16(v) \
+    _mm_or_si128( _mm_shufflehi_epi16( v, v128u16( 0x1b ) ) \
+                  _mm_shufflelo_epi16( v, v128u16( 0x1b ) ) )
+
+#define v128_lrev16(v)      v128_ror32( v, 16 )
+
+// alias bswap
+#define v128_qrev8(v)       _mm_shuffle_epi8( v, v128_8( 0,1,2,3,4,5,6,7 ) )
+#define v128_lrev8(v)       _mm_shuffle_epi8( v, v128_8( 4,5,6,7, 0,1,2,3 ) )
+#define v128_wrev8(v)       _mm_shuffle_epi8( v, v128_8( 6,7, 4,5, 2,3, 1,0 ) )
+   
+// reverse bits, can it be done?
+//#define v128_bitrev8( v )              vrbitq_u8

 /* Not used
 #if defined(__SSSE3__)
@@ -682,7 +717,6 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #endif
 */

-//
 // Endian byte swap.

 #if defined(__SSSE3__)
@@ -798,8 +832,7 @@ static inline __m128i mm128_bswap_16( __m128i v )
  return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
 }

-#define mm128_bswap_128( v ) \
-   mm128_swap_64( mm128_bswap_64( v ) )
+#define mm128_bswap_128( v )   v128_qrev32( v128_bswap64( v ) )

 static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
 {
@@ -846,7 +879,7 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
   d[7] = mm128_bswap_32( s[7] );
 }
 #define mm128_block_bswap32_256 mm128_block_bswap_32
-#define v128_block_bswap32_256 mm128_block_bswap_32
+#define v128_block_bswap32_256  mm128_block_bswap_32

 static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
 {
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -375,16 +375,27 @@ static inline __m256i mm256_not( const __m256i v )
 // Cross lane shuffles
 //
 // Rotate elements accross all lanes.
+#define mm256_shuffle_16( v, c ) \
+   _mm256_or_si256( _mm256_shufflehi_epi16( v, c ), \
+                    _mm256_shufflelo_epi16( v, c ) )

 // Swap 128 bit elements in 256 bit vector.
 #define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
-#define mm256_shuflr_128        mm256_swap_128
-#define mm256_shufll_128        mm256_swap_128
+#define mm256_rev_128( v )      _mm256_permute4x64_epi64( v, 0x4e )

 // Rotate 256 bit vector by one 64 bit element
 #define mm256_shuflr_64( v )    _mm256_permute4x64_epi64( v, 0x39 )
 #define mm256_shufll_64( v )    _mm256_permute4x64_epi64( v, 0x93 )

+// Reverse 64 bit elements 
+#define mm256_rev_64( v )       _mm256_permute4x64_epi64( v, 0x1b )
+
+#define mm256_rev_32( v ) \
+   _mm256_permute8x32_epi64( v, 0x0000000000000001, 0x0000000200000003, \
+                                0x0000000400000005, 0x0000000600000007 )
+
+#define mm256_rev_16( v ) \
+   _mm256_permute4x64_epi64( mm256_shuffle_16( v, 0x1b ), 0x4e )

 /* Not used
 // Rotate 256 bit vector by one 32 bit element.
@@ -423,12 +434,16 @@ static inline __m256i mm256_shufll_32( const __m256i v )
   _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \
                                           _mm256_castsi256_ps( v2 ), c ) ); 

-#define mm256_swap128_64( v )     _mm256_shuffle_epi32( v, 0x4e )
-#define mm256_shuflr128_64        mm256_swap128_64
-#define mm256_shufll128_64        mm256_swap128_64
+#define mm256_swap128_64(v)     _mm256_shuffle_epi32( v, 0x4e )
+#define mm256_rev128_64(v)      _mm256_shuffle_epi32( v, 0x4e )
+#define mm256_rev128_32(v)      _mm256_shuffle_epi32( v, 0x1b )
+#define mm256_rev128_16(v)      mm256_shuffle_16( v, 0x1b )

-#define mm256_shuflr128_32( v )   _mm256_shuffle_epi32( v, 0x39 )
-#define mm256_shufll128_32( v )   _mm256_shuffle_epi32( v, 0x93 )
+#define mm256_shuflr128_32(v)   _mm256_shuffle_epi32( v, 0x39 )
+#define mm256_shufll128_32(v)   _mm256_shuffle_epi32( v, 0x93 )
+
+#define mm256_shuflr128_16(v)   _mm256_shuffle_epi16( v, 0x39 )
+#define mm256_shufll128_16(v)   _mm256_shuffle_epi16( v, 0x93 )

 /* Not used
 static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
@@ -436,7 +451,19 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 */

 // Same as bit rotation but logically used as byte/word rotation.
-#define mm256_swap64_32( v )     mm256_ror_64( v, 32 )
+#define mm256_swap64_32( v )    mm256_ror_64( v, 32 )  // grandfathered
+#define mm256_rev64_32( v )     mm256_ror_64( v, 32 ) 
+
+#define mm256_shuflr64_16(v)    _mm256_ror_epi64( v, 16 )
+#define mm256_shufll64_16(v)    _mm256_rol_epi64( v, 16 )
+
+#define mm256_shuflr64_8(v)     _mm256_ror_epi64( v,  8 )
+#define mm256_shufll64_8(v)     _mm256_rol_epi64( v,  8 )
+
+#define mm256_rev32_16( v )     mm256_ror_32( v, 16 ) 
+
+#define mm256_shuflr32_8(v)     _mm256_ror_epi32( v,  8 )
+#define mm256_shufll32_8(v)     _mm256_rol_epi32( v,  8 )

 // Reverse byte order in elements, endian bswap.
 #define mm256_bswap_64( v ) \
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -15,11 +15,11 @@
 //  vxarq_u64( v1, v0, n )    ror( xor( v1, v0 ), n )
 //  vraxlq_u64( v1, v0 )      xor( rol( v1, 1 ), rol( v0, 1 ) )
 //  vbcaxq( v2, v1, v0 )      xor( v2, and( v1, not(v0) ) )
+//  vsraq_n( v1, v0, n )   add( v1, sr( v0, n ) )
 //
-//  might not work, not tried yet:
+//  Doesn't work on RPi but works on OPi:
 //
 //  vornq( v1, v0 )        or( v1, not( v0 ) )
-//  vsraq_n( v1, v0, n )   add( v1, sr( v0, n ) )

 #define v128_t                         uint32x4_t   // default, 
 #define v128u64_t                      uint64x2_t
@@ -31,6 +31,15 @@
 #define v128_load( p )                 vld1q_u32( (uint32_t*)(p) )
 #define v128_store( p, v )             vst1q_u32( (uint32_t*)(p), v )

+#define v128u64_load( p )              vld1q_u64( (uint64_t*)(p) )
+#define v128u64_store( p, v )          vst1q_u64( (uint64_t*)(p), v )
+#define v128u32_load( p )              vld1q_u32( (uint32_t*)(p) )
+#define v128u32_store( p, v )          vst1q_u32( (uint32_t*)(p), v )
+#define v128u16_load( p )              vld1q_u16( (uint16_t*)(p) )
+#define v128u16_store( p, v )          vst1q_u16( (uint16_t*)(p), v )
+#define v128u8_load( p )               vld1q_u16( (uint8_t*)(p) )
+#define v128u8_store( p, v )           vst1q_u16( (uint8_t*)(p), v )
+
 // load & set1 combined
 #define v128_load1_64(p)               vld1q_dup_u64( (uint64_t*)(p) )
 #define v128_load1_32(p)               vld1q_dup_u32( (uint32_t*)(p) )
@@ -74,6 +83,9 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )

 #define v128_cmpeq0                    vceqzq_u64

+// Not yet needed
+//#define v128_cmpeq1
+
 #define v128_cmpgt64                   vcgtq_u64
 #define v128_cmpgt32                   vcgtq_u32
 #define v128_cmpgt16                   vcgtq_u16
@@ -95,7 +107,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_sr16                      vshrq_n_u16
 #define v128_sr8                       vshrq_n_u8

-// Maybe signed shift will work.
+// Unit tested, working.
 #define v128_sra64                     vshrq_n_s64
 #define v128_sra32                     vshrq_n_s32
 #define v128_sra16                     vshrq_n_s16
@@ -103,25 +115,47 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 // unary logic
 #define v128_not                       vmvnq_u32

-// binary
+// binary logic
 #define v128_or                        vorrq_u32
 #define v128_and                       vandq_u32
 #define v128_xor                       veorq_u32
-#define v128_andnot                    vandq_u32
-#define v128_xnor( v1, v0 )            v128_not( v128_xor( v1, v0 ) )
-#define v128_ornot                     vornq_u32 

-// ternary logic, veorq_u32 not defined
+// ~v1 & v0
+#define v128_andnot( v1, v0 )          vandq_u32( vmvnq_u32( v1 ), v0 )
+
+// ~( a ^ b ), same as (~a) ^ b
+#define v128_xnor( v1, v0 )            v128_not( v128_xor( v1, v0 ) )
+
+// ~v1 | v0, x86_64 convention, first arg is  not'ed
+#define v128_ornot( v1, v0 )           vornq_u32( v0, v1 ) 
+
+// ternary logic
+
+// v2 ^ v1 ^ v0
+// veorq_u32 not defined
 //#define v128_xor3                      veor3q_u32
 #define v128_xor3( v2, v1, v0 )        veorq_u32( v2, veorq_u32( v1, v0 ) )
-#define v128_nor                       vornq_u32
+
+// v2 & v1 & v0
+#define v128_and3( v2, v1, v0 )        v128_and( v2, v128_and( v1, v0 ) )
+
+// v2 | v1 | v0
+#define v128_or3( v2, v1, v0 )         v128_or( v2, v128_or( v1, v0 ) )
+
+// a ^ ( ~b & c )
 #define v128_xorandnot( v2, v1, v0 )   v128_xor( v2, v128_andnot( v1, v0 ) )
-#define v128_and3( a, b, c )           v128_and( a, v128_and( b, c ) )
-#define v128_or3( a, b, c )            v128_or( a, v128_or( b, c ) )
-#define v128_xorand( a, b, c )         v128_xor( a, v128_and( b, c ) )
-#define v128_andxor( a, b, c )         v128_and( a, v128_xor( b, c ) )
-#define v128_xoror( a, b, c )          v128_xor( a, v128_or( b, c ) )
-#define v128_orand( a, b, c )          v128_or( a, v128_and( b, c ) )
+
+// a ^ ( b & c )
+#define v128_xorand( v2, v1, v0 )      v128_xor( v2, v128_and( v1, v0 ) )
+
+// a & ( b ^ c )
+#define v128_andxor( v2, v1, v0 )      v128_and( v2, v128_xor( v1, v0 ) )
+
+// a ^ ( b | c )
+#define v128_xoror( v2, v1, v0 )       v128_xor( v2, v128_or( v1, v0 ) )
+
+// v2 | ( v1 & v0 )
+#define v128_orand( v2, v1, v0 )       v128_or( v2, v128_and( v1, v0 ) )

 // shift 2 concatenated vectors right.
 #define v128_alignr64( v1, v0, c )     vextq_u64( v0, v1, c )
@@ -129,24 +163,15 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_alignr8(  v1, v0, c )     vextq_u8(  v0, v1, c ) 

 // Intetleave high or low half of 2 vectors.
-#define v128_unpacklo64( v1, v0 )      vzip1q_u64( v0, v1 )
-#define v128_unpackhi64( v1, v0 )      vzip2q_u64( v0, v1 )
-#define v128_unpacklo32( v1, v0 )      vzip1q_u32( v0, v1 )
-#define v128_unpackhi32( v1, v0 )      vzip2q_u32( v0, v1 )
-#define v128_unpacklo16( v1, v0 )      vzip1q_u16( v0, v1 )
-#define v128_unpackhi16( v1, v0 )      vzip2q_u16( v0, v1 )
-#define v128_unpacklo8(  v1, v0 )      vzip1q_u8(  v0, v1 )
-#define v128_unpackhi8(  v1, v0 )      vzip2q_u8(  v0, v1 )
+#define v128_unpacklo64( v1, v0 )      vzip1q_u64( v1, v0 )
+#define v128_unpackhi64( v1, v0 )      vzip2q_u64( v1, v0 )
+#define v128_unpacklo32( v1, v0 )      vzip1q_u32( v1, v0 )
+#define v128_unpackhi32( v1, v0 )      vzip2q_u32( v1, v0 )
+#define v128_unpacklo16( v1, v0 )      vzip1q_u16( v1, v0 )
+#define v128_unpackhi16( v1, v0 )      vzip2q_u16( v1, v0 )
+#define v128_unpacklo8(  v1, v0 )      vzip1q_u8(  v1, v0 )
+#define v128_unpackhi8(  v1, v0 )      vzip2q_u8(  v1, v0 )

-// Shorter achchitecture agnostic names for unpack using NEON-like mnemonics
-#define v128_ziplo64                   vzip1q_u64
-#define v128_ziphi64                   vzip2q_u64
-#define v128_ziplo32                   vzip1q_u32
-#define v128_ziphi32                   vzip2q_u32
-#define v128_ziplo16                   vzip1q_u16
-#define v128_ziphi16                   vzip2q_u16
-#define v128_ziplo8                    vzip1q_u8
-#define v128_ziphi8                    vzip2q_u8

 // AES
 // consistent with Intel AES, break up for optimizing
@@ -158,10 +183,22 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )

 typedef union
 {
+   uint32x4_t v128;
   uint32x4_t m128;
-   uint32_t u32[4];
+   uint32_t   u32[4];
 } __attribute__ ((aligned (16))) v128_ovly;

+
+// Broadcast lane 0 to all lanes
+#define v128_bcast64(v)                vdupq_laneq_u64( v, 0 )
+#define v128_bcast32(v)                vdupq_laneq_u32( v, 0 )
+#define v128_bcast16(v)                vdupq_laneq_u16( v, 0 )
+
+// Replicate (broadcast) lane l to all lanes
+#define v128_replane64( v, l )         vdupq_laneq_u64( v, l )
+#define v128_replane32( v, l )         vdupq_laneq_u32( v, l )
+#define v128_replane16( v, l )         vdupq_laneq_u16( v, l )
+
 // pointer indexing
 #define casti_v128( p, i )             (((uint32x4_t*)(p))[i])
 #define cast_v128( p )                 (*((uint32x4_t*)(p)))
@@ -255,12 +292,13 @@ typedef union
 #define v128_negate16              vnegq_s16
 #define v128_negate8               vnegq_s8

+// Nothing else seems to work
 static inline void v128_memset_zero( void *dst, const int n )
 {
-   for( int i = 0; i < n; i++ )
-      ((uint32x4_t*)dst)[n] = (uint32x4_t)(uint128_t)0;
+    memset( dst, 0, n*16 );
 }

+
 static inline void v128_memset( void *dst, const void *src, const int n )
 {
   for( int i = 0; i < n; i++ )
@@ -273,67 +311,40 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
      ((uint32x4_t*)dst)[i] = ((const uint32x4_t*)src)[i];
 }

-// how to build a bitmask from vector elements?
+// how to build a bitmask from vector elements? Efficiently???
 #define v128_movmask32                 
 #define v128_movmask64                

 // Bit rotation
-//TODO, maybe, Optimize 64 bit rotations
-// Fall back for odd bit rotations
-static inline uint64x2_t v128_ror64( uint64x2_t v, int c )
-{
-   return vsriq_n_u64( vshlq_n_u64( (uint64x2_t)v, 64-c ), (uint64x2_t)v, c );
-}
-
-static inline uint64x2_t v128_rol64( uint64x2_t v, int c )
-{
-   return vsliq_n_u64( vshrq_n_u64( (uint64x2_t)v, 64-c ), (uint64x2_t)v, c );
-}
-
-//static inline uint64x2_t v128_rol64( uint64x2_t v, int c )
-//{  return vsriq_n_u64( vshlq_n_u64( v, c ), v, 64-c ); }
-
-static inline uint32x4_t v128_ror32( uint32x4_t v, int c )
-{  return vsriq_n_u32( vshlq_n_u32( v, 32-c ), v, c ); }
-
-static inline uint32x4_t v128_rol32( uint32x4_t v, int c )
-{  return vsliq_n_u32( vshrq_n_u32( v, 32-c ), v, c ); }
-
-//static inline uint32x4_t v128_rol32( uint32x4_t v, int c )
-//{  return vsriq_n_u32( vshlq_n_u32( v, c ), v, 32-c ); }
-
-static inline uint16x8_t v128_ror16( uint16x8_t v, int c )
-{  return vsriq_n_u16( vshlq_n_u16( v, 16-c ), v, c ); }
-
-static inline uint16x8_t v128_rol16( uint16x8_t v, int c )
-{  return vsliq_n_u16( vshrq_n_u16( v, 16-c ), v, c ); }
-
-//static inline uint16x8_t v128_rol16( uint16x8_t v, int c )
-//{  return vsriq_n_u16( vshlq_n_u16( v, c ), v, 16-c ); }
-
-static inline uint8x16_t v128_ror8( uint8x16_t v, int c )
-{  return vsriq_n_u8( vshlq_n_u8( v, 8-c ), v, c ); }
-
-static inline uint8x16_t v128_rol8( uint8x16_t v, int c )
-{  return vsliq_n_u8( vshrq_n_u8( v, 8-c ), v, c ); }
-
-//static inline uint8x16_t v128_rol8( uint16x8_t v, int c )
-//{  return vsriq_n_u8( vshlq_n_u8( v, c ), v, 8-c ); }
-
-/*
-// Optimzed for half element rotations (swap)
 #define v128_ror64( v, c ) \
-   ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( v ) : v128_ror64_neon( v, c )
+  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint64x2_t)v) ) \
+   : vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )

 #define v128_rol64( v, c ) \
-   ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( v ) : v128_rol64_neon( v, c )
-   
+  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint64x2_t)v) ) \
+   : vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
+
 #define v128_ror32( v, c ) \
-   ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( v ) : v128_ror32_neon( v, c )
+  ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint32x4_t)v) ) \
+   : vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )

 #define v128_rol32( v, c ) \
-   ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( v ) : v128_rol32_neon( v, c )
-*/
+  ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint32x4_t)v) ) \
+   : vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
+
+#define v128_ror16( v, c ) \
+  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint16x8_t)v) ) \
+   : vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
+
+#define v128_rol16( v, c ) \
+  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint16x8_t)v) ) \
+   : vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
+
+#define v128_ror8( v, c ) \
+      vsriq_n_u8( vshlq_n_u8(  ((uint8x16_t)v),  8-c ), ((uint8x16_t)v), c )
+
+#define v128_rol8( v, c ) \
+      vsliq_n_u8( vshrq_n_u8(  ((uint8x16_t)v),  8-c ), ((uint8x16_t)v), c )

 #define v128_2ror64( v1, v0, c ) \
 { \
@@ -361,7 +372,7 @@ static inline uint8x16_t v128_rol8( uint8x16_t v, int c )
 uint32x4_t t1 = vshrq_n_u32( v1, c ); \
 v0 = vsliq_n_u32( v0, 32-(c) ); \
 v1 = vsliq_n_u32( v1, 32-(c) ); \
- v0 = vorrq_u32( v0, t0 ); \
+ v0 = vorrq_32( v0, t0 ); \
 v1 = vorrq_u32( v1, t1 ); \
 }

@@ -375,16 +386,6 @@ static inline uint8x16_t v128_rol8( uint8x16_t v, int c )
 v1 = vorrq_u32( v1, t1 ); \
 }

-// vector rotation , size?
-static inline uint64x2_t v128_swap64( uint64x2_t v )
-{   return vextq_u64( v, v, 1 ); }
-
-static inline uint32x4_t v128_shuflr32( uint32x4_t v )
-{   return vextq_u32( v, v, 1 ); }
-
-static inline uint32x4_t v128_shufll32( uint32x4_t v )
-{   return vextq_u32( v, v, 3 ); }
-
 // Cross lane shuffles, no programmable shuffle in NEON

 // vector mask, use as last resort. prefer rev, alignr, etc
@@ -413,29 +414,54 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 1] ], \
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 0] ] )

-#define v128_swap64_32( v )             vrev64q_u32( v )
-#define v128_v128_shuflr64_16( v )      v128_ror_64( v, 16 )
-#define v128_v128_shufll64_16( v )      v128_rol_64( v, 16 )
+// sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
+// Bit rotation already promotes faster widths. Usage is context sensitive.
+// preferred.

-// Don't use as an alias for byte sized bit rotation
-#define v128_swap32_16( v )             vrev64q_u16( v )
-#define v128_v128_shuflr32_8( v )       v128_ror_32( v, 8 )
-#define v128_v128_shufll32_8( v )       v128_rol_32( v, 8 )
+// reverse elements in vector lanes
+#define v128_qrev32            vrev64q_u32
+#define v128_swap64_32         vrev64q_u32  // grandfathered

-// reverse elements
-#define v128_rev32( v )                vrev64q_u32( v )
-#define v128_rev16( v )                vrev64q_u16( v )
-#define v128_rev8( v )                 vrev64q_u8( v )
+#define v128_qrev16            vrev64q_u16
+#define v128_lrev16            vrev32q_u16

-// reverse bits, nothing like it in x86_64
-#define v128_bitrev8( v )              vrbitq_u8
+// aka bswap
+#define v128_qrev8             vrev64q_u8
+#define v128_lrev8             vrev32q_u8
+#define v128_wrev8             vrev16q_u8
+
+// full vector rotation
+
+// reverse elements in vector
+static inline uint64x2_t v128_rev64( uint64x2_t v )
+{   return vextq_u64( v, v, 1 ); }
+#define v128_swap64     v128_rev64   // grandfathered
+
+#define v128_rev32(v)        v128_rev64( v128_qrev32( v ) )
+#define v128_rev16(v)        v128_rev64( v128_qrev16( v ) )
+
+// shuffle-rotate vector elements
+static inline uint32x4_t v128_shuflr32( uint32x4_t v )
+{   return vextq_u32( v, v, 1 ); }
+
+static inline uint32x4_t v128_shufll32( uint32x4_t v )
+{   return vextq_u32( v, v, 3 ); }
+
+static inline uint16x8_t v128_shuflr16( uint16x8_t v )
+{   return vextq_u16( v, v, 1 ); }
+
+static inline uint16x8_t v128_shufll16( uint16x8_t v )
+{   return vextq_u16( v, v, 7 ); }
+
+// reverse bits in bytes, nothing like it in x86_64
+#define v128_bitrev8           vrbitq_u8

 // reverse byte order
-#define v128_bswap16(v)                (uint16x8_t)vrev16q_u8( (uint8x16_t)(v) )
-#define v128_bswap32(v)                (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) )
-#define v128_bswap64(v)                (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
-#define v128_bswap128(v)               (uint32x4_t)v128_swap64( v128_bswap64(v) )
-#define v128_bswap256(p)               v128_bswap128( (p)[0], (p)[1] ) 
+#define v128_bswap16(v)        (uint16x8_t)vrev16q_u8( (uint8x16_t)(v) )
+#define v128_bswap32(v)        (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) )
+#define v128_bswap64(v)        (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
+#define v128_bswap128(v)       (uint32x4_t)v128_swap64( v128_bswap64(v) )
+#define v128_bswap256(p)       v128_bswap128( (p)[0], (p)[1] ) 

 // Usefull for x86_64 but does nothing for ARM
 #define v128_block_bswap32( dst, src ) \