v24.6

2025-09-17 23:44:27 +00:00 · 2024-12-08 11:14:08 -05:00
parent 8e91bfbe19
commit 06624a0ff2
18 changed files with 1526 additions and 1140 deletions
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -86,7 +86,7 @@ static inline void extr_lane_2x32( void *dst, const void *src,

 // 4x32

-#if ( defined(__x86_64__) && defined(__SSE2__) ) || ( defined(__aarch64__) && defined(__ARM_NEON) )
+#if defined(__x86_64__) && defined(__SSE2__) 

 #define ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ) \
 { \
@@ -174,6 +174,7 @@ static inline void intrlv_4x32_512( void *dst, const void *src0,
   STOR_DEST_4x32( D0, D1, D2, D3, dst, 12, dst, 13, dst, 14, dst, 15 );
 }

+
 static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
                           void *dst3, const void *src, const int bit_len )
 {
@@ -235,6 +236,190 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
   STOR_DEST_4x32( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 );
 }

+#elif defined(__aarch64__) && defined(__ARM_NEON)
+
+static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
+                      const void *src2, const void *src3, const int bit_len )
+{
+   uint32x4x4_t s;
+
+   s.val[0] = casti_v128u32( src0, 0 );
+   s.val[1] = casti_v128u32( src1, 0 );
+   s.val[2] = casti_v128u32( src2, 0 );
+   s.val[3] = casti_v128u32( src3, 0 );
+   vst4q_u32( dst, s );
+
+   s.val[0] = casti_v128u32( src0, 1 );
+   s.val[1] = casti_v128u32( src1, 1 );
+   s.val[2] = casti_v128u32( src2, 1 );
+   s.val[3] = casti_v128u32( src3, 1 );
+   vst4q_u32( dst + 64, s );
+   
+   if ( bit_len <= 256 ) return;
+
+   s.val[0] = casti_v128u32( src0, 2 );
+   s.val[1] = casti_v128u32( src1, 2 );
+   s.val[2] = casti_v128u32( src2, 2 );
+   s.val[3] = casti_v128u32( src3, 2 );
+   vst4q_u32( dst + 128, s );
+
+   s.val[0] = casti_v128u32( src0, 3 );
+   s.val[1] = casti_v128u32( src1, 3 );
+   s.val[2] = casti_v128u32( src2, 3 );
+   s.val[3] = casti_v128u32( src3, 3 );
+   vst4q_u32( dst + 192, s );
+
+   if ( bit_len <= 512 ) return;
+
+   s.val[0] = casti_v128u32( src0, 4 );
+   s.val[1] = casti_v128u32( src1, 4 );
+   s.val[2] = casti_v128u32( src2, 4 );
+   s.val[3] = casti_v128u32( src3, 4 );
+   vst4q_u32( dst + 256, s );
+
+   if ( bit_len <= 640 ) return;
+
+   s.val[0] = casti_v128u32( src0, 5 );
+   s.val[1] = casti_v128u32( src1, 5 );
+   s.val[2] = casti_v128u32( src2, 5 );
+   s.val[3] = casti_v128u32( src3, 5 );
+   vst4q_u32( dst + 320, s );
+
+   s.val[0] = casti_v128u32( src0, 6 );
+   s.val[1] = casti_v128u32( src1, 6 );
+   s.val[2] = casti_v128u32( src2, 6 );
+   s.val[3] = casti_v128u32( src3, 6 );
+   vst4q_u32( dst + 384, s );
+
+   s.val[0] = casti_v128u32( src0, 7 );
+   s.val[1] = casti_v128u32( src1, 7 );
+   s.val[2] = casti_v128u32( src2, 7 );
+   s.val[3] = casti_v128u32( src3, 7 );
+   vst4q_u32( dst + 448, s );
+
+// if ( bit_len <= 1024 return;
+}
+
+static inline void intrlv_4x32_512( void *dst, const void *src0,
+                     const void *src1, const void *src2, const void *src3 )
+{
+   uint32x4x4_t s;
+
+   s.val[0] = casti_v128u32( src0, 0 );
+   s.val[1] = casti_v128u32( src1, 0 );
+   s.val[2] = casti_v128u32( src2, 0 );
+   s.val[3] = casti_v128u32( src3, 0 );
+   vst4q_u32( dst, s );
+
+   s.val[0] = casti_v128u32( src0, 1 );
+   s.val[1] = casti_v128u32( src1, 1 );
+   s.val[2] = casti_v128u32( src2, 1 );
+   s.val[3] = casti_v128u32( src3, 1 );
+   vst4q_u32( dst + 64, s );
+
+   s.val[0] = casti_v128u32( src0, 2 );
+   s.val[1] = casti_v128u32( src1, 2 );
+   s.val[2] = casti_v128u32( src2, 2 );
+   s.val[3] = casti_v128u32( src3, 2 );
+   vst4q_u32( dst + 128, s );
+
+   s.val[0] = casti_v128u32( src0, 3 );
+   s.val[1] = casti_v128u32( src1, 3 );
+   s.val[2] = casti_v128u32( src2, 3 );
+   s.val[3] = casti_v128u32( src3, 3 );
+   vst4q_u32( dst + 192, s );
+}
+
+static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
+                           void *dst3, const void *src, int bit_len )
+{
+   uint32x4x4_t s = vld4q_u32( src );
+
+   casti_v128( dst0, 0 ) = s.val[0];
+   casti_v128( dst1, 0 ) = s.val[1];
+   casti_v128( dst2, 0 ) = s.val[2];
+   casti_v128( dst3, 0 ) = s.val[3];
+
+   s = vld4q_u32( src + 64 );
+   casti_v128( dst0, 1 ) = s.val[0];
+   casti_v128( dst1, 1 ) = s.val[1];
+   casti_v128( dst2, 1 ) = s.val[2];
+   casti_v128( dst3, 1 ) = s.val[3];
+
+   if ( bit_len <= 256 ) return;
+
+   s = vld4q_u32( src + 128 );
+   casti_v128( dst0, 2 ) = s.val[0];
+   casti_v128( dst1, 2 ) = s.val[1];
+   casti_v128( dst2, 2 ) = s.val[2];
+   casti_v128( dst3, 2 ) = s.val[3];
+
+   s = vld4q_u32( src + 192 );
+   casti_v128( dst0, 3 ) = s.val[0];
+   casti_v128( dst1, 3 ) = s.val[1];
+   casti_v128( dst2, 3 ) = s.val[2];
+   casti_v128( dst3, 3 ) = s.val[3];
+
+   if ( bit_len <= 512 ) return;
+
+   s = vld4q_u32( src + 256 );
+   casti_v128( dst0, 4 ) = s.val[0];
+   casti_v128( dst1, 4 ) = s.val[1];
+   casti_v128( dst2, 4 ) = s.val[2];
+   casti_v128( dst3, 4 ) = s.val[3];
+
+   if ( bit_len <= 640 ) return;
+
+   s = vld4q_u32( src + 320 );
+   casti_v128( dst0, 5 ) = s.val[0];
+   casti_v128( dst1, 5 ) = s.val[1];
+   casti_v128( dst2, 5 ) = s.val[2];
+   casti_v128( dst3, 5 ) = s.val[3];
+
+   s = vld4q_u32( src + 384 );
+   casti_v128( dst0, 6 ) = s.val[0];
+   casti_v128( dst1, 6 ) = s.val[1];
+   casti_v128( dst2, 6 ) = s.val[2];
+   casti_v128( dst3, 6 ) = s.val[3];
+
+   s = vld4q_u32( src + 448 );
+   casti_v128( dst0, 6 ) = s.val[0];
+   casti_v128( dst1, 6 ) = s.val[1];
+   casti_v128( dst2, 6 ) = s.val[2];
+   casti_v128( dst3, 6 ) = s.val[3];
+
+//   if ( bit_len <= 1024 ) return;
+}
+
+static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
+                           void *dst3, const void *src )
+{
+   uint32x4x4_t s = vld4q_u32( src );
+
+   casti_v128( dst0, 0 ) = s.val[0];
+   casti_v128( dst1, 0 ) = s.val[1];
+   casti_v128( dst2, 0 ) = s.val[2];
+   casti_v128( dst3, 0 ) = s.val[3];
+
+   s = vld4q_u32( src + 64 );
+   casti_v128( dst0, 1 ) = s.val[0];
+   casti_v128( dst1, 1 ) = s.val[1];
+   casti_v128( dst2, 1 ) = s.val[2];
+   casti_v128( dst3, 1 ) = s.val[3];
+
+   s = vld4q_u32( src + 128 );
+   casti_v128( dst0, 2 ) = s.val[0];
+   casti_v128( dst1, 2 ) = s.val[1];
+   casti_v128( dst2, 2 ) = s.val[2];
+   casti_v128( dst3, 2 ) = s.val[3];
+
+   s = vld4q_u32( src + 192 );
+   casti_v128( dst0, 3 ) = s.val[0];
+   casti_v128( dst1, 3 ) = s.val[1];
+   casti_v128( dst2, 3 ) = s.val[2];
+   casti_v128( dst3, 3 ) = s.val[3];
+}
+
 #else  // !SSE2 && !NEON

 static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
@@ -456,15 +641,13 @@ static inline void v128_bswap32_80( void *d, void *s )

 #endif

-#if defined(__SSE2__)
-
 static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
 {
-  v128_t s0 = casti_v128( src,0 );
-  v128_t s1 = casti_v128( src,1 );
-  v128_t s2 = casti_v128( src,2 );
-  v128_t s3 = casti_v128( src,3 );
-  v128_t s4 = casti_v128( src,4 );
+  v128u32_t s0 = casti_v128u32( src,0 );
+  v128u32_t s1 = casti_v128u32( src,1 );
+  v128u32_t s2 = casti_v128u32( src,2 );
+  v128u32_t s3 = casti_v128u32( src,3 );
+  v128u32_t s4 = casti_v128u32( src,4 );

 #if defined(__SSSE3__)

@@ -487,79 +670,34 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )

 #endif

-  casti_v128( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
-  casti_v128( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
-  casti_v128( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
-  casti_v128( d, 3 ) = _mm_shuffle_epi32( s0, 0xff );
+  casti_v128u32( d, 0 ) = v128_duplane32( s0, 0 );
+  casti_v128u32( d, 1 ) = v128_duplane32( s0, 1 );
+  casti_v128u32( d, 2 ) = v128_duplane32( s0, 2 );
+  casti_v128u32( d, 3 ) = v128_duplane32( s0, 3 );

-  casti_v128( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 );
-  casti_v128( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 );
-  casti_v128( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa );
-  casti_v128( d, 7 ) = _mm_shuffle_epi32( s1, 0xff );
+  casti_v128u32( d, 4 ) = v128_duplane32( s1, 0 );
+  casti_v128u32( d, 5 ) = v128_duplane32( s1, 1 );
+  casti_v128u32( d, 6 ) = v128_duplane32( s1, 2 );
+  casti_v128u32( d, 7 ) = v128_duplane32( s1, 3 );

-  casti_v128( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 );
-  casti_v128( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 );
-  casti_v128( d,10 ) = _mm_shuffle_epi32( s2, 0xaa );
-  casti_v128( d,11 ) = _mm_shuffle_epi32( s2, 0xff );
+  casti_v128u32( d, 8 ) = v128_duplane32( s2, 0 );
+  casti_v128u32( d, 9 ) = v128_duplane32( s2, 1 );
+  casti_v128u32( d,10 ) = v128_duplane32( s2, 2 );
+  casti_v128u32( d,11 ) = v128_duplane32( s2, 3 );

-  casti_v128( d,12 ) = _mm_shuffle_epi32( s3, 0x00 );
-  casti_v128( d,13 ) = _mm_shuffle_epi32( s3, 0x55 );
-  casti_v128( d,14 ) = _mm_shuffle_epi32( s3, 0xaa );
-  casti_v128( d,15 ) = _mm_shuffle_epi32( s3, 0xff );
+  casti_v128u32( d,12 ) = v128_duplane32( s3, 0 );
+  casti_v128u32( d,13 ) = v128_duplane32( s3, 1 );
+  casti_v128u32( d,14 ) = v128_duplane32( s3, 2 );
+  casti_v128u32( d,15 ) = v128_duplane32( s3, 3 );

-  casti_v128( d,16 ) = _mm_shuffle_epi32( s4, 0x00 );
-  casti_v128( d,17 ) = _mm_shuffle_epi32( s4, 0x55 );
-  casti_v128( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
-  casti_v128( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
+  casti_v128u32( d,16 ) = v128_duplane32( s2, 0 );
+  casti_v128u32( d,17 ) = v128_duplane32( s2, 1 );
+  casti_v128u32( d,18 ) = v128_duplane32( s2, 2 );
+  casti_v128u32( d,19 ) = v128_duplane32( s2, 3 );
 }

-#elif defined(__aarch64__) && defined(__ARM_NEON)
-
-static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
-{
-  v128_t s0 = casti_v128( src,0 );
-  v128_t s1 = casti_v128( src,1 );
-  v128_t s2 = casti_v128( src,2 );
-  v128_t s3 = casti_v128( src,3 );
-  v128_t s4 = casti_v128( src,4 );
-
-  s0 = v128_bswap32( s0 );
-  s1 = v128_bswap32( s1 );
-  s2 = v128_bswap32( s2 );
-  s3 = v128_bswap32( s3 );
-  s4 = v128_bswap32( s4 );
-
-  casti_v128( d, 0 ) = vdupq_laneq_u32( s0, 0 );
-  casti_v128( d, 1 ) = vdupq_laneq_u32( s0, 1 );
-  casti_v128( d, 2 ) = vdupq_laneq_u32( s0, 2 );
-  casti_v128( d, 3 ) = vdupq_laneq_u32( s0, 3 );
-
-  casti_v128( d, 4 ) = vdupq_laneq_u32( s1, 0 );
-  casti_v128( d, 5 ) = vdupq_laneq_u32( s1, 1 );
-  casti_v128( d, 6 ) = vdupq_laneq_u32( s1, 2 );
-  casti_v128( d, 7 ) = vdupq_laneq_u32( s1, 3 );
-
-  casti_v128( d, 8 ) = vdupq_laneq_u32( s2, 0 );
-  casti_v128( d, 9 ) = vdupq_laneq_u32( s2, 1 );
-  casti_v128( d,10 ) = vdupq_laneq_u32( s2, 2 );
-  casti_v128( d,11 ) = vdupq_laneq_u32( s2, 3 );
-
-  casti_v128( d,12 ) = vdupq_laneq_u32( s3, 0 );
-  casti_v128( d,13 ) = vdupq_laneq_u32( s3, 1 );
-  casti_v128( d,14 ) = vdupq_laneq_u32( s3, 2 );
-  casti_v128( d,15 ) = vdupq_laneq_u32( s3, 3 );
-
-  casti_v128( d,16 ) = vdupq_laneq_u32( s2, 0 );
-  casti_v128( d,17 ) = vdupq_laneq_u32( s2, 1 );
-  casti_v128( d,18 ) = vdupq_laneq_u32( s2, 2 );
-  casti_v128( d,19 ) = vdupq_laneq_u32( s2, 3 );
-}
-
-#endif
-
 // 8x32

-
 #if defined(__AVX2__)

 #define ILEAVE_8x32( D0, D1, D2, D3, D4, D5, D6, D7, \
@@ -1544,7 +1682,9 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
 //
 //     64 bit data

-// 2x64    SSE2, NEON
+// 2x64    
+
+#if defined(__x86_64__) && defined(__SSE2__)

 static inline void intrlv_2x64( void *dst, const void *src0,
                                const void *src1, const int bit_len )
@@ -1602,7 +1742,101 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
   d1[7] = v128_unpackhi64( s[14], s[15] );
 }

-/*
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+
+static inline void intrlv_2x64( void *dst, const void *src0,
+                                const void *src1, const int bit_len )
+{
+   uint64x2x2_t s;
+
+   s.val[0] = casti_v128u64( src0, 0 );
+   s.val[1] = casti_v128u64( src1, 0 );
+   vst2q_u64( dst, s );
+
+   s.val[0] = casti_v128u64( src0, 1 );
+   s.val[1] = casti_v128u64( src1, 1 );
+   vst2q_u64( dst + 32, s );
+   
+   if ( bit_len <= 256 ) return;
+
+   s.val[0] = casti_v128u64( src0, 2 );
+   s.val[1] = casti_v128u64( src1, 2 );
+   vst2q_u64( dst + 64, s );
+
+   s.val[0] = casti_v128u64( src0, 3 );
+   s.val[1] = casti_v128u64( src1, 3 );
+   vst2q_u64( dst + 96, s );
+
+   if ( bit_len <= 512 ) return;
+
+   s.val[0] = casti_v128u64( src0, 4 );
+   s.val[1] = casti_v128u64( src1, 4 );
+   vst2q_u64( dst + 128, s );
+
+   if ( bit_len <= 640 ) return;
+
+   s.val[0] = casti_v128u64( src0, 5 );
+   s.val[1] = casti_v128u64( src1, 5 );
+   vst2q_u64( dst + 160, s );
+
+   s.val[0] = casti_v128u64( src0, 6 );
+   s.val[1] = casti_v128u64( src1, 6 );
+   vst2q_u64( dst + 192, s );
+
+   s.val[0] = casti_v128u64( src0, 7 );
+   s.val[1] = casti_v128u64( src1, 7 );
+   vst2q_u64( dst + 224, s );
+
+//   if ( bit_len <= 1024 ) return;
+}
+
+static inline void dintrlv_2x64( void *dst0, void *dst1,
+                                 const void *src, const int bit_len )
+{
+   uint64x2x2_t s = vld2q_u64( src );
+
+   casti_v128u64( dst0, 0 ) = s.val[0];
+   casti_v128u64( dst1, 0 ) = s.val[1];
+
+   s = vld2q_u64( src + 32 );
+   casti_v128u64( dst0, 1 ) = s.val[0];
+   casti_v128u64( dst1, 1 ) = s.val[1];
+
+   if ( bit_len <= 256 ) return;
+
+   s = vld2q_u64( src + 64 );
+   casti_v128u64( dst0, 2 ) = s.val[0];
+   casti_v128u64( dst1, 2 ) = s.val[1];
+
+   s = vld2q_u64( src + 96 );
+   casti_v128u64( dst0, 3 ) = s.val[0];
+   casti_v128u64( dst1, 3 ) = s.val[1];
+
+   if ( bit_len <= 512 ) return;
+
+   s = vld2q_u64( src + 128 );
+   casti_v128u64( dst0, 4 ) = s.val[0];
+   casti_v128u64( dst1, 4 ) = s.val[1];
+
+   if ( bit_len <= 640 ) return;
+
+   s = vld2q_u64( src + 160 );
+   casti_v128u64( dst0, 5 ) = s.val[0];
+   casti_v128u64( dst1, 5 ) = s.val[1];  
+
+   s = vld2q_u64( src + 192 );
+   casti_v128u64( dst0, 6 ) = s.val[0];
+   casti_v128u64( dst1, 6 ) = s.val[1];   
+
+   s = vld2q_u64( src + 224 );
+   casti_v128u64( dst0, 7 ) = s.val[0];
+   casti_v128u64( dst1, 7 ) = s.val[1];   
+
+//   if ( bit_len <= 1024 ) return;
+}
+   
+#else
+
 static inline void intrlv_2x64( void *dst, const void *src0,
                                const void *src1, const int bit_len )
 {
@@ -1621,8 +1855,7 @@ static inline void intrlv_2x64( void *dst, const void *src0,
   d[24] = s0[12];    d[25] = s1[12];   d[26] = s0[13];    d[27] = s1[13];
   d[28] = s0[14];    d[29] = s1[14];   d[30] = s0[15];    d[31] = s1[15];
 }
-*/
-/*
+
 static inline void dintrlv_2x64( void *dst0, void *dst1,
                                 const void *src, const int bit_len )
 {
@@ -1642,15 +1875,16 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
   d0[12] = s[24];   d1[12] = s[25];   d0[13] = s[26];   d1[13] = s[27];
   d0[14] = s[28];   d1[14] = s[29];   d0[15] = s[30];   d1[15] = s[31];
 }
-*/
+
+#endif

 static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
 {
-  v128_t s0 = casti_v128( src,0 );
-  v128_t s1 = casti_v128( src,1 );
-  v128_t s2 = casti_v128( src,2 );
-  v128_t s3 = casti_v128( src,3 );
-  v128_t s4 = casti_v128( src,4 );
+  v128u64_t s0 = casti_v128u64( src,0 );
+  v128u64_t s1 = casti_v128u64( src,1 );
+  v128u64_t s2 = casti_v128u64( src,2 );
+  v128u64_t s3 = casti_v128u64( src,3 );
+  v128u64_t s4 = casti_v128u64( src,4 );

 #if defined(__SSSE3__)

@@ -1673,41 +1907,20 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )

 #endif

-#if defined(__SSE2__)
+  casti_v128u64( d,0 ) = v128_duplane64( s0, 0 );
+  casti_v128u64( d,1 ) = v128_duplane64( s0, 1 );

-  casti_v128( d,0 ) = _mm_shuffle_epi32( s0, 0x44 );
-  casti_v128( d,1 ) = _mm_shuffle_epi32( s0, 0xee );
+  casti_v128u64( d,2 ) = v128_duplane64( s1, 0 );
+  casti_v128u64( d,3 ) = v128_duplane64( s1, 1 );

-  casti_v128( d,2 ) = _mm_shuffle_epi32( s1, 0x44 );
-  casti_v128( d,3 ) = _mm_shuffle_epi32( s1, 0xee );
+  casti_v128u64( d,4 ) = v128_duplane64( s2, 0 );
+  casti_v128u64( d,5 ) = v128_duplane64( s2, 1 );

-  casti_v128( d,4 ) = _mm_shuffle_epi32( s2, 0x44 );
-  casti_v128( d,5 ) = _mm_shuffle_epi32( s2, 0xee );
+  casti_v128u64( d,6 ) = v128_duplane64( s3, 0 );
+  casti_v128u64( d,7 ) = v128_duplane64( s3, 1 );

-  casti_v128( d,6 ) = _mm_shuffle_epi32( s3, 0x44 );
-  casti_v128( d,7 ) = _mm_shuffle_epi32( s3, 0xee );
-
-  casti_v128( d,8 ) = _mm_shuffle_epi32( s4, 0x44 );
-  casti_v128( d,9 ) = _mm_shuffle_epi32( s4, 0xee );
-
-#elif defined(__ARM_NEON)
-
-  casti_v128u64( d,0 ) = vdupq_laneq_u64( (uint64x2_t)s0, 0 );
-  casti_v128u64( d,1 ) = vdupq_laneq_u64( (uint64x2_t)s0, 1 );
-
-  casti_v128u64( d,2 ) = vdupq_laneq_u64( (uint64x2_t)s1, 0 );
-  casti_v128u64( d,3 ) = vdupq_laneq_u64( (uint64x2_t)s1, 1 );
-
-  casti_v128u64( d,4 ) = vdupq_laneq_u64( (uint64x2_t)s2, 0 );
-  casti_v128u64( d,5 ) = vdupq_laneq_u64( (uint64x2_t)s2, 1 );
-
-  casti_v128u64( d,6 ) = vdupq_laneq_u64( (uint64x2_t)s3, 0 );
-  casti_v128u64( d,7 ) = vdupq_laneq_u64( (uint64x2_t)s3, 1 );
-
-  casti_v128u64( d,8 ) = vdupq_laneq_u64( (uint64x2_t)s4, 0 );
-  casti_v128u64( d,9 ) = vdupq_laneq_u64( (uint64x2_t)s4, 1 );
-
-#endif
+  casti_v128u64( d,8 ) = v128_duplane64( s4, 0 );
+  casti_v128u64( d,9 ) = v128_duplane64( s4, 1 );
 }

 static inline void extr_lane_2x64( void *dst, const void *src,
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -439,11 +439,11 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #define v128_ornot( v1, v0 )      _mm_or_si128( v128_not( v1 ), v0 )

-#define v128_xor3( a, b, c )      _mm_xor_si128( a, _mm_xor_si128( b, c ) )
+#define v128_xor3( a, b, c )      _mm_xor_si128( _mm_xor_si128( a, b ), c )

-#define v128_and3( a, b, c )      _mm_and_si128( a, _mm_and_si128( b, c ) )
+#define v128_and3( a, b, c )      _mm_and_si128( _mm_and_si128( a, b ), c )

-#define v128_or3( a, b, c )       _mm_or_si128( a, _mm_or_si128( b, c ) )
+#define v128_or3( a, b, c )       _mm_or_si128( _mm_or_si128( a, b ), c )

 #define v128_xorand( a, b, c )    _mm_xor_si128( a, _mm_and_si128( b, c ) )

--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -174,17 +174,22 @@ static inline __m256i mm256_not( const __m256i v )

 #define mm256_ornot( v1, v0 )      _mm256_or_si256( mm256_not( v1 ), v0 )

+// usage hints to improve performance when ternary logic is not avalable:
+// If overwriting an input arg put that arg first so the intermediate
+// result can be stored in the dest.
+// Put an arg with the nearest dependency last so independant args can be
+// processed first.
 #define mm256_xor3( a, b, c ) \
-  _mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
+  _mm256_xor_si256( _mm256_xor_si256( a, b ), c )

 #define mm256_xor4( a, b, c, d ) \
  _mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )

 #define mm256_and3( a, b, c ) \
-  _mm256_and_si256( a, _mm256_and_si256( b, c ) )
+  _mm256_and_si256( _mm256_and_si256( a, b ), c )

 #define mm256_or3( a, b, c ) \
-   _mm256_or_si256( a, _mm256_or_si256( b, c ) )
+   _mm256_or_si256( _mm256_or_si256( a, b ), c )

 #define mm256_xorand( a, b, c ) \
  _mm256_xor_si256( a, _mm256_and_si256( b, c ) )
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -2,7 +2,7 @@
 #define SIMD_INT_H__ 1

 //TODO compile time test for byte order
-// be64 etc using HW bowap.
+// be64 etc using HW bswap.
 //
 // Endian byte swap
 #if defined(__x86_64__)
@@ -94,7 +94,7 @@ static inline uint16_t be16( const uint16_t u16 )
  return ( (uint16_t)(p[3]) ) + ( (uint16_t)(p[2]) <<  8 );
 }

-static inline uint32_t le162( const uint16_t u16 )
+static inline uint32_t le16( const uint16_t u16 )
 {
   const uint8_t *p = (uint8_t const *)&u16;
   return ( (uint16_t)(p[0]) ) + ( (uint16_t)(p[1]) <<  8 );
@@ -112,7 +112,7 @@ static inline uint32_t le162( const uint16_t u16 )
 #elif defined(__aarch64__)

 // Documentation is vague, ror exists but is ambiguous. Docs say it can
-// do 32 or 64 registers. Assuming that is architecture specific andcan
+// do 32 or 64 bit registers. Assuming that is architecture specific and can
 // only do 32 bit on 32 bit arch. Rarely used so not a big issue.
 static inline uint64_t ror64( uint64_t a, const int c )
 {
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -93,6 +93,8 @@
 #define v128_cmplt16( v1, v0 )      vcltq_s16( (int16x8_t)v1, (int16x8_t)(v0) )
 #define v128_cmplt8( v1, v0 )       vcltq_s8( (int8x16_t)v1, (int8x16_t)(v0) )

+#define v128_cmpeq_zero                vceqzq_u64
+
 // Logical bit shift
 #define v128_sl64                     vshlq_n_u64
 #define v128_sl32                     vshlq_n_u32
@@ -135,14 +137,14 @@
 #if defined(__ARM_FEATURE_SHA3)
  #define v128_xor3                   veor3q_u32
 #else
-  #define v128_xor3( v2, v1, v0 )     veorq_u32( v2, veorq_u32( v1, v0 ) )
+  #define v128_xor3( v2, v1, v0 )     veorq_u32( veorq_u32( v2, v1 ), v0 )
 #endif

 // v2 & v1 & v0
-#define v128_and3( v2, v1, v0 )       v128_and( v2, v128_and( v1, v0 ) )
+#define v128_and3( v2, v1, v0 )       v128_and( v128_and( v2, v1 ), v0 )

 // v2 | v1 | v0
-#define v128_or3( v2, v1, v0 )        v128_or( v2, v128_or( v1, v0 ) )
+#define v128_or3( v2, v1, v0 )        v128_or( v128_or( v2, v1 ), v0 )

 // v2 ^ ( ~v1 & v0 )
 #if defined(__ARM_FEATURE_SHA3)
@@ -178,6 +180,7 @@
 #define v128_unpacklo8(  v1, v0 )     vzip1q_u8(  v1, v0 )
 #define v128_unpackhi8(  v1, v0 )     vzip2q_u8(  v1, v0 )

+// vzipq_u32 can do hi & lo and return uint32x4x2, no 64 bit version.

 // AES
 // consistent with Intel AES intrinsics, break up for optimizing
@@ -237,18 +240,15 @@ typedef union
 #define cast_v128u32( p )              (*((uint32x4_t*)(p)))
 #define castp_v128u32( p )             ((uint32x4_t*)(p))

-#define v128_zero                      v128_64( 0ull )
-
-#define v128_cmpeq_zero                vceqzq_u64
-
-#define v128_neg1                      v128_64( 0xffffffffffffffffull )
-
 // set1
 #define v128_64                        vmovq_n_u64
 #define v128_32                        vmovq_n_u32
 #define v128_16                        vmovq_n_u16
 #define v128_8                         vmovq_n_u8

+#define v128_zero                      v128_64( 0ull )
+#define v128_neg1                      v128_64( 0xffffffffffffffffull )
+
 #define v64_set32( u32_1, u32_0 ) \
  vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )

@@ -357,28 +357,23 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
                             ((uint16x8_t)(v)), c )

 #define v128_rol16( v, c ) \
-  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
+  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \
               : vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
                             ((uint16x8_t)(v)), c )

 #define v128_ror8( v, c ) \
-      vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)),  8-(c) ), \
+      vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
                  ((uint8x16_t)(v)), c )

 #define v128_rol8( v, c ) \
-      vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)),  8-(c) ), \
+      vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
                 ((uint8x16_t)(v)), c )

-
-// ( v1 ^ v0 ) >>> n 
+// ( v1 ^ v0 ) >>> c 
 #if defined(__ARM_FEATURE_SHA3)
-
-#define v128_ror64xor( v1, v0, n )  vxarq_u64( v1, v0, n ) 
-
+  #define v128_ror64xor( v1, v0, c )  vxarq_u64( v1, v0, c ) 
 #else
-
-#define v128_ror64xor( v1, v0, n )  v128_ror64( v128_xor( v1, v0 ), n ) 
-
+  #define v128_ror64xor( v1, v0, c )  v128_ror64( v128_xor( v1, v0 ), c ) 
 #endif

 #define v128_2ror64( v1, v0, c ) \
@@ -411,7 +406,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 v1 = vorrq_u32( v1, t1 ); \
 }

-#define v128_2rorx32( v1, v0, c ) \
+#define v128_2ror32( v1, v0, c ) \
 { \
 uint32x4_t t0 = vshlq_n_u32( v0, c ); \
 uint32x4_t t1 = vshlq_n_u32( v1, c ); \
@@ -444,9 +439,9 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 #define v128_lrev16            vrev32q_u16

 // aka bswap
-#define v128_qrev8             vrev64q_u8
-#define v128_lrev8             vrev32q_u8
-#define v128_wrev8             vrev16q_u8
+// #define v128_qrev8             vrev64q_u8
+// #define v128_lrev8             vrev32q_u8
+// #define v128_wrev8             vrev16q_u8

 // full vector rotation

@@ -471,9 +466,9 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
 #define v128_bswap16(v)        (uint16x8_t)vrev16q_u8( (uint8x16_t)(v) )
 #define v128_bswap32(v)        (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) )
 #define v128_bswap64(v)        (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
-#define v128_bswap128(v)       (uint32x4_t)v128_swap64( v128_bswap64(v) )
+#define v128_bswap128(v)       (uint32x4_t)v128_rev64( v128_bswap64(v) )

-// Usefull for x86_64 but does nothing for ARM
+// Useful for x86_64 but does nothing for ARM
 #define v128_block_bswap32( dst, src ) \
 { \
   casti_v128u32( dst,0 ) = v128_bswap32( casti_v128u32( src,0 ) ); \
@@ -542,7 +537,7 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )

 // Bitwise blend using vector mask, use only bytewise for compatibility
 // with x86_64.
-#define v128_blendv( v1, v0, mask )    vbslq_u32( mask, v1, v0 )
+#define v128_blendv( v1, v0, mask )    vbslq_u32( mask, v0, v1 )

 #endif   // __ARM_NEON
 #endif   // SIMD_NEON_H__
--- a/simd-utils/simd-sve.h
+++ b/simd-utils/simd-sve.h
@@ -1,25 +1,152 @@
 // Placeholder for now.
 //
-// This file will hold AArch64 SVE code, a replecement for NEON that uses vector length
-// agnostic instructions. This means the same code can be used on CPUs with different
-// SVE vector register lengths. This is not good for vectorized hashing.
+// This file will hold AArch64 SVE code, a replecement for NEON that uses
+// vector length agnostic instructions. This means the same code can be used
+// on CPUs with different SVE vector register lengths. This is not good for
+// vectorized hashing.
 // Optimum hash is sensitive to the vector register length with different code
-// used for different register sizes. On X86_64 the vector length is tied to the CPU
-// feature making it simple and efficient to handle different lengths although it
-// results in multiple executables. Theoretically SVE could use a single executable for
-// any vector length.
+// used for different register sizes. On X86_64 the vector length is tied to
+// the CPU feature making it simple and efficient to handle different lengths
+// although it results in multiple executables. Theoretically SVE could use a
+// single executable for any vector length.
 //
-// With the SVE vector length only known at run time it resultis in run time overhead
-// to test the vector length. Theoretically it could be tested at program loading and
-// appropriate libraries loaded. However I don't know if this can be done and if so
-// how to do it.
+// With the SVE vector length only known at run time it results in run time
+// overhead to test the vector length. Theoretically it could be tested at
+// program loading and appropriate libraries loaded. However I don't know if
+// this can be done and if specified how to do it.
 //
 // SVE is not expected to be used for 128 bit vectors as it does not provide any
 // advantages over NEON. However, it may be implemented for testing purposes
-// because CPU with registers larger than 128 bits are currently very rare and very
-// expensive server class CPUs.
+// because CPU with registers larger than 128 bits are currently very rare and
+// very expensive server class CPUs.
 //
-// N-way parallel hashing could be the best use of SVE, usimg the same code for all 
-// vector lengths with the only variable being the number of lanes. This will still
-// require run time checking but should be lighter than substituting functions.
+// However, 128 bit vectors also need to be supported with 256 bit registers.
+// This could be a challenge for un-predicated functions.
+//
+// N-way parallel hashing could be the best use of SVE, usimg the same code
+// for all vector lengths with the only variable being the number of lanes.
+// This will still require run time checking but should be lighter than
+// substituting functions.
+
+// Current approach is to hard code the length in these intrinsics and called
+// by existing length specific code.
+// define with sv_ prefix for generic use predicate provided by caller,
+// use sv<size>_ with hard coded predicate.
+// v<size>_ only if and when it's compatible with SSE & NEON
+
+// Many instructions have no predicate operand, how is VVL handled?
+// How does the CPU know how long the vector is and whether it spans
+// multiple registers without the predicate?
+
+// Also how does the predicate define the vector size? How to tell if inactive
+// high lanes are part of the vector or beyond its range.
+//
+// Some intructions may have an implied predicate by other arguments. 
+// TBL for example will only have shuffle indexes for active lanes.
+// However this is dependant on software being aware of register size.
+
+
+ 
+#if 0
+// #if defined USE_SV128
+// NEON needs to be disabled
+
+#define PRED128 0xffff
+#define PRED256 0xffffffff
+
+// Types should be transparent
+
+
+#define sv128u32_t  svuint32_t
+#define sv256u32_t  svuint32_t
+
+
+// load1
+
+
+// arithmetic
+
+// _z zero inactive elements, _x undefined inactive elements, _m inactive
+// elements from first arg. arg order only matters when _m used. Use _x.
+
+#define sv_add32( p, v1, v0 )         svadd_u32_x( p, v1, v0 )
+
+#define sv128_add32( v1, v0 )         svadd_u32_x( PRED128, v1, v0 )
+#define sv256_add32( v1, v0 )         svadd_u32_x( PRED256, v1, v0 )
+
+// Add integer to each element
+#define sv_addi32( p, v, i )           svadd_n_u32_x( p, v, i )
+
+
+
+// compare
+
+#define sv_cmpeq32( p, v1, v0 )       svcmpeq_u32( p, v1, v0 )
+
+#define sv128_cmpeq32( v1, v0 )       svcmpeq_u32( PRED128, v1, v0 )
+#define sv256_cmpeq32( v1, v0 )       svcmpeq_u32( PRED256, v1, v0 )
+
+
+// bit shift
+
+#define sv_sl32( v, c )              svlsl_n_u32_x( p, v, c )
+
+#define sv128_sl32( v, c )           svlsl_n_u32_x( PRED128, v, c )
+#define sv256_sl32( v, c )           svlsl_n_u32_x( PRED256, v, c )
+
+
+// logic
+
+#define sv_or( p, v1, v0 )           svorr_u32_x( p, v1, v0 )
+
+#define sv128_or( v1, v0 )           svorr_u32_x( PRED128, v1, v0 )
+#define sv256_or( v1, v0 )           svorr_u32_x( PRED256, v1, v0 )
+
+// ext used for alignr, and zip used for unpack have no predicate arg.
+// How is vector length determined? How are register sizes handled?
+// How are part registers handled?
+
+// alignr (ext)
+
+// unpack
+
+
+// AES
+
+// AES uses fixed 128 bit vectors, how does this work with larger registers?
+ 
+// set1
+
+#define sv128_32( n )      svdup_n_u32_x( PRED128, n )
+#define sv256_32( n )      svdup_n_u32_x( PRED256, n )
+
+// broadcast
+
+// svdup_lane has no predicate
+
+// constants
+
+
+// pointer cast
+
+
+// Bit rotation
+
+// No predication for shift instructions
+
+// Cross lane shuffles
+
+// Very limited shuffling, mostly svtbl which has no predicate and  uses
+// vector for the index.
+
+
+// endian byte swap
+
+
+#define sv128_bswap32(v)        svrevb_u32_x( p, v )
+
+
+// blend
+
+#enfif