v24.6

2026-02-22 16:33:08 +00:00 · 2024-12-08 11:14:08 -05:00
parent 8e91bfbe19
commit 06624a0ff2
18 changed files with 1526 additions and 1140 deletions
--- a/simd-utils/simd-sve.h
+++ b/simd-utils/simd-sve.h
@@ -1,25 +1,152 @@
 // Placeholder for now.
 //
-// This file will hold AArch64 SVE code, a replecement for NEON that uses vector length
-// agnostic instructions. This means the same code can be used on CPUs with different
-// SVE vector register lengths. This is not good for vectorized hashing.
+// This file will hold AArch64 SVE code, a replecement for NEON that uses
+// vector length agnostic instructions. This means the same code can be used
+// on CPUs with different SVE vector register lengths. This is not good for
+// vectorized hashing.
 // Optimum hash is sensitive to the vector register length with different code
-// used for different register sizes. On X86_64 the vector length is tied to the CPU
-// feature making it simple and efficient to handle different lengths although it
-// results in multiple executables. Theoretically SVE could use a single executable for
-// any vector length.
+// used for different register sizes. On X86_64 the vector length is tied to
+// the CPU feature making it simple and efficient to handle different lengths
+// although it results in multiple executables. Theoretically SVE could use a
+// single executable for any vector length.
 //
-// With the SVE vector length only known at run time it resultis in run time overhead
-// to test the vector length. Theoretically it could be tested at program loading and
-// appropriate libraries loaded. However I don't know if this can be done and if so
-// how to do it.
+// With the SVE vector length only known at run time it results in run time
+// overhead to test the vector length. Theoretically it could be tested at
+// program loading and appropriate libraries loaded. However I don't know if
+// this can be done and if specified how to do it.
 //
 // SVE is not expected to be used for 128 bit vectors as it does not provide any
 // advantages over NEON. However, it may be implemented for testing purposes
-// because CPU with registers larger than 128 bits are currently very rare and very
-// expensive server class CPUs.
+// because CPU with registers larger than 128 bits are currently very rare and
+// very expensive server class CPUs.
 //
-// N-way parallel hashing could be the best use of SVE, usimg the same code for all 
-// vector lengths with the only variable being the number of lanes. This will still
-// require run time checking but should be lighter than substituting functions.
+// However, 128 bit vectors also need to be supported with 256 bit registers.
+// This could be a challenge for un-predicated functions.
+//
+// N-way parallel hashing could be the best use of SVE, usimg the same code
+// for all vector lengths with the only variable being the number of lanes.
+// This will still require run time checking but should be lighter than
+// substituting functions.
+
+// Current approach is to hard code the length in these intrinsics and called
+// by existing length specific code.
+// define with sv_ prefix for generic use predicate provided by caller,
+// use sv<size>_ with hard coded predicate.
+// v<size>_ only if and when it's compatible with SSE & NEON
+
+// Many instructions have no predicate operand, how is VVL handled?
+// How does the CPU know how long the vector is and whether it spans
+// multiple registers without the predicate?
+
+// Also how does the predicate define the vector size? How to tell if inactive
+// high lanes are part of the vector or beyond its range.
+//
+// Some intructions may have an implied predicate by other arguments. 
+// TBL for example will only have shuffle indexes for active lanes.
+// However this is dependant on software being aware of register size.
+
+
+ 
+#if 0
+// #if defined USE_SV128
+// NEON needs to be disabled
+
+#define PRED128 0xffff
+#define PRED256 0xffffffff
+
+// Types should be transparent
+
+
+#define sv128u32_t  svuint32_t
+#define sv256u32_t  svuint32_t
+
+
+// load1
+
+
+// arithmetic
+
+// _z zero inactive elements, _x undefined inactive elements, _m inactive
+// elements from first arg. arg order only matters when _m used. Use _x.
+
+#define sv_add32( p, v1, v0 )         svadd_u32_x( p, v1, v0 )
+
+#define sv128_add32( v1, v0 )         svadd_u32_x( PRED128, v1, v0 )
+#define sv256_add32( v1, v0 )         svadd_u32_x( PRED256, v1, v0 )
+
+// Add integer to each element
+#define sv_addi32( p, v, i )           svadd_n_u32_x( p, v, i )
+
+
+
+// compare
+
+#define sv_cmpeq32( p, v1, v0 )       svcmpeq_u32( p, v1, v0 )
+
+#define sv128_cmpeq32( v1, v0 )       svcmpeq_u32( PRED128, v1, v0 )
+#define sv256_cmpeq32( v1, v0 )       svcmpeq_u32( PRED256, v1, v0 )
+
+
+// bit shift
+
+#define sv_sl32( v, c )              svlsl_n_u32_x( p, v, c )
+
+#define sv128_sl32( v, c )           svlsl_n_u32_x( PRED128, v, c )
+#define sv256_sl32( v, c )           svlsl_n_u32_x( PRED256, v, c )
+
+
+// logic
+
+#define sv_or( p, v1, v0 )           svorr_u32_x( p, v1, v0 )
+
+#define sv128_or( v1, v0 )           svorr_u32_x( PRED128, v1, v0 )
+#define sv256_or( v1, v0 )           svorr_u32_x( PRED256, v1, v0 )
+
+// ext used for alignr, and zip used for unpack have no predicate arg.
+// How is vector length determined? How are register sizes handled?
+// How are part registers handled?
+
+// alignr (ext)
+
+// unpack
+
+
+// AES
+
+// AES uses fixed 128 bit vectors, how does this work with larger registers?
+ 
+// set1
+
+#define sv128_32( n )      svdup_n_u32_x( PRED128, n )
+#define sv256_32( n )      svdup_n_u32_x( PRED256, n )
+
+// broadcast
+
+// svdup_lane has no predicate
+
+// constants
+
+
+// pointer cast
+
+
+// Bit rotation
+
+// No predication for shift instructions
+
+// Cross lane shuffles
+
+// Very limited shuffling, mostly svtbl which has no predicate and  uses
+// vector for the index.
+
+
+// endian byte swap
+
+
+#define sv128_bswap32(v)        svrevb_u32_x( p, v )
+
+
+// blend
+
+#enfif