Files
cpuminer-opt-gpu/simd-utils/simd-sve.h
Jay D Dee 06624a0ff2 v24.6
2024-12-08 11:14:08 -05:00

153 lines
4.4 KiB
C

// Placeholder for now.
//
// This file will hold AArch64 SVE code, a replecement for NEON that uses
// vector length agnostic instructions. This means the same code can be used
// on CPUs with different SVE vector register lengths. This is not good for
// vectorized hashing.
// Optimum hash is sensitive to the vector register length with different code
// used for different register sizes. On X86_64 the vector length is tied to
// the CPU feature making it simple and efficient to handle different lengths
// although it results in multiple executables. Theoretically SVE could use a
// single executable for any vector length.
//
// With the SVE vector length only known at run time it results in run time
// overhead to test the vector length. Theoretically it could be tested at
// program loading and appropriate libraries loaded. However I don't know if
// this can be done and if specified how to do it.
//
// SVE is not expected to be used for 128 bit vectors as it does not provide any
// advantages over NEON. However, it may be implemented for testing purposes
// because CPU with registers larger than 128 bits are currently very rare and
// very expensive server class CPUs.
//
// However, 128 bit vectors also need to be supported with 256 bit registers.
// This could be a challenge for un-predicated functions.
//
// N-way parallel hashing could be the best use of SVE, usimg the same code
// for all vector lengths with the only variable being the number of lanes.
// This will still require run time checking but should be lighter than
// substituting functions.
// Current approach is to hard code the length in these intrinsics and called
// by existing length specific code.
// define with sv_ prefix for generic use predicate provided by caller,
// use sv<size>_ with hard coded predicate.
// v<size>_ only if and when it's compatible with SSE & NEON
// Many instructions have no predicate operand, how is VVL handled?
// How does the CPU know how long the vector is and whether it spans
// multiple registers without the predicate?
// Also how does the predicate define the vector size? How to tell if inactive
// high lanes are part of the vector or beyond its range.
//
// Some intructions may have an implied predicate by other arguments.
// TBL for example will only have shuffle indexes for active lanes.
// However this is dependant on software being aware of register size.
#if 0
// #if defined USE_SV128
// NEON needs to be disabled
#define PRED128 0xffff
#define PRED256 0xffffffff
// Types should be transparent
#define sv128u32_t svuint32_t
#define sv256u32_t svuint32_t
// load1
// arithmetic
// _z zero inactive elements, _x undefined inactive elements, _m inactive
// elements from first arg. arg order only matters when _m used. Use _x.
#define sv_add32( p, v1, v0 ) svadd_u32_x( p, v1, v0 )
#define sv128_add32( v1, v0 ) svadd_u32_x( PRED128, v1, v0 )
#define sv256_add32( v1, v0 ) svadd_u32_x( PRED256, v1, v0 )
// Add integer to each element
#define sv_addi32( p, v, i ) svadd_n_u32_x( p, v, i )
// compare
#define sv_cmpeq32( p, v1, v0 ) svcmpeq_u32( p, v1, v0 )
#define sv128_cmpeq32( v1, v0 ) svcmpeq_u32( PRED128, v1, v0 )
#define sv256_cmpeq32( v1, v0 ) svcmpeq_u32( PRED256, v1, v0 )
// bit shift
#define sv_sl32( v, c ) svlsl_n_u32_x( p, v, c )
#define sv128_sl32( v, c ) svlsl_n_u32_x( PRED128, v, c )
#define sv256_sl32( v, c ) svlsl_n_u32_x( PRED256, v, c )
// logic
#define sv_or( p, v1, v0 ) svorr_u32_x( p, v1, v0 )
#define sv128_or( v1, v0 ) svorr_u32_x( PRED128, v1, v0 )
#define sv256_or( v1, v0 ) svorr_u32_x( PRED256, v1, v0 )
// ext used for alignr, and zip used for unpack have no predicate arg.
// How is vector length determined? How are register sizes handled?
// How are part registers handled?
// alignr (ext)
// unpack
// AES
// AES uses fixed 128 bit vectors, how does this work with larger registers?
// set1
#define sv128_32( n ) svdup_n_u32_x( PRED128, n )
#define sv256_32( n ) svdup_n_u32_x( PRED256, n )
// broadcast
// svdup_lane has no predicate
// constants
// pointer cast
// Bit rotation
// No predication for shift instructions
// Cross lane shuffles
// Very limited shuffling, mostly svtbl which has no predicate and uses
// vector for the index.
// endian byte swap
#define sv128_bswap32(v) svrevb_u32_x( p, v )
// blend
#enfif