mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.9.2.5
This commit is contained in:
398
simd-utils/simd-types.h
Normal file
398
simd-utils/simd-types.h
Normal file
@@ -0,0 +1,398 @@
|
||||
//////////////////////////////////////
|
||||
//
|
||||
// Type abstraction overlays designed for use in highly optimized
|
||||
// straight line code operating on array structures. It uses direct
|
||||
// struct member access instead of indexing to access array elements.
|
||||
// Ex: array.u32_3 instead of array[3].
|
||||
//
|
||||
// Vector types are used to represent asrrays. 64 and 128 bit vectors have
|
||||
// corresponding 64 and 128 bit integer types.
|
||||
//
|
||||
// Data accesses are not tied to memory as arrays are. Thes structures
|
||||
// can operate comfortably as reguietr variables.
|
||||
//
|
||||
// Although the abstraction makes for transparent usage there is overhead.
|
||||
// Extra move instructins are required when an operation requires a
|
||||
// different register type. Additionaly 128 bit operations, uint128_t
|
||||
// and AES, can't be done in parallel with a 256 bit or lager vector.
|
||||
// The require additionalmove instructions in addition to the lack of
|
||||
// improvement from parallelism.
|
||||
//
|
||||
// Move instruction overhead is required when moving among gpr, mmx
|
||||
// and xmm registers. The number of extra moves is usually the number
|
||||
// of elements inthe vector. If bothe are the same size onlu one move
|
||||
// is required. The number is doubled if the data is moved back.
|
||||
//
|
||||
// xmm and ymm resgisters are special, they are aliased. xmm registers
|
||||
// overlay the lower 128 bits of the ymm registers. Accessing the data
|
||||
// in the lower half of a ymm register by an xmm argument is free.
|
||||
// The upper 128 bits need to be extracted and inserted like with other
|
||||
// different sized data types.
|
||||
//
|
||||
// Integer types can be converted to differently sized integers without
|
||||
// penalty.
|
||||
//
|
||||
// Conversions with penalty should be avoided as much possible by grouping
|
||||
// operations requiring the same register set.
|
||||
//
|
||||
// There are two algorithms for extracting and inserting data.
|
||||
//
|
||||
// There isthe straightforward iterative meathod wher each element is
|
||||
// extracted or inserted in turn. The compiler evidently take a different
|
||||
// aproach based on assembly code generated by a set intrinsic.
|
||||
// To extract 64 bit or smaller elements from a 256 bit vector the
|
||||
// first extracts the upper 128 bit into a second xmm register. This
|
||||
// eliminates a dependency between the upper and lower elements allowing
|
||||
// the CPU more opportunity at multiple operations per clock.
|
||||
// This adds one additional instruction to the process. With AVX512 an
|
||||
// another stege is added by first splitting up the 512 bit vector into
|
||||
// 2 256 bit vectors,
|
||||
//
|
||||
// xmm/ymm aliasing makes accessing low half trivial and without cost.
|
||||
// Accessing the upper half requires a move from the upper half of
|
||||
// the source register to the lower half of the destination.
|
||||
// It's a bigger issue with GPRs as there is no aliasing.
|
||||
//
|
||||
// Theoretically memory resident data could bypass the move and load
|
||||
// the data directly into the desired register type. However this
|
||||
// ignores the overhead to ensure coherency between register and memory
|
||||
// wich is significantly more.
|
||||
//
|
||||
// Overlay avoids pointer dereferences and favours register move over
|
||||
// memory load, notwistanding compiler optimization.
|
||||
//
|
||||
// The syntax is ugly but can be abstracted with macros.
|
||||
|
||||
|
||||
// Universal 64 bit overlay
|
||||
// Avoids arrays and pointers, suitable as register variable.
|
||||
// Conversions are transparent but not free, cost is one MOV instruction.
|
||||
// Facilitates manipulating 32 bit data in 64 bit pairs.
|
||||
// Allows full use of 64 bit registers for 32 bit data, effectively doubling
|
||||
// the size of the register set.
|
||||
// Potentially up to 50% reduction in instructions depending on rate of
|
||||
// conversion.
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////
|
||||
//
|
||||
// 128 bit integer
|
||||
//
|
||||
// Native type __int128 supported starting with GCC-4.8.
|
||||
//
|
||||
// __int128 uses two 64 bit GPRs to hold the data. The main benefits are
|
||||
// for 128 bit arithmetic. Vectors are preferred when 128 bit arith
|
||||
// is not required. int128 also works better with other integer sizes.
|
||||
// Vectors benefit from wider registers.
|
||||
//
|
||||
// For safety use typecasting on all numeric arguments.
|
||||
//
|
||||
// Use typecasting for conversion to/from 128 bit vector:
|
||||
// __m128i v128 = (__m128i)my_int128l
|
||||
// __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 );
|
||||
// my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );
|
||||
|
||||
// Compiler check for __int128 support
|
||||
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
|
||||
#define GCC_INT128 1
|
||||
#endif
|
||||
|
||||
#if !defined(GCC_INT128)
|
||||
#warning "__int128 not supported, requires GCC-4.8 or newer."
|
||||
#endif
|
||||
|
||||
#if defined(GCC_INT128)
|
||||
|
||||
// Familiar looking type names
|
||||
typedef __int128 int128_t;
|
||||
typedef unsigned __int128 uint128_t;
|
||||
|
||||
#endif
|
||||
|
||||
/////////////////////////////////////
|
||||
//
|
||||
// MMX 64 bit vector
|
||||
//
|
||||
|
||||
|
||||
// Emulates uint32_t[2]
|
||||
struct _regarray_u32x2
|
||||
{
|
||||
uint32_t _0; uint32_t _1;
|
||||
};
|
||||
typedef struct _regarray_u32x2 regarray_u32x2;
|
||||
|
||||
// Emulates uint16_t[4]
|
||||
struct _regarray_u16x4
|
||||
{
|
||||
uint16_t _0; uint16_t _1; uint16_t _2; uint16_t _3;
|
||||
};
|
||||
typedef struct _regarray_u16x4 regarray_u16x4;
|
||||
|
||||
// Emulates uint8_t[8]
|
||||
struct _regarray_u8x8
|
||||
{
|
||||
uint8_t _0; uint8_t _1; uint8_t _2; uint8_t _3;
|
||||
uint8_t _4; uint8_t _5; uint8_t _6; uint8_t _7;
|
||||
};
|
||||
typedef struct _regarray_u8x8 regarray_u8x8;
|
||||
|
||||
// universal 64 bit overlay
|
||||
union _regarray_64
|
||||
{
|
||||
regarray_u32x2 u32_; // uint32_t[2]
|
||||
regarray_u16x4 u16_; // uint16_t[4]
|
||||
regarray_u8x8 u8_; // uint8_t[8]
|
||||
uint64_t u64;
|
||||
__m64 v64;
|
||||
};
|
||||
typedef union _regarray_64 regarray_64;
|
||||
|
||||
/////
|
||||
//
|
||||
// SSE2
|
||||
|
||||
// Universal 128 bit overlay
|
||||
//
|
||||
// Avoids arrays and pointers, suitable as register variable.
|
||||
// Designed for speed in straight line code with no loops.
|
||||
//
|
||||
// Conversions are transparent but not free, cost is one MOV instruction
|
||||
// in each direction, except for lower half of ymm to/from xmm which are
|
||||
// free.
|
||||
//
|
||||
// Facilitates two dimensional vectoring.
|
||||
//
|
||||
// 128 bit integer and AES can't be done in parallel. AES suffers extraction
|
||||
// and insertion of the upper 128 bits. uint128_t suffers 4 times the cost
|
||||
// with 2 64 bit extractions and 2 insertions for each 128 bit lane with
|
||||
// single stage ymm <--> gpr for a total of 8 moves.
|
||||
//
|
||||
// Two stage conversion is possible which helps CPU instruction scheduling
|
||||
// by removing a register dependency between the upper and lower 128 at the
|
||||
// cost of two extra instructions (128 bit extract and insert. The compiler
|
||||
// seems to prefer the 2 staged approach when using the set intrinsic.
|
||||
|
||||
// Use macros to simplify array access emulation.
|
||||
// emulated array type: uint64_t a[4];
|
||||
// array indexing: a[0], a[1]
|
||||
// overlay emulation: a.u64_0, a.u64_1
|
||||
// without macro: a.u64_._0, a.u64_._1
|
||||
|
||||
|
||||
|
||||
struct _regarray_u64x2
|
||||
{
|
||||
uint64_t _0; uint64_t _1;
|
||||
};
|
||||
typedef struct _regarray_u64x2 regarray_u64x2;
|
||||
|
||||
struct _regarray_v64x2
|
||||
{
|
||||
__m64 _0; __m64 _1;
|
||||
};
|
||||
typedef struct _regarray_v64x2 regarray_v64x2;
|
||||
|
||||
struct _regarray_u32x4
|
||||
{
|
||||
uint32_t _0; uint32_t _1; uint32_t _2; uint32_t _3;
|
||||
};
|
||||
typedef struct _regarray_u32x2 regarray_u32x4;
|
||||
|
||||
struct _regarray_u16x8
|
||||
{
|
||||
uint16_t _0; uint16_t _1; uint16_t _2; uint16_t _3;
|
||||
uint16_t _4; uint16_t _5; uint16_t _6; uint16_t _7;
|
||||
};
|
||||
typedef struct _regarray_u16x4 regarray_u16x4;
|
||||
|
||||
struct _regarray_u8x16
|
||||
{
|
||||
uint8_t _0; uint8_t _1; uint8_t _2; uint8_t _3;
|
||||
uint8_t _4; uint8_t _5; uint8_t _6; uint8_t _7;
|
||||
uint8_t _8; uint8_t _9; uint8_t _a; uint8_t _b;
|
||||
uint8_t _c; uint8_t _d; uint8_t _e; uint8_t _f;
|
||||
};
|
||||
typedef struct _regarray_u8x16 regarray_u8x16;
|
||||
|
||||
|
||||
union _register_array_m128v
|
||||
{
|
||||
#if defined(GCC_INT128)
|
||||
uint128_t u128;
|
||||
#endif
|
||||
__m128i v128;
|
||||
regarray_u64x2 u64_; // uint64_t[2]
|
||||
regarray_v64x2 v64_; // __m64[2]
|
||||
regarray_u32x4 u32_; // uint32_t[4]
|
||||
regarray_u16x4 u16_; // uint16_t[8]
|
||||
regarray_u8x16 u8_; // uint8_t[16]
|
||||
};
|
||||
typedef union _register_array_m128v register_array_m128v;
|
||||
|
||||
///////////////////
|
||||
//
|
||||
// AVX2
|
||||
//
|
||||
|
||||
|
||||
struct _regarray_v128x2
|
||||
{
|
||||
__m128i _0; __m128i _1;
|
||||
};
|
||||
typedef struct _regarray_v128x2 regarray_v128x2;
|
||||
|
||||
struct _regarray_u128x2
|
||||
{
|
||||
uint128_t _0; uint128_t _1;
|
||||
};
|
||||
typedef struct _regarray_u128x2 regarray_u128x2;
|
||||
|
||||
struct _regarray_u64x4
|
||||
{
|
||||
uint64_t _0; uint64_t _1; uint64_t _2; uint64_t _3;
|
||||
};
|
||||
typedef struct _regarray_u64x4 regarray_u64x4;
|
||||
|
||||
struct _regarray_v64x4
|
||||
{
|
||||
__m64 _0; __m64 _1; __m64 _2; __m64 _3;
|
||||
};
|
||||
typedef struct _regarray_v64x4 regarray_v64x4;
|
||||
|
||||
struct _regarray_u32x8
|
||||
{
|
||||
uint32_t _0; uint32_t _1; uint32_t _2; uint32_t _3;
|
||||
uint32_t _4; uint32_t _5; uint32_t _6; uint32_t _7;
|
||||
};
|
||||
typedef struct _regarray_u32x8 regarray_u32x8;
|
||||
|
||||
struct _regarray_u16x16
|
||||
{
|
||||
uint16_t _0; uint16_t _1; uint16_t _2; uint16_t _3;
|
||||
uint16_t _4; uint16_t _5; uint16_t _6; uint16_t _7;
|
||||
uint16_t _8; uint16_t _9; uint16_t _a; uint16_t _b;
|
||||
uint16_t _c; uint16_t _d; uint16_t _e; uint16_t _f;
|
||||
};
|
||||
typedef struct _regarray_u16x16 regarray_u16x16;
|
||||
|
||||
struct _regarray_u8x32
|
||||
{
|
||||
uint8_t _00; uint8_t _01; uint8_t _02; uint8_t _03;
|
||||
uint8_t _04; uint8_t _05; uint8_t _06; uint8_t _07;
|
||||
uint8_t _08; uint8_t _09; uint8_t _0a; uint8_t _0b;
|
||||
uint8_t _0c; uint8_t _0d; uint8_t _0e; uint8_t _0f;
|
||||
uint8_t _10; uint8_t _11; uint8_t _12; uint8_t _13;
|
||||
uint8_t _14; uint8_t _15; uint8_t _16; uint8_t _17;
|
||||
uint8_t _18; uint8_t _19; uint8_t _1a; uint8_t _1b;
|
||||
uint8_t _1c; uint8_t _1d; uint8_t _1e; uint8_t _1f;
|
||||
};
|
||||
typedef struct _regarray_u8x32 regarray_u8x32;
|
||||
|
||||
union _regarray_v256
|
||||
{
|
||||
__m256i v256;
|
||||
#if defined(GCC_INT128)
|
||||
regarray_u128x2 u128_; // uint128_t[2]
|
||||
#endif
|
||||
regarray_v128x2 v128_; // __m128i[2]
|
||||
regarray_v64x4 v64_;
|
||||
regarray_u64x4 u64_;
|
||||
regarray_u32x8 u32_;
|
||||
regarray_u16x16 u16_;
|
||||
regarray_u8x32 u8_;
|
||||
};
|
||||
typedef union _regarray_v256 regarray_v256;
|
||||
|
||||
////////////
|
||||
//
|
||||
// Abstraction macros to allow easy readability.
|
||||
// Users may define their own list to suit their preferences
|
||||
// such as, upper case hex, leading zeros, multidimensional,
|
||||
// alphabetic, day of week, etc..
|
||||
|
||||
#define v128_0 v128_._0
|
||||
#define v128_1 v128_._1
|
||||
|
||||
#define u128_0 u128_._0
|
||||
#define u128_1 u128_._1
|
||||
|
||||
#define v64_0 v64_._0
|
||||
#define v64_1 v64_._1
|
||||
#define v64_2 v64_._2
|
||||
#define v64_3 v64_._3
|
||||
|
||||
#define u64_0 u64_._0
|
||||
#define u64_1 u64_._1
|
||||
#define u64_2 u64_._2
|
||||
#define u64_3 u64_._3
|
||||
|
||||
#define u32_0 u32_._0
|
||||
#define u32_1 u32_._1
|
||||
#define u32_2 u32_._2
|
||||
#define u32_3 u32_._3
|
||||
#define u32_4 u32_._4
|
||||
#define u32_5 u32_._5
|
||||
#define u32_6 u32_._6
|
||||
#define u32_7 u32_._7
|
||||
|
||||
#define u16_0 u16_._0
|
||||
#define u16_1 u16_._1
|
||||
#define u16_2 u16_._2
|
||||
#define u16_3 u16_._3
|
||||
#define u16_4 u16_._4
|
||||
#define u16_5 u16_._5
|
||||
#define u16_6 u16_._6
|
||||
#define u16_7 u16_._7
|
||||
#define u16_8 u16_._8
|
||||
#define u16_9 u16_._9
|
||||
#define u16_a u16_._a
|
||||
#define u16_b u16_._b
|
||||
#define u16_c u16_._c
|
||||
#define u16_d u16_._d
|
||||
#define u16_e u16_._e
|
||||
#define u16_f u16_._f
|
||||
|
||||
#define u8_00 u8_._00
|
||||
#define u8_01 u8_._01
|
||||
#define u8_02 u8_._02
|
||||
#define u8_03 u8_._03
|
||||
#define u8_04 u8_._04
|
||||
#define u8_05 u8_._05
|
||||
#define u8_06 u8_._06
|
||||
#define u8_07 u8_._07
|
||||
#define u8_08 u8_._08
|
||||
#define u8_09 u8_._09
|
||||
#define u8_0a u8_._0a
|
||||
#define u8_0b u8_._0b
|
||||
#define u8_0c u8_._0c
|
||||
#define u8_0d u8_._0d
|
||||
#define u8_0e u8_._0e
|
||||
#define u8_0f u8_._0f
|
||||
#define u8_10 u8_._10
|
||||
#define u8_11 u8_._11
|
||||
#define u8_12 u8_._12
|
||||
#define u8_13 u8_._13
|
||||
#define u8_14 u8_._14
|
||||
#define u8_15 u8_._15
|
||||
#define u8_16 u8_._16
|
||||
#define u8_17 u8_._17
|
||||
#define u8_18 u8_._18
|
||||
#define u8_19 u8_._19
|
||||
#define u8_1a u8_._1a
|
||||
#define u8_1b u8_._1b
|
||||
#define u8_1c u8_._1c
|
||||
#define u8_1d u8_._1d
|
||||
#define u8_1e u8_._1e
|
||||
#define u8_1f u8_._1f
|
||||
|
||||
|
||||
// This is in use by, coincidentally, simd hash.
|
||||
union _m256_v16 {
|
||||
uint16_t u16[16];
|
||||
__m256i v256;
|
||||
};
|
||||
typedef union _m256_v16 m256_v16;
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user