v3.9.2.5

2025-09-17 23:44:27 +00:00 · 2019-06-13 11:20:27 -04:00
parent 7fec680835
commit b2331375a3
70 changed files with 4413 additions and 4360 deletions
--- a/simd-utils/simd-types.h
+++ b/simd-utils/simd-types.h
@@ -0,0 +1,398 @@
+//////////////////////////////////////
+//
+//   Type abstraction overlays designed for use in highly optimized
+//   straight line code operating on array structures. It uses direct
+//   struct member access instead of indexing to access array elements.
+//   Ex: array.u32_3 instead of array[3].
+//
+//   Vector types are used to represent asrrays. 64 and 128 bit vectors have
+//   corresponding 64 and 128 bit integer types.
+//
+//   Data accesses are not tied to memory as arrays are. Thes structures
+//   can operate comfortably as reguietr variables.
+//
+//   Although the abstraction makes for transparent usage there is overhead.
+//   Extra move instructins are required when an operation requires a
+//   different register type. Additionaly 128 bit operations, uint128_t
+//   and AES, can't be done in parallel with a 256 bit or lager vector.
+//   The require additionalmove instructions in addition to the lack of
+//   improvement from parallelism.
+//
+//   Move instruction overhead is required when moving among gpr, mmx
+//   and xmm registers. The number of extra moves is usually the number
+//   of elements inthe vector. If bothe are the same size onlu one move
+//   is required. The number is doubled if the data is moved back.
+//
+//   xmm and ymm resgisters are special, they are aliased. xmm registers
+//   overlay the lower 128 bits of the ymm registers. Accessing the data
+//   in the lower half of a ymm register by an xmm argument is free.
+//   The upper 128 bits need to be extracted and inserted like with other
+//   different sized data types.
+//
+//   Integer types can be converted to differently sized integers without
+//   penalty.
+//
+//   Conversions with penalty should be avoided as much possible by grouping
+//   operations requiring the same register set.
+//
+//   There are two algorithms for extracting and inserting data.
+//
+//   There isthe straightforward iterative meathod wher each element is
+//   extracted or inserted in turn. The compiler evidently take a different
+//   aproach based on assembly code generated by a set intrinsic.
+//   To extract 64 bit or smaller elements from a 256 bit vector the
+//   first extracts the upper 128 bit into a second xmm register. This
+//   eliminates a dependency between the upper and lower elements allowing
+//   the CPU more opportunity at multiple operations per clock.
+//   This adds one additional instruction to the process. With AVX512 an
+//   another stege is added by first splitting up the 512 bit vector into
+//   2 256 bit vectors,
+//
+// xmm/ymm aliasing makes accessing low half trivial and without cost.
+// Accessing the upper half requires a move from the upper half of
+// the source register to the lower half of the destination.
+// It's a bigger issue with GPRs as there is no aliasing.
+//
+// Theoretically memory resident data could bypass the move and load
+// the data directly into the desired register type. However this 
+// ignores the overhead to ensure coherency between register and memory
+// wich is significantly more.
+//
+// Overlay avoids pointer dereferences and favours register move over
+// memory load, notwistanding compiler optimization.
+//
+// The syntax is ugly but can be abstracted with macros.
+
+
+// Universal 64 bit overlay
+// Avoids arrays and pointers, suitable as register variable.
+// Conversions are transparent but not free, cost is one MOV instruction.
+// Facilitates manipulating 32 bit data in 64 bit pairs.
+// Allows full use of 64 bit registers for 32 bit data, effectively doubling
+// the size of the register set.
+// Potentially up to 50% reduction in instructions depending on rate of
+// conversion.
+
+
+///////////////////////////////////////////////////////
+//
+//    128 bit integer
+//
+// Native type __int128 supported starting with GCC-4.8.
+//
+// __int128 uses two 64 bit GPRs to hold the data. The main benefits are
+// for 128 bit arithmetic. Vectors are preferred when 128 bit arith
+// is not required. int128 also works better with other integer sizes.
+// Vectors benefit from wider registers.
+//
+// For safety use typecasting on all numeric arguments.
+//
+// Use typecasting for conversion to/from 128 bit vector:
+// __m128i v128 = (__m128i)my_int128l
+// __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 );
+// my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );
+
+// Compiler check for __int128 support
+#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
+  #define GCC_INT128 1
+#endif
+
+#if !defined(GCC_INT128)
+  #warning "__int128 not supported, requires GCC-4.8 or newer."
+#endif
+
+#if defined(GCC_INT128)
+
+// Familiar looking type names
+typedef          __int128  int128_t;
+typedef unsigned __int128 uint128_t;
+
+#endif
+
+/////////////////////////////////////
+//
+//          MMX 64 bit vector
+//
+
+
+// Emulates uint32_t[2]
+struct _regarray_u32x2
+{
+  uint32_t _0;    uint32_t _1;
+};
+typedef struct _regarray_u32x2 regarray_u32x2;
+
+// Emulates uint16_t[4]
+struct _regarray_u16x4
+{
+  uint16_t _0;    uint16_t _1;    uint16_t _2;    uint16_t _3;
+};
+typedef struct _regarray_u16x4 regarray_u16x4;
+
+// Emulates uint8_t[8]
+struct _regarray_u8x8
+{
+  uint8_t _0;      uint8_t _1;     uint8_t _2;     uint8_t _3;
+  uint8_t _4;      uint8_t _5;     uint8_t _6;     uint8_t _7;
+};
+typedef struct _regarray_u8x8 regarray_u8x8;
+
+// universal 64 bit overlay
+union _regarray_64
+{
+  regarray_u32x2 u32_;    // uint32_t[2]
+  regarray_u16x4 u16_;    // uint16_t[4]
+  regarray_u8x8  u8_;     // uint8_t[8]
+  uint64_t       u64;
+  __m64          v64;
+};
+typedef union _regarray_64 regarray_64;
+
+/////
+//
+//   SSE2
+
+// Universal 128 bit overlay
+//
+// Avoids arrays and pointers, suitable as register variable.
+// Designed for speed in straight line code with no loops.
+//
+// Conversions are transparent but not free, cost is one MOV instruction
+// in each direction, except for lower half of ymm to/from xmm which are 
+// free.
+//
+// Facilitates two dimensional vectoring.
+//
+// 128 bit integer and AES can't be done in parallel. AES suffers extraction
+// and insertion of the upper 128 bits. uint128_t suffers 4 times the cost
+// with 2 64 bit extractions and 2 insertions for each 128 bit lane with
+// single stage ymm <--> gpr for a total of 8 moves.
+//
+// Two stage conversion is possible which helps CPU instruction scheduling
+// by removing a register dependency between the upper and lower 128 at the
+// cost of two extra instructions (128 bit extract and insert. The compiler
+// seems to prefer the 2 staged approach when using the set intrinsic.
+
+// Use macros to simplify array access emulation.
+//    emulated array type: uint64_t a[4];
+//    array indexing:      a[0],      a[1]
+//    overlay emulation:   a.u64_0,   a.u64_1
+//    without macro:       a.u64_._0, a.u64_._1
+
+
+
+struct _regarray_u64x2
+{
+  uint64_t _0;    uint64_t _1;
+};
+typedef struct _regarray_u64x2 regarray_u64x2;
+
+struct _regarray_v64x2
+{
+  __m64 _0;       __m64 _1;
+};
+typedef struct _regarray_v64x2 regarray_v64x2;
+
+struct _regarray_u32x4
+{
+  uint32_t _0;    uint32_t _1;    uint32_t _2;    uint32_t _3;
+};
+typedef struct _regarray_u32x2 regarray_u32x4;
+
+struct _regarray_u16x8
+{
+  uint16_t _0;    uint16_t _1;    uint16_t _2;    uint16_t _3;
+  uint16_t _4;    uint16_t _5;    uint16_t _6;    uint16_t _7;
+};
+typedef struct _regarray_u16x4 regarray_u16x4;
+
+struct _regarray_u8x16
+{
+  uint8_t _0;      uint8_t _1;     uint8_t _2;     uint8_t _3;
+  uint8_t _4;      uint8_t _5;     uint8_t _6;     uint8_t _7;
+  uint8_t _8;      uint8_t _9;     uint8_t _a;     uint8_t _b;
+  uint8_t _c;      uint8_t _d;     uint8_t _e;     uint8_t _f;
+};
+typedef struct _regarray_u8x16 regarray_u8x16;
+
+
+union _register_array_m128v
+{
+#if defined(GCC_INT128)
+  uint128_t       u128;
+#endif
+  __m128i         v128;
+  regarray_u64x2  u64_;   // uint64_t[2]
+  regarray_v64x2  v64_;   // __m64[2]
+  regarray_u32x4  u32_;   // uint32_t[4]
+  regarray_u16x4  u16_;   // uint16_t[8]
+  regarray_u8x16  u8_;    // uint8_t[16]
+};
+typedef union _register_array_m128v register_array_m128v;
+
+///////////////////
+//
+//   AVX2
+//
+
+
+struct _regarray_v128x2
+{
+  __m128i _0;       __m128i _1;
+};
+typedef struct _regarray_v128x2 regarray_v128x2;
+
+struct _regarray_u128x2
+{
+  uint128_t _0;     uint128_t _1;
+};
+typedef struct _regarray_u128x2 regarray_u128x2;
+
+struct _regarray_u64x4
+{
+  uint64_t _0;     uint64_t _1;     uint64_t _2;     uint64_t _3;
+};
+typedef struct _regarray_u64x4 regarray_u64x4;
+
+struct _regarray_v64x4
+{
+  __m64 _0;        __m64 _1;        __m64 _2;        __m64 _3;
+};
+typedef struct _regarray_v64x4 regarray_v64x4;
+
+struct _regarray_u32x8
+{
+  uint32_t _0;     uint32_t _1;     uint32_t _2;      uint32_t _3;
+  uint32_t _4;     uint32_t _5;     uint32_t _6;      uint32_t _7;
+};
+typedef struct _regarray_u32x8 regarray_u32x8;
+
+struct _regarray_u16x16
+{
+  uint16_t _0;     uint16_t _1;     uint16_t _2;      uint16_t _3;
+  uint16_t _4;     uint16_t _5;     uint16_t _6;      uint16_t _7;
+  uint16_t _8;     uint16_t _9;     uint16_t _a;      uint16_t _b;
+  uint16_t _c;     uint16_t _d;     uint16_t _e;      uint16_t _f;
+};
+typedef struct _regarray_u16x16 regarray_u16x16;
+
+struct _regarray_u8x32
+{
+  uint8_t _00;     uint8_t _01;     uint8_t _02;      uint8_t _03;
+  uint8_t _04;     uint8_t _05;     uint8_t _06;      uint8_t _07;
+  uint8_t _08;     uint8_t _09;     uint8_t _0a;      uint8_t _0b;
+  uint8_t _0c;     uint8_t _0d;     uint8_t _0e;      uint8_t _0f;
+  uint8_t _10;     uint8_t _11;     uint8_t _12;      uint8_t _13;
+  uint8_t _14;     uint8_t _15;     uint8_t _16;      uint8_t _17;
+  uint8_t _18;     uint8_t _19;     uint8_t _1a;      uint8_t _1b;
+  uint8_t _1c;     uint8_t _1d;     uint8_t _1e;      uint8_t _1f;
+};
+typedef struct _regarray_u8x32 regarray_u8x32;
+
+union _regarray_v256
+{
+  __m256i         v256;
+#if defined(GCC_INT128)
+  regarray_u128x2 u128_;   // uint128_t[2]
+#endif
+  regarray_v128x2 v128_;   // __m128i[2]
+  regarray_v64x4   v64_;
+  regarray_u64x4   u64_;
+  regarray_u32x8   u32_;
+  regarray_u16x16  u16_;
+  regarray_u8x32    u8_;
+};
+typedef union _regarray_v256 regarray_v256;
+
+////////////
+//
+//   Abstraction macros to allow easy readability.
+//   Users may define their own list to suit their preferences
+//   such as, upper case hex, leading zeros, multidimensional,
+//   alphabetic, day of week, etc..
+
+#define v128_0 v128_._0
+#define v128_1 v128_._1
+
+#define u128_0 u128_._0
+#define u128_1 u128_._1
+
+#define v64_0 v64_._0
+#define v64_1 v64_._1
+#define v64_2 v64_._2
+#define v64_3 v64_._3
+
+#define u64_0 u64_._0
+#define u64_1 u64_._1
+#define u64_2 u64_._2
+#define u64_3 u64_._3
+
+#define u32_0 u32_._0
+#define u32_1 u32_._1
+#define u32_2 u32_._2
+#define u32_3 u32_._3
+#define u32_4 u32_._4
+#define u32_5 u32_._5
+#define u32_6 u32_._6
+#define u32_7 u32_._7
+
+#define u16_0 u16_._0
+#define u16_1 u16_._1
+#define u16_2 u16_._2
+#define u16_3 u16_._3
+#define u16_4 u16_._4
+#define u16_5 u16_._5
+#define u16_6 u16_._6
+#define u16_7 u16_._7
+#define u16_8 u16_._8
+#define u16_9 u16_._9
+#define u16_a u16_._a
+#define u16_b u16_._b
+#define u16_c u16_._c
+#define u16_d u16_._d
+#define u16_e u16_._e
+#define u16_f u16_._f
+
+#define u8_00 u8_._00
+#define u8_01 u8_._01
+#define u8_02 u8_._02
+#define u8_03 u8_._03
+#define u8_04 u8_._04
+#define u8_05 u8_._05
+#define u8_06 u8_._06
+#define u8_07 u8_._07
+#define u8_08 u8_._08
+#define u8_09 u8_._09
+#define u8_0a u8_._0a
+#define u8_0b u8_._0b
+#define u8_0c u8_._0c
+#define u8_0d u8_._0d
+#define u8_0e u8_._0e
+#define u8_0f u8_._0f
+#define u8_10 u8_._10
+#define u8_11 u8_._11
+#define u8_12 u8_._12
+#define u8_13 u8_._13
+#define u8_14 u8_._14
+#define u8_15 u8_._15
+#define u8_16 u8_._16
+#define u8_17 u8_._17
+#define u8_18 u8_._18
+#define u8_19 u8_._19
+#define u8_1a u8_._1a
+#define u8_1b u8_._1b
+#define u8_1c u8_._1c
+#define u8_1d u8_._1d
+#define u8_1e u8_._1e
+#define u8_1f u8_._1f
+
+
+// This is in use by, coincidentally, simd hash.
+union _m256_v16 {
+  uint16_t u16[16];
+  __m256i v256;
+};
+typedef union _m256_v16 m256_v16;
+
+
+