////////////////////////////////////// // // Type abstraction overlays designed for use in highly optimized // straight line code operating on array structures. It uses direct // struct member access instead of indexing to access array elements. // Ex: array.u32_3 instead of array[3]. // // Vector types are used to represent asrrays. 64 and 128 bit vectors have // corresponding 64 and 128 bit integer types. // // Data accesses are not tied to memory as arrays are. Thes structures // can operate comfortably as reguietr variables. // // Although the abstraction makes for transparent usage there is overhead. // Extra move instructins are required when an operation requires a // different register type. Additionaly 128 bit operations, uint128_t // and AES, can't be done in parallel with a 256 bit or lager vector. // The require additionalmove instructions in addition to the lack of // improvement from parallelism. // // Move instruction overhead is required when moving among gpr, mmx // and xmm registers. The number of extra moves is usually the number // of elements inthe vector. If bothe are the same size onlu one move // is required. The number is doubled if the data is moved back. // // xmm and ymm resgisters are special, they are aliased. xmm registers // overlay the lower 128 bits of the ymm registers. Accessing the data // in the lower half of a ymm register by an xmm argument is free. // The upper 128 bits need to be extracted and inserted like with other // different sized data types. // // Integer types can be converted to differently sized integers without // penalty. // // Conversions with penalty should be avoided as much possible by grouping // operations requiring the same register set. // // There are two algorithms for extracting and inserting data. // // There isthe straightforward iterative meathod wher each element is // extracted or inserted in turn. The compiler evidently take a different // aproach based on assembly code generated by a set intrinsic. // To extract 64 bit or smaller elements from a 256 bit vector the // first extracts the upper 128 bit into a second xmm register. This // eliminates a dependency between the upper and lower elements allowing // the CPU more opportunity at multiple operations per clock. // This adds one additional instruction to the process. With AVX512 an // another stege is added by first splitting up the 512 bit vector into // 2 256 bit vectors, // // xmm/ymm aliasing makes accessing low half trivial and without cost. // Accessing the upper half requires a move from the upper half of // the source register to the lower half of the destination. // It's a bigger issue with GPRs as there is no aliasing. // // Theoretically memory resident data could bypass the move and load // the data directly into the desired register type. However this // ignores the overhead to ensure coherency between register and memory // wich is significantly more. // // Overlay avoids pointer dereferences and favours register move over // memory load, notwistanding compiler optimization. // // The syntax is ugly but can be abstracted with macros. // Universal 64 bit overlay // Avoids arrays and pointers, suitable as register variable. // Conversions are transparent but not free, cost is one MOV instruction. // Facilitates manipulating 32 bit data in 64 bit pairs. // Allows full use of 64 bit registers for 32 bit data, effectively doubling // the size of the register set. // Potentially up to 50% reduction in instructions depending on rate of // conversion. /////////////////////////////////////////////////////// // // 128 bit integer // // Native type __int128 supported starting with GCC-4.8. // // __int128 uses two 64 bit GPRs to hold the data. The main benefits are // for 128 bit arithmetic. Vectors are preferred when 128 bit arith // is not required. int128 also works better with other integer sizes. // Vectors benefit from wider registers. // // For safety use typecasting on all numeric arguments. // // Use typecasting for conversion to/from 128 bit vector: // __m128i v128 = (__m128i)my_int128l // __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 ); // my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 ); // Compiler check for __int128 support // Configure also has a test for int128. #if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) ) #define GCC_INT128 1 #endif #if !defined(GCC_INT128) #warning "__int128 not supported, requires GCC-4.8 or newer." #endif #if defined(GCC_INT128) // Familiar looking type names typedef __int128 int128_t; typedef unsigned __int128 uint128_t; #endif ///////////////////////////////////// // // MMX 64 bit vector // // Emulates uint32_t[2] struct _regarray_u32x2 { uint32_t _0; uint32_t _1; }; typedef struct _regarray_u32x2 regarray_u32x2; // Emulates uint16_t[4] struct _regarray_u16x4 { uint16_t _0; uint16_t _1; uint16_t _2; uint16_t _3; }; typedef struct _regarray_u16x4 regarray_u16x4; // Emulates uint8_t[8] struct _regarray_u8x8 { uint8_t _0; uint8_t _1; uint8_t _2; uint8_t _3; uint8_t _4; uint8_t _5; uint8_t _6; uint8_t _7; }; typedef struct _regarray_u8x8 regarray_u8x8; // universal 64 bit overlay union _regarray_64 { regarray_u32x2 u32_; // uint32_t[2] regarray_u16x4 u16_; // uint16_t[4] regarray_u8x8 u8_; // uint8_t[8] uint64_t u64; __m64 v64; }; typedef union _regarray_64 regarray_64; ///// // // SSE2 // Universal 128 bit overlay // // Avoids arrays and pointers, suitable as register variable. // Designed for speed in straight line code with no loops. // // Conversions are transparent but not free, cost is one MOV instruction // in each direction, except for lower half of ymm to/from xmm which are // free. // // Facilitates two dimensional vectoring. // // 128 bit integer and AES can't be done in parallel. AES suffers extraction // and insertion of the upper 128 bits. uint128_t suffers 4 times the cost // with 2 64 bit extractions and 2 insertions for each 128 bit lane with // single stage ymm <--> gpr for a total of 8 moves. // // Two stage conversion is possible which helps CPU instruction scheduling // by removing a register dependency between the upper and lower 128 at the // cost of two extra instructions (128 bit extract and insert. The compiler // seems to prefer the 2 staged approach when using the set intrinsic. // Use macros to simplify array access emulation. // emulated array type: uint64_t a[4]; // array indexing: a[0], a[1] // overlay emulation: a.u64_0, a.u64_1 // without macro: a.u64_._0, a.u64_._1 struct _regarray_u64x2 { uint64_t _0; uint64_t _1; }; typedef struct _regarray_u64x2 regarray_u64x2; struct _regarray_v64x2 { __m64 _0; __m64 _1; }; typedef struct _regarray_v64x2 regarray_v64x2; struct _regarray_u32x4 { uint32_t _0; uint32_t _1; uint32_t _2; uint32_t _3; }; typedef struct _regarray_u32x2 regarray_u32x4; struct _regarray_u16x8 { uint16_t _0; uint16_t _1; uint16_t _2; uint16_t _3; uint16_t _4; uint16_t _5; uint16_t _6; uint16_t _7; }; typedef struct _regarray_u16x4 regarray_u16x4; struct _regarray_u8x16 { uint8_t _0; uint8_t _1; uint8_t _2; uint8_t _3; uint8_t _4; uint8_t _5; uint8_t _6; uint8_t _7; uint8_t _8; uint8_t _9; uint8_t _a; uint8_t _b; uint8_t _c; uint8_t _d; uint8_t _e; uint8_t _f; }; typedef struct _regarray_u8x16 regarray_u8x16; union _register_array_m128v { #if defined(GCC_INT128) uint128_t u128; #endif __m128i v128; regarray_u64x2 u64_; // uint64_t[2] regarray_v64x2 v64_; // __m64[2] regarray_u32x4 u32_; // uint32_t[4] regarray_u16x4 u16_; // uint16_t[8] regarray_u8x16 u8_; // uint8_t[16] }; typedef union _register_array_m128v register_array_m128v; /////////////////// // // AVX2 // struct _regarray_v128x2 { __m128i _0; __m128i _1; }; typedef struct _regarray_v128x2 regarray_v128x2; struct _regarray_u128x2 { uint128_t _0; uint128_t _1; }; typedef struct _regarray_u128x2 regarray_u128x2; struct _regarray_u64x4 { uint64_t _0; uint64_t _1; uint64_t _2; uint64_t _3; }; typedef struct _regarray_u64x4 regarray_u64x4; struct _regarray_v64x4 { __m64 _0; __m64 _1; __m64 _2; __m64 _3; }; typedef struct _regarray_v64x4 regarray_v64x4; struct _regarray_u32x8 { uint32_t _0; uint32_t _1; uint32_t _2; uint32_t _3; uint32_t _4; uint32_t _5; uint32_t _6; uint32_t _7; }; typedef struct _regarray_u32x8 regarray_u32x8; struct _regarray_u16x16 { uint16_t _0; uint16_t _1; uint16_t _2; uint16_t _3; uint16_t _4; uint16_t _5; uint16_t _6; uint16_t _7; uint16_t _8; uint16_t _9; uint16_t _a; uint16_t _b; uint16_t _c; uint16_t _d; uint16_t _e; uint16_t _f; }; typedef struct _regarray_u16x16 regarray_u16x16; struct _regarray_u8x32 { uint8_t _00; uint8_t _01; uint8_t _02; uint8_t _03; uint8_t _04; uint8_t _05; uint8_t _06; uint8_t _07; uint8_t _08; uint8_t _09; uint8_t _0a; uint8_t _0b; uint8_t _0c; uint8_t _0d; uint8_t _0e; uint8_t _0f; uint8_t _10; uint8_t _11; uint8_t _12; uint8_t _13; uint8_t _14; uint8_t _15; uint8_t _16; uint8_t _17; uint8_t _18; uint8_t _19; uint8_t _1a; uint8_t _1b; uint8_t _1c; uint8_t _1d; uint8_t _1e; uint8_t _1f; }; typedef struct _regarray_u8x32 regarray_u8x32; union _regarray_v256 { __m256i v256; #if defined(GCC_INT128) regarray_u128x2 u128_; // uint128_t[2] #endif regarray_v128x2 v128_; // __m128i[2] regarray_v64x4 v64_; regarray_u64x4 u64_; regarray_u32x8 u32_; regarray_u16x16 u16_; regarray_u8x32 u8_; }; typedef union _regarray_v256 regarray_v256; //////////// // // Abstraction macros to allow easy readability. // Users may define their own list to suit their preferences // such as, upper case hex, leading zeros, multidimensional, // alphabetic, day of week, etc.. #define v128_0 v128_._0 #define v128_1 v128_._1 #define u128_0 u128_._0 #define u128_1 u128_._1 #define v64_0 v64_._0 #define v64_1 v64_._1 #define v64_2 v64_._2 #define v64_3 v64_._3 #define u64_0 u64_._0 #define u64_1 u64_._1 #define u64_2 u64_._2 #define u64_3 u64_._3 #define u32_0 u32_._0 #define u32_1 u32_._1 #define u32_2 u32_._2 #define u32_3 u32_._3 #define u32_4 u32_._4 #define u32_5 u32_._5 #define u32_6 u32_._6 #define u32_7 u32_._7 #define u16_0 u16_._0 #define u16_1 u16_._1 #define u16_2 u16_._2 #define u16_3 u16_._3 #define u16_4 u16_._4 #define u16_5 u16_._5 #define u16_6 u16_._6 #define u16_7 u16_._7 #define u16_8 u16_._8 #define u16_9 u16_._9 #define u16_a u16_._a #define u16_b u16_._b #define u16_c u16_._c #define u16_d u16_._d #define u16_e u16_._e #define u16_f u16_._f #define u8_00 u8_._00 #define u8_01 u8_._01 #define u8_02 u8_._02 #define u8_03 u8_._03 #define u8_04 u8_._04 #define u8_05 u8_._05 #define u8_06 u8_._06 #define u8_07 u8_._07 #define u8_08 u8_._08 #define u8_09 u8_._09 #define u8_0a u8_._0a #define u8_0b u8_._0b #define u8_0c u8_._0c #define u8_0d u8_._0d #define u8_0e u8_._0e #define u8_0f u8_._0f #define u8_10 u8_._10 #define u8_11 u8_._11 #define u8_12 u8_._12 #define u8_13 u8_._13 #define u8_14 u8_._14 #define u8_15 u8_._15 #define u8_16 u8_._16 #define u8_17 u8_._17 #define u8_18 u8_._18 #define u8_19 u8_._19 #define u8_1a u8_._1a #define u8_1b u8_._1b #define u8_1c u8_._1c #define u8_1d u8_._1d #define u8_1e u8_._1e #define u8_1f u8_._1f