#if !defined(SIMD_UTILS_H__) #define SIMD_UTILS_H__ 1 ////////////////////////////////////////////////////////////////////// // // SIMD utilities // // Not to be confused with the hashing function of the same name. This // is about Single Instruction Multiple Data programming using CPU // features such as SSE and AVX. // // This header is the entry point to a suite of macros and functions // to perform basic operations on vectors that are useful in crypto // mining. Some of these functions have native CPU support for scalar // data but not for vectors. The main categories are bit rotation // and endian byte swapping // // An attempt was made to make the names as similar as possible to // Intel's intrinsic function format. Most variations are to avoid // confusion with actual Intel intrinsics, brevity, and clarity. // // This suite supports some operations on regular 64 bit integers // as well as 128 bit integers available on recent versions of Linux // and GCC. // // It also supports various vector sizes on CPUs that meet the minimum // requirements. // // The minimum for any real work is a 64 bit CPU with SSE2, // ie an the Intel Core 2. // // Following are the minimum requirements for each vector size. There // is no significant 64 bit vectorization therefore SSE2 is the practical // minimum for using this code. // // MMX: 64 bit vectors // SSE2: 128 bit vectors (64 bit CPUs only, such as Intel Core2. // AVX2: 256 bit vectors (Starting with Intel Haswell and AMD Ryzen) // AVX512: 512 bit vectors (still under development) // // Most functions are avalaible at the stated levels but in rare cases // a higher level feature may be required with no compatible alternative. // Some SSE2 functions have versions optimized for higher feature levels // such as SSSE3 or SSE4.1 that will be used automatically on capable // CPUs. // // The vector size boundaries are respected to maintain compatibility. // For example, an instruction introduced with AVX2 may improve 128 bit // vector performance but will not be implemented. A CPU with AVX2 will // tend to use 256 bit vectors. On a practical level AVX512 does introduce // bit rotation instructions for 128 and 256 bit vectors in addition to // its own 5a12 bit vectors. These will not be back ported to replace the // SW implementations for the smaller vectors. This policy may be reviewed // in the future once AVX512 is established. // // Strict alignment of data is required: 16 bytes for 128 bit vectors, // 32 bytes for 256 bit vectors and 64 bytes for 512 bit vectors. 64 byte // alignment is recommended in all cases for best cache alignment. // // Windows has problems with function vector arguments larger than // 128 bits. Stack alignment is only guaranteed to 16 bytes. Always use // pointers for larger vectors in function arguments. Macros can be // used for larger value arguments. // // An attempt was made to make the names as similar as possible to // Intel's intrinsic function format. Most variations are to avoid // confusion with actual Intel intrinsics, brevity, and clarity // // The main differences are: // // - the leading underscore(s) "_" and the "i" are dropped from the // prefix of vector instructions. // - "mm64" and "mm128" used for 64 and 128 bit prefix respectively // to avoid the ambiguity of "mm". // - the element size does not include additional type specifiers // like "epi". // - some macros contain value args that are updated. // - specialized shift and rotate functions that move elements around // use the notation "1x32" to indicate the distance moved as units of // the element size. // - there is a subset of some functions for scalar data. They may have // no prefix nor vec-size, just one size, the size of the data. // - Some integer functions are also defined which use a similar notation. // // Function names follow this pattern: // // prefix_op[esize]_[vsize] // // Prefix: usually the size of the largest vectors used. Following // are some examples: // // u64: unsigned 64 bit integer function // i128: signed 128 bit integer function (rarely used) // m128: 128 bit vector identifier // mm128: 128 bit vector function // // op: describes the operation of the function or names the data // identifier. // // esize: optional, element size of operation // // vsize: optional, lane size used when a function operates on elements // of vectors within lanes of a vector. // // Ex: mm256_ror1x64_128 rotates each 128 bit lane of a 256 bit vector // right by 64 bits. // // Some random thoughts about macros and inline functions, the pros and // cons, when to use them, etc: // // Macros are very convenient and efficient for statement functions. // Macro args are passed by value and modifications are seen by the caller. // Macros should not generally call regular functions unless it is for a // special purpose such overloading a function name. // Statement function macros that return a value should not end in ";" // Statement function macros that return a value and don't modify input args // may be used in function arguments and expressions. // Macro args used in expressions should be protected ex: (x)+1 // Macros force inlining, function inlining can be overridden by the compiler. // Inline functions are preferred when multiple statements or local variables // are needed. // The compiler can't do any syntax checking or type checking of args making // macros difficult to debug. // Although it is technically posssible to access the callers data without // they being passed as arguments it is good practice to always define // arguments even if they have the same name. // // General guidelines for inline functions: // // Inline functions should not have loops, it defeats the purpose of inlining. // Inline functions should be short, the benefit is lost and the memory cost // increases if the function is referenced often. // Inline functions may call other functions, inlined or not. It is convenient // for wrapper functions whether or not the wrapped function is itself inlined. // Care should be taken when unrolling loops that contain calls to inlined // functions that may be large. // Large code blocks used only once may use function inlining to // improve high level code readability without the penalty of function // overhead. // // A major restructuring is taking place shifting the focus from pointers // to registers. Previously pointer casting used memory to provide transparency // leaving it up to the compiler to manage everything and it does a very good // job. The focus has shifted to register arguments for more control // over the actual instructions assuming the data is in a register and the // the compiler just needs to manage the registers. // // Rather than use pointers to provide type transparency // specific instructions are used to access specific data as specific types. // Previously pointers were cast and the compiler was left to find a way // to get the data from wherever it happened to be to the correct registers. // // The utilities defined here make use features like register aliasing // to optimize operations. Many operations have specialized versions as // well as more generic versions. It is preferable to use a specialized // version whenever possible a sthey can take advantage of certain // optimizations not available to the generic version. Specically the generic // version usually has a second argument used is some extra calculations. // /////////////////////////////////////////////////////// #include #include #include #include #include // Various types and overlays #include "simd-utils/simd-types.h" // 64 and 128 bit integers. #include "simd-utils/simd-int.h" #if defined(__MMX__) // 64 bit vectors #include "simd-utils/simd-64.h" #include "simd-utils/intrlv-mmx.h" #if defined(__SSE2__) // 128 bit vectors #include "simd-utils/simd-128.h" #include "simd-utils/intrlv-sse2.h" #if defined(__AVX__) // 256 bit vector basics #include "simd-utils/simd-256.h" #include "simd-utils/intrlv-avx.h" #if defined(__AVX2__) // 256 bit everything else //#include "simd-utils/simd-avx2.h" #include "simd-utils/intrlv-avx2.h" // Skylake-X has all these #if defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) // 512 bit vectors #include "simd-utils/simd-512.h" #include "simd-utils/intrlv-avx512.h" #endif // MMX #endif // SSE2 #endif // AVX #endif // AVX2 #endif // AVX512 // Picks implementation based on available CPU features. #include "simd-utils/intrlv-selector.h" #endif // SIMD_UTILS_H__