mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.5.6
This commit is contained in:
249
avxdefs.h
249
avxdefs.h
@@ -34,6 +34,140 @@ uint16_t v16[ 8];
|
||||
uint8_t v8 [16];
|
||||
} area128;
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// Replacements for vectorized data
|
||||
// n = number of __m256i (32 bytes)
|
||||
inline void memset_zero_m256i( __m256i *dst, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i++ ) dst[i] = _mm256_setzero_si256();
|
||||
}
|
||||
|
||||
inline void memset_m256i( __m256i *dst, const __m256i a, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i++ ) dst[i] = a;
|
||||
}
|
||||
|
||||
|
||||
// optimized copying, first fit is usually best. If none of these works there
|
||||
// are __m128i versions or plain memcpy.
|
||||
|
||||
// Fixed size
|
||||
|
||||
// multi buffered copy for 64 bytes, the size of a cache line.
|
||||
// minimum alignment is 32 bytes, optimum for cache is 64.
|
||||
// src & dst are __m256i*
|
||||
inline void mcpy64_m256i( __m256i* dst, const __m256i* src )
|
||||
{
|
||||
const __m256i* dest = dst;
|
||||
const __m256i* srce = src;
|
||||
__m256i a = _mm256_load_si256( srce );
|
||||
__m256i b = _mm256_load_si256( srce + 1 );
|
||||
_mm256_store_si256( dest, a );
|
||||
_mm256_store_si256( dest + 1, b );
|
||||
}
|
||||
|
||||
inline void mcpy96_m256i( __m256i* dst, const __m256i* src )
|
||||
{
|
||||
const __m256i* dest = dst;
|
||||
const __m256i* srce = src;
|
||||
__m256i a = _mm256_load_si256( srce );
|
||||
__m256i b = _mm256_load_si256( srce + 1 );
|
||||
_mm256_store_si256( dest, a );
|
||||
__m256i c = _mm256_load_si256( srce + 2 );
|
||||
_mm256_store_si256( dest + 1, b );
|
||||
_mm256_store_si256( dest + 2, c );
|
||||
}
|
||||
|
||||
inline void mcpy128_m256i( __m256i* dst, const __m256i* src )
|
||||
{
|
||||
const __m256i* dest = dst;
|
||||
const __m256i* srce = src;
|
||||
__m256i a = _mm256_load_si256( srce );
|
||||
__m256i b = _mm256_load_si256( srce + 1 );
|
||||
__m256i c = _mm256_load_si256( srce + 2 );
|
||||
_mm256_store_si256( dest , a );
|
||||
__m256i d = _mm256_load_si256( srce + 3 );
|
||||
_mm256_store_si256( dest + 1, b );
|
||||
a = _mm256_load_si256( srce + 4 );
|
||||
_mm256_store_si256( dest + 2, c );
|
||||
b = _mm256_load_si256( srce + 5 );
|
||||
_mm256_store_si256( dest + 3, d );
|
||||
c = _mm256_load_si256( srce + 6 );
|
||||
_mm256_store_si256( dest + 4, a );
|
||||
d = _mm256_load_si256( srce + 7 );
|
||||
_mm256_store_si256( dest + 5, b );
|
||||
_mm256_store_si256( dest + 6, c );
|
||||
_mm256_store_si256( dest + 7, d );
|
||||
}
|
||||
|
||||
// Variable size
|
||||
|
||||
// copy multiples of 64 bytes using quad buffering with interleave
|
||||
// of first read of next line with last write of current line.
|
||||
// n is a multiple of 32 bytes (_m256i size)
|
||||
// minimum alignment: 32 bytes
|
||||
// optimum alignment: 64 bytes (cache line size)
|
||||
// minimum size.....: 128 bytes (4*n)
|
||||
// recommended size.: 256+ bytes
|
||||
// minimum increment: 128 bytes
|
||||
// Only the first load or store in a cache line triggers a memory access.
|
||||
// the subsequent actions are trivial because they benefit from data
|
||||
// cached by the first.
|
||||
// Priming the second cache line is done before dumping the first to
|
||||
// give read priority to ensure there are no gaps in data available to
|
||||
// the cpu caused by waiting for data to be written back.
|
||||
|
||||
inline void mcpy_m256i_x4( __m256i *dst, const __m256i *src, const int n )
|
||||
{
|
||||
const __m256i* dest = dst;
|
||||
const __m256i* srce = src;
|
||||
|
||||
// preload 1 cache line to absorb startup latency
|
||||
__m256i a = _mm256_load_si256( srce );
|
||||
__m256i b = _mm256_load_si256( srce + 1 );
|
||||
// start loading second line, queue while waiting
|
||||
__m256i c = _mm256_load_si256( srce + 2 );
|
||||
// start writing first line, as soon as data available,
|
||||
// second line read will have priority on the bus
|
||||
_mm256_store_si256( dest, a );
|
||||
__m256i d;
|
||||
|
||||
int i;
|
||||
const int loops = n/4 - 1;
|
||||
|
||||
for ( i = 0; i < loops; i++ )
|
||||
{
|
||||
const int i4 = i*4;
|
||||
const __m256i* si4 = (__m256i*)(srce + i4);
|
||||
const __m256i* di4 = (__m256i*)(dest + i4);
|
||||
|
||||
d = _mm256_load_si256( si4 + 3 );
|
||||
_mm256_store_si256( di4 + 1, b );
|
||||
// start loading next line
|
||||
a = _mm256_load_si256( si4 + 4 );
|
||||
_mm256_store_si256( di4 + 2, c );
|
||||
b = _mm256_load_si256( si4 + 5 );
|
||||
_mm256_store_si256( di4 + 3, d );
|
||||
c = _mm256_load_si256( si4 + 6 );
|
||||
// start writing next line
|
||||
_mm256_store_si256( di4 + 4, a );
|
||||
}
|
||||
// finish last line
|
||||
d = _mm256_load_si256( srce + n - 4 );
|
||||
_mm256_store_si256( dest + n - 3, b );
|
||||
_mm256_store_si256( dest + n - 2, c );
|
||||
_mm256_store_si256( dest + n - 1, d );
|
||||
}
|
||||
|
||||
// basic __m256i memcpy
|
||||
|
||||
inline void memcpy_m256i( __m256i *dst, const __m256i *src, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
|
||||
}
|
||||
|
||||
|
||||
// For cheating with pointer types
|
||||
|
||||
// p = any aligned pointer
|
||||
@@ -63,8 +197,6 @@ uint8_t v8 [16];
|
||||
// dst = a( b, a[127:0] ) ; mask == 1
|
||||
//__m256i _mm256_inserti128_si256(__m256i a, __m128i b, const int mask);
|
||||
|
||||
#if defined __AVX2__
|
||||
|
||||
// Rotate bits in 4 uint64 (3 instructions)
|
||||
// __m256i mm256_rotr_64( __256i, int )
|
||||
#define mm256_rotr_64( w, c ) \
|
||||
@@ -141,6 +273,119 @@ uint8_t v8 [16];
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
// Replacements for vectorized data
|
||||
|
||||
inline void memset_zero_m128i( __m128i *dst, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i++ ) dst[i] = _mm_setzero_si128();
|
||||
}
|
||||
|
||||
inline void memset_m128i( __m128i *dst, const __m128i a, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i++ ) dst[i] = a;
|
||||
}
|
||||
|
||||
// __m128i versions of optimized copying
|
||||
|
||||
inline void mcpy32_m128i( __m128i* dst, const __m128i* src )
|
||||
{
|
||||
const __m128i* dest = dst;
|
||||
const __m128i* srce = src;
|
||||
// 4 loads fills cache line
|
||||
__m128i a = _mm_load_si128( srce );
|
||||
__m128i b = _mm_load_si128( srce + 1 );
|
||||
_mm_store_si128( dest, a );
|
||||
_mm_store_si128( dest + 1, b );
|
||||
}
|
||||
|
||||
inline void mcpy64_m128i( __m128i* dst, const __m128i* src )
|
||||
{
|
||||
const __m128i* dest = dst;
|
||||
const __m128i* srce = src;
|
||||
// 4 loads fills cache line
|
||||
__m128i a = _mm_load_si128( srce );
|
||||
__m128i b = _mm_load_si128( srce + 1 );
|
||||
__m128i c = _mm_load_si128( srce + 2 );
|
||||
__m128i d = _mm_load_si128( srce + 3 );
|
||||
// need to store a before overwriting it
|
||||
_mm_store_si128( dest, a );
|
||||
a = _mm_load_si128( srce + 4 );
|
||||
_mm_store_si128( dest + 1, b );
|
||||
b = _mm_load_si128( srce + 5 );
|
||||
_mm_store_si128( dest + 2, c );
|
||||
c = _mm_load_si128( srce + 6 );
|
||||
_mm_store_si128( dest + 3, d );
|
||||
d = _mm_load_si128( srce + 7 );
|
||||
_mm_store_si128( dest + 4, a );
|
||||
d = _mm_load_si128( srce + 7 );
|
||||
_mm_store_si128( dest + 5, b );
|
||||
_mm_store_si128( dest + 6, c );
|
||||
_mm_store_si128( dest + 7, d );
|
||||
}
|
||||
|
||||
// Variable length
|
||||
|
||||
// copy multiples of 16 bytes using quad buffering.
|
||||
// n is a multiple of 16 bytes (__m128i size)
|
||||
// minimum alignment: 16 bytes
|
||||
// optimum alignment: 64 bytes (cache line size)
|
||||
// minimum size.....: 32 bytes (4*n)
|
||||
// recommended size.: 96+ bytes
|
||||
// minimum increment: 32 bytes
|
||||
inline void memcpy_m128i_x4( __m128i *dst, const __m128i *src, const int n )
|
||||
{
|
||||
// preload 1 cache line to absorb startup latency
|
||||
__m128i a = _mm_load_si128( src );
|
||||
__m128i b = _mm_load_si128( src + 1 );
|
||||
__m128i c = _mm_load_si128( src + 2 );
|
||||
__m128i d = _mm_load_si128( src + 3 );
|
||||
|
||||
int i;
|
||||
const int loops = n/4 - 1;
|
||||
const __m128i* dst_n = (__m128i*)(dst + n);
|
||||
|
||||
for ( i = 0; i < loops; i++ )
|
||||
{
|
||||
const int i4 = i*4;
|
||||
const __m128i* si4 = (__m128i*)(src + i4);
|
||||
const __m128i* di4 = (__m128i*)(dst + i4);
|
||||
|
||||
// need to free a before overwriting it
|
||||
_mm_store_si128( di4, a );
|
||||
a = _mm_load_si128( di4 + 4 );
|
||||
_mm_store_si128( di4 + 1, b );
|
||||
b = _mm_load_si128( di4 + 5 );
|
||||
_mm_store_si128( di4 + 2, c );
|
||||
c = _mm_load_si128( di4 + 6 );
|
||||
_mm_store_si128( di4 + 3, d );
|
||||
d = _mm_load_si128( di4 + 7 );
|
||||
}
|
||||
_mm_store_si128( dst_n - 4, a );
|
||||
_mm_store_si128( dst_n - 3, b );
|
||||
_mm_store_si128( dst_n - 2, c );
|
||||
_mm_store_si128( dst_n - 1, d );
|
||||
}
|
||||
|
||||
// basic __m128i copy
|
||||
inline void memcpy_m128i( __m128i *dst, const __m128i *src, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
|
||||
}
|
||||
|
||||
// For cheating with pointer types
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns p as pointer to vector type
|
||||
#define castp_m128i(p) ((__m128i*)(p))
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns *p, watch your pointer arithmetic
|
||||
#define cast_m128i(p) (*((__m128i*)(p)))
|
||||
|
||||
// p = any aligned pointer, i = scaled array index
|
||||
// returns p[i]
|
||||
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
|
||||
|
||||
// rotate bits in 2 uint64
|
||||
// _m128i mm_rotr_64( __m128i, int )
|
||||
#define mm_rotr_64(w,c) _mm_or_si128(_mm_srli_epi64(w, c), \
|
||||
|
||||
Reference in New Issue
Block a user