mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.8.7
This commit is contained in:
@@ -48,8 +48,9 @@ Supported Algorithms
|
||||
allium Garlicoin
|
||||
anime Animecoin
|
||||
argon2 Argon2 coin (AR2)
|
||||
argon2d-crds Credits (CRDS)
|
||||
argon2d-dyn Dynamic (DYN)
|
||||
argon2d250 argon2d-crds, Credits (CRDS)
|
||||
argon2d500 argon2d-dyn, Dynamic (DYN)
|
||||
argon2d4096 argon2d-uis, Unitus, (UIS)
|
||||
axiom Shabal-256 MemoHash
|
||||
bastion
|
||||
blake Blake-256 (SFR)
|
||||
|
13
README.txt
13
README.txt
@@ -21,14 +21,15 @@ AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
|
||||
supported by cpuminer-opt due to an incompatible implementation of SSE2 on
|
||||
these CPUs. Some algos may crash the miner with an invalid instruction.
|
||||
Users are recommended to use an unoptimized miner such as cpuminer-multi.
|
||||
Changes in v3.8.4 may have improved compatibility with some of these CPUs.
|
||||
|
||||
Exe name Compile flags Arch name
|
||||
|
||||
cpuminer-sse2.exe "-msse2" Core2, Nehalem
|
||||
cpuminer-aes-sse42.exe "-maes -msse4.2" Westmere
|
||||
cpuminer-aes-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2" Haswell...
|
||||
cpuminer-avx2-sha.exe "-march=core-avx2 -msha" Ryzen
|
||||
Exe name Compile flags Arch name
|
||||
|
||||
cpuminer-sse2.exe "-msse2" Core2, Nehalem
|
||||
cpuminer-aes-sse42.exe "-maes -msse4.2" Westmere, Sandy-Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2" Haswell, Sky-Kaby-Coffeelake
|
||||
cpuminer-avx2-sha.exe "-march=core-avx2 -msha" Ryzen
|
||||
|
||||
If you like this software feel free to donate:
|
||||
|
||||
|
@@ -81,7 +81,7 @@ cd cpuminer-opt-x.y.z
|
||||
Run ./build.sh to build on Linux or execute the following commands.
|
||||
|
||||
./autogen.sh
|
||||
CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
|
||||
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
make
|
||||
|
||||
Additional optional compile flags, add the following to CFLAGS to activate:
|
||||
@@ -160,6 +160,15 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.8.7
|
||||
|
||||
Added argon2d4096 (alias argon2d-uis) for Unitus (UIS).
|
||||
argon2d-crds and argon2d-dyn renamed to argon2d250 and argon2d500 respectively.
|
||||
The old names are recognized as aliases.
|
||||
AVX512 is now supported for argon2d algos, Linux only.
|
||||
AVX is no longer a reported feature and an AVX Windows binary is no longer
|
||||
provided. Use AES-SSE42 build instead.
|
||||
|
||||
v3.8.6.1
|
||||
|
||||
Faster argon2d* AVX2.
|
||||
|
@@ -160,8 +160,9 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_ALLIUM: register_allium_algo ( gate ); break;
|
||||
case ALGO_ANIME: register_anime_algo ( gate ); break;
|
||||
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
|
||||
case ALGO_ARGON2DCRDS: register_argon2d_crds_algo( gate ); break;
|
||||
case ALGO_ARGON2DDYN: register_argon2d_dyn_algo ( gate ); break;
|
||||
case ALGO_ARGON2D250: register_argon2d_crds_algo( gate ); break;
|
||||
case ALGO_ARGON2D500: register_argon2d_dyn_algo ( gate ); break;
|
||||
case ALGO_ARGON2D4096: register_argon2d4096_algo ( gate ); break;
|
||||
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
|
||||
case ALGO_BASTION: register_bastion_algo ( gate ); break;
|
||||
case ALGO_BLAKE: register_blake_algo ( gate ); break;
|
||||
@@ -288,6 +289,9 @@ void exec_hash_function( int algo, void *output, const void *pdata )
|
||||
const char* const algo_alias_map[][2] =
|
||||
{
|
||||
// alias proper
|
||||
{ "argon2d-crds" "argon2d250" },
|
||||
{ "argon2d-dyn" "argon2d500" },
|
||||
{ "argon2d-uis" "argon2d4096" },
|
||||
{ "bitcore", "timetravel10" },
|
||||
{ "bitzeny", "yescryptr8" },
|
||||
{ "blake256r8", "blakecoin" },
|
||||
|
@@ -2,6 +2,8 @@
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include "miner.h"
|
||||
#include "avxdefs.h"
|
||||
#include "interleave.h"
|
||||
|
||||
/////////////////////////////
|
||||
////
|
||||
@@ -91,6 +93,7 @@ typedef uint32_t set_t;
|
||||
#define AVX_OPT 8
|
||||
#define AVX2_OPT 0x10
|
||||
#define SHA_OPT 0x20
|
||||
#define AVX512_OPT 0x40
|
||||
|
||||
// return set containing all elements from sets a & b
|
||||
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
|
||||
|
@@ -70,7 +70,8 @@ bool register_argon2d_crds_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_argon2d_crds;
|
||||
gate->hash = (void*)&argon2d_crds_hash;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Dynamic
|
||||
@@ -138,6 +139,56 @@ bool register_argon2d_dyn_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_argon2d_dyn;
|
||||
gate->hash = (void*)&argon2d_dyn_hash;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
}
|
||||
|
||||
int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done)
|
||||
{
|
||||
uint32_t _ALIGN(64) vhash[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
uint32_t t_cost = 1; // 1 iteration
|
||||
uint32_t m_cost = 4096; // use 4MB
|
||||
uint32_t parallelism = 1; // 1 thread, 2 lanes
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
be32enc( &endiandata[i], pdata[i] );
|
||||
|
||||
do {
|
||||
be32enc( &endiandata[19], n );
|
||||
argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80,
|
||||
(char*) endiandata, 80, (char*) vhash, 32 );
|
||||
if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) )
|
||||
{
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t get_max64_0x1ff() { return 0x1ff; }
|
||||
|
||||
bool register_argon2d4096_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_argon2d4096;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
gate->get_max64 = (void*)&get_max64_0x1ff;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
766
avxdefs.h
766
avxdefs.h
@@ -1,5 +1,5 @@
|
||||
#ifndef AVXDEFS_H__
|
||||
#define AVXDEFS_H__
|
||||
#define AVXDEFS_H__ 1
|
||||
|
||||
// Some tools to help using SIMD vectors.
|
||||
//
|
||||
@@ -1034,6 +1034,11 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
|
||||
//
|
||||
// Pseudo constants.
|
||||
|
||||
// _mm512_setzero_si512 uses xor instruction. If needed frequently
|
||||
// in a function it's better to define a register variable (const?)
|
||||
// initialized to zero.
|
||||
// It isn't clear to me yet how set or set1 work.
|
||||
|
||||
#define m512_zero _mm512_setzero_si512()
|
||||
#define m512_one_512 _mm512_set_epi64x( 0ULL, 0ULL, 0ULL, 0ULL, \
|
||||
0ULL, 0ULL, 0ULL, 1ULL )
|
||||
@@ -1058,6 +1063,21 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
|
||||
//
|
||||
// Pointer casting
|
||||
|
||||
// p = any aligned pointer
|
||||
// i = scaled array index
|
||||
// o = scaled address offset
|
||||
|
||||
// returns p as pointer to vector
|
||||
#define castp_m512i(p) ((__m512i*)(p))
|
||||
|
||||
// returns *p as vector value
|
||||
#define cast_m512i(p) (*((__m512i*)(p)))
|
||||
|
||||
// returns p[i] as vector value
|
||||
#define casti_m512i(p,i) (((__m512i*)(p))[(i)])
|
||||
|
||||
// returns p+o as pointer to vector
|
||||
#define casto_m512i(p,o) (((__m512i*)(p))+(o))
|
||||
|
||||
//
|
||||
// Memory functions
|
||||
@@ -1237,746 +1257,4 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
|
||||
|
||||
#endif // AVX512F
|
||||
|
||||
// Paired functions for interleaving and deinterleaving data for vector
|
||||
// processing.
|
||||
// Size is specfied in bits regardless of vector size to avoid pointer
|
||||
// arithmetic confusion with different size vectors and be consistent with
|
||||
// the function's name.
|
||||
//
|
||||
// Each function has 2 implementations, an optimized version that uses
|
||||
// vector indexing and a slower version that uses pointers. The optimized
|
||||
// version can only be used with 64 bit elements and only supports sizes
|
||||
// of 256, 512 or 640 bits, 32, 64, and 80 bytes respectively.
|
||||
//
|
||||
// NOTE: Contrary to GCC documentation, accessing vector elements using array
|
||||
// indexes only works with 64 bit elements.
|
||||
// Interleaving and deinterleaving of vectors of 32 bit elements
|
||||
// must use the slower implementations that don't use vector indexing.
|
||||
//
|
||||
// All data must be aligned to 256 bits for AVX2, or 128 bits for AVX.
|
||||
// Interleave source args and deinterleave destination args are not required
|
||||
// to be contiguous in memory but it's more efficient if they are.
|
||||
// Interleave source agrs may be the same actual arg repeated.
|
||||
// 640 bit deinterleaving 4x64 using 256 bit AVX2 requires the
|
||||
// destination buffers be defined with padding up to 768 bits for overrun
|
||||
// space. Although overrun space use is non destructive it should not overlay
|
||||
// useful data and should be ignored by the caller.
|
||||
|
||||
// SSE2 AVX
|
||||
|
||||
// interleave 4 arrays of 32 bit elements for 128 bit processing
|
||||
// bit_len must be 256, 512 or 640 bits.
|
||||
static inline void mm_interleave_4x32( void *dst, const void *src0,
|
||||
const void *src1, const void *src2, const void *src3, int bit_len )
|
||||
{
|
||||
uint32_t *s0 = (uint32_t*)src0;
|
||||
uint32_t *s1 = (uint32_t*)src1;
|
||||
uint32_t *s2 = (uint32_t*)src2;
|
||||
uint32_t *s3 = (uint32_t*)src3;
|
||||
__m128i* d = (__m128i*)dst;
|
||||
|
||||
d[0] = _mm_set_epi32( s3[ 0], s2[ 0], s1[ 0], s0[ 0] );
|
||||
d[1] = _mm_set_epi32( s3[ 1], s2[ 1], s1[ 1], s0[ 1] );
|
||||
d[2] = _mm_set_epi32( s3[ 2], s2[ 2], s1[ 2], s0[ 2] );
|
||||
d[3] = _mm_set_epi32( s3[ 3], s2[ 3], s1[ 3], s0[ 3] );
|
||||
d[4] = _mm_set_epi32( s3[ 4], s2[ 4], s1[ 4], s0[ 4] );
|
||||
d[5] = _mm_set_epi32( s3[ 5], s2[ 5], s1[ 5], s0[ 5] );
|
||||
d[6] = _mm_set_epi32( s3[ 6], s2[ 6], s1[ 6], s0[ 6] );
|
||||
d[7] = _mm_set_epi32( s3[ 7], s2[ 7], s1[ 7], s0[ 7] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[ 8] = _mm_set_epi32( s3[ 8], s2[ 8], s1[ 8], s0[ 8] );
|
||||
d[ 9] = _mm_set_epi32( s3[ 9], s2[ 9], s1[ 9], s0[ 9] );
|
||||
d[10] = _mm_set_epi32( s3[10], s2[10], s1[10], s0[10] );
|
||||
d[11] = _mm_set_epi32( s3[11], s2[11], s1[11], s0[11] );
|
||||
d[12] = _mm_set_epi32( s3[12], s2[12], s1[12], s0[12] );
|
||||
d[13] = _mm_set_epi32( s3[13], s2[13], s1[13], s0[13] );
|
||||
d[14] = _mm_set_epi32( s3[14], s2[14], s1[14], s0[14] );
|
||||
d[15] = _mm_set_epi32( s3[15], s2[15], s1[15], s0[15] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[16] = _mm_set_epi32( s3[16], s2[16], s1[16], s0[16] );
|
||||
d[17] = _mm_set_epi32( s3[17], s2[17], s1[17], s0[17] );
|
||||
d[18] = _mm_set_epi32( s3[18], s2[18], s1[18], s0[18] );
|
||||
d[19] = _mm_set_epi32( s3[19], s2[19], s1[19], s0[19] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[20] = _mm_set_epi32( s3[20], s2[20], s1[20], s0[20] );
|
||||
d[21] = _mm_set_epi32( s3[21], s2[21], s1[21], s0[21] );
|
||||
d[22] = _mm_set_epi32( s3[22], s2[22], s1[22], s0[22] );
|
||||
d[23] = _mm_set_epi32( s3[23], s2[23], s1[23], s0[23] );
|
||||
|
||||
d[24] = _mm_set_epi32( s3[24], s2[24], s1[24], s0[24] );
|
||||
d[25] = _mm_set_epi32( s3[25], s2[25], s1[25], s0[25] );
|
||||
d[26] = _mm_set_epi32( s3[26], s2[26], s1[26], s0[26] );
|
||||
d[27] = _mm_set_epi32( s3[27], s2[27], s1[27], s0[27] );
|
||||
d[28] = _mm_set_epi32( s3[28], s2[28], s1[28], s0[28] );
|
||||
d[29] = _mm_set_epi32( s3[29], s2[29], s1[29], s0[29] );
|
||||
d[30] = _mm_set_epi32( s3[30], s2[30], s1[30], s0[30] );
|
||||
d[31] = _mm_set_epi32( s3[31], s2[31], s1[31], s0[31] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// bit_len must be multiple of 32
|
||||
static inline void mm_interleave_4x32x( void *dst, void *src0, void *src1,
|
||||
void *src2, void *src3, int bit_len )
|
||||
{
|
||||
uint32_t *d = (uint32_t*)dst;
|
||||
uint32_t *s0 = (uint32_t*)src0;
|
||||
uint32_t *s1 = (uint32_t*)src1;
|
||||
uint32_t *s2 = (uint32_t*)src2;
|
||||
uint32_t *s3 = (uint32_t*)src3;
|
||||
|
||||
for ( int i = 0; i < bit_len >> 5; i++, d += 4 )
|
||||
{
|
||||
*d = *(s0+i);
|
||||
*(d+1) = *(s1+i);
|
||||
*(d+2) = *(s2+i);
|
||||
*(d+3) = *(s3+i);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, int bit_len )
|
||||
{
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
__m128i* d0 = (__m128i*)dst0;
|
||||
__m128i* d1 = (__m128i*)dst1;
|
||||
__m128i* d2 = (__m128i*)dst2;
|
||||
__m128i* d3 = (__m128i*)dst3;
|
||||
|
||||
d0[0] = _mm_set_epi32( s[12], s[ 8], s[ 4], s[ 0] );
|
||||
d1[0] = _mm_set_epi32( s[13], s[ 9], s[ 5], s[ 1] );
|
||||
d2[0] = _mm_set_epi32( s[14], s[10], s[ 6], s[ 2] );
|
||||
d3[0] = _mm_set_epi32( s[15], s[11], s[ 7], s[ 3] );
|
||||
|
||||
d0[1] = _mm_set_epi32( s[28], s[24], s[20], s[16] );
|
||||
d1[1] = _mm_set_epi32( s[29], s[25], s[21], s[17] );
|
||||
d2[1] = _mm_set_epi32( s[30], s[26], s[22], s[18] );
|
||||
d3[1] = _mm_set_epi32( s[31], s[27], s[23], s[19] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d0[2] = _mm_set_epi32( s[44], s[40], s[36], s[32] );
|
||||
d1[2] = _mm_set_epi32( s[45], s[41], s[37], s[33] );
|
||||
d2[2] = _mm_set_epi32( s[46], s[42], s[38], s[34] );
|
||||
d3[2] = _mm_set_epi32( s[47], s[43], s[39], s[35] );
|
||||
|
||||
d0[3] = _mm_set_epi32( s[60], s[56], s[52], s[48] );
|
||||
d1[3] = _mm_set_epi32( s[61], s[57], s[53], s[49] );
|
||||
d2[3] = _mm_set_epi32( s[62], s[58], s[54], s[50] );
|
||||
d3[3] = _mm_set_epi32( s[63], s[59], s[55], s[51] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d0[4] = _mm_set_epi32( s[76], s[72], s[68], s[64] );
|
||||
d1[4] = _mm_set_epi32( s[77], s[73], s[69], s[65] );
|
||||
d2[4] = _mm_set_epi32( s[78], s[74], s[70], s[66] );
|
||||
d3[4] = _mm_set_epi32( s[79], s[75], s[71], s[67] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d0[5] = _mm_set_epi32( s[92], s[88], s[84], s[80] );
|
||||
d1[5] = _mm_set_epi32( s[93], s[89], s[85], s[81] );
|
||||
d2[5] = _mm_set_epi32( s[94], s[90], s[86], s[82] );
|
||||
d3[5] = _mm_set_epi32( s[95], s[91], s[87], s[83] );
|
||||
|
||||
d0[6] = _mm_set_epi32( s[108], s[104], s[100], s[ 96] );
|
||||
d1[6] = _mm_set_epi32( s[109], s[105], s[101], s[ 97] );
|
||||
d2[6] = _mm_set_epi32( s[110], s[106], s[102], s[ 98] );
|
||||
d3[6] = _mm_set_epi32( s[111], s[107], s[103], s[ 99] );
|
||||
|
||||
d0[7] = _mm_set_epi32( s[124], s[120], s[116], s[112] );
|
||||
d1[7] = _mm_set_epi32( s[125], s[121], s[117], s[113] );
|
||||
d2[7] = _mm_set_epi32( s[126], s[122], s[118], s[114] );
|
||||
d3[7] = _mm_set_epi32( s[127], s[123], s[119], s[115] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// deinterleave 4 arrays into individual buffers for scalarm processing
|
||||
// bit_len must be multiple of 32
|
||||
static inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, int bit_len )
|
||||
{
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
uint32_t *d0 = (uint32_t*)dst0;
|
||||
uint32_t *d1 = (uint32_t*)dst1;
|
||||
uint32_t *d2 = (uint32_t*)dst2;
|
||||
uint32_t *d3 = (uint32_t*)dst3;
|
||||
|
||||
for ( int i = 0; i < bit_len >> 5; i++, s += 4 )
|
||||
{
|
||||
*(d0+i) = *s;
|
||||
*(d1+i) = *(s+1);
|
||||
*(d2+i) = *(s+2);
|
||||
*(d3+i) = *(s+3);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// Interleave 4 source buffers containing 64 bit data into the destination
|
||||
// buffer. Only bit_len 256, 512, 640 & 1024 are supported.
|
||||
static inline void mm256_interleave_4x64( void *dst, const void *src0,
|
||||
const void *src1, const void *src2, const void *src3, int bit_len )
|
||||
{
|
||||
__m256i* d = (__m256i*)dst;
|
||||
uint64_t *s0 = (uint64_t*)src0;
|
||||
uint64_t *s1 = (uint64_t*)src1;
|
||||
uint64_t *s2 = (uint64_t*)src2;
|
||||
uint64_t *s3 = (uint64_t*)src3;
|
||||
|
||||
d[0] = _mm256_set_epi64x( s3[0], s2[0], s1[0], s0[0] );
|
||||
d[1] = _mm256_set_epi64x( s3[1], s2[1], s1[1], s0[1] );
|
||||
d[2] = _mm256_set_epi64x( s3[2], s2[2], s1[2], s0[2] );
|
||||
d[3] = _mm256_set_epi64x( s3[3], s2[3], s1[3], s0[3] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[4] = _mm256_set_epi64x( s3[4], s2[4], s1[4], s0[4] );
|
||||
d[5] = _mm256_set_epi64x( s3[5], s2[5], s1[5], s0[5] );
|
||||
d[6] = _mm256_set_epi64x( s3[6], s2[6], s1[6], s0[6] );
|
||||
d[7] = _mm256_set_epi64x( s3[7], s2[7], s1[7], s0[7] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[8] = _mm256_set_epi64x( s3[8], s2[8], s1[8], s0[8] );
|
||||
d[9] = _mm256_set_epi64x( s3[9], s2[9], s1[9], s0[9] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[10] = _mm256_set_epi64x( s3[10], s2[10], s1[10], s0[10] );
|
||||
d[11] = _mm256_set_epi64x( s3[11], s2[11], s1[11], s0[11] );
|
||||
|
||||
d[12] = _mm256_set_epi64x( s3[12], s2[12], s1[12], s0[12] );
|
||||
d[13] = _mm256_set_epi64x( s3[13], s2[13], s1[13], s0[13] );
|
||||
d[14] = _mm256_set_epi64x( s3[14], s2[14], s1[14], s0[14] );
|
||||
d[15] = _mm256_set_epi64x( s3[15], s2[15], s1[15], s0[15] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// Slower version
|
||||
// bit_len must be multiple of 64
|
||||
static inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1,
|
||||
void *src2, void *src3, int bit_len )
|
||||
{
|
||||
uint64_t *d = (uint64_t*)dst;
|
||||
uint64_t *s0 = (uint64_t*)src0;
|
||||
uint64_t *s1 = (uint64_t*)src1;
|
||||
uint64_t *s2 = (uint64_t*)src2;
|
||||
uint64_t *s3 = (uint64_t*)src3;
|
||||
|
||||
for ( int i = 0; i < bit_len>>6; i++, d += 4 )
|
||||
{
|
||||
*d = *(s0+i);
|
||||
*(d+1) = *(s1+i);
|
||||
*(d+2) = *(s2+i);
|
||||
*(d+3) = *(s3+i);
|
||||
}
|
||||
}
|
||||
|
||||
// Deinterleave 4 buffers of 64 bit data from the source buffer.
|
||||
// bit_len must be 256, 512, 640 or 1024 bits.
|
||||
// Requires overrun padding for 640 bit len.
|
||||
static inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, int bit_len )
|
||||
{
|
||||
__m256i* d0 = (__m256i*)dst0;
|
||||
__m256i* d1 = (__m256i*)dst1;
|
||||
__m256i* d2 = (__m256i*)dst2;
|
||||
__m256i* d3 = (__m256i*)dst3;
|
||||
uint64_t* s = (uint64_t*)src;
|
||||
|
||||
d0[0] = _mm256_set_epi64x( s[12], s[ 8], s[ 4], s[ 0] );
|
||||
d1[0] = _mm256_set_epi64x( s[13], s[ 9], s[ 5], s[ 1] );
|
||||
d2[0] = _mm256_set_epi64x( s[14], s[10], s[ 6], s[ 2] );
|
||||
d3[0] = _mm256_set_epi64x( s[15], s[11], s[ 7], s[ 3] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d0[1] = _mm256_set_epi64x( s[28], s[24], s[20], s[16] );
|
||||
d1[1] = _mm256_set_epi64x( s[29], s[25], s[21], s[17] );
|
||||
d2[1] = _mm256_set_epi64x( s[30], s[26], s[22], s[18] );
|
||||
d3[1] = _mm256_set_epi64x( s[31], s[27], s[23], s[19] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
// null change to overrun area
|
||||
d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[36], s[32] );
|
||||
d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[37], s[33] );
|
||||
d2[2] = _mm256_set_epi64x( d2[2][3], d2[2][2], s[38], s[34] );
|
||||
d3[2] = _mm256_set_epi64x( d3[2][3], d3[2][2], s[39], s[35] );
|
||||
return;
|
||||
}
|
||||
|
||||
d0[2] = _mm256_set_epi64x( s[44], s[40], s[36], s[32] );
|
||||
d1[2] = _mm256_set_epi64x( s[45], s[41], s[37], s[33] );
|
||||
d2[2] = _mm256_set_epi64x( s[46], s[42], s[38], s[34] );
|
||||
d3[2] = _mm256_set_epi64x( s[47], s[43], s[39], s[35] );
|
||||
|
||||
d0[3] = _mm256_set_epi64x( s[60], s[56], s[52], s[48] );
|
||||
d1[3] = _mm256_set_epi64x( s[61], s[57], s[53], s[49] );
|
||||
d2[3] = _mm256_set_epi64x( s[62], s[58], s[54], s[50] );
|
||||
d3[3] = _mm256_set_epi64x( s[63], s[59], s[55], s[51] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// Slower version
|
||||
// bit_len must be multiple 0f 64
|
||||
static inline void mm256_deinterleave_4x64x( void *dst0, void *dst1,
|
||||
void *dst2, void *dst3, void *src, int bit_len )
|
||||
{
|
||||
uint64_t *s = (uint64_t*)src;
|
||||
uint64_t *d0 = (uint64_t*)dst0;
|
||||
uint64_t *d1 = (uint64_t*)dst1;
|
||||
uint64_t *d2 = (uint64_t*)dst2;
|
||||
uint64_t *d3 = (uint64_t*)dst3;
|
||||
|
||||
for ( int i = 0; i < bit_len>>6; i++, s += 4 )
|
||||
{
|
||||
*(d0+i) = *s;
|
||||
*(d1+i) = *(s+1);
|
||||
*(d2+i) = *(s+2);
|
||||
*(d3+i) = *(s+3);
|
||||
}
|
||||
}
|
||||
|
||||
// Interleave 8 source buffers containing 32 bit data into the destination
|
||||
// vector
|
||||
static inline void mm256_interleave_8x32( void *dst, const void *src0,
|
||||
const void *src1, const void *src2, const void *src3, const void *src4,
|
||||
const void *src5, const void *src6, const void *src7, int bit_len )
|
||||
{
|
||||
uint32_t *s0 = (uint32_t*)src0;
|
||||
uint32_t *s1 = (uint32_t*)src1;
|
||||
uint32_t *s2 = (uint32_t*)src2;
|
||||
uint32_t *s3 = (uint32_t*)src3;
|
||||
uint32_t *s4 = (uint32_t*)src4;
|
||||
uint32_t *s5 = (uint32_t*)src5;
|
||||
uint32_t *s6 = (uint32_t*)src6;
|
||||
uint32_t *s7 = (uint32_t*)src7;
|
||||
__m256i *d = (__m256i*)dst;
|
||||
|
||||
d[ 0] = _mm256_set_epi32( s7[0], s6[0], s5[0], s4[0],
|
||||
s3[0], s2[0], s1[0], s0[0] );
|
||||
d[ 1] = _mm256_set_epi32( s7[1], s6[1], s5[1], s4[1],
|
||||
s3[1], s2[1], s1[1], s0[1] );
|
||||
d[ 2] = _mm256_set_epi32( s7[2], s6[2], s5[2], s4[2],
|
||||
s3[2], s2[2], s1[2], s0[2] );
|
||||
d[ 3] = _mm256_set_epi32( s7[3], s6[3], s5[3], s4[3],
|
||||
s3[3], s2[3], s1[3], s0[3] );
|
||||
d[ 4] = _mm256_set_epi32( s7[4], s6[4], s5[4], s4[4],
|
||||
s3[4], s2[4], s1[4], s0[4] );
|
||||
d[ 5] = _mm256_set_epi32( s7[5], s6[5], s5[5], s4[5],
|
||||
s3[5], s2[5], s1[5], s0[5] );
|
||||
d[ 6] = _mm256_set_epi32( s7[6], s6[6], s5[6], s4[6],
|
||||
s3[6], s2[6], s1[6], s0[6] );
|
||||
d[ 7] = _mm256_set_epi32( s7[7], s6[7], s5[7], s4[7],
|
||||
s3[7], s2[7], s1[7], s0[7] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[ 8] = _mm256_set_epi32( s7[ 8], s6[ 8], s5[ 8], s4[ 8],
|
||||
s3[ 8], s2[ 8], s1[ 8], s0[ 8] );
|
||||
d[ 9] = _mm256_set_epi32( s7[ 9], s6[ 9], s5[ 9], s4[ 9],
|
||||
s3[ 9], s2[ 9], s1[ 9], s0[ 9] );
|
||||
d[10] = _mm256_set_epi32( s7[10], s6[10], s5[10], s4[10],
|
||||
s3[10], s2[10], s1[10], s0[10] );
|
||||
d[11] = _mm256_set_epi32( s7[11], s6[11], s5[11], s4[11],
|
||||
s3[11], s2[11], s1[11], s0[11] );
|
||||
d[12] = _mm256_set_epi32( s7[12], s6[12], s5[12], s4[12],
|
||||
s3[12], s2[12], s1[12], s0[12] );
|
||||
d[13] = _mm256_set_epi32( s7[13], s6[13], s5[13], s4[13],
|
||||
s3[13], s2[13], s1[13], s0[13] );
|
||||
d[14] = _mm256_set_epi32( s7[14], s6[14], s5[14], s4[14],
|
||||
s3[14], s2[14], s1[14], s0[14] );
|
||||
d[15] = _mm256_set_epi32( s7[15], s6[15], s5[15], s4[15],
|
||||
s3[15], s2[15], s1[15], s0[15] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[16] = _mm256_set_epi32( s7[16], s6[16], s5[16], s4[16],
|
||||
s3[16], s2[16], s1[16], s0[16] );
|
||||
d[17] = _mm256_set_epi32( s7[17], s6[17], s5[17], s4[17],
|
||||
s3[17], s2[17], s1[17], s0[17] );
|
||||
d[18] = _mm256_set_epi32( s7[18], s6[18], s5[18], s4[18],
|
||||
s3[18], s2[18], s1[18], s0[18] );
|
||||
d[19] = _mm256_set_epi32( s7[19], s6[19], s5[19], s4[19],
|
||||
s3[19], s2[19], s1[19], s0[19] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[20] = _mm256_set_epi32( s7[20], s6[20], s5[20], s4[20],
|
||||
s3[20], s2[20], s1[20], s0[20] );
|
||||
d[21] = _mm256_set_epi32( s7[21], s6[21], s5[21], s4[21],
|
||||
s3[21], s2[21], s1[21], s0[21] );
|
||||
d[22] = _mm256_set_epi32( s7[22], s6[22], s5[22], s4[22],
|
||||
s3[22], s2[22], s1[22], s0[22] );
|
||||
d[23] = _mm256_set_epi32( s7[23], s6[23], s5[23], s4[23],
|
||||
s3[23], s2[23], s1[23], s0[23] );
|
||||
|
||||
if ( bit_len <= 768 ) return;
|
||||
|
||||
d[24] = _mm256_set_epi32( s7[24], s6[24], s5[24], s4[24],
|
||||
s3[24], s2[24], s1[24], s0[24] );
|
||||
d[25] = _mm256_set_epi32( s7[25], s6[25], s5[25], s4[25],
|
||||
s3[25], s2[25], s1[25], s0[25] );
|
||||
d[26] = _mm256_set_epi32( s7[26], s6[26], s5[26], s4[26],
|
||||
s3[26], s2[26], s1[26], s0[26] );
|
||||
d[27] = _mm256_set_epi32( s7[27], s6[27], s5[27], s4[27],
|
||||
s3[27], s2[27], s1[27], s0[27] );
|
||||
d[28] = _mm256_set_epi32( s7[28], s6[28], s5[28], s4[28],
|
||||
s3[28], s2[28], s1[28], s0[28] );
|
||||
d[29] = _mm256_set_epi32( s7[29], s6[29], s5[29], s4[29],
|
||||
s3[29], s2[29], s1[29], s0[29] );
|
||||
d[30] = _mm256_set_epi32( s7[30], s6[30], s5[30], s4[30],
|
||||
s3[30], s2[30], s1[30], s0[30] );
|
||||
d[31] = _mm256_set_epi32( s7[31], s6[31], s5[31], s4[31],
|
||||
s3[31], s2[31], s1[31], s0[31] );
|
||||
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// Slower but it works with 32 bit data
|
||||
// bit_len must be multiple of 32
|
||||
static inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0,
|
||||
uint32_t *src1, uint32_t *src2, uint32_t *src3, uint32_t *src4,
|
||||
uint32_t *src5, uint32_t *src6, uint32_t *src7, int bit_len )
|
||||
{
|
||||
uint32_t *d = dst;;
|
||||
for ( int i = 0; i < bit_len>>5; i++, d += 8 )
|
||||
{
|
||||
*d = *(src0+i);
|
||||
*(d+1) = *(src1+i);
|
||||
*(d+2) = *(src2+i);
|
||||
*(d+3) = *(src3+i);
|
||||
*(d+4) = *(src4+i);
|
||||
*(d+5) = *(src5+i);
|
||||
*(d+6) = *(src6+i);
|
||||
*(d+7) = *(src7+i);
|
||||
}
|
||||
}
|
||||
|
||||
// Deinterleave 8 buffers of 32 bit data from the source buffer.
|
||||
static inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, void *dst4, void *dst5, void *dst6, void *dst7,
|
||||
const void *src, int bit_len )
|
||||
{
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
__m256i* d0 = (__m256i*)dst0;
|
||||
__m256i* d1 = (__m256i*)dst1;
|
||||
__m256i* d2 = (__m256i*)dst2;
|
||||
__m256i* d3 = (__m256i*)dst3;
|
||||
__m256i* d4 = (__m256i*)dst4;
|
||||
__m256i* d5 = (__m256i*)dst5;
|
||||
__m256i* d6 = (__m256i*)dst6;
|
||||
__m256i* d7 = (__m256i*)dst7;
|
||||
|
||||
d0[0] = _mm256_set_epi32( s[ 56], s[ 48], s[ 40], s[ 32],
|
||||
s[ 24], s[ 16], s[ 8], s[ 0] );
|
||||
d1[0] = _mm256_set_epi32( s[ 57], s[ 49], s[ 41], s[ 33],
|
||||
s[ 25], s[ 17], s[ 9], s[ 1] );
|
||||
d2[0] = _mm256_set_epi32( s[ 58], s[ 50], s[ 42], s[ 34],
|
||||
s[ 26], s[ 18], s[ 10], s[ 2] );
|
||||
d3[0] = _mm256_set_epi32( s[ 59], s[ 51], s[ 43], s[ 35],
|
||||
s[ 27], s[ 19], s[ 11], s[ 3] );
|
||||
d4[0] = _mm256_set_epi32( s[ 60], s[ 52], s[ 44], s[ 36],
|
||||
s[ 28], s[ 20], s[ 12], s[ 4] );
|
||||
d5[0] = _mm256_set_epi32( s[ 61], s[ 53], s[ 45], s[ 37],
|
||||
s[ 29], s[ 21], s[ 13], s[ 5] );
|
||||
d6[0] = _mm256_set_epi32( s[ 62], s[ 54], s[ 46], s[ 38],
|
||||
s[ 30], s[ 22], s[ 14], s[ 6] );
|
||||
d7[0] = _mm256_set_epi32( s[ 63], s[ 55], s[ 47], s[ 39],
|
||||
s[ 31], s[ 23], s[ 15], s[ 7] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d0[1] = _mm256_set_epi32( s[120], s[112], s[104], s[ 96],
|
||||
s[ 88], s[ 80], s[ 72], s[ 64] );
|
||||
d1[1] = _mm256_set_epi32( s[121], s[113], s[105], s[ 97],
|
||||
s[ 89], s[ 81], s[ 73], s[ 65] );
|
||||
d2[1] = _mm256_set_epi32( s[122], s[114], s[106], s[ 98],
|
||||
s[ 90], s[ 82], s[ 74], s[ 66]);
|
||||
d3[1] = _mm256_set_epi32( s[123], s[115], s[107], s[ 99],
|
||||
s[ 91], s[ 83], s[ 75], s[ 67] );
|
||||
d4[1] = _mm256_set_epi32( s[124], s[116], s[108], s[100],
|
||||
s[ 92], s[ 84], s[ 76], s[ 68] );
|
||||
d5[1] = _mm256_set_epi32( s[125], s[117], s[109], s[101],
|
||||
s[ 93], s[ 85], s[ 77], s[ 69] );
|
||||
d6[1] = _mm256_set_epi32( s[126], s[118], s[110], s[102],
|
||||
s[ 94], s[ 86], s[ 78], s[ 70] );
|
||||
d7[1] = _mm256_set_epi32( s[127], s[119], s[111], s[103],
|
||||
s[ 95], s[ 87], s[ 79], s[ 71] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
// null change for overrun space, vector indexing doesn't work for
|
||||
// 32 bit data
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
uint32_t *d = ((uint32_t*)d0) + 8;
|
||||
d0[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[152], s[144], s[136], s[128] );
|
||||
d = ((uint32_t*)d1) + 8;
|
||||
d1[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[153], s[145], s[137], s[129] );
|
||||
d = ((uint32_t*)d2) + 8;
|
||||
d2[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[154], s[146], s[138], s[130]);
|
||||
d = ((uint32_t*)d3) + 8;
|
||||
d3[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[155], s[147], s[139], s[131] );
|
||||
d = ((uint32_t*)d4) + 8;
|
||||
d4[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[156], s[148], s[140], s[132] );
|
||||
d = ((uint32_t*)d5) + 8;
|
||||
d5[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[157], s[149], s[141], s[133] );
|
||||
d = ((uint32_t*)d6) + 8;
|
||||
d6[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[158], s[150], s[142], s[134] );
|
||||
d = ((uint32_t*)d7) + 8;
|
||||
d7[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[159], s[151], s[143], s[135] );
|
||||
return;
|
||||
}
|
||||
|
||||
d0[2] = _mm256_set_epi32( s[184], s[176], s[168], s[160],
|
||||
s[152], s[144], s[136], s[128] );
|
||||
d1[2] = _mm256_set_epi32( s[185], s[177], s[169], s[161],
|
||||
s[153], s[145], s[137], s[129] );
|
||||
d2[2] = _mm256_set_epi32( s[186], s[178], s[170], s[162],
|
||||
s[154], s[146], s[138], s[130] );
|
||||
d3[2] = _mm256_set_epi32( s[187], s[179], s[171], s[163],
|
||||
s[155], s[147], s[139], s[131] );
|
||||
d4[2] = _mm256_set_epi32( s[188], s[180], s[172], s[164],
|
||||
s[156], s[148], s[140], s[132] );
|
||||
d5[2] = _mm256_set_epi32( s[189], s[181], s[173], s[165],
|
||||
s[157], s[149], s[141], s[133] );
|
||||
d6[2] = _mm256_set_epi32( s[190], s[182], s[174], s[166],
|
||||
s[158], s[150], s[142], s[134] );
|
||||
d7[2] = _mm256_set_epi32( s[191], s[183], s[175], s[167],
|
||||
s[159], s[151], s[143], s[135] );
|
||||
|
||||
if ( bit_len <= 768 ) return;
|
||||
|
||||
d0[3] = _mm256_set_epi32( s[248], s[240], s[232], s[224],
|
||||
s[216], s[208], s[200], s[192] );
|
||||
d1[3] = _mm256_set_epi32( s[249], s[241], s[233], s[225],
|
||||
s[217], s[209], s[201], s[193] );
|
||||
d2[3] = _mm256_set_epi32( s[250], s[242], s[234], s[226],
|
||||
s[218], s[210], s[202], s[194] );
|
||||
d3[3] = _mm256_set_epi32( s[251], s[243], s[235], s[227],
|
||||
s[219], s[211], s[203], s[195] );
|
||||
d4[3] = _mm256_set_epi32( s[252], s[244], s[236], s[228],
|
||||
s[220], s[212], s[204], s[196] );
|
||||
d5[3] = _mm256_set_epi32( s[253], s[245], s[237], s[229],
|
||||
s[221], s[213], s[205], s[197] );
|
||||
d6[3] = _mm256_set_epi32( s[254], s[246], s[238], s[230],
|
||||
s[222], s[214], s[206], s[198] );
|
||||
d7[3] = _mm256_set_epi32( s[255], s[247], s[239], s[231],
|
||||
s[223], s[215], s[207], s[199] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// Deinterleave 8 arrays into indivdual buffers for scalar processing
|
||||
// bit_len must be multiple of 32
|
||||
static inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1,
|
||||
uint32_t *dst2,uint32_t *dst3, uint32_t *dst4, uint32_t *dst5,
|
||||
uint32_t *dst6,uint32_t *dst7,uint32_t *src, int bit_len )
|
||||
{
|
||||
uint32_t *s = src;
|
||||
for ( int i = 0; i < bit_len>>5; i++, s += 8 )
|
||||
{
|
||||
*(dst0+i) = *( s );
|
||||
*(dst1+i) = *( s + 1 );
|
||||
*(dst2+i) = *( s + 2 );
|
||||
*(dst3+i) = *( s + 3 );
|
||||
*(dst4+i) = *( s + 4 );
|
||||
*(dst5+i) = *( s + 5 );
|
||||
*(dst6+i) = *( s + 6 );
|
||||
*(dst7+i) = *( s + 7 );
|
||||
}
|
||||
}
|
||||
|
||||
// Convert from 4x32 AVX interleaving to 4x64 AVX2.
|
||||
// Can't do it in place
|
||||
static inline void mm256_reinterleave_4x64( void *dst, void *src, int bit_len )
|
||||
{
|
||||
__m256i* d = (__m256i*)dst;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
|
||||
d[0] = _mm256_set_epi32( s[7], s[3], s[6], s[2], s[5], s[1], s[4], s[0] );
|
||||
d[1] = _mm256_set_epi32( s[15],s[11],s[14],s[10],s[13],s[9],s[12], s[8] );
|
||||
d[2] = _mm256_set_epi32( s[23],s[19],s[22],s[18],s[21],s[17],s[20],s[16] );
|
||||
d[3] = _mm256_set_epi32( s[31],s[27],s[30],s[26],s[29],s[25],s[28],s[24] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[4] = _mm256_set_epi32( s[39],s[35],s[38],s[34],s[37],s[33],s[36],s[32] );
|
||||
d[5] = _mm256_set_epi32( s[47],s[43],s[46],s[42],s[45],s[41],s[44],s[40] );
|
||||
d[6] = _mm256_set_epi32( s[55],s[51],s[54],s[50],s[53],s[49],s[52],s[48] );
|
||||
d[7] = _mm256_set_epi32( s[63],s[59],s[62],s[58],s[61],s[57],s[60],s[56] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[8] = _mm256_set_epi32( s[71],s[67],s[70],s[66],s[69],s[65],s[68],s[64] );
|
||||
d[9] = _mm256_set_epi32( s[79],s[75],s[78],s[74],s[77],s[73],s[76],s[72] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[10] = _mm256_set_epi32(s[87],s[83],s[86],s[82],s[85],s[81],s[84],s[80]);
|
||||
d[11] = _mm256_set_epi32(s[95],s[91],s[94],s[90],s[93],s[89],s[92],s[88]);
|
||||
|
||||
d[12] = _mm256_set_epi32(s[103],s[99],s[102],s[98],s[101],s[97],s[100],s[96]);
|
||||
d[13] = _mm256_set_epi32(s[111],s[107],s[110],s[106],s[109],s[105],s[108],s[104]);
|
||||
d[14] = _mm256_set_epi32(s[119],s[115],s[118],s[114],s[117],s[113],s[116],s[112]);
|
||||
d[15] = _mm256_set_epi32(s[127],s[123],s[126],s[122],s[125],s[121],s[124],s[120]);
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// likely of no use.
|
||||
// convert 4x32 byte (128 bit) vectors to 4x64 (256 bit) vectors for AVX2
|
||||
// bit_len must be multiple of 64
|
||||
// broken
|
||||
static inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src,
|
||||
int bit_len )
|
||||
{
|
||||
uint32_t *d = (uint32_t*)dst;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
for ( int i = 0; i < bit_len >> 5; i += 8 )
|
||||
{
|
||||
*( d + i ) = *( s + i ); // 0 <- 0 8 <- 8
|
||||
*( d + i + 1 ) = *( s + i + 4 ); // 1 <- 4 9 <- 12
|
||||
*( d + i + 2 ) = *( s + i + 1 ); // 2 <- 1 10 <- 9
|
||||
*( d + i + 3 ) = *( s + i + 5 ); // 3 <- 5 11 <- 13
|
||||
*( d + i + 4 ) = *( s + i + 2 ); // 4 <- 2 12 <- 10
|
||||
*( d + i + 5 ) = *( s + i + 6 ); // 5 <- 6 13 <- 14
|
||||
*( d + i + 6 ) = *( s + i + 3 ); // 6 <- 3 14 <- 11
|
||||
*( d + i + 7 ) = *( s + i + 7 ); // 7 <- 7 15 <- 15
|
||||
}
|
||||
}
|
||||
|
||||
// Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
|
||||
// bit_len must be multiple of 64
|
||||
static inline void mm256_reinterleave_4x32( void *dst, void *src, int bit_len )
|
||||
{
|
||||
__m256i *d = (__m256i*)dst;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
|
||||
d[0] = _mm256_set_epi32( s[ 7],s[ 5],s[ 3],s[ 1],s[ 6],s[ 4],s[ 2],s[ 0] );
|
||||
d[1] = _mm256_set_epi32( s[15],s[13],s[11],s[ 9],s[14],s[12],s[10],s[ 8] );
|
||||
d[2] = _mm256_set_epi32( s[23],s[21],s[19],s[17],s[22],s[20],s[18],s[16] );
|
||||
d[3] = _mm256_set_epi32( s[31],s[29],s[27],s[25],s[30],s[28],s[26],s[24] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[4] = _mm256_set_epi32( s[39],s[37],s[35],s[33],s[38],s[36],s[34],s[32] );
|
||||
d[5] = _mm256_set_epi32( s[47],s[45],s[43],s[41],s[46],s[44],s[42],s[40] );
|
||||
d[6] = _mm256_set_epi32( s[55],s[53],s[51],s[49],s[54],s[52],s[50],s[48] );
|
||||
d[7] = _mm256_set_epi32( s[63],s[61],s[59],s[57],s[62],s[60],s[58],s[56] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[8] = _mm256_set_epi32( s[71],s[69],s[67],s[65],s[70],s[68],s[66],s[64] );
|
||||
d[9] = _mm256_set_epi32( s[79],s[77],s[75],s[73],s[78],s[76],s[74],s[72] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[10] = _mm256_set_epi32( s[87],s[85],s[83],s[81],s[86],s[84],s[82],s[80] );
|
||||
d[11] = _mm256_set_epi32( s[95],s[93],s[91],s[89],s[94],s[92],s[90],s[88] );
|
||||
|
||||
d[12] = _mm256_set_epi32( s[103],s[101],s[99],s[97],s[102],s[100],s[98],s[96] );
|
||||
d[13] = _mm256_set_epi32( s[111],s[109],s[107],s[105],s[110],s[108],s[106],s[104] );
|
||||
d[14] = _mm256_set_epi32( s[119],s[117],s[115],s[113],s[118],s[116],s[114],s[112] );
|
||||
d[15] = _mm256_set_epi32( s[127],s[125],s[123],s[121],s[126],s[124],s[122],s[120] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
static inline void mm256_interleave_2x128( void *dst, void *src0, void *src1,
|
||||
int bit_len )
|
||||
{
|
||||
__m256i *d = (__m256i*)dst;
|
||||
uint64_t *s0 = (uint64_t*)src0;
|
||||
uint64_t *s1 = (uint64_t*)src1;
|
||||
|
||||
d[0] = _mm256_set_epi64x( s1[ 1], s1[ 0], s0[ 1], s0[ 0] );
|
||||
d[1] = _mm256_set_epi64x( s1[ 3], s1[ 2], s0[ 3], s0[ 2] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[2] = _mm256_set_epi64x( s1[ 5], s1[ 4], s0[ 5], s0[ 4] );
|
||||
d[3] = _mm256_set_epi64x( s1[ 7], s1[ 6], s0[ 7], s0[ 6] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[4] = _mm256_set_epi64x( s1[ 9], s1[ 8], s0[ 9], s0[ 8] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[5] = _mm256_set_epi64x( s1[11], s1[10], s0[11], s0[10] );
|
||||
|
||||
d[6] = _mm256_set_epi64x( s1[13], s1[12], s0[13], s0[12] );
|
||||
d[7] = _mm256_set_epi64x( s1[15], s1[14], s0[15], s0[14] );
|
||||
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
static inline void mm256_deinterleave_2x128( void *dst0, void *dst1, void *src,
|
||||
int bit_len )
|
||||
{
|
||||
uint64_t *s = (uint64_t*)src;
|
||||
__m256i *d0 = (__m256i*)dst0;
|
||||
__m256i *d1 = (__m256i*)dst1;
|
||||
|
||||
d0[0] = _mm256_set_epi64x( s[ 5], s[4], s[ 1], s[ 0] );
|
||||
d1[0] = _mm256_set_epi64x( s[ 7], s[6], s[ 3], s[ 2] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d0[1] = _mm256_set_epi64x( s[13], s[12], s[ 9], s[ 8] );
|
||||
d1[1] = _mm256_set_epi64x( s[15], s[14], s[11], s[10] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[17], s[16] );
|
||||
d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[19], s[18] );
|
||||
return;
|
||||
}
|
||||
|
||||
d0[2] = _mm256_set_epi64x( s[21], s[20], s[17], s[16] );
|
||||
d1[2] = _mm256_set_epi64x( s[23], s[22], s[19], s[18] );
|
||||
|
||||
d0[3] = _mm256_set_epi64x( s[29], s[28], s[25], s[24] );
|
||||
d1[3] = _mm256_set_epi64x( s[31], s[30], s[27], s[26] );
|
||||
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// not used
|
||||
static inline void mm_reinterleave_4x32( void *dst, void *src, int bit_len )
|
||||
{
|
||||
uint32_t *d = (uint32_t*)dst;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
for ( int i = 0; i < bit_len >> 5; i +=8 )
|
||||
{
|
||||
*( d + i ) = *( s + i );
|
||||
*( d + i + 1 ) = *( s + i + 2 );
|
||||
*( d + i + 2 ) = *( s + i + 4 );
|
||||
*( d + i + 3 ) = *( s + i + 6 );
|
||||
*( d + i + 4 ) = *( s + i + 1 );
|
||||
*( d + i + 5 ) = *( s + i + 3 );
|
||||
*( d + i + 6 ) = *( s + i + 5 );
|
||||
*( d + i + 7 ) = *( s + i + 7 );
|
||||
}
|
||||
}
|
||||
|
||||
#endif // __AVX2__
|
||||
#endif // AVXDEFS_H__
|
||||
#endif // AVXDEFS_H__
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.6.1.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.7.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.8.6.1'
|
||||
PACKAGE_STRING='cpuminer-opt 3.8.6.1'
|
||||
PACKAGE_VERSION='3.8.7'
|
||||
PACKAGE_STRING='cpuminer-opt 3.8.7'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.8.6.1 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.8.7 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1392,7 +1392,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.8.6.1:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.8.7:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1497,7 +1497,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.8.6.1
|
||||
cpuminer-opt configure 3.8.7
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.8.6.1, which was
|
||||
It was created by cpuminer-opt $as_me 3.8.7, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2981,7 +2981,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.8.6.1'
|
||||
VERSION='3.8.7'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.8.6.1, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.8.7, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6743,7 +6743,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.8.6.1
|
||||
cpuminer-opt config.status 3.8.7
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.8.6.1])
|
||||
AC_INIT([cpuminer-opt], [3.8.7])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
118
cpu-miner.c
118
cpu-miner.c
@@ -2999,60 +2999,47 @@ static void show_credits()
|
||||
bool check_cpu_capability ()
|
||||
{
|
||||
char cpu_brand[0x40];
|
||||
// there is no CPU related feature specific to 4way, just AVX2 and AES
|
||||
bool cpu_has_sse2 = has_sse2();
|
||||
bool cpu_has_aes = has_aes_ni();
|
||||
bool cpu_has_sse42 = has_sse42();
|
||||
bool cpu_has_avx = has_avx1();
|
||||
bool cpu_has_avx2 = has_avx2();
|
||||
bool cpu_has_sha = has_sha();
|
||||
// no need to check if sw has sse2,
|
||||
// the code won't compile without it.
|
||||
// bool sw_has_sse2 = false;
|
||||
bool sw_has_aes = false;
|
||||
bool sw_has_sse42 = false;
|
||||
bool sw_has_avx = false;
|
||||
bool sw_has_avx2 = false;
|
||||
bool sw_has_sha = false;
|
||||
// bool sw_has_4way = false;
|
||||
bool cpu_has_sse2 = has_sse2();
|
||||
bool cpu_has_aes = has_aes_ni();
|
||||
bool cpu_has_sse42 = has_sse42();
|
||||
bool cpu_has_avx2 = has_avx2();
|
||||
bool cpu_has_sha = has_sha();
|
||||
bool cpu_has_avx512 = has_avx512f();
|
||||
bool sw_has_aes = false;
|
||||
bool sw_has_sse42 = false;
|
||||
bool sw_has_avx2 = false;
|
||||
bool sw_has_avx512 = false;
|
||||
bool sw_has_sha = false;
|
||||
set_t algo_features = algo_gate.optimizations;
|
||||
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
|
||||
bool algo_has_aes = set_incl( AES_OPT, algo_features );
|
||||
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
|
||||
bool algo_has_avx = set_incl( AVX_OPT, algo_features );
|
||||
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
|
||||
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
|
||||
// bool algo_has_4way = set_incl( FOUR_WAY_OPT, algo_features );
|
||||
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
|
||||
bool algo_has_aes = set_incl( AES_OPT, algo_features );
|
||||
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
|
||||
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
|
||||
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
|
||||
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
|
||||
bool use_aes;
|
||||
bool use_sse2;
|
||||
bool use_sse42;
|
||||
bool use_avx;
|
||||
bool use_avx2;
|
||||
bool use_avx512;
|
||||
bool use_sha;
|
||||
// bool use_4way;
|
||||
bool use_none;
|
||||
|
||||
#ifdef __AES__
|
||||
sw_has_aes = true;
|
||||
#endif
|
||||
// #ifdef __SSE2__
|
||||
// sw_has_sse2 = true;
|
||||
// #endif
|
||||
#ifdef __SSE4_2__
|
||||
sw_has_sse42 = true;
|
||||
#endif
|
||||
#ifdef __AVX__
|
||||
sw_has_avx = true;
|
||||
#endif
|
||||
#ifdef __AVX2__
|
||||
sw_has_avx2 = true;
|
||||
#endif
|
||||
#ifdef __AVX512F__
|
||||
sw_has_avx512 = true;
|
||||
#endif
|
||||
#ifdef __SHA__
|
||||
sw_has_sha = true;
|
||||
#endif
|
||||
// #ifdef HASH_4WAY
|
||||
// sw_has_4way = true;
|
||||
// #endif
|
||||
|
||||
#if !((__AES__) || (__SSE2__))
|
||||
printf("Neither __AES__ nor __SSE2__ defined.\n");
|
||||
@@ -3072,33 +3059,31 @@ bool check_cpu_capability ()
|
||||
#endif
|
||||
|
||||
printf("CPU features:");
|
||||
if ( cpu_has_sse2 ) printf( " SSE2" );
|
||||
if ( cpu_has_aes ) printf( " AES" );
|
||||
if ( cpu_has_sse42 ) printf( " SSE4.2" );
|
||||
if ( cpu_has_avx ) printf( " AVX" );
|
||||
if ( cpu_has_avx2 ) printf( " AVX2" );
|
||||
if ( cpu_has_sha ) printf( " SHA" );
|
||||
if ( cpu_has_sse2 ) printf( " SSE2" );
|
||||
if ( cpu_has_aes ) printf( " AES" );
|
||||
if ( cpu_has_sse42 ) printf( " SSE4.2" );
|
||||
if ( cpu_has_avx2 ) printf( " AVX2" );
|
||||
if ( cpu_has_avx512 ) printf( " AVX512" );
|
||||
if ( cpu_has_sha ) printf( " SHA" );
|
||||
|
||||
printf(".\nSW features: SSE2");
|
||||
if ( sw_has_aes ) printf( " AES" );
|
||||
if ( sw_has_sse42 ) printf( " SSE4.2" );
|
||||
if ( sw_has_avx ) printf( " AVX" );
|
||||
if ( sw_has_avx2 ) printf( " AVX2" );
|
||||
// if ( sw_has_4way ) printf( " 4WAY" );
|
||||
if ( sw_has_sha ) printf( " SHA" );
|
||||
if ( sw_has_aes ) printf( " AES" );
|
||||
if ( sw_has_sse42 ) printf( " SSE4.2" );
|
||||
if ( sw_has_avx2 ) printf( " AVX2" );
|
||||
if ( sw_has_avx512 ) printf( " AVX512" );
|
||||
if ( sw_has_sha ) printf( " SHA" );
|
||||
|
||||
|
||||
printf(".\nAlgo features:");
|
||||
if ( algo_features == EMPTY_SET ) printf( " None" );
|
||||
else
|
||||
{
|
||||
if ( algo_has_sse2 ) printf( " SSE2" );
|
||||
if ( algo_has_aes ) printf( " AES" );
|
||||
if ( algo_has_sse42 ) printf( " SSE4.2" );
|
||||
if ( algo_has_avx ) printf( " AVX" );
|
||||
if ( algo_has_avx2 ) printf( " AVX2" );
|
||||
// if ( algo_has_4way ) printf( " 4WAY" );
|
||||
if ( algo_has_sha ) printf( " SHA" );
|
||||
if ( algo_has_sse2 ) printf( " SSE2" );
|
||||
if ( algo_has_aes ) printf( " AES" );
|
||||
if ( algo_has_sse42 ) printf( " SSE4.2" );
|
||||
if ( algo_has_avx2 ) printf( " AVX2" );
|
||||
if ( algo_has_avx512 ) printf( " AVX512" );
|
||||
if ( algo_has_sha ) printf( " SHA" );
|
||||
}
|
||||
printf(".\n");
|
||||
|
||||
@@ -3118,11 +3103,6 @@ bool check_cpu_capability ()
|
||||
printf( "The SW build requires a CPU with SSE4.2!\n" );
|
||||
return false;
|
||||
}
|
||||
if ( sw_has_avx && !cpu_has_avx )
|
||||
{
|
||||
printf( "The SW build requires a CPU with AVX!\n" );
|
||||
return false;
|
||||
}
|
||||
if ( sw_has_aes && !cpu_has_aes )
|
||||
{
|
||||
printf( "The SW build requires a CPU with AES!\n" );
|
||||
@@ -3135,13 +3115,13 @@ bool check_cpu_capability ()
|
||||
}
|
||||
|
||||
// Determine mining options
|
||||
use_sse2 = cpu_has_sse2 && algo_has_sse2;
|
||||
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
|
||||
use_sse2 = cpu_has_sse2 && algo_has_sse2;
|
||||
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
|
||||
use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42;
|
||||
use_avx = cpu_has_avx && sw_has_avx && algo_has_avx;
|
||||
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
|
||||
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
|
||||
use_none = !( use_sse2 || use_aes || use_sse42 || use_avx || use_avx2 ||
|
||||
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
|
||||
use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
|
||||
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
|
||||
use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
|
||||
use_sha );
|
||||
|
||||
// Display best options
|
||||
@@ -3149,12 +3129,12 @@ bool check_cpu_capability ()
|
||||
if ( use_none ) printf( " no optimizations" );
|
||||
else
|
||||
{
|
||||
if ( use_aes ) printf( " AES" );
|
||||
if ( use_avx2 ) printf( " AVX2" );
|
||||
else if ( use_avx ) printf( " AVX" );
|
||||
if ( use_aes ) printf( " AES" );
|
||||
if ( use_avx512 ) printf( " AVX512" );
|
||||
else if ( use_avx2 ) printf( " AVX2" );
|
||||
else if ( use_sse42 ) printf( " SSE4.2" );
|
||||
else if ( use_sse2 ) printf( " SSE2" );
|
||||
if ( use_sha ) printf( " SHA" );
|
||||
else if ( use_sse2 ) printf( " SSE2" );
|
||||
if ( use_sha ) printf( " SHA" );
|
||||
}
|
||||
printf( ".\n\n" );
|
||||
|
||||
|
1372
interleave.h
Normal file
1372
interleave.h
Normal file
File diff suppressed because it is too large
Load Diff
16
miner.h
16
miner.h
@@ -333,6 +333,7 @@ bool has_sha();
|
||||
bool has_aes_ni();
|
||||
bool has_avx1();
|
||||
bool has_avx2();
|
||||
bool has_avx512f();
|
||||
bool has_sse2();
|
||||
bool has_xop();
|
||||
bool has_fma3();
|
||||
@@ -485,8 +486,9 @@ enum algos {
|
||||
ALGO_ALLIUM,
|
||||
ALGO_ANIME,
|
||||
ALGO_ARGON2,
|
||||
ALGO_ARGON2DCRDS,
|
||||
ALGO_ARGON2DDYN,
|
||||
ALGO_ARGON2D250,
|
||||
ALGO_ARGON2D500,
|
||||
ALGO_ARGON2D4096,
|
||||
ALGO_AXIOM,
|
||||
ALGO_BASTION,
|
||||
ALGO_BLAKE,
|
||||
@@ -565,8 +567,9 @@ static const char* const algo_names[] = {
|
||||
"allium",
|
||||
"anime",
|
||||
"argon2",
|
||||
"argon2d-crds",
|
||||
"argon2d-dyn",
|
||||
"argon2d250",
|
||||
"argon2d500",
|
||||
"argon2d4096",
|
||||
"axiom",
|
||||
"bastion",
|
||||
"blake",
|
||||
@@ -704,8 +707,9 @@ Options:\n\
|
||||
allium Garlicoin (GRLC)\n\
|
||||
anime Animecoin (ANI)\n\
|
||||
argon2 Argon2 Coin (AR2)\n\
|
||||
argon2d-crds Credits (CRDS)\n\
|
||||
argon2d-dyn Dynamic (DYN)\n\
|
||||
argon2d250 argon2d-crds, Credits (CRDS)\n\
|
||||
argon2d500 argon2d-dyn, Dynamic (DYN)\n\
|
||||
argon2d4096 argon2d-uis, Unitus (UIS)\n\
|
||||
axiom Shabal-256 MemoHash\n\
|
||||
bastion\n\
|
||||
blake blake256r14 (SFR)\n\
|
||||
|
16
sysinfos.c
16
sysinfos.c
@@ -274,6 +274,7 @@ void cpu_getmodelid(char *outbuf, size_t maxsz)
|
||||
#define SSE2_Flag (1<<26)
|
||||
|
||||
#define AVX2_Flag (1<< 5) // ADV EBX
|
||||
#define AVX512F_Flag (1<<16)
|
||||
#define SHA_Flag (1<<29)
|
||||
|
||||
// Use this to detect presence of feature
|
||||
@@ -350,6 +351,21 @@ static inline bool has_avx2_()
|
||||
|
||||
bool has_avx2() { return has_avx2_(); }
|
||||
|
||||
static inline bool has_avx512f_()
|
||||
{
|
||||
#ifdef __arm__
|
||||
return false;
|
||||
#else
|
||||
int cpu_info[4] = { 0 };
|
||||
cpuid( EXTENDED_FEATURES, cpu_info );
|
||||
return cpu_info[ EBX_Reg ] & AVX512F_Flag;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool has_avx512f() { return has_avx512f_(); }
|
||||
|
||||
|
||||
// AMD only
|
||||
static inline bool has_xop_()
|
||||
{
|
||||
#ifdef __arm__
|
||||
|
@@ -46,12 +46,12 @@ mv cpuminer.exe release/cpuminer-avx2.exe
|
||||
#mv cpuminer.exe release/cpuminer-aes-sha.exe
|
||||
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $F
|
||||
make
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-aes-avx.exe
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $F
|
||||
#make
|
||||
#strip -s cpuminer.exe
|
||||
#mv cpuminer.exe release/cpuminer-aes-avx.exe
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
|
Reference in New Issue
Block a user