mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v24.3
This commit is contained in:
@@ -75,6 +75,13 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v24.3
|
||||
|
||||
ARM: CPU feature detection and reporting is now working.
|
||||
ARM: Verthash is now working.
|
||||
ARM: Small speedup for yescrypt, yespower & argon2d.
|
||||
Code cleanup.
|
||||
|
||||
v24.2
|
||||
|
||||
x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.
|
||||
|
||||
@@ -98,7 +98,6 @@ typedef uint32_t set_t;
|
||||
#define AVX512_OPT 1 << 6 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
|
||||
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
|
||||
#define VAES_OPT 1 << 8 // Icelake, Zen3
|
||||
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
|
||||
#define SHA256_OPT 1 << 9 // Zen1, Icelake, AArch64
|
||||
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
|
||||
#define NEON_OPT 1 << 11 // AArch64
|
||||
|
||||
@@ -240,7 +240,7 @@ static const uint8_t sigma[12][16] =
|
||||
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
|
||||
}
|
||||
|
||||
static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
static void blake2b_8x64_compress( blake2b_8x64_ctx *ctx, int last )
|
||||
{
|
||||
__m512i v[16], m[16];
|
||||
|
||||
@@ -306,7 +306,7 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
|
||||
}
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
int blake2b_8x64_init( blake2b_8x64_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
@@ -333,7 +333,7 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
}
|
||||
|
||||
|
||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
|
||||
size_t inlen )
|
||||
{
|
||||
__m512i* in =(__m512i*)input;
|
||||
@@ -348,7 +348,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
blake2b_8way_compress( ctx, 0 );
|
||||
blake2b_8x64_compress( ctx, 0 );
|
||||
ctx->c = 0;
|
||||
}
|
||||
ctx->b[ c++ ] = in[i];
|
||||
@@ -356,7 +356,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
}
|
||||
}
|
||||
|
||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
||||
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out )
|
||||
{
|
||||
size_t c;
|
||||
c = ctx->c >> 3;
|
||||
@@ -371,7 +371,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
||||
ctx->c += 8;
|
||||
}
|
||||
|
||||
blake2b_8way_compress( ctx, 1 ); // final block flag = 1
|
||||
blake2b_8x64_compress( ctx, 1 ); // final block flag = 1
|
||||
|
||||
casti_m512i( out, 0 ) = ctx->h[0];
|
||||
casti_m512i( out, 1 ) = ctx->h[1];
|
||||
@@ -407,7 +407,7 @@ static const uint64_t blake2b_iv[8] = {
|
||||
};
|
||||
*/
|
||||
|
||||
static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
||||
static void blake2b_4x64_compress( blake2b_4x64_ctx *ctx, int last )
|
||||
{
|
||||
__m256i v[16], m[16];
|
||||
|
||||
@@ -473,7 +473,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
||||
ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
|
||||
}
|
||||
|
||||
int blake2b_4way_init( blake2b_4way_ctx *ctx )
|
||||
int blake2b_4x64_init( blake2b_4x64_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
@@ -499,7 +499,7 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
|
||||
return 0;
|
||||
}
|
||||
|
||||
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
|
||||
size_t inlen )
|
||||
{
|
||||
__m256i* in =(__m256i*)input;
|
||||
@@ -514,7 +514,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
blake2b_4way_compress( ctx, 0 );
|
||||
blake2b_4x64_compress( ctx, 0 );
|
||||
ctx->c = 0;
|
||||
}
|
||||
ctx->b[ c++ ] = in[i];
|
||||
@@ -522,7 +522,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
}
|
||||
}
|
||||
|
||||
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
|
||||
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out )
|
||||
{
|
||||
size_t c;
|
||||
c = ctx->c >> 3;
|
||||
@@ -537,7 +537,7 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
|
||||
ctx->c += 8;
|
||||
}
|
||||
|
||||
blake2b_4way_compress( ctx, 1 ); // final block flag = 1
|
||||
blake2b_4x64_compress( ctx, 1 ); // final block flag = 1
|
||||
|
||||
casti_m256i( out, 0 ) = ctx->h[0];
|
||||
casti_m256i( out, 1 ) = ctx->h[1];
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#ifndef __BLAKE2B_HASH_4WAY_H__
|
||||
#define __BLAKE2B_HASH_4WAY_H__
|
||||
#ifndef BLAKE2B_HASH_4WAY_H__
|
||||
#define BLAKE2B_HASH_4WAY_H__
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include <stddef.h>
|
||||
@@ -23,12 +23,17 @@ typedef struct ALIGN( 64 ) {
|
||||
uint64_t t[2]; // total number of bytes
|
||||
size_t c; // pointer for b[]
|
||||
size_t outlen; // digest size
|
||||
} blake2b_8way_ctx;
|
||||
} blake2b_8x64_ctx;
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx );
|
||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
int blake2b_8x64_init( blake2b_8x64_ctx *ctx );
|
||||
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
|
||||
size_t inlen );
|
||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
|
||||
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );
|
||||
|
||||
#define blake2b_8way_ctx blake2b_8x64_ctx
|
||||
#define blake2b_8way_init blake2b_8x64_init
|
||||
#define blake2b_8way_update blake2b_8x64_update
|
||||
#define blake2b_8way_final blake2b_8x64_final
|
||||
|
||||
#endif
|
||||
|
||||
@@ -41,12 +46,17 @@ typedef struct ALIGN( 64 ) {
|
||||
uint64_t t[2]; // total number of bytes
|
||||
size_t c; // pointer for b[]
|
||||
size_t outlen; // digest size
|
||||
} blake2b_4way_ctx;
|
||||
} blake2b_4x64_ctx;
|
||||
|
||||
int blake2b_4way_init( blake2b_4way_ctx *ctx );
|
||||
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
int blake2b_4x64_init( blake2b_4x64_ctx *ctx );
|
||||
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
|
||||
size_t inlen );
|
||||
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
|
||||
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );
|
||||
|
||||
#define blake2b_4way_ctx blake2b_4x64_ctx
|
||||
#define blake2b_4way_init blake2b_4x64_init
|
||||
#define blake2b_4way_update blake2b_4x64_update
|
||||
#define blake2b_4way_final blake2b_4x64_final
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -11,8 +11,8 @@
|
||||
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
//#pragma once
|
||||
#ifndef __BLAKE2S_HASH_4WAY_H__
|
||||
#define __BLAKE2S_HASH_4WAY_H__ 1
|
||||
#ifndef BLAKE2S_HASH_4WAY_H__
|
||||
#define BLAKE2S_HASH_4WAY_H__ 1
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ bool register_myriad_algo( algo_gate_t* gate )
|
||||
init_myrgr_ctx();
|
||||
gate->scanhash = (void*)&scanhash_myriad;
|
||||
gate->hash = (void*)&myriad_hash;
|
||||
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
|
||||
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA256_OPT | VAES_OPT;
|
||||
#endif
|
||||
return true;
|
||||
};
|
||||
|
||||
@@ -195,10 +195,6 @@ static const uint64_t blake2b_IV[8] =
|
||||
|
||||
#endif // AVX2 else SSE2
|
||||
|
||||
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
return ( w >> c ) | ( w << ( 64 - c ) );
|
||||
}
|
||||
|
||||
#define G( r, i, a, b, c, d ) \
|
||||
{ \
|
||||
a = a + b; \
|
||||
|
||||
@@ -306,7 +306,7 @@ bool register_m7m_algo( algo_gate_t *gate )
|
||||
applog( LOG_ERR, "M7M algo is not supported on MacOS");
|
||||
return false;
|
||||
#else
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->optimizations = SHA256_OPT;
|
||||
init_m7m_ctx();
|
||||
gate->scanhash = (void*)&scanhash_m7m_hash;
|
||||
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
|
||||
|
||||
@@ -104,14 +104,14 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
// we need bigendian data...
|
||||
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
|
||||
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
|
||||
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
|
||||
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
|
||||
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
||||
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
|
||||
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
|
||||
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
|
||||
casti_m128i( edata, 0 ) = v128_bswap32( casti_m128i( pdata, 0 ) );
|
||||
casti_m128i( edata, 1 ) = v128_bswap32( casti_m128i( pdata, 1 ) );
|
||||
casti_m128i( edata, 2 ) = v128_bswap32( casti_m128i( pdata, 2 ) );
|
||||
casti_m128i( edata, 3 ) = v128_bswap32( casti_m128i( pdata, 3 ) );
|
||||
casti_m128i( edata, 4 ) = v128_bswap32( casti_m128i( pdata, 4 ) );
|
||||
casti_m128i( edata, 5 ) = v128_bswap32( casti_m128i( pdata, 5 ) );
|
||||
casti_m128i( edata, 6 ) = v128_bswap32( casti_m128i( pdata, 6 ) );
|
||||
casti_m128i( edata, 7 ) = v128_bswap32( casti_m128i( pdata, 7 ) );
|
||||
intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
|
||||
|
||||
@@ -224,14 +224,14 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
// we need bigendian data...
|
||||
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
|
||||
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
|
||||
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
|
||||
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
|
||||
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
||||
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
|
||||
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
|
||||
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
|
||||
casti_m128i( edata, 0 ) = v128_bswap32( casti_m128i( pdata, 0 ) );
|
||||
casti_m128i( edata, 1 ) = v128_bswap32( casti_m128i( pdata, 1 ) );
|
||||
casti_m128i( edata, 2 ) = v128_bswap32( casti_m128i( pdata, 2 ) );
|
||||
casti_m128i( edata, 3 ) = v128_bswap32( casti_m128i( pdata, 3 ) );
|
||||
casti_m128i( edata, 4 ) = v128_bswap32( casti_m128i( pdata, 4 ) );
|
||||
casti_m128i( edata, 5 ) = v128_bswap32( casti_m128i( pdata, 5 ) );
|
||||
casti_m128i( edata, 6 ) = v128_bswap32( casti_m128i( pdata, 6 ) );
|
||||
casti_m128i( edata, 7 ) = v128_bswap32( casti_m128i( pdata, 7 ) );
|
||||
intrlv_8x32( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 1024 );
|
||||
|
||||
|
||||
@@ -51,7 +51,6 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
|
||||
|
||||
bool register_lbry_algo( algo_gate_t* gate )
|
||||
{
|
||||
// gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
||||
#if defined (LBRY_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_lbry_16way;
|
||||
gate->hash = (void*)&lbry_16way_hash;
|
||||
@@ -67,7 +66,7 @@ bool register_lbry_algo( algo_gate_t* gate )
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_lbry;
|
||||
gate->hash = (void*)&lbry_hash;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA256_OPT;
|
||||
#endif
|
||||
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
|
||||
gate->build_extraheader = (void*)&lbry_build_extraheader;
|
||||
|
||||
@@ -2074,7 +2074,7 @@ void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N )
|
||||
v128_ovly v;
|
||||
for ( int l = 0; l < 4; l++ )
|
||||
v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
|
||||
X[i] = v128_xor( X[i], v.m128 );
|
||||
X[i] = v128_xor( X[i], v.v128 );
|
||||
}
|
||||
|
||||
xor_salsa8_4way( &X[ 0], &X[16] );
|
||||
@@ -2211,10 +2211,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
|
||||
// X2 is shuffled left 2 (swap_64) { xd, x8, x7, x2 }
|
||||
// X3 is shuffled left 3 (ror_1x32) { xc, xb, x6, x1 }
|
||||
|
||||
y[0].m128 = X0;
|
||||
y[1].m128 = X1;
|
||||
y[2].m128 = X2;
|
||||
y[3].m128 = X3;
|
||||
y[0].v128 = X0;
|
||||
y[1].v128 = X1;
|
||||
y[2].v128 = X2;
|
||||
y[3].v128 = X3;
|
||||
|
||||
z[0].u32[0] = y[0].u32[0];
|
||||
z[0].u32[3] = y[1].u32[0];
|
||||
@@ -2236,10 +2236,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
|
||||
z[3].u32[1] = y[2].u32[3];
|
||||
z[3].u32[0] = y[3].u32[3];
|
||||
|
||||
B[0] = v128_add32( B[0], z[0].m128 );
|
||||
B[1] = v128_add32( B[1], z[1].m128 );
|
||||
B[2] = v128_add32( B[2], z[2].m128 );
|
||||
B[3] = v128_add32( B[3], z[3].m128 );
|
||||
B[0] = v128_add32( B[0], z[0].v128 );
|
||||
B[1] = v128_add32( B[1], z[1].v128 );
|
||||
B[2] = v128_add32( B[2], z[2].v128 );
|
||||
B[3] = v128_add32( B[3], z[3].v128 );
|
||||
|
||||
#endif
|
||||
|
||||
@@ -2404,14 +2404,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
||||
/*
|
||||
v128_ovly ya[4], za[4], yb[4], zb[4];
|
||||
|
||||
ya[0].m128 = XA[0];
|
||||
yb[0].m128 = XB[0];
|
||||
ya[1].m128 = XA[1];
|
||||
yb[1].m128 = XB[1];
|
||||
ya[2].m128 = XA[2];
|
||||
yb[2].m128 = XB[2];
|
||||
ya[3].m128 = XA[3];
|
||||
yb[3].m128 = XB[3];
|
||||
ya[0].v128 = XA[0];
|
||||
yb[0].v128 = XB[0];
|
||||
ya[1].v128 = XA[1];
|
||||
yb[1].v128 = XB[1];
|
||||
ya[2].v128 = XA[2];
|
||||
yb[2].v128 = XB[2];
|
||||
ya[3].v128 = XA[3];
|
||||
yb[3].v128 = XB[3];
|
||||
|
||||
za[0].u32[0] = ya[0].u32[0];
|
||||
zb[0].u32[0] = yb[0].u32[0];
|
||||
@@ -2449,14 +2449,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
||||
za[3].u32[3] = ya[0].u32[3];
|
||||
zb[3].u32[3] = yb[0].u32[3];
|
||||
|
||||
XA[0] = za[0].m128;
|
||||
XB[0] = zb[0].m128;
|
||||
XA[1] = za[1].m128;
|
||||
XB[1] = zb[1].m128;
|
||||
XA[2] = za[2].m128;
|
||||
XB[2] = zb[2].m128;
|
||||
XA[3] = za[3].m128;
|
||||
XB[3] = zb[3].m128;
|
||||
XA[0] = za[0].v128;
|
||||
XB[0] = zb[0].v128;
|
||||
XA[1] = za[1].v128;
|
||||
XB[1] = zb[1].v128;
|
||||
XA[2] = za[2].v128;
|
||||
XB[2] = zb[2].v128;
|
||||
XA[3] = za[3].v128;
|
||||
XB[3] = zb[3].v128;
|
||||
*/
|
||||
}
|
||||
|
||||
@@ -2770,18 +2770,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
||||
/*
|
||||
v128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];
|
||||
|
||||
ya[0].m128 = XA[0];
|
||||
yb[0].m128 = XB[0];
|
||||
yc[0].m128 = XC[0];
|
||||
ya[1].m128 = XA[1];
|
||||
yb[1].m128 = XB[1];
|
||||
yc[1].m128 = XC[1];
|
||||
ya[2].m128 = XA[2];
|
||||
yb[2].m128 = XB[2];
|
||||
yc[2].m128 = XC[2];
|
||||
ya[3].m128 = XA[3];
|
||||
yb[3].m128 = XB[3];
|
||||
yc[3].m128 = XC[3];
|
||||
ya[0].v128 = XA[0];
|
||||
yb[0].v128 = XB[0];
|
||||
yc[0].v128 = XC[0];
|
||||
ya[1].v128 = XA[1];
|
||||
yb[1].v128 = XB[1];
|
||||
yc[1].v128 = XC[1];
|
||||
ya[2].v128 = XA[2];
|
||||
yb[2].v128 = XB[2];
|
||||
yc[2].v128 = XC[2];
|
||||
ya[3].v128 = XA[3];
|
||||
yb[3].v128 = XB[3];
|
||||
yc[3].v128 = XC[3];
|
||||
|
||||
za[0].u32[0] = ya[0].u32[0];
|
||||
zb[0].u32[0] = yb[0].u32[0];
|
||||
@@ -2835,18 +2835,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
||||
zb[3].u32[3] = yb[0].u32[3];
|
||||
zc[3].u32[3] = yc[0].u32[3];
|
||||
|
||||
XA[0] = za[0].m128;
|
||||
XB[0] = zb[0].m128;
|
||||
XC[0] = zc[0].m128;
|
||||
XA[1] = za[1].m128;
|
||||
XB[1] = zb[1].m128;
|
||||
XC[1] = zc[1].m128;
|
||||
XA[2] = za[2].m128;
|
||||
XB[2] = zb[2].m128;
|
||||
XC[2] = zc[2].m128;
|
||||
XA[3] = za[3].m128;
|
||||
XB[3] = zb[3].m128;
|
||||
XC[3] = zc[3].m128;
|
||||
XA[0] = za[0].v128;
|
||||
XB[0] = zb[0].v128;
|
||||
XC[0] = zc[0].v128;
|
||||
XA[1] = za[1].v128;
|
||||
XB[1] = zb[1].v128;
|
||||
XC[1] = zc[1].v128;
|
||||
XA[2] = za[2].v128;
|
||||
XB[2] = zb[2].v128;
|
||||
XC[2] = zc[2].v128;
|
||||
XA[3] = za[3].v128;
|
||||
XB[3] = zb[3].v128;
|
||||
XC[3] = zc[3].v128;
|
||||
*/
|
||||
}
|
||||
|
||||
@@ -3049,7 +3049,7 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
|
||||
xf = (B[15] ^= C[15]);
|
||||
|
||||
|
||||
#define ROL32( a, c ) ror32( a, c )
|
||||
#define ROL32( a, c ) rol32( a, c )
|
||||
#define ADD32( a, b ) ( (a)+(b) )
|
||||
#define XOR( a, b ) ( (a)^(b) )
|
||||
|
||||
|
||||
@@ -1481,7 +1481,7 @@ bool scrypt_miner_thread_init( int thr_id )
|
||||
bool register_scrypt_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
|
||||
#else
|
||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
#endif
|
||||
|
||||
@@ -8,14 +8,14 @@ void sha256d( void *hash, const void *data, int len )
|
||||
}
|
||||
bool register_sha256d_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
#if defined(SHA256D_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_16way;
|
||||
#elif defined(SHA256D_SHA)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA256_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256d_sha;
|
||||
#elif defined(SHA256D_NEON_SHA2)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->optimizations = NEON_OPT | SHA256_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256d_neon_sha2;
|
||||
#elif defined(SHA256D_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_8way;
|
||||
|
||||
@@ -500,10 +500,10 @@ bool register_sha256dt_algo( algo_gate_t* gate )
|
||||
#if defined(SHA256DT_16X32)
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_16x32;
|
||||
#elif defined(SHA256DT_X86_SHA256)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA256_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_x86_x2sha;
|
||||
#elif defined(SHA256DT_NEON_SHA256)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->optimizations = NEON_OPT | SHA256_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_neon_x2sha;
|
||||
#elif defined(SHA256DT_8X32)
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_8x32;
|
||||
|
||||
@@ -6,9 +6,10 @@ bool register_sha256t_algo( algo_gate_t* gate )
|
||||
#if defined(SHA256T_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256t_16way;
|
||||
#elif defined(SHA256T_SHA)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA256_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256t_sha;
|
||||
#elif defined(SHA256T_NEON_SHA2)
|
||||
gate->optimizations = NEON_OPT | SHA256_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256t_neon_sha2;
|
||||
#elif defined(SHA256T_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256t_8way;
|
||||
@@ -28,7 +29,7 @@ bool register_sha256q_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_sha256q_16way;
|
||||
gate->hash = (void*)&sha256q_16way_hash;
|
||||
//#elif defined(SHA256T_SHA)
|
||||
// gate->optimizations = SHA_OPT;
|
||||
// gate->optimizations = SHA256_OPT;
|
||||
// gate->scanhash = (void*)&scanhash_sha256q;
|
||||
// gate->hash = (void*)&sha256q_hash;
|
||||
#elif defined(SHA256T_8WAY)
|
||||
|
||||
@@ -71,12 +71,13 @@ static const uint64_t K512[80] =
|
||||
|
||||
// SHA-512 implemented using SHA512 CPU extension.
|
||||
|
||||
// Experimental. Not tested. Not reviewed. Compile tested only.
|
||||
// Experimental. Not supported. Not tested. Not reviewed. Compile tested only.
|
||||
// Modelled after noloader sha256 implementation, replacing 4x32 bit
|
||||
// instructions with equivalent 4x64 bit instructions and increasing rounds
|
||||
// to 80.
|
||||
|
||||
// Needs GCC-14 for compilation.
|
||||
// Needs Intel Lunarlake or Arrowlake CPU, or AMD Zen-6? for execution.
|
||||
// Modelled after noloader sha256 implementation.
|
||||
|
||||
|
||||
void sha512_opt_transform_be( uint64_t *state_out, const void *input,
|
||||
const uint64_t *state_in )
|
||||
@@ -571,6 +572,20 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
#if defined(__ARM_FEATURE_NEON) && defined(__ARM_FEATURE_SHA512)
|
||||
|
||||
uint64x2_t sha512_compile_test( uint64x2_t test )
|
||||
{
|
||||
test = vsha512hq_u64( test, test, test );
|
||||
test = vsha512h2q_u64( test, test, test );
|
||||
test = vsha512su0q_u64( test, test );
|
||||
test = vsha512su1q_u64( test, test, test );
|
||||
return test;
|
||||
}
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
#if defined(SIMD512)
|
||||
|
||||
|
||||
@@ -8,15 +8,15 @@ bool register_skein_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_skein_8way;
|
||||
gate->hash = (void*)&skeinhash_8way;
|
||||
#elif defined(SKEIN_4WAY)
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT;
|
||||
gate->scanhash = (void*)&scanhash_skein_4way;
|
||||
gate->hash = (void*)&skeinhash_4way;
|
||||
#elif defined(SKEIN_2WAY)
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT;
|
||||
gate->scanhash = (void*)&scanhash_skein_2x64;
|
||||
gate->hash = (void*)&skeinhash_2x64;
|
||||
#else
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT;
|
||||
gate->scanhash = (void*)&scanhash_skein;
|
||||
gate->hash = (void*)&skeinhash;
|
||||
#endif
|
||||
|
||||
@@ -240,10 +240,10 @@ void sm3_8way_close( void *cc, void *dst )
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 9 ), \
|
||||
mm128_rol_32( x, 17 ) ) )
|
||||
#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 15 ), \
|
||||
mm128_rol_32( x, 23 ) ) )
|
||||
#define P0(x) _mm_xor_si128( x, _mm_xor_si128( v128_rol32( x, 9 ), \
|
||||
v128_rol32( x, 17 ) ) )
|
||||
#define P1(x) _mm_xor_si128( x, _mm_xor_si128( v128_rol32( x, 15 ), \
|
||||
v128_rol32( x, 23 ) ) )
|
||||
|
||||
#define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) )
|
||||
#define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \
|
||||
@@ -273,13 +273,13 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
|
||||
int j;
|
||||
|
||||
for ( j = 0; j < 16; j++ )
|
||||
W[j] = mm128_bswap_32( block[j] );
|
||||
W[j] = v128_bswap32( block[j] );
|
||||
|
||||
for ( j = 16; j < 68; j++ )
|
||||
W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
|
||||
W[ j-9 ] ),
|
||||
mm128_rol_32( W[ j-3 ], 15 ) ) ),
|
||||
_mm_xor_si128( mm128_rol_32( W[ j-13 ], 7 ),
|
||||
v128_rol32( W[ j-3 ], 15 ) ) ),
|
||||
_mm_xor_si128( v128_rol32( W[ j-13 ], 7 ),
|
||||
W[ j-6 ] ) );
|
||||
|
||||
for( j = 0; j < 64; j++ )
|
||||
@@ -288,19 +288,19 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
|
||||
T = _mm_set1_epi32( 0x79CC4519UL );
|
||||
for( j =0; j < 16; j++ )
|
||||
{
|
||||
SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
|
||||
SS1 = v128_rol32( _mm_add_epi32( _mm_add_epi32( v128_rol32(A,12), E ),
|
||||
mm128_rol_var_32( T, j ) ), 7 );
|
||||
SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );
|
||||
SS2 = _mm_xor_si128( SS1, v128_rol32( A, 12 ) );
|
||||
TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ),
|
||||
SS2 ), W1[j] );
|
||||
TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ),
|
||||
SS1 ), W[j] );
|
||||
D = C;
|
||||
C = mm128_rol_32( B, 9 );
|
||||
C = v128_rol32( B, 9 );
|
||||
B = A;
|
||||
A = TT1;
|
||||
H = G;
|
||||
G = mm128_rol_32( F, 19 );
|
||||
G = v128_rol32( F, 19 );
|
||||
F = E;
|
||||
E = P0( TT2 );
|
||||
}
|
||||
@@ -308,19 +308,19 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
|
||||
T = _mm_set1_epi32( 0x7A879D8AUL );
|
||||
for( j =16; j < 64; j++ )
|
||||
{
|
||||
SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
|
||||
SS1 = v128_rol32( _mm_add_epi32( _mm_add_epi32( v128_rol32(A,12), E ),
|
||||
mm128_rol_var_32( T, j&31 ) ), 7 );
|
||||
SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );
|
||||
SS2 = _mm_xor_si128( SS1, v128_rol32( A, 12 ) );
|
||||
TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ),
|
||||
SS2 ), W1[j] );
|
||||
TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ),
|
||||
SS1 ), W[j] );
|
||||
D = C;
|
||||
C = mm128_rol_32( B, 9 );
|
||||
C = v128_rol32( B, 9 );
|
||||
B = A;
|
||||
A = TT1;
|
||||
H = G;
|
||||
G = mm128_rol_32( F, 19 );
|
||||
G = v128_rol32( F, 19 );
|
||||
F = E;
|
||||
E = P0( TT2 );
|
||||
}
|
||||
@@ -408,14 +408,14 @@ void sm3_4way_close( void *cc, void *dst )
|
||||
memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
|
||||
}
|
||||
|
||||
count[0] = mm128_bswap_32(
|
||||
count[0] = v128_bswap32(
|
||||
_mm_set1_epi32( ctx->nblocks >> 23 ) );
|
||||
count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
|
||||
count[1] = v128_bswap32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
|
||||
( ctx->num << 3 ) ) );
|
||||
sm3_4way_compress( ctx->digest, block );
|
||||
|
||||
for ( i = 0; i < 8 ; i++ )
|
||||
hash[i] = mm128_bswap_32( ctx->digest[i] );
|
||||
hash[i] = v128_bswap32( ctx->digest[i] );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -137,53 +137,8 @@ void verthash_info_free(verthash_info_t* info)
|
||||
#define VH_N_INDEXES 4096
|
||||
#define VH_BYTE_ALIGNMENT 16
|
||||
|
||||
static inline uint32_t fnv1a(const uint32_t a, const uint32_t b)
|
||||
{
|
||||
return (a ^ b) * 0x1000193;
|
||||
}
|
||||
#define fnv1a( a, b ) ( ( (a) ^ (b) ) * 0x1000193 )
|
||||
|
||||
#if 0
|
||||
static void rotate_indexes( uint32_t *p )
|
||||
{
|
||||
#if defined(__AVX2__)
|
||||
|
||||
for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m256i); x += 8 )
|
||||
{
|
||||
__m256i *px = (__m256i*)p + x;
|
||||
|
||||
px[0] = mm256_rol_32( px[0], 1 );
|
||||
px[1] = mm256_rol_32( px[1], 1 );
|
||||
px[2] = mm256_rol_32( px[2], 1 );
|
||||
px[3] = mm256_rol_32( px[3], 1 );
|
||||
px[4] = mm256_rol_32( px[4], 1 );
|
||||
px[5] = mm256_rol_32( px[5], 1 );
|
||||
px[6] = mm256_rol_32( px[6], 1 );
|
||||
px[7] = mm256_rol_32( px[7], 1 );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m128i); x += 8 )
|
||||
{
|
||||
__m128i *px = (__m128i*)p0_index + x;
|
||||
|
||||
px[0] = mm128_rol_32( px[0], 1 );
|
||||
px[1] = mm128_rol_32( px[1], 1 );
|
||||
px[2] = mm128_rol_32( px[2], 1 );
|
||||
px[3] = mm128_rol_32( px[3], 1 );
|
||||
px[4] = mm128_rol_32( px[4], 1 );
|
||||
px[5] = mm128_rol_32( px[5], 1 );
|
||||
px[6] = mm128_rol_32( px[6], 1 );
|
||||
px[7] = mm128_rol_32( px[7], 1 );
|
||||
}
|
||||
|
||||
#endif
|
||||
/*
|
||||
for ( size_t x = 0; x < VH_N_SUBSET / sizeof(uint32_t); ++x )
|
||||
p[x] = ( p[x] << 1 ) | ( p[x] >> 31 );
|
||||
*/
|
||||
}
|
||||
#endif
|
||||
// Vectorized and targetted version of fnv1a
|
||||
#if defined (__AVX2__)
|
||||
|
||||
@@ -229,7 +184,7 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
|
||||
MULXOR; \
|
||||
}
|
||||
|
||||
// subsequent passes rotate by r on demand, no need for mass rotate
|
||||
// subsequent passes rotate by r
|
||||
#define ROUND_r( r ) \
|
||||
for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
|
||||
{ \
|
||||
@@ -243,8 +198,8 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
|
||||
void verthash_hash( const void *blob_bytes, const size_t blob_size,
|
||||
const void *input, void *output )
|
||||
{
|
||||
uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (64)));
|
||||
uint32_t subset[ VH_N_SUBSET / 4 ] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (32)));
|
||||
const uint32_t *blob = (const uint32_t*)blob_bytes;
|
||||
uint32_t accumulator = 0x811c9dc5;
|
||||
const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE )
|
||||
|
||||
@@ -91,8 +91,8 @@ void verthash_sha3_512_final_8( void *hash, const uint64_t nonce )
|
||||
int scanhash_verthash( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t edata[20] __attribute__((aligned(64)));
|
||||
uint32_t hash[8] __attribute__((aligned(64)));
|
||||
uint32_t edata[20] __attribute__((aligned(32)));
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -101,9 +101,7 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
for (int i = 0; i < 20; i++)
|
||||
edata[i] = bswap_32( pdata[i] );
|
||||
// v128_bswap32_80( edata, pdata );
|
||||
v128_bswap32_80( edata, pdata );
|
||||
verthash_sha3_512_prehash_72( edata );
|
||||
|
||||
do
|
||||
|
||||
@@ -318,7 +318,7 @@ bool register_minotaur_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&minotaur_hash;
|
||||
gate->miner_thread_init = (void*)&initialize_torture_garden;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA_OPT;
|
||||
if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA256_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ bool register_x22i_algo( algo_gate_t* gate )
|
||||
|
||||
#endif
|
||||
|
||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT
|
||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA256_OPT
|
||||
| AVX512_OPT | VAES_OPT | NEON_OPT;
|
||||
return true;
|
||||
};
|
||||
@@ -48,7 +48,7 @@ bool register_x25x_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x25x;
|
||||
gate->hash = (void*)&x25x_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
|
||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA256_OPT |
|
||||
AVX512_OPT | VAES_OPT | NEON_OPT;
|
||||
InitializeSWIFFTX();
|
||||
return true;
|
||||
|
||||
@@ -71,7 +71,7 @@ int scanhash_yespower_r8g( struct work *work, uint32_t max_nonce,
|
||||
|
||||
bool register_yescryptr8g_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower_r8g;
|
||||
#if (__SSE2__) || defined(__aarch64__)
|
||||
gate->hash = (void*)&yespower_hash;
|
||||
|
||||
@@ -162,7 +162,7 @@ bool register_yespower_algo( algo_gate_t* gate )
|
||||
if ( yespower_params.pers )
|
||||
applog( LOG_NOTICE,"Key= \"%s\"\n", yespower_params.pers );
|
||||
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower;
|
||||
#if (__SSE2__) || defined(__aarch64__)
|
||||
gate->hash = (void*)&yespower_hash;
|
||||
@@ -180,7 +180,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
|
||||
yespower_params.r = 16;
|
||||
yespower_params.pers = NULL;
|
||||
yespower_params.perslen = 0;
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower;
|
||||
#if (__SSE2__) || defined(__aarch64__)
|
||||
gate->hash = (void*)&yespower_hash;
|
||||
@@ -195,7 +195,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
|
||||
|
||||
bool register_yescrypt_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower;
|
||||
#if (__SSE2__) || defined(__aarch64__)
|
||||
gate->hash = (void*)&yespower_hash;
|
||||
@@ -233,7 +233,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )
|
||||
|
||||
bool register_yescryptr8_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower;
|
||||
#if (__SSE2__) || defined(__aarch64__)
|
||||
gate->hash = (void*)&yespower_hash;
|
||||
@@ -251,7 +251,7 @@ bool register_yescryptr8_algo( algo_gate_t* gate )
|
||||
|
||||
bool register_yescryptr16_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower;
|
||||
#if (__SSE2__) || defined(__aarch64__)
|
||||
gate->hash = (void*)&yespower_hash;
|
||||
@@ -269,7 +269,7 @@ bool register_yescryptr16_algo( algo_gate_t* gate )
|
||||
|
||||
bool register_yescryptr32_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower;
|
||||
#if (__SSE2__) || defined(__aarch64__)
|
||||
gate->hash = (void*)&yespower_hash;
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
# during develpment. However the information contained may provide compilation
|
||||
# tips to users.
|
||||
|
||||
rm cpuminer cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.2-aes-sha3-sve2 cpuminer-armv8-aes-sha2-sve2 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null
|
||||
rm cpuminer cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.5-aes-sha3-sve2 cpuminer-armv8.4-aes-sha3 cpuminer-armv8-aes-sha2 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null
|
||||
|
||||
# armv9 needs gcc-13
|
||||
|
||||
@@ -16,23 +16,25 @@ make -j $(nproc)
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-armv9-aes-sha3
|
||||
|
||||
make clean || echo clean
|
||||
CFLAGS="-O3 -march=armv9-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure --with-curl
|
||||
make -j $(nproc)
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-armv9-aes-sha3-sve2
|
||||
#make clean || echo clean
|
||||
#CFLAGS="-O3 -march=armv9-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure --with-curl
|
||||
#make -j $(nproc)
|
||||
#strip -s cpuminer
|
||||
#mv cpuminer cpuminer-armv9-aes-sha3-sve2
|
||||
|
||||
make clean || echo clean
|
||||
CFLAGS="-O3 -march=armv8.2-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure --with-curl
|
||||
make -j $(nproc)
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-armv8.2-aes-sha3-sve2
|
||||
# SVE2 available in armv8.5
|
||||
#make clean || echo clean
|
||||
#CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure --with-curl
|
||||
#make -j $(nproc)
|
||||
#strip -s cpuminer
|
||||
#mv cpuminer cpuminer-armv8.5-aes-sha3-sve2
|
||||
|
||||
# SHA3 available in armv8.4
|
||||
make clean || echo clean
|
||||
CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes+sve2 -Wall -flax-vector-conversions" ./configure --with-curl
|
||||
CFLAGS="-O3 -march=armv8.4-a+crypto+sha3+aes -Wall -flax-vector-conversions" ./configure --with-curl
|
||||
make -j $(nproc)
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-armv8-aes-sha2-sve2
|
||||
mv cpuminer cpuminer-armv8.4-aes-sha3
|
||||
|
||||
make clean || echo clean
|
||||
CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure --with-curl
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# make clean and rm all the targetted executables.
|
||||
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.2-aes-sha3-sve2 cpuminer-armv8-aes-sha3-sve2 cpuminer-armv8-aes-sha2-sve2 cpuminer-armv8-crypto cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-aes-sha3 cpuminer-armv8-aes-sha2 cpuminer-armv8-sha2 > /dev/null
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.4-aes-sha3 cpuminer-armv8.5-aes-sha3-sve2 cpuminer-armv8-crypto cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-aes-sha3 cpuminer-armv8-aes-sha2 cpuminer-armv8-sha2 > /dev/null
|
||||
|
||||
rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null
|
||||
|
||||
|
||||
28
configure
vendored
28
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.2.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.3.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
|
||||
@@ -608,8 +608,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='24.2'
|
||||
PACKAGE_STRING='cpuminer-opt 24.2'
|
||||
PACKAGE_VERSION='24.3'
|
||||
PACKAGE_STRING='cpuminer-opt 24.3'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 24.2 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 24.3 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1432,7 +1432,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 24.2:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 24.3:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1538,7 +1538,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 24.2
|
||||
cpuminer-opt configure 24.3
|
||||
generated by GNU Autoconf 2.71
|
||||
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 24.2, which was
|
||||
It was created by cpuminer-opt $as_me 24.3, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
$ $0$ac_configure_args_raw
|
||||
@@ -3593,7 +3593,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='24.2'
|
||||
VERSION='24.3'
|
||||
|
||||
|
||||
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
|
||||
@@ -5810,11 +5810,11 @@ if test x$ac_prog_cxx_stdcxx = xno
|
||||
then :
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++11 features" >&5
|
||||
printf %s "checking for $CXX option to enable C++11 features... " >&6; }
|
||||
if test ${ac_cv_prog_cxx_11+y}
|
||||
if test ${ac_cv_prog_cxx_cxx11+y}
|
||||
then :
|
||||
printf %s "(cached) " >&6
|
||||
else $as_nop
|
||||
ac_cv_prog_cxx_11=no
|
||||
ac_cv_prog_cxx_cxx11=no
|
||||
ac_save_CXX=$CXX
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
@@ -5856,11 +5856,11 @@ if test x$ac_prog_cxx_stdcxx = xno
|
||||
then :
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++98 features" >&5
|
||||
printf %s "checking for $CXX option to enable C++98 features... " >&6; }
|
||||
if test ${ac_cv_prog_cxx_98+y}
|
||||
if test ${ac_cv_prog_cxx_cxx98+y}
|
||||
then :
|
||||
printf %s "(cached) " >&6
|
||||
else $as_nop
|
||||
ac_cv_prog_cxx_98=no
|
||||
ac_cv_prog_cxx_cxx98=no
|
||||
ac_save_CXX=$CXX
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 24.2, which was
|
||||
This file was extended by cpuminer-opt $as_me 24.3, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config='$ac_cs_config_escaped'
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 24.2
|
||||
cpuminer-opt config.status 24.3
|
||||
configured by $0, generated by GNU Autoconf 2.71,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [24.2])
|
||||
AC_INIT([cpuminer-opt], [24.3])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
||||
20
configure~
20
configure~
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.1.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.2.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
|
||||
@@ -608,8 +608,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='24.1'
|
||||
PACKAGE_STRING='cpuminer-opt 24.1'
|
||||
PACKAGE_VERSION='24.2'
|
||||
PACKAGE_STRING='cpuminer-opt 24.2'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 24.1 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 24.2 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1432,7 +1432,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 24.1:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 24.2:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1538,7 +1538,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 24.1
|
||||
cpuminer-opt configure 24.2
|
||||
generated by GNU Autoconf 2.71
|
||||
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 24.1, which was
|
||||
It was created by cpuminer-opt $as_me 24.2, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
$ $0$ac_configure_args_raw
|
||||
@@ -3593,7 +3593,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='24.1'
|
||||
VERSION='24.2'
|
||||
|
||||
|
||||
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
|
||||
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 24.1, which was
|
||||
This file was extended by cpuminer-opt $as_me 24.2, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config='$ac_cs_config_escaped'
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 24.1
|
||||
cpuminer-opt config.status 24.2
|
||||
configured by $0, generated by GNU Autoconf 2.71,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
||||
57
cpu-miner.c
57
cpu-miner.c
@@ -2847,12 +2847,14 @@ static bool cpu_capability( bool display_only )
|
||||
bool cpu_has_sse41 = has_sse41(); // X86_64 only
|
||||
bool cpu_has_sse42 = has_sse42();
|
||||
bool cpu_has_avx = has_avx();
|
||||
// bool cpu_has_sve = has_sve(); // aarch64 only
|
||||
// bool cpu_has_sve2 = has_sve2();
|
||||
bool cpu_has_avx2 = has_avx2();
|
||||
bool cpu_has_avx512 = has_avx512();
|
||||
bool cpu_has_avx10 = has_avx10();
|
||||
bool cpu_has_aes = has_aes_ni(); // x86_64 or AArch64 AES
|
||||
bool cpu_has_aes = has_aes(); // x86_64 or AArch64
|
||||
bool cpu_has_vaes = has_vaes();
|
||||
bool cpu_has_sha256 = has_sha(); // x86_64 or AArch64
|
||||
bool cpu_has_sha256 = has_sha256(); // x86_64 or AArch64
|
||||
bool cpu_has_sha512 = has_sha512();
|
||||
bool sw_has_x86_64 = false;
|
||||
bool sw_has_aarch64 = false;
|
||||
@@ -2881,7 +2883,7 @@ static bool cpu_capability( bool display_only )
|
||||
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
|
||||
bool algo_has_aes = set_incl( AES_OPT, algo_features );
|
||||
bool algo_has_vaes = set_incl( VAES_OPT, algo_features );
|
||||
bool algo_has_sha256 = set_incl( SHA_OPT, algo_features );
|
||||
bool algo_has_sha256 = set_incl( SHA256_OPT, algo_features );
|
||||
bool algo_has_sha512 = set_incl( SHA512_OPT, algo_features );
|
||||
bool algo_has_neon = set_incl( NEON_OPT, algo_features );
|
||||
bool use_sse2;
|
||||
@@ -2896,7 +2898,6 @@ static bool cpu_capability( bool display_only )
|
||||
bool use_neon;
|
||||
bool use_none;
|
||||
|
||||
// x86_64
|
||||
#if defined(__x86_64__)
|
||||
sw_has_x86_64 = true;
|
||||
#elif defined(__aarch64__)
|
||||
@@ -2908,6 +2909,7 @@ static bool cpu_capability( bool display_only )
|
||||
sw_arm_arch = __ARM_ARCH;
|
||||
#endif
|
||||
#endif
|
||||
// x86_64_only
|
||||
#if defined(__SSE2__)
|
||||
sw_has_sse2 = true;
|
||||
#endif
|
||||
@@ -2935,7 +2937,7 @@ static bool cpu_capability( bool display_only )
|
||||
#if defined(__AVX10_1_512__)
|
||||
sw_has_avx10_512 = true;
|
||||
#endif
|
||||
|
||||
// x86_64 or AArch64
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
sw_has_aes = true;
|
||||
#endif
|
||||
@@ -2945,9 +2947,10 @@ static bool cpu_capability( bool display_only )
|
||||
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
|
||||
sw_has_sha256 = true;
|
||||
#endif
|
||||
#if defined(__SHA512__) || defined(__ARM_FEATURE_SHA3)
|
||||
#if defined(__SHA512__) || defined(__ARM_FEATURE_SHA512)
|
||||
sw_has_sha512 = true;
|
||||
#endif
|
||||
// AArch64 only
|
||||
#if defined(__ARM_NEON)
|
||||
sw_has_neon = true;
|
||||
#endif
|
||||
@@ -2995,7 +2998,12 @@ static bool cpu_capability( bool display_only )
|
||||
else if ( cpu_has_ssse3 ) printf( " SSSE3 " );
|
||||
else if ( cpu_has_sse2 ) printf( " SSE2 " );
|
||||
}
|
||||
else if ( cpu_has_aarch64 ) printf( " AArch64 NEON" ); // NEON assumed
|
||||
else if ( cpu_has_aarch64 )
|
||||
{
|
||||
printf( " AArch64 NEON" ); // NEON assumed
|
||||
// if ( cpu_has_sve2 ) printf( " SVE2 " );
|
||||
// else if ( cpu_has_sve ) printf( " SVE " );
|
||||
}
|
||||
if ( cpu_has_vaes ) printf( " VAES" );
|
||||
else if ( cpu_has_aes ) printf( " AES" );
|
||||
if ( cpu_has_sha512 ) printf( " SHA512" );
|
||||
@@ -3022,8 +3030,8 @@ static bool cpu_capability( bool display_only )
|
||||
printf( " AArch64" );
|
||||
if ( sw_arm_arch ) printf( " armv%d", sw_arm_arch );
|
||||
if ( sw_has_neon ) printf( " NEON" );
|
||||
// if ( sw_has_sve ) printf( " SVE" );
|
||||
// else if ( sw_has_sve2 ) printf( " SVE2" );
|
||||
// if ( sw_has_sve2 ) printf( " SVE2" );
|
||||
// else if ( sw_has_sve ) printf( " SVE" );
|
||||
|
||||
}
|
||||
if ( sw_has_vaes ) printf( " VAES" );
|
||||
@@ -3052,35 +3060,6 @@ static bool cpu_capability( bool display_only )
|
||||
|
||||
if ( display_only ) return true;
|
||||
|
||||
/*
|
||||
// Check for CPU and build incompatibilities
|
||||
if ( !cpu_has_sse2 && !cpu_has_aarch64 )
|
||||
{
|
||||
printf( "A CPU with SSE2 is required to use cpuminer-opt\n" );
|
||||
return false;
|
||||
}
|
||||
if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) )
|
||||
{
|
||||
printf( "The SW build requires a CPU with AES and AVX2!\n" );
|
||||
return false;
|
||||
}
|
||||
if ( sw_has_sse42 && !cpu_has_sse42 )
|
||||
{
|
||||
printf( "The SW build requires a CPU with SSE4.2!\n" );
|
||||
return false;
|
||||
}
|
||||
if ( sw_has_aes && !cpu_has_aes )
|
||||
{
|
||||
printf( "The SW build requires a CPU with AES!\n" );
|
||||
return false;
|
||||
}
|
||||
if ( sw_has_sha && !cpu_has_sha )
|
||||
{
|
||||
printf( "The SW build requires a CPU with SHA!\n" );
|
||||
return false;
|
||||
}
|
||||
*/
|
||||
|
||||
// Determine mining options
|
||||
use_sse2 = cpu_has_sse2 && sw_has_sse2 && algo_has_sse2;
|
||||
use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42;
|
||||
@@ -3103,6 +3082,7 @@ static bool cpu_capability( bool display_only )
|
||||
// if ( cpu_has_aarch64 ) printf( " AArch64");
|
||||
// else
|
||||
// printf( " x86_64" );
|
||||
if ( use_neon ) printf( " NEON" );
|
||||
if ( use_avx512 ) printf( " AVX512" );
|
||||
else if ( use_avx2 ) printf( " AVX2" );
|
||||
else if ( use_avx ) printf( " AVX" );
|
||||
@@ -3112,7 +3092,6 @@ static bool cpu_capability( bool display_only )
|
||||
else if ( use_aes ) printf( " AES" );
|
||||
if ( use_sha512 ) printf( " SHA512" );
|
||||
else if ( use_sha256 ) printf( " SHA256" );
|
||||
if ( use_neon ) printf( " NEON" );
|
||||
}
|
||||
printf( "\n" );
|
||||
|
||||
|
||||
@@ -38,7 +38,6 @@
|
||||
//
|
||||
// __m128i -> v128_t
|
||||
// _mm_ -> v128_
|
||||
// mm128_ -> v128_
|
||||
//
|
||||
// There is also new syntax to accomodate ARM's stricter type checking of
|
||||
// vector element size. They have no effect on x86_64.
|
||||
@@ -145,10 +144,8 @@
|
||||
typedef union
|
||||
{
|
||||
v128_t v128;
|
||||
__m128i m128;
|
||||
uint32_t u32[4];
|
||||
} __attribute__ ((aligned (16))) m128_ovly;
|
||||
#define v128_ovly m128_ovly
|
||||
} __attribute__ ((aligned (16))) v128_ovly;
|
||||
|
||||
// use for immediate constants, use load1 for mem.
|
||||
#define v128_64 _mm_set1_epi64x
|
||||
@@ -168,7 +165,7 @@ typedef union
|
||||
// compiler to exploit new features to produce optimum code.
|
||||
// Currently only used internally and by Luffa.
|
||||
|
||||
static inline __m128i mm128_mov64_128( const uint64_t n )
|
||||
static inline __m128i v128_mov64( const uint64_t n )
|
||||
{
|
||||
__m128i a;
|
||||
#if defined(__AVX__)
|
||||
@@ -178,10 +175,8 @@ static inline __m128i mm128_mov64_128( const uint64_t n )
|
||||
#endif
|
||||
return a;
|
||||
}
|
||||
//#define v128_mov64( u64 ) mm128_mov64_128( u64 )
|
||||
|
||||
|
||||
static inline __m128i mm128_mov32_128( const uint32_t n )
|
||||
static inline __m128i v128_mov32( const uint32_t n )
|
||||
{
|
||||
__m128i a;
|
||||
#if defined(__AVX__)
|
||||
@@ -235,7 +230,7 @@ static inline int v128_cmpeq0( v128_t v )
|
||||
// Bitwise compare return 1 if all bits set.
|
||||
#define v128_cmpeq1(v) _mm_test_all ones(v)
|
||||
|
||||
#define v128_one mm128_mov64_128(1)
|
||||
#define v128_one v128_mov64(1)
|
||||
|
||||
// ASM avoids the need to initialize return variable to avoid compiler warning.
|
||||
// Macro abstracts function parentheses to look like an identifier.
|
||||
@@ -327,7 +322,7 @@ static inline __m128i v128_neg1_fn()
|
||||
/*
|
||||
// Copy i32 to element c of dest and copy remaining elemnts from v.
|
||||
#define v128_put32( v, i32, c ) \
|
||||
v128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
|
||||
v128_xim_32( v, v128_mov32( i32 ), (c)<<4 )
|
||||
*/
|
||||
|
||||
|
||||
@@ -463,13 +458,11 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
// Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
|
||||
// Effectively a sign test.
|
||||
|
||||
#define mm128_movmask_64( v ) \
|
||||
#define v128_movmask64( v ) \
|
||||
_mm_movemask_pd( (__m128d)(v) )
|
||||
#define v128_movmask64 mm128_movmask_64
|
||||
|
||||
#define mm128_movmask_32( v ) \
|
||||
#define v128_movmask32( v ) \
|
||||
_mm_movemask_ps( (__m128)(v) )
|
||||
#define v128_movmask32 mm128_movmask_32
|
||||
|
||||
//
|
||||
// Bit rotations
|
||||
@@ -608,9 +601,6 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
|
||||
#endif
|
||||
|
||||
// deprecated
|
||||
#define mm128_rol_32 v128_rol32
|
||||
|
||||
// ror( v1 ^ v0, n )
|
||||
#define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n )
|
||||
|
||||
@@ -689,7 +679,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
|
||||
/* Not used, exists only for compatibility with NEON if ever needed.
|
||||
#define v128_shufflev32( v, vmask ) \
|
||||
v128_shuffle32( v, mm128_movmask_32( vmask ) )
|
||||
v128_shuffle32( v, v128_movmask32( vmask ) )
|
||||
*/
|
||||
|
||||
#define v128_shuffle8 _mm_shuffle_epi8
|
||||
@@ -734,15 +724,12 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
#define v128_bswap32( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ) )
|
||||
// deprecated
|
||||
#define mm128_bswap_32 v128_bswap32
|
||||
|
||||
#define v128_bswap16( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \
|
||||
0x0607040502030001 )
|
||||
|
||||
// 8 byte qword * 8 qwords * 2 lanes = 128 bytes
|
||||
#define mm128_block_bswap_64( d, s ) \
|
||||
#define v128_block_bswap64( d, s ) \
|
||||
{ \
|
||||
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
|
||||
@@ -754,8 +741,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
|
||||
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
|
||||
}
|
||||
#define mm128_block_bswap64_512 mm128_block_bswap_64
|
||||
#define v128_block_bswap64_512 mm128_block_bswap_64
|
||||
#define v128_block_bswap64_512 v128_block_bswap64
|
||||
|
||||
#define v128_block_bswap64_1024( d, s ) \
|
||||
{ \
|
||||
@@ -779,7 +765,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
}
|
||||
|
||||
// 4 byte dword * 8 dwords * 4 lanes = 128 bytes
|
||||
#define mm128_block_bswap_32( d, s ) \
|
||||
#define v128_block_bswap32( d, s ) \
|
||||
{ \
|
||||
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
|
||||
@@ -791,11 +777,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
|
||||
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
|
||||
}
|
||||
#define mm128_block_bswap32_256 mm128_block_bswap_32
|
||||
#define v128_block_bswap32_256 mm128_block_bswap_32
|
||||
#define v128_block_bswap32_256 v128_block_bswap32
|
||||
|
||||
|
||||
#define mm128_block_bswap32_128( d, s ) \
|
||||
#define v128_block_bswap32_128( d, s ) \
|
||||
{ \
|
||||
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
|
||||
@@ -840,7 +825,6 @@ static inline v128_t v128_bswap32( __m128i v )
|
||||
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
|
||||
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
|
||||
}
|
||||
#define mm128_bswap_32 v128_bswap32
|
||||
|
||||
static inline v128_t v128_bswap16( __m128i v )
|
||||
{
|
||||
@@ -849,7 +833,7 @@ static inline v128_t v128_bswap16( __m128i v )
|
||||
|
||||
#define v128_bswap128( v ) v128_qrev32( v128_bswap64( v ) )
|
||||
|
||||
static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
|
||||
static inline void v128_block_bswap64( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[0] = v128_bswap64( s[0] );
|
||||
d[1] = v128_bswap64( s[1] );
|
||||
@@ -860,9 +844,8 @@ static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
|
||||
d[6] = v128_bswap64( s[6] );
|
||||
d[7] = v128_bswap64( s[7] );
|
||||
}
|
||||
#define v128_block_bswap64_512 mm128_block_bswap_64
|
||||
|
||||
static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s )
|
||||
static inline void v128_block_bswap64_1024( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[ 0] = v128_bswap64( s[ 0] );
|
||||
d[ 1] = v128_bswap64( s[ 1] );
|
||||
@@ -882,7 +865,7 @@ static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s )
|
||||
d[15] = v128_bswap64( s[15] );
|
||||
}
|
||||
|
||||
static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
static inline void v128_block_bswap32( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[0] = v128_bswap32( s[0] );
|
||||
d[1] = v128_bswap32( s[1] );
|
||||
@@ -893,10 +876,9 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
d[6] = v128_bswap32( s[6] );
|
||||
d[7] = v128_bswap32( s[7] );
|
||||
}
|
||||
#define mm128_block_bswap32_256 mm128_block_bswap_32
|
||||
#define v128_block_bswap32_256 mm128_block_bswap_32
|
||||
#define v128_block_bswap32_256 v128_block_bswap32
|
||||
|
||||
static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
|
||||
static inline void v128_block_bswap32_512( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[ 0] = v128_bswap32( s[ 0] );
|
||||
d[ 1] = v128_bswap32( s[ 1] );
|
||||
@@ -918,9 +900,6 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
|
||||
|
||||
#endif // SSSE3 else SSE2
|
||||
|
||||
#define v128_block_bswap32 mm128_block_bswap_32
|
||||
#define v128_block_bswap64 mm128_block_bswap_64
|
||||
|
||||
// alignr instruction for 32 & 64 bit elements is only available with AVX512
|
||||
// but emulated here. Behaviour is consistent with Intel alignr intrinsics.
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
@@ -73,10 +73,10 @@ typedef union
|
||||
|
||||
#else
|
||||
|
||||
#define mm256_bcast128lo_64( i64 ) mm256_bcast_m128( mm128_mov64_128( i64 ) )
|
||||
#define mm256_bcast128lo_64( i64 ) mm256_bcast_m128( v128_mov64( i64 ) )
|
||||
|
||||
#define mm256_bcast128hi_64( i64 ) _mm256_permute4x64_epi64( \
|
||||
_mm256_castsi128_si256( mm128_mov64_128( i64 ) ), 0x11 )
|
||||
_mm256_castsi128_si256( v128_mov64( i64 ) ), 0x11 )
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -108,8 +108,12 @@ static inline uint32_t le162( const uint16_t u16 )
|
||||
#define rol32 __rold
|
||||
#define ror32 __rord
|
||||
|
||||
/* these don't seem to work
|
||||
#elif defined(__aarch64__)
|
||||
|
||||
// Documentation is vague, ror exists but is ambiguous. Docs say it can
|
||||
// do 32 or 64 registers. Assuming that is architecture specific andcan
|
||||
// only do 32 bit on 32 bit arch. Rarely used so not a big issue.
|
||||
static inline uint64_t ror64( uint64_t a, const int c )
|
||||
{
|
||||
uint64_t b;
|
||||
@@ -125,6 +129,7 @@ static inline uint32_t ror32( uint32_t a, const int c )
|
||||
return b;
|
||||
}
|
||||
#define rol32( a, c ) ror32( a, 32-(c) )
|
||||
*/
|
||||
|
||||
#else
|
||||
|
||||
|
||||
@@ -38,7 +38,9 @@
|
||||
#define v128u8_load( p ) vld1q_u16( (uint8_t*)(p) )
|
||||
#define v128u8_store( p, v ) vst1q_u16( (uint8_t*)(p), v )
|
||||
|
||||
// load & set1 combined, doesn't work
|
||||
// load & set1 combined. What if source is already loaded?
|
||||
// Don't use, leave it up to the compiler to optimize.
|
||||
// Same with vld1q_lane.
|
||||
#define v128_load1_64(p) vld1q_dup_u64( (uint64_t*)(p) )
|
||||
#define v128_load1_32(p) vld1q_dup_u32( (uint32_t*)(p) )
|
||||
#define v128_load1_16(p) vld1q_dup_u16( (uint16_t*)(p) )
|
||||
@@ -61,17 +63,13 @@
|
||||
#define v128_sub16 vsubq_u16
|
||||
#define v128_sub8 vsubq_u8
|
||||
|
||||
// returns low half, u64 undocumented, may not exist.
|
||||
#define v128_mul64 vmulq_u64
|
||||
// returns low half
|
||||
#define v128_mul32 vmulq_u32
|
||||
#define v128_mul16 vmulq_u16
|
||||
|
||||
// Widening multiply, align source elements with Intel
|
||||
static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
|
||||
{
|
||||
return vmull_u32( vget_low_u32( vcopyq_laneq_u32( v1, 1, v1, 2 ) ),
|
||||
vget_low_u32( vcopyq_laneq_u32( v0, 1, v0, 2 ) ) );
|
||||
}
|
||||
// Widening multiply, realign source elements from x86_64 to NEON.
|
||||
#define v128_mulw32( v1, v0 ) \
|
||||
vmull_u32( vmovn_u64( v1 ), vmovn_u64( v0 ) )
|
||||
|
||||
// compare
|
||||
#define v128_cmpeq64 vceqq_u64
|
||||
@@ -315,7 +313,6 @@ static inline void v128_memset_zero( void *dst, const int n )
|
||||
memset( dst, 0, n*16 );
|
||||
}
|
||||
|
||||
|
||||
static inline void v128_memset( void *dst, const void *src, const int n )
|
||||
{
|
||||
for( int i = 0; i < n; i++ )
|
||||
@@ -373,7 +370,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
((uint8x16_t)(v)), c )
|
||||
|
||||
|
||||
// ror( v1 ^ v0, n )
|
||||
// ( v1 ^ v0 ) >>> n
|
||||
#if defined(__ARM_FEATURE_SHA3)
|
||||
|
||||
#define v128_ror64xor( v1, v0, n ) vxarq_u64( v1, v0, n )
|
||||
@@ -438,7 +435,6 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
|
||||
// sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
|
||||
// Bit rotation already promotes faster widths. Usage is context sensitive.
|
||||
// preferred.
|
||||
|
||||
// reverse elements in vector lanes
|
||||
#define v128_qrev32 vrev64q_u32
|
||||
@@ -496,7 +492,7 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
|
||||
casti_v128u32( dst,6 ) = v128_bswap32( casti_v128u32( src,6 ) ); \
|
||||
casti_v128u32( dst,7 ) = v128_bswap32( casti_v128u32( src,7 ) ); \
|
||||
}
|
||||
#define v128_block_bswap32_256( dst, src ) \
|
||||
#define v128_block_bswap32_256 v128_block_bswap32
|
||||
|
||||
#define v128_block_bswap32_512( dst, src ) \
|
||||
{ \
|
||||
|
||||
239
sysinfos.c
239
sysinfos.c
@@ -16,7 +16,7 @@
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__aarch64__) && !defined(__APPLE__)
|
||||
#if defined(__aarch64__)
|
||||
// for arm's "cpuid"
|
||||
#include <sys/auxv.h>
|
||||
#include <asm/hwcap.h>
|
||||
@@ -309,14 +309,49 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
|
||||
#endif
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__) && !defined(__APPLE__)
|
||||
#elif defined(__aarch64__)
|
||||
|
||||
static inline void cpuid( unsigned int leaf, unsigned int subleaf,
|
||||
unsigned int output[4] )
|
||||
{
|
||||
|
||||
output[0] = getauxval(AT_HWCAP);
|
||||
output[1] = getauxval(AT_HWCAP2);
|
||||
|
||||
/*
|
||||
#define has(CAP, hwcap) !!((hwcap) & HWCAP_##CAP)
|
||||
#define pr(CAP, hwcap) printf("%10s = %d\n", #CAP, has(CAP, hwcap))
|
||||
|
||||
unsigned long hwcaps = getauxval(AT_HWCAP);
|
||||
printf("HWCAP = 0x%lx\n", hwcaps);
|
||||
|
||||
pr(FP, hwcaps);
|
||||
pr(ASIMD, hwcaps);
|
||||
pr(EVTSTRM, hwcaps);
|
||||
pr(AES, hwcaps);
|
||||
pr(PMULL, hwcaps);
|
||||
pr(SHA1, hwcaps);
|
||||
pr(SHA2, hwcaps);
|
||||
pr(CRC32, hwcaps);
|
||||
pr(ATOMICS, hwcaps);
|
||||
pr(FPHP, hwcaps);
|
||||
pr(ASIMDHP, hwcaps);
|
||||
pr(CPUID, hwcaps);
|
||||
pr(ASIMDRDM, hwcaps);
|
||||
pr(JSCVT, hwcaps);
|
||||
pr(FCMA, hwcaps);
|
||||
pr(LRCPC, hwcaps);
|
||||
pr(DCPOP, hwcaps);
|
||||
pr(SHA3, hwcaps);
|
||||
pr(SM3, hwcaps);
|
||||
pr(SM4, hwcaps);
|
||||
pr(ASIMDDP, hwcaps);
|
||||
pr(SHA512, hwcaps);
|
||||
pr(SVE, hwcaps);
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
#else
|
||||
#define cpuid(leaf, subleaf, out) out[0] = 0;
|
||||
#endif
|
||||
@@ -482,7 +517,6 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
|
||||
// 1 1 1 1 = AVX10 512 bit max (version 1 granite rapids)
|
||||
// Other combinations are not defined.
|
||||
|
||||
// No technical need for this, the code won't run if false.
|
||||
static inline bool cpu_arch_x86_64()
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
@@ -556,43 +590,18 @@ static inline bool has_sse42()
|
||||
#endif
|
||||
}
|
||||
|
||||
/* doesn't work
|
||||
static inline bool has_neon()
|
||||
{
|
||||
#if defined(__aarch64__) && !defined(__APPLE__)
|
||||
#if defined(__aarch64__)
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
return cpu_info[0];
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool has_aes_ni()
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
if ( has_sse2() )
|
||||
{
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( CPU_INFO, 0, cpu_info );
|
||||
return cpu_info[ ECX_Reg ] & AES_NI_Flag;
|
||||
}
|
||||
return false;
|
||||
#elif defined(__aarch64__) && !defined(__APPLE__)
|
||||
if ( has_neon() )
|
||||
{
|
||||
#if defined(KERNEL_HWCAP_AES)
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
/* unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( 0, 0, cpu_info );
|
||||
return cpu_info[0] & HWCAP_AES;
|
||||
*/ }
|
||||
return false;
|
||||
return cpu_info[0] & HWCAP_NEON;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
*/
|
||||
|
||||
static inline bool has_avx()
|
||||
{
|
||||
@@ -616,52 +625,23 @@ static inline bool has_avx2()
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool has_sha()
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
if ( has_avx() )
|
||||
static inline bool has_sve()
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( EXTENDED_FEATURES, 0, cpu_info );
|
||||
return cpu_info[ EBX_Reg ] & SHA_Flag;
|
||||
}
|
||||
return false;
|
||||
#elif defined(__aarch64__) && !defined(__APPLE__)
|
||||
if ( has_neon() )
|
||||
{
|
||||
#if defined(KERNEL_HWCAP_SHA2)
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
/* unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( 0, 0, cpu_info );
|
||||
return cpu_info[0] & HWCAP_SHA2;
|
||||
*/ }
|
||||
return false;
|
||||
return cpu_info[0] & HWCAP_SVE;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool has_sha512()
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
if ( has_avx2() )
|
||||
{
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( EXTENDED_FEATURES, 1, cpu_info );
|
||||
return cpu_info[ EAX_Reg ] & SHA512_Flag;
|
||||
}
|
||||
return false;
|
||||
#elif defined(__aarch64__) && !defined(__APPLE__)
|
||||
if ( has_neon() )
|
||||
static inline bool has_sve2()
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( 0, 0, cpu_info );
|
||||
return cpu_info[0] & HWCAP_SHA3;
|
||||
}
|
||||
return false;
|
||||
return cpu_info[1] & HWCAP2_SVE2;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
@@ -723,21 +703,6 @@ static inline bool has_avx512()
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool has_vaes()
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
if ( has_avx2() )
|
||||
{
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( EXTENDED_FEATURES, 0, cpu_info );
|
||||
return cpu_info[ ECX_Reg ] & VAES_Flag;
|
||||
}
|
||||
return false;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool has_vbmi()
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
@@ -760,6 +725,112 @@ static inline bool has_vbmi2()
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool has_aes()
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
if ( has_sse2() )
|
||||
{
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( CPU_INFO, 0, cpu_info );
|
||||
return cpu_info[ ECX_Reg ] & AES_NI_Flag;
|
||||
}
|
||||
return false;
|
||||
#elif defined(__aarch64__)
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( 0, 0, cpu_info );
|
||||
return cpu_info[0] & HWCAP_AES;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool has_vaes()
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
if ( has_avx2() )
|
||||
{
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( EXTENDED_FEATURES, 0, cpu_info );
|
||||
return cpu_info[ ECX_Reg ] & VAES_Flag;
|
||||
}
|
||||
return false;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool has_sveaes()
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( 0, 0, cpu_info );
|
||||
return cpu_info[1] & HWCAP2_SVEAES;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool has_sha256()
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
if ( has_avx() )
|
||||
{
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( EXTENDED_FEATURES, 0, cpu_info );
|
||||
return cpu_info[ EBX_Reg ] & SHA_Flag;
|
||||
}
|
||||
return false;
|
||||
#elif defined(__aarch64__)
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( 0, 0, cpu_info );
|
||||
return cpu_info[0] & HWCAP_SHA2;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool has_sha512()
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
if ( has_avx2() )
|
||||
{
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( EXTENDED_FEATURES, 1, cpu_info );
|
||||
return cpu_info[ EAX_Reg ] & SHA512_Flag;
|
||||
}
|
||||
return false;
|
||||
#elif defined(__aarch64__)
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( 0, 0, cpu_info );
|
||||
return cpu_info[0] & HWCAP_SHA512;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Arm only
|
||||
static inline bool has_sha3()
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( 0, 0, cpu_info );
|
||||
return cpu_info[0] & HWCAP_SHA3;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool has_svesha3()
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( 0, 0, cpu_info );
|
||||
return cpu_info[1] & HWCAP2_SVESHA3;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Obsolete, AMD only
|
||||
static inline bool has_xop()
|
||||
{
|
||||
@@ -962,9 +1033,7 @@ static inline void cpu_brand_string( char* s )
|
||||
|
||||
#elif defined(__arm__) || defined(__aarch64__)
|
||||
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( 0, 0, cpu_info );
|
||||
sprintf( s, "ARM 64 bit CPU, HWCAP %08x", cpu_info[0] );
|
||||
sprintf( s, "ARM 64 bit CPU" );
|
||||
|
||||
#else
|
||||
|
||||
|
||||
Reference in New Issue
Block a user