Compare commits

..

9 Commits
v24.2 ... v25.3

Author SHA1 Message Date
Jay D Dee
dd99580a4c v25.3 2025-01-16 12:31:53 -05:00
Jay D Dee
1ed18bf22e v25.2 2025-01-12 18:58:21 -05:00
Jay D Dee
1d9341ee92 v25.1 2024-12-30 21:33:04 -05:00
Jay D Dee
a45a333b40 v24.8 2024-12-25 23:12:29 -05:00
Jay D Dee
2b1037a7c7 v24.7 2024-12-16 19:17:19 -05:00
Jay D Dee
06624a0ff2 v24.6 2024-12-08 11:14:08 -05:00
Jay D Dee
8e91bfbe19 v24.5 2024-09-13 14:14:57 -04:00
Jay D Dee
47e24b50e8 v24.4 2024-07-01 00:33:19 -04:00
Jay D Dee
c47c4a8885 v24.3 2024-05-28 18:20:19 -04:00
93 changed files with 3219 additions and 5323 deletions

View File

@@ -1,19 +1,39 @@
if WANT_JANSSON
JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
if HAVE_APPLE
# MacOS uses Homebrew to install needed packages but they aren't linked for
# the jansson test in configure. Ignore the failed test & link them now,
# different path for different CPU arch.
if ARCH_ARM64
EXTRA_INCLUDES = -I/opt/homebrew/include
EXTRA_LIBS = -L/opt/homebrew/lib
else
JANSSON_INCLUDES=
EXTRA_INCLUDES = -I/usr/local/include
EXTRA_LIBS = -L/usr/local/lib
endif
EXTRA_DIST = example-cfg.json nomacro.pl
else
SUBDIRS = compat
if WANT_JANSSON
# Can't find jansson libraries, compile the included source code.
EXTRA_INCLUDES = -I$(top_srcdir)/compat/jansson
EXTRA_LIBS = -L$(top_srcdir)/compat/jansson
else
EXTRA_INCLUDES =
EXTRA_LIBS =
endif
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) -I.
endif
bin_PROGRAMS = cpuminer
EXTRA_DIST = example-cfg.json nomacro.pl
dist_man_MANS = cpuminer.1
SUBDIRS = compat
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(EXTRA_INCLUDES) -I.
bin_PROGRAMS = cpuminer
dist_man_MANS = cpuminer.1
cpuminer_SOURCES = \
dummy.cpp \
@@ -166,9 +186,6 @@ cpuminer_SOURCES = \
algo/shavite/sph-shavite-aesni.c \
algo/shavite/shavite-hash-2way.c \
algo/shavite/shavite-hash-4way.c \
algo/shavite/shavite.c \
algo/simd/nist.c \
algo/simd/vector.c \
algo/simd/sph_simd.c \
algo/simd/simd-hash-2way.c \
algo/skein/sph_skein.c \
@@ -275,29 +292,29 @@ cpuminer_SOURCES = \
algo/yespower/yespower-opt.c \
algo/yespower/yespower-ref.c \
algo/yespower/yespower-blake2b-ref.c
disable_flags =
if USE_ASM
cpuminer_SOURCES += asm/neoscrypt_asm.S
else
disable_flags += -DNOASM
endif
if HAVE_WINDOWS
cpuminer_SOURCES += compat/winansi.c
endif
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
if USE_ASM
disable_flags =
cpuminer_SOURCES += asm/neoscrypt_asm.S
else
disable_flags = -DNOASM
endif
if HAVE_WINDOWS
cpuminer_CFLAGS += -Wl,--stack,10485760
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = $(EXTRA_LIBS) @LIBCURL@ -ljansson @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
if ARCH_ARM64
cpuminer_CFLAGS += -flax-vector-conversions
endif
if HAVE_WINDOWS
# use to profile an object
# gprof_cflags = -pg -g3
# cpuminer_LDFLAGS += -pg
@@ -311,5 +328,4 @@ cpuminer-neoscrypt.o: neoscrypt.c
@echo "CUSTOM ${@}: ${filter %.o,${^}} ${filter %.c,${^}}"
$(CC) $(common_ccflags) -g -O3 $(gprof_cflags) -MT $@ -MD -MP -c -o $@ $<
endif

View File

@@ -36,34 +36,18 @@ for compile instructions.
Requirements
------------
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
Intel Core2 and newer and AMD equivalents. Further optimizations are available
on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
32 bit CPUs are not supported.
Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
are not supported.
1. A 64 bit CPU supporting x86_64 (Intel or AMD) or aarch64 (ARM).
x86_64 requires SSE2, aarch64 requires armv8 & NEON.
Mobile CPUs like laptop computers are not recommended because they aren't
designed for extreme heat of operating at full load for extended periods of
time.
Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
including Mint and Centos, are known to work and have all dependencies
in their repositories. Others may work but may require more effort. Older
versions such as Centos 6 don't work due to missing features.
Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
binaries. WindowsXP 64 bit is YMMV.
FreeBSD is not actively tested but should work, YMMV.
MacOS, OSx and Android are not supported.
2. 64 bit operating system including Linux, Windows, MacOS, or BSD.
Android, IOS and alt OSs like Haiku & ReactOS are not supported.
3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
RPC getwork using http:// or https://.
GBT is YMMV.
RPC getblockte,plate using http:// or https://.
Supported Algorithms
--------------------
@@ -71,9 +55,9 @@ Supported Algorithms
allium Garlicoin
anime Animecoin
argon2 Argon2 coin (AR2)
argon2d250 argon2d-crds, Credits (CRDS)
argon2d500 argon2d-dyn, Dynamic (DYN)
argon2d4096 argon2d-uis, Unitus, (UIS)
argon2d250
argon2d500
argon2d4096
blake Blake-256
blake2b Blake2-512
blake2s Blake2-256

View File

@@ -75,6 +75,70 @@ If not what makes it happen or not happen?
Change Log
----------
v25.3
#442, #443: Fixed a regression in Makefile.am.
Updated dockerfile.
Removed algo features log display.
Some code cleanup.
v25.2
ARM: Fixed regression from v25.1 that could cause build fail.
BSD: FreeBSD is now supported. Other BSDs may also work.
MacOS: build with installed jansson library instead of compiling the included source code.
Windows: remove "_WIN32_WINNT=0x0601" which is a downgrade on Win11.
Changed build.sh shell from bash to sh.
v25.1
MacOS ARM64: m7m algo is now working.
MacOS ARM64: can now be compiled with GCC.
MacOS x86_64: is now working compiled with GCC.
Fixed some minor bugs & removed some obsolete code.
v24.8
ARM: Apple MacOS on M series CPU is now supported compiled from source
code, see Wiki for details.
ARM: Fix incorrect compiler version display when using clang.
build.sh can now be used to compile all targets, arm_build.sh & build_msys2.sh
have been removed.
Windows: MSys2 build now enables CPU groups by default, prebuilt binaries
continue to be compiled with CPU groups disabled.
v24.7
ARM: compile works for Windows using MSys2 & MingW, see wiki for details.
v24.6
ARM: Fixed scryptn2, x16*, broken in v24.2.
ARM: Small improvement to interleaving.
Eliminated some potential compile errors in code that was dependent on
compiler optimisations.
x86_64: improved support for AVX10 compilation, needs GCC-14 or higher.
v24.5
Fix MinGW compile error after MSys2 upgrade to GCC-14.2.
#427: GBT: Improved handling of new work.
Removed shavite3 algo.
v24.4
x86_64: fixed a bug in ornot macro for AVX2 which broke some algos in v24.2.
x86_64: fixed a bug in alignr macros for SSE2.
ARM: CPU feature reporting enhancements.
Some code cleanup.
v24.3
ARM: CPU feature detection and reporting is now working.
ARM: Verthash is now working.
ARM: Small speedup for yescrypt, yespower & argon2d.
Code cleanup.
v24.2
x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.

View File

@@ -263,8 +263,8 @@ static void init_algo_gate( algo_gate_t* gate )
gate->build_block_header = (void*)&std_build_block_header;
gate->build_extraheader = (void*)&std_build_extraheader;
gate->set_work_data_endian = (void*)&do_nothing;
gate->resync_threads = (void*)&do_nothing;
gate->do_this_thread = (void*)&return_true;
// gate->resync_threads = (void*)&do_nothing;
// gate->do_this_thread = (void*)&return_true;
gate->longpoll_rpc_call = (void*)&std_longpoll_rpc_call;
gate->get_work_data_size = (void*)&std_get_work_data_size;
gate->optimizations = EMPTY_SET;
@@ -295,8 +295,8 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
{
case ALGO_ALLIUM: rc = register_allium_algo ( gate ); break;
case ALGO_ANIME: rc = register_anime_algo ( gate ); break;
case ALGO_ARGON2D250: rc = register_argon2d_crds_algo ( gate ); break;
case ALGO_ARGON2D500: rc = register_argon2d_dyn_algo ( gate ); break;
case ALGO_ARGON2D250: rc = register_argon2d250_algo ( gate ); break;
case ALGO_ARGON2D500: rc = register_argon2d500_algo ( gate ); break;
case ALGO_ARGON2D4096: rc = register_argon2d4096_algo ( gate ); break;
case ALGO_AXIOM: rc = register_axiom_algo ( gate ); break;
case ALGO_BLAKE: rc = register_blake_algo ( gate ); break;
@@ -340,7 +340,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_SHA256T: rc = register_sha256t_algo ( gate ); break;
case ALGO_SHA3D: rc = register_sha3d_algo ( gate ); break;
case ALGO_SHA512256D: rc = register_sha512256d_algo ( gate ); break;
case ALGO_SHAVITE3: rc = register_shavite_algo ( gate ); break;
case ALGO_SKEIN: rc = register_skein_algo ( gate ); break;
case ALGO_SKEIN2: rc = register_skein2_algo ( gate ); break;
case ALGO_SKUNK: rc = register_skunk_algo ( gate ); break;
@@ -417,8 +416,6 @@ void exec_hash_function( int algo, void *output, const void *pdata )
const char* const algo_alias_map[][2] =
{
// alias proper
{ "argon2d-dyn", "argon2d500" },
{ "argon2d-uis", "argon2d4096" },
{ "bcd", "x13bcd" },
{ "bitcore", "timetravel10" },
{ "bitzeny", "yescryptr8" },

View File

@@ -98,7 +98,6 @@ typedef uint32_t set_t;
#define AVX512_OPT 1 << 6 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
#define VAES_OPT 1 << 8 // Icelake, Zen3
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA256_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
#define NEON_OPT 1 << 11 // AArch64
@@ -166,10 +165,10 @@ char* ( *malloc_txs_request ) ( struct work* );
void ( *set_work_data_endian ) ( struct work* );
// Diverge mining threads
bool ( *do_this_thread ) ( int );
//bool ( *do_this_thread ) ( int );
// After do_this_thread
void ( *resync_threads ) ( int, struct work* );
//void ( *resync_threads ) ( int, struct work* );
json_t* ( *longpoll_rpc_call ) ( CURL*, int*, char* );

View File

@@ -6,9 +6,7 @@ static const size_t INPUT_BYTES = 80; // Lenth of a block header in bytes. Inpu
static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash
static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS
// Credits
void argon2d_crds_hash( void *output, const void *input )
void argon2d250_hash( void *output, const void *input )
{
argon2_context context;
context.out = (uint8_t *)output;
@@ -34,7 +32,7 @@ void argon2d_crds_hash( void *output, const void *input )
argon2_ctx( &context, Argon2_d );
}
int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) edata[20];
@@ -50,7 +48,7 @@ int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
do {
be32enc(&edata[19], nonce);
argon2d_crds_hash( hash, edata );
argon2d250_hash( hash, edata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
{
pdata[19] = nonce;
@@ -64,18 +62,16 @@ int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
return 0;
}
bool register_argon2d_crds_algo( algo_gate_t* gate )
bool register_argon2d250_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d_crds;
gate->hash = (void*)&argon2d_crds_hash;
gate->scanhash = (void*)&scanhash_argon2d250;
gate->hash = (void*)&argon2d250_hash;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
opt_target_factor = 65536.0;
return true;
}
// Dynamic
void argon2d_dyn_hash( void *output, const void *input )
void argon2d500_hash( void *output, const void *input )
{
argon2_context context;
context.out = (uint8_t *)output;
@@ -101,7 +97,7 @@ void argon2d_dyn_hash( void *output, const void *input )
argon2_ctx( &context, Argon2_d );
}
int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) edata[20];
@@ -118,7 +114,7 @@ int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
do
{
edata[19] = nonce;
argon2d_dyn_hash( hash, edata );
argon2d500_hash( hash, edata );
if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
&& !bench ) )
{
@@ -133,17 +129,15 @@ int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
return 0;
}
bool register_argon2d_dyn_algo( algo_gate_t* gate )
bool register_argon2d500_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d_dyn;
gate->hash = (void*)&argon2d_dyn_hash;
gate->scanhash = (void*)&scanhash_argon2d500;
gate->hash = (void*)&argon2d500_hash;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
opt_target_factor = 65536.0;
return true;
}
// Unitus
int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{

View File

@@ -5,19 +5,19 @@
#include <stdint.h>
// Credits: version = 0x10, m_cost = 250.
bool register_argon2d_crds_algo( algo_gate_t* gate );
bool register_argon2d250_algo( algo_gate_t* gate );
void argon2d_crds_hash( void *state, const void *input );
void argon2d250_hash( void *state, const void *input );
int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
// Dynamic: version = 0x10, m_cost = 500.
bool register_argon2d_dyn_algo( algo_gate_t* gate );
bool register_argon2d500_algo( algo_gate_t* gate );
void argon2d_dyn_hash( void *state, const void *input );
void argon2d500_hash( void *state, const void *input );
int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );

View File

@@ -240,7 +240,7 @@ static const uint8_t sigma[12][16] =
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
}
static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
static void blake2b_8x64_compress( blake2b_8x64_ctx *ctx, int last )
{
__m512i v[16], m[16];
@@ -306,7 +306,7 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
}
int blake2b_8way_init( blake2b_8way_ctx *ctx )
int blake2b_8x64_init( blake2b_8x64_ctx *ctx )
{
size_t i;
@@ -333,7 +333,7 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
}
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
size_t inlen )
{
__m512i* in =(__m512i*)input;
@@ -348,7 +348,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
ctx->t[0] += ctx->c;
if ( ctx->t[0] < ctx->c )
ctx->t[1]++;
blake2b_8way_compress( ctx, 0 );
blake2b_8x64_compress( ctx, 0 );
ctx->c = 0;
}
ctx->b[ c++ ] = in[i];
@@ -356,7 +356,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
}
}
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out )
{
size_t c;
c = ctx->c >> 3;
@@ -371,7 +371,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
ctx->c += 8;
}
blake2b_8way_compress( ctx, 1 ); // final block flag = 1
blake2b_8x64_compress( ctx, 1 ); // final block flag = 1
casti_m512i( out, 0 ) = ctx->h[0];
casti_m512i( out, 1 ) = ctx->h[1];
@@ -407,7 +407,7 @@ static const uint64_t blake2b_iv[8] = {
};
*/
static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
static void blake2b_4x64_compress( blake2b_4x64_ctx *ctx, int last )
{
__m256i v[16], m[16];
@@ -473,7 +473,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
}
int blake2b_4way_init( blake2b_4way_ctx *ctx )
int blake2b_4x64_init( blake2b_4x64_ctx *ctx )
{
size_t i;
@@ -499,7 +499,7 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
return 0;
}
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
size_t inlen )
{
__m256i* in =(__m256i*)input;
@@ -514,7 +514,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
ctx->t[0] += ctx->c;
if ( ctx->t[0] < ctx->c )
ctx->t[1]++;
blake2b_4way_compress( ctx, 0 );
blake2b_4x64_compress( ctx, 0 );
ctx->c = 0;
}
ctx->b[ c++ ] = in[i];
@@ -522,7 +522,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
}
}
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out )
{
size_t c;
c = ctx->c >> 3;
@@ -537,7 +537,7 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
ctx->c += 8;
}
blake2b_4way_compress( ctx, 1 ); // final block flag = 1
blake2b_4x64_compress( ctx, 1 ); // final block flag = 1
casti_m256i( out, 0 ) = ctx->h[0];
casti_m256i( out, 1 ) = ctx->h[1];

View File

@@ -1,6 +1,6 @@
#pragma once
#ifndef __BLAKE2B_HASH_4WAY_H__
#define __BLAKE2B_HASH_4WAY_H__
#ifndef BLAKE2B_HASH_4WAY_H__
#define BLAKE2B_HASH_4WAY_H__
#include "simd-utils.h"
#include <stddef.h>
@@ -23,12 +23,17 @@ typedef struct ALIGN( 64 ) {
uint64_t t[2]; // total number of bytes
size_t c; // pointer for b[]
size_t outlen; // digest size
} blake2b_8way_ctx;
} blake2b_8x64_ctx;
int blake2b_8way_init( blake2b_8way_ctx *ctx );
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
int blake2b_8x64_init( blake2b_8x64_ctx *ctx );
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
size_t inlen );
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );
#define blake2b_8way_ctx blake2b_8x64_ctx
#define blake2b_8way_init blake2b_8x64_init
#define blake2b_8way_update blake2b_8x64_update
#define blake2b_8way_final blake2b_8x64_final
#endif
@@ -41,12 +46,17 @@ typedef struct ALIGN( 64 ) {
uint64_t t[2]; // total number of bytes
size_t c; // pointer for b[]
size_t outlen; // digest size
} blake2b_4way_ctx;
} blake2b_4x64_ctx;
int blake2b_4way_init( blake2b_4way_ctx *ctx );
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
int blake2b_4x64_init( blake2b_4x64_ctx *ctx );
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
size_t inlen );
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );
#define blake2b_4way_ctx blake2b_4x64_ctx
#define blake2b_4way_init blake2b_4x64_init
#define blake2b_4way_update blake2b_4x64_update
#define blake2b_4way_final blake2b_4x64_final
#endif

View File

@@ -11,8 +11,8 @@
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
//#pragma once
#ifndef __BLAKE2S_HASH_4WAY_H__
#define __BLAKE2S_HASH_4WAY_H__ 1
#ifndef BLAKE2S_HASH_4WAY_H__
#define BLAKE2S_HASH_4WAY_H__ 1
#if defined(__SSE2__) || defined(__ARM_NEON)

View File

@@ -239,7 +239,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
{ \
/* AddRoundConstant P1024 */\
xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
casti_m128i( round_const_p, round_counter ) ) ); \
casti_v128u32( round_const_p, round_counter ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK1 );\
@@ -254,7 +254,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
\
/* AddRoundConstant P1024 */\
xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
casti_m128i( round_const_p, round_counter+1 ) ) ); \
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
@@ -283,7 +283,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
casti_m128i( round_const_q, round_counter ) ) ); \
casti_v128u32( round_const_q, round_counter ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK1 );\
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK3 );\
@@ -306,7 +306,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
casti_m128i( round_const_q, round_counter+1 ) ) ); \
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
@@ -812,7 +812,7 @@ static const __m256i SUBSH_MASK7_2WAY =
{ \
/* AddRoundConstant P1024 */\
xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
casti_m128i( round_const_p, round_counter ) ) ); \
casti_v128u32( round_const_p, round_counter ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK0_2WAY ); \
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK1_2WAY );\
@@ -827,7 +827,7 @@ static const __m256i SUBSH_MASK7_2WAY =
\
/* AddRoundConstant P1024 */\
xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
casti_m128i( round_const_p, round_counter+1 ) ) ); \
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
@@ -856,7 +856,7 @@ static const __m256i SUBSH_MASK7_2WAY =
xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
casti_m128i( round_const_q, round_counter ) ) ); \
casti_v128u32( round_const_q, round_counter ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK1_2WAY );\
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK3_2WAY );\
@@ -879,7 +879,7 @@ static const __m256i SUBSH_MASK7_2WAY =
xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
casti_m128i( round_const_q, round_counter+1 ) ) ); \
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\

View File

@@ -16,7 +16,7 @@ bool register_myriad_algo( algo_gate_t* gate )
init_myrgr_ctx();
gate->scanhash = (void*)&scanhash_myriad;
gate->hash = (void*)&myriad_hash;
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA256_OPT | VAES_OPT;
#endif
return true;
};

View File

@@ -387,7 +387,7 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
// Hamsi 8 way AVX512
// Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
// timig. However, when tested hashing X13 on i9-9940x using cmplt with zero
// timing. However, testing hashing X13 on i9-9940x using cmplt with zero
// had a 3% faster overall hashrate than than using movepi.
#define INPUT_BIG8 \
@@ -418,13 +418,11 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
tb = mm512_xoror( b, d, a ); \
a = _mm512_xor_si512( a, c ); \
b = mm512_xoror( td, tb, a ); \
td = mm512_xorand( a, td, tb ); \
d = _mm512_ternarylogic_epi64( a, td, tb, 0x87 );/* not( xorand( a, td, tb ) ); */ \
a = c; \
c = mm512_xor3( tb, b, td ); \
d = mm512_not( td ); \
c = _mm512_ternarylogic_epi64( tb, b, d, 0x69 ); /* not( xor3( tb, b, d ) ); */ \
}
/*
#define SBOX8( a, b, c, d ) \
do { \
@@ -1155,11 +1153,99 @@ do { \
b = mm256_xoror( td, tb, a ); \
d = _mm256_ternarylogic_epi64( a, td, tb, 0x87 );/* mm256_not( mm256_xorand( a, td, tb ) ); */ \
a = c; \
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /*mm256_not( mm256_xor3( tb, b, d ) );*/ \
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /* mm256_not( mm256_xor3( tb, b, d ) ); */ \
}
#else
#define INPUT_BIG_sub( db_i ) \
{ \
const __m256i dm = _mm256_cmpgt_epi64( zero, db_i ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
tp += 8; \
}
#define INPUT_BIG \
{ \
const __m256i db = *buf; \
const __m256i zero = m256_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
INPUT_BIG_sub( _mm256_slli_epi64( db,63 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,62 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,61 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,60 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,59 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,58 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,57 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,56 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,55 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,54 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,53 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,52 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,51 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,50 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,49 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,48 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,47 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,46 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,45 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,44 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,43 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,42 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,41 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,40 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,39 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,38 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,37 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,36 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,35 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,34 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,33 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,32 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,31 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,30 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,29 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,28 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,27 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,26 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,25 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,24 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,23 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,22 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,21 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,20 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,19 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,18 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,17 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,16 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,15 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,14 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,13 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,12 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,11 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,10 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 9 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 8 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 7 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 6 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 5 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 4 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 3 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 2 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 1 ) ); \
INPUT_BIG_sub( db ); \
}
#if 0
// dependent on the compiler unrolling the loop
#define INPUT_BIG \
do { \
__m256i db = *buf; \
@@ -1180,6 +1266,7 @@ do { \
tp += 8; \
} \
} while (0)
#endif
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
#define SBOX( a, b, c, d ) \
@@ -1219,7 +1306,7 @@ do { \
do { \
a = mm256_rol_32( a, 13 ); \
c = mm256_rol_32( c, 3 ); \
b = mm256_xor3( a, b, c ); \
b = mm256_xor3( b, a, c ); \
d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
b = mm256_rol_32( b, 1 ); \
d = mm256_rol_32( d, 7 ); \
@@ -1961,6 +2048,94 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
sc->h[6] = c6; \
sc->h[7] = c7;
#define INPUT_2x64_sub( db_i ) \
{ \
const v128u64_t dm = v128_cmpgt64( zero, db_i ); \
m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
tp += 8; \
}
#define INPUT_2x64 \
{ \
const v128u64_t db = *buf; \
const v128u64_t zero = v128_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
INPUT_2x64_sub( v128_sl64( db,63 ) ); \
INPUT_2x64_sub( v128_sl64( db,62 ) ); \
INPUT_2x64_sub( v128_sl64( db,61 ) ); \
INPUT_2x64_sub( v128_sl64( db,60 ) ); \
INPUT_2x64_sub( v128_sl64( db,59 ) ); \
INPUT_2x64_sub( v128_sl64( db,58 ) ); \
INPUT_2x64_sub( v128_sl64( db,57 ) ); \
INPUT_2x64_sub( v128_sl64( db,56 ) ); \
INPUT_2x64_sub( v128_sl64( db,55 ) ); \
INPUT_2x64_sub( v128_sl64( db,54 ) ); \
INPUT_2x64_sub( v128_sl64( db,53 ) ); \
INPUT_2x64_sub( v128_sl64( db,52 ) ); \
INPUT_2x64_sub( v128_sl64( db,51 ) ); \
INPUT_2x64_sub( v128_sl64( db,50 ) ); \
INPUT_2x64_sub( v128_sl64( db,49 ) ); \
INPUT_2x64_sub( v128_sl64( db,48 ) ); \
INPUT_2x64_sub( v128_sl64( db,47 ) ); \
INPUT_2x64_sub( v128_sl64( db,46 ) ); \
INPUT_2x64_sub( v128_sl64( db,45 ) ); \
INPUT_2x64_sub( v128_sl64( db,44 ) ); \
INPUT_2x64_sub( v128_sl64( db,43 ) ); \
INPUT_2x64_sub( v128_sl64( db,42 ) ); \
INPUT_2x64_sub( v128_sl64( db,41 ) ); \
INPUT_2x64_sub( v128_sl64( db,40 ) ); \
INPUT_2x64_sub( v128_sl64( db,39 ) ); \
INPUT_2x64_sub( v128_sl64( db,38 ) ); \
INPUT_2x64_sub( v128_sl64( db,37 ) ); \
INPUT_2x64_sub( v128_sl64( db,36 ) ); \
INPUT_2x64_sub( v128_sl64( db,35 ) ); \
INPUT_2x64_sub( v128_sl64( db,34 ) ); \
INPUT_2x64_sub( v128_sl64( db,33 ) ); \
INPUT_2x64_sub( v128_sl64( db,32 ) ); \
INPUT_2x64_sub( v128_sl64( db,31 ) ); \
INPUT_2x64_sub( v128_sl64( db,30 ) ); \
INPUT_2x64_sub( v128_sl64( db,29 ) ); \
INPUT_2x64_sub( v128_sl64( db,28 ) ); \
INPUT_2x64_sub( v128_sl64( db,27 ) ); \
INPUT_2x64_sub( v128_sl64( db,26 ) ); \
INPUT_2x64_sub( v128_sl64( db,25 ) ); \
INPUT_2x64_sub( v128_sl64( db,24 ) ); \
INPUT_2x64_sub( v128_sl64( db,23 ) ); \
INPUT_2x64_sub( v128_sl64( db,22 ) ); \
INPUT_2x64_sub( v128_sl64( db,21 ) ); \
INPUT_2x64_sub( v128_sl64( db,20 ) ); \
INPUT_2x64_sub( v128_sl64( db,19 ) ); \
INPUT_2x64_sub( v128_sl64( db,18 ) ); \
INPUT_2x64_sub( v128_sl64( db,17 ) ); \
INPUT_2x64_sub( v128_sl64( db,16 ) ); \
INPUT_2x64_sub( v128_sl64( db,15 ) ); \
INPUT_2x64_sub( v128_sl64( db,14 ) ); \
INPUT_2x64_sub( v128_sl64( db,13 ) ); \
INPUT_2x64_sub( v128_sl64( db,12 ) ); \
INPUT_2x64_sub( v128_sl64( db,11 ) ); \
INPUT_2x64_sub( v128_sl64( db,10 ) ); \
INPUT_2x64_sub( v128_sl64( db, 9 ) ); \
INPUT_2x64_sub( v128_sl64( db, 8 ) ); \
INPUT_2x64_sub( v128_sl64( db, 7 ) ); \
INPUT_2x64_sub( v128_sl64( db, 6 ) ); \
INPUT_2x64_sub( v128_sl64( db, 5 ) ); \
INPUT_2x64_sub( v128_sl64( db, 4 ) ); \
INPUT_2x64_sub( v128_sl64( db, 3 ) ); \
INPUT_2x64_sub( v128_sl64( db, 2 ) ); \
INPUT_2x64_sub( v128_sl64( db, 1 ) ); \
INPUT_2x64_sub( db ); \
}
#if 0
// Dependent on the compiler unrolling the loop.
#define INPUT_2x64 \
{ \
v128u64_t db = *buf; \
@@ -1981,6 +2156,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
tp += 8; \
} \
}
#endif
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
#define SBOX_2x64( a, b, c, d ) \
@@ -2001,7 +2177,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
{ \
a = v128_rol32( a, 13 ); \
c = v128_rol32( c, 3 ); \
b = v128_xor3( a, b, c ); \
b = v128_xor3( c, a, b ); \
d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
b = v128_rol32( b, 1 ); \
d = v128_rol32( d, 7 ); \

View File

@@ -161,29 +161,25 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
size_t byte_len, size_t lim )
{
unsigned eb;
union {
__m512i tmp[lim + 1];
uint64_t dummy; /* for alignment */
} u;
__m512i tmp[lim + 1] __attribute__ ((aligned (64)));
size_t j;
size_t m512_len = byte_len >> 3;
const unsigned eb = hard_coded_eb;
eb = hard_coded_eb;
if ( kc->ptr == (lim - 8) )
{
const uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = _mm512_set1_epi64( t );
tmp[0] = _mm512_set1_epi64( t );
j = 8;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = _mm512_set1_epi64( eb );
memset_zero_512( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
tmp[0] = _mm512_set1_epi64( eb );
memset_zero_512( tmp + 1, (j>>3) - 2 );
tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
}
keccak64_8way_core( kc, u.tmp, j, lim );
keccak64_8way_core( kc, tmp, j, lim );
/* Finalize the "lane complement" */
NOT64( kc->w[ 1], kc->w[ 1] );
NOT64( kc->w[ 2], kc->w[ 2] );
@@ -361,29 +357,25 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
size_t lim )
{
unsigned eb;
union {
__m256i tmp[lim + 1];
uint64_t dummy; /* for alignment */
} u;
__m256i tmp[lim + 1] __attribute__ ((aligned (32)));
size_t j;
size_t m256_len = byte_len >> 3;
const unsigned eb = hard_coded_eb;
eb = hard_coded_eb;
if ( kc->ptr == (lim - 8) )
{
const uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = _mm256_set1_epi64x( t );
tmp[0] = _mm256_set1_epi64x( t );
j = 8;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = _mm256_set1_epi64x( eb );
memset_zero_256( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
tmp[0] = _mm256_set1_epi64x( eb );
memset_zero_256( tmp + 1, (j>>3) - 2 );
tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
}
keccak64_core( kc, u.tmp, j, lim );
keccak64_core( kc, tmp, j, lim );
/* Finalize the "lane complement" */
NOT64( kc->w[ 1], kc->w[ 1] );
NOT64( kc->w[ 2], kc->w[ 2] );

View File

@@ -47,25 +47,19 @@
a1 = _mm_alignr_epi8( b, a1, 4 ); \
}
#elif defined(__ARM_NEON)
#elif defined(__ARM_NEON) || defined(__SSE2__)
// { a1_0, 0, a1_0, a1_0 }
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, v128_and( vdupq_laneq_u32( a1, 0 ), MASK ) ); \
v128_t b = v128_xor( a0, v128_and( v128_bcast32( a1 ), MASK ) ); \
a0 = v128_alignr32( a1, b, 1 ); \
a1 = v128_alignr32( b, a1, 1 ); \
}
#else // assume SSE2
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, v128_and( _mm_shuffle_epi32( a1, 0 ), MASK ) ); \
a0 = v128_or( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
}
#else
#warning __FILE__ ":" __LINE__ " Unknown or unsupported CPU architecture."
#endif
#if defined(VL256)

View File

@@ -195,10 +195,6 @@ static const uint64_t blake2b_IV[8] =
#endif // AVX2 else SSE2
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
return ( w >> c ) | ( w << ( 64 - c ) );
}
#define G( r, i, a, b, c, d ) \
{ \
a = a + b; \

View File

@@ -1,8 +1,6 @@
#include "cpuminer-config.h"
#include "algo-gate-api.h"
#if !defined(__APPLE__)
#include <gmp.h>
#include <stdbool.h>
#include <stdlib.h>
@@ -33,6 +31,7 @@ static inline double exp_n( double xt )
return exp( xt );
}
/*
static inline double exp_n2( double x1, double x2 )
{
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
@@ -53,6 +52,7 @@ static inline double exp_n2( double x1, double x2 )
else if ( xt > p6 - 1.e-200 )
return 0.;
}
*/
double swit2_( double wvnmb )
{
@@ -298,15 +298,9 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
return 0;
}
#endif // not apple
bool register_m7m_algo( algo_gate_t *gate )
{
#if defined(__APPLE__)
applog( LOG_ERR, "M7M algo is not supported on MacOS");
return false;
#else
gate->optimizations = SHA_OPT;
gate->optimizations = SHA256_OPT;
init_m7m_ctx();
gate->scanhash = (void*)&scanhash_m7m_hash;
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
@@ -315,6 +309,5 @@ bool register_m7m_algo( algo_gate_t *gate )
gate->set_work_data_endian = (void*)&set_work_data_big_endian;
opt_target_factor = 65536.0;
return true;
#endif
}

View File

@@ -11,7 +11,6 @@
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
@@ -617,9 +616,9 @@ union _hmq1725_4way_context_overlay
cubehashParam cube;
cube_2way_context cube2;
sph_shavite512_context shavite;
hashState_sd sd;
simd512_context simd;
shavite512_2way_context shavite2;
simd_2way_context simd;
simd_2way_context simd_2way;
hashState_echo echo;
hamsi512_4way_context hamsi;
hashState_fugue fugue;
@@ -753,8 +752,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
simd512_2way_full( &ctx.simd_2way, vhashA, vhashA, 64 );
simd512_2way_full( &ctx.simd_2way, vhashB, vhashB, 64 );
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
@@ -869,41 +868,25 @@ extern void hmq1725_4way_hash(void *state, const void *input)
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
(const BitSequence *)hash0, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
}
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
if ( hash1[0] & mask ) //4
echo_full( &ctx.echo, (BitSequence *)hash1, 512,
(const BitSequence *)hash1, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
}
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
if ( hash2[0] & mask ) //4
echo_full( &ctx.echo, (BitSequence *)hash2, 512,
(const BitSequence *)hash2, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
}
simd512_ctx( &ctx.simd, hash2, hash2, 64 );
if ( hash3[0] & mask ) //4
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
(const BitSequence *)hash3, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
}
simd512_ctx( &ctx.simd, hash3, hash3, 64 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

View File

@@ -104,14 +104,14 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
int thr_id = mythr->id; // thr_id arg is deprecated
// we need bigendian data...
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
@@ -224,14 +224,14 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
int thr_id = mythr->id; // thr_id arg is deprecated
// we need bigendian data...
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
intrlv_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 1024 );

View File

@@ -51,7 +51,6 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
bool register_lbry_algo( algo_gate_t* gate )
{
// gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
#if defined (LBRY_16WAY)
gate->scanhash = (void*)&scanhash_lbry_16way;
gate->hash = (void*)&lbry_16way_hash;
@@ -67,7 +66,7 @@ bool register_lbry_algo( algo_gate_t* gate )
#else
gate->scanhash = (void*)&scanhash_lbry;
gate->hash = (void*)&lbry_hash;
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA256_OPT;
#endif
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
gate->build_extraheader = (void*)&lbry_build_extraheader;

View File

@@ -319,7 +319,7 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
ripemd160_4way_round( sc );
for (u = 0; u < 5; u ++)
casti_m128i( dst, u ) = sc->val[u];
casti_v128u32( dst, u ) = sc->val[u];
}
#endif

View File

@@ -46,7 +46,7 @@
#endif
#ifdef __GNUC__
#if defined(NOASM) || defined(__arm__) || defined(__aarch64__)
#if defined(NOASM) || defined(__arm__) || defined(__aarch64__) || defined(__APPLE__)
#define ASM 0
#else
#define ASM 1

View File

@@ -2074,7 +2074,7 @@ void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N )
v128_ovly v;
for ( int l = 0; l < 4; l++ )
v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
X[i] = v128_xor( X[i], v.m128 );
X[i] = v128_xor( X[i], v.v128 );
}
xor_salsa8_4way( &X[ 0], &X[16] );
@@ -2211,10 +2211,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
// X2 is shuffled left 2 (swap_64) { xd, x8, x7, x2 }
// X3 is shuffled left 3 (ror_1x32) { xc, xb, x6, x1 }
y[0].m128 = X0;
y[1].m128 = X1;
y[2].m128 = X2;
y[3].m128 = X3;
y[0].v128 = X0;
y[1].v128 = X1;
y[2].v128 = X2;
y[3].v128 = X3;
z[0].u32[0] = y[0].u32[0];
z[0].u32[3] = y[1].u32[0];
@@ -2236,10 +2236,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
z[3].u32[1] = y[2].u32[3];
z[3].u32[0] = y[3].u32[3];
B[0] = v128_add32( B[0], z[0].m128 );
B[1] = v128_add32( B[1], z[1].m128 );
B[2] = v128_add32( B[2], z[2].m128 );
B[3] = v128_add32( B[3], z[3].m128 );
B[0] = v128_add32( B[0], z[0].v128 );
B[1] = v128_add32( B[1], z[1].v128 );
B[2] = v128_add32( B[2], z[2].v128 );
B[3] = v128_add32( B[3], z[3].v128 );
#endif
@@ -2404,14 +2404,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
/*
v128_ovly ya[4], za[4], yb[4], zb[4];
ya[0].m128 = XA[0];
yb[0].m128 = XB[0];
ya[1].m128 = XA[1];
yb[1].m128 = XB[1];
ya[2].m128 = XA[2];
yb[2].m128 = XB[2];
ya[3].m128 = XA[3];
yb[3].m128 = XB[3];
ya[0].v128 = XA[0];
yb[0].v128 = XB[0];
ya[1].v128 = XA[1];
yb[1].v128 = XB[1];
ya[2].v128 = XA[2];
yb[2].v128 = XB[2];
ya[3].v128 = XA[3];
yb[3].v128 = XB[3];
za[0].u32[0] = ya[0].u32[0];
zb[0].u32[0] = yb[0].u32[0];
@@ -2449,14 +2449,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
za[3].u32[3] = ya[0].u32[3];
zb[3].u32[3] = yb[0].u32[3];
XA[0] = za[0].m128;
XB[0] = zb[0].m128;
XA[1] = za[1].m128;
XB[1] = zb[1].m128;
XA[2] = za[2].m128;
XB[2] = zb[2].m128;
XA[3] = za[3].m128;
XB[3] = zb[3].m128;
XA[0] = za[0].v128;
XB[0] = zb[0].v128;
XA[1] = za[1].v128;
XB[1] = zb[1].v128;
XA[2] = za[2].v128;
XB[2] = zb[2].v128;
XA[3] = za[3].v128;
XB[3] = zb[3].v128;
*/
}
@@ -2770,18 +2770,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
/*
v128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];
ya[0].m128 = XA[0];
yb[0].m128 = XB[0];
yc[0].m128 = XC[0];
ya[1].m128 = XA[1];
yb[1].m128 = XB[1];
yc[1].m128 = XC[1];
ya[2].m128 = XA[2];
yb[2].m128 = XB[2];
yc[2].m128 = XC[2];
ya[3].m128 = XA[3];
yb[3].m128 = XB[3];
yc[3].m128 = XC[3];
ya[0].v128 = XA[0];
yb[0].v128 = XB[0];
yc[0].v128 = XC[0];
ya[1].v128 = XA[1];
yb[1].v128 = XB[1];
yc[1].v128 = XC[1];
ya[2].v128 = XA[2];
yb[2].v128 = XB[2];
yc[2].v128 = XC[2];
ya[3].v128 = XA[3];
yb[3].v128 = XB[3];
yc[3].v128 = XC[3];
za[0].u32[0] = ya[0].u32[0];
zb[0].u32[0] = yb[0].u32[0];
@@ -2835,18 +2835,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
zb[3].u32[3] = yb[0].u32[3];
zc[3].u32[3] = yc[0].u32[3];
XA[0] = za[0].m128;
XB[0] = zb[0].m128;
XC[0] = zc[0].m128;
XA[1] = za[1].m128;
XB[1] = zb[1].m128;
XC[1] = zc[1].m128;
XA[2] = za[2].m128;
XB[2] = zb[2].m128;
XC[2] = zc[2].m128;
XA[3] = za[3].m128;
XB[3] = zb[3].m128;
XC[3] = zc[3].m128;
XA[0] = za[0].v128;
XB[0] = zb[0].v128;
XC[0] = zc[0].v128;
XA[1] = za[1].v128;
XB[1] = zb[1].v128;
XC[1] = zc[1].v128;
XA[2] = za[2].v128;
XB[2] = zb[2].v128;
XC[2] = zc[2].v128;
XA[3] = za[3].v128;
XB[3] = zb[3].v128;
XC[3] = zc[3].v128;
*/
}
@@ -3049,7 +3049,7 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
xf = (B[15] ^= C[15]);
#define ROL32( a, c ) ror32( a, c )
#define ROL32( a, c ) rol32( a, c )
#define ADD32( a, b ) ( (a)+(b) )
#define XOR( a, b ) ( (a)^(b) )

View File

@@ -274,9 +274,6 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
#endif // SHA
static const uint32_t keypad_4way[ 4*12 ] __attribute((aligned(32))) =
{
0x80000000, 0x80000000, 0x80000000, 0x80000000,
@@ -447,7 +444,7 @@ static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate,
output[i] = bswap_32( ostate[i] );
}
#ifdef HAVE_SHA256_8WAY
#if defined(__AVX2__)
/*
static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = {
@@ -590,7 +587,7 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
output[i] = bswap_32(ostate[i]);
}
#endif /* HAVE_SHA256_8WAY */
#endif //AVX2
#if defined(SIMD512)
@@ -724,25 +721,10 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
#endif // AVX512
#define SCRYPT_MAX_WAYS 12
#define HAVE_SCRYPT_3WAY 1
void scrypt_core(uint32_t *X, uint32_t *V, int N);
void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
#if defined(__AVX2__)
#undef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 24
#define HAVE_SCRYPT_6WAY 1
void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
#endif
#ifndef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 1
#endif
#include "scrypt-core-4way.h"
/*
#if ( SCRYPT_THROUGHPUT == 1 )
static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
uint32_t *midstate, int N, int thr_id )
{
@@ -752,15 +734,12 @@ static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
memcpy(tstate, midstate, 32);
HMAC_SHA256_80_init(input, tstate, ostate);
PBKDF2_SHA256_80_128(tstate, ostate, input, X);
scrypt_core_simd128( X, scratchbuf, N ); // woring
// scrypt_core_1way( X, V, N ); // working
// scrypt_core(X, V, N);
scrypt_core_1way( X, scratchbuf, N );
PBKDF2_SHA256_128_32(tstate, ostate, X, output);
return true;
}
*/
#endif
#if ( SCRYPT_THROUGHPUT == 8 )
@@ -1201,20 +1180,6 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+448, V, N );
********************/
/*
scrypt_core_3way( X, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_3way( X+ 96, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+192, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_3way( X+256, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_3way( X+352, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+448, V, N );
*/
if ( work_restart[thrid].restart ) return 0;
@@ -1321,8 +1286,7 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
return 1;
}
#else
// SSE2
#elif defined(__SSE2__) || defined(__ARM_NEON)
static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
uint32_t *midstate, int N, int thrid )
@@ -1481,7 +1445,7 @@ bool scrypt_miner_thread_init( int thr_id )
bool register_scrypt_algo( algo_gate_t* gate )
{
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | SHA256_OPT | NEON_OPT;
#else
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#endif
@@ -1491,31 +1455,31 @@ bool register_scrypt_algo( algo_gate_t* gate )
opt_param_n = opt_param_n ? opt_param_n : 1024;
applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );
// scrypt_throughput defined at compile time and used to replace
// MAX_WAYS to reduce memory usage.
#if defined(SIMD512)
// scrypt_throughput = 16;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
// scrypt_throughput = 2;
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
#elif defined(__AVX2__)
// scrypt_throughput = 8;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else
scratchbuf_size = opt_param_n * 2 * 128; // 2 way
#else
// scrypt_throughput = 4;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
else
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
#endif
switch ( SCRYPT_THROUGHPUT )
{
case 16: // AVX512
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
break;
case 2: // SHA256
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
break;
case 8: // AVX2
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else
scratchbuf_size = opt_param_n * 2 * 128; // 2 way
break;
case 4: // SSE2, NEON
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
else
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
default:
scratchbuf_size = opt_param_n; // 1 way
}
char t_units[4] = {0};
char d_units[4] = {0};

View File

@@ -74,8 +74,8 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
memset( pad, 0x36, 64*4 );
for ( i = 0; i < Klen; i++ )
casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
casti_m128i( K, i ) );
casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
casti_v128u32( K, i ) );
sha256_4way_update( &ctx->ictx, pad, 64 );
@@ -83,8 +83,8 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
sha256_4way_init( &ctx->octx );
memset( pad, 0x5c, 64*4 );
for ( i = 0; i < Klen/4; i++ )
casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
casti_m128i( K, i ) );
casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
casti_v128u32( K, i ) );
sha256_4way_update( &ctx->octx, pad, 64 );
}
@@ -158,8 +158,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
/* ... xor U_j ... */
for ( k = 0; k < 8; k++ )
casti_m128i( T, k ) = _mm_xor_si128( casti_m128i( T, k ),
casti_m128i( U, k ) );
casti_v128u32( T, k ) = _mm_xor_si128( casti_v128u32( T, k ),
casti_v128u32( U, k ) );
}
/* Copy as many bytes as necessary into buf. */

View File

@@ -569,8 +569,8 @@ void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
__m128i STATE0, STATE1, MSG, TMP;
// Load initial values
TMP = casti_m128i( istate, 0 );
STATE1 = casti_m128i( istate, 1 );
TMP = casti_v128u32( istate, 0 );
STATE1 = casti_v128u32( istate, 1 );
TMP = _mm_shuffle_epi32( TMP, 0xB1 ); // CDAB
STATE1 = _mm_shuffle_epi32( STATE1, 0x1B ); // EFGH
@@ -578,17 +578,17 @@ void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); // CDGH
// Save current hash
casti_m128i( sstate, 0 ) = STATE0;
casti_m128i( sstate, 1 ) = STATE1;
casti_v128u32( sstate, 0 ) = STATE0;
casti_v128u32( sstate, 1 ) = STATE1;
// Rounds 0 to 3
MSG = casti_m128i( msg, 0 );
MSG = casti_v128u32( msg, 0 );
TMP = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL );
MSG = _mm_add_epi32( MSG, TMP );
STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
MSG = _mm_shuffle_epi32( MSG, 0x0E );
casti_m128i( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
casti_m128i( ostate, 1 ) = STATE1;
casti_v128u32( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
casti_v128u32( ostate, 1 ) = STATE1;
}
void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
@@ -601,22 +601,22 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
STATE0_X = casti_m128i( state_mid_X, 0 );
STATE1_X = casti_m128i( state_mid_X, 1 );
STATE0_Y = casti_m128i( state_mid_Y, 0 );
STATE1_Y = casti_m128i( state_mid_Y, 1 );
STATE0_X = casti_v128u32( state_mid_X, 0 );
STATE1_X = casti_v128u32( state_mid_X, 1 );
STATE0_Y = casti_v128u32( state_mid_Y, 0 );
STATE1_Y = casti_v128u32( state_mid_Y, 1 );
// Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
TMSG0_X = casti_m128i( msg_X, 0 );
TMSG0_Y = casti_m128i( msg_Y, 0 );
TMSG0_X = casti_v128u32( msg_X, 0 );
TMSG0_Y = casti_v128u32( msg_Y, 0 );
TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );
// Rounds 4 to 7
TMSG1_X = casti_m128i( msg_X, 1 );
TMSG1_Y = casti_m128i( msg_Y, 1 );
TMSG1_X = casti_v128u32( msg_X, 1 );
TMSG1_Y = casti_v128u32( msg_Y, 1 );
TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL );
MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
@@ -638,8 +638,8 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_X );
// Rounds 12 to 15
TMSG3_X = casti_m128i( msg_X, 3 );
TMSG3_Y = casti_m128i( msg_Y, 3 );
TMSG3_X = casti_v128u32( msg_X, 3 );
TMSG3_Y = casti_v128u32( msg_Y, 3 );
TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL );
MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
@@ -867,20 +867,20 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
// Add saved state to new state
STATE0_X = _mm_add_epi32( STATE0_X, casti_m128i( state_save_X, 0 ) );
STATE1_X = _mm_add_epi32( STATE1_X, casti_m128i( state_save_X, 1 ) );
STATE0_Y = _mm_add_epi32( STATE0_Y, casti_m128i( state_save_Y, 0 ) );
STATE1_Y = _mm_add_epi32( STATE1_Y, casti_m128i( state_save_Y, 1 ) );
STATE0_X = _mm_add_epi32( STATE0_X, casti_v128u32( state_save_X, 0 ) );
STATE1_X = _mm_add_epi32( STATE1_X, casti_v128u32( state_save_X, 1 ) );
STATE0_Y = _mm_add_epi32( STATE0_Y, casti_v128u32( state_save_Y, 0 ) );
STATE1_Y = _mm_add_epi32( STATE1_Y, casti_v128u32( state_save_Y, 1 ) );
// Unshuffle & save state
TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B ); // FEBA
TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B );
STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 ); // DCHG
STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 );
casti_m128i( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
casti_m128i( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
casti_m128i( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); // ABEF
casti_m128i( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
casti_v128u32( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
casti_v128u32( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
casti_v128u32( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); // ABEF
casti_v128u32( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
}
#endif // SHA

View File

@@ -8,14 +8,14 @@ void sha256d( void *hash, const void *data, int len )
}
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
#elif defined(SHA256D_SHA)
gate->optimizations = SHA_OPT;
gate->optimizations = SSE2_OPT | SHA256_OPT;
gate->scanhash = (void*)&scanhash_sha256d_sha;
#elif defined(SHA256D_NEON_SHA2)
gate->optimizations = SHA_OPT;
gate->optimizations = NEON_OPT | SHA256_OPT;
gate->scanhash = (void*)&scanhash_sha256d_neon_sha2;
#elif defined(SHA256D_8WAY)
gate->scanhash = (void*)&scanhash_sha256d_8way;

View File

@@ -500,10 +500,10 @@ bool register_sha256dt_algo( algo_gate_t* gate )
#if defined(SHA256DT_16X32)
gate->scanhash = (void*)&scanhash_sha256dt_16x32;
#elif defined(SHA256DT_X86_SHA256)
gate->optimizations = SHA_OPT;
gate->optimizations = SSE2_OPT | SHA256_OPT;
gate->scanhash = (void*)&scanhash_sha256dt_x86_x2sha;
#elif defined(SHA256DT_NEON_SHA256)
gate->optimizations = SHA_OPT;
gate->optimizations = NEON_OPT | SHA256_OPT;
gate->scanhash = (void*)&scanhash_sha256dt_neon_x2sha;
#elif defined(SHA256DT_8X32)
gate->scanhash = (void*)&scanhash_sha256dt_8x32;

View File

@@ -6,9 +6,10 @@ bool register_sha256t_algo( algo_gate_t* gate )
#if defined(SHA256T_16WAY)
gate->scanhash = (void*)&scanhash_sha256t_16way;
#elif defined(SHA256T_SHA)
gate->optimizations = SHA_OPT;
gate->optimizations = SSE2_OPT | SHA256_OPT;
gate->scanhash = (void*)&scanhash_sha256t_sha;
#elif defined(SHA256T_NEON_SHA2)
gate->optimizations = NEON_OPT | SHA256_OPT;
gate->scanhash = (void*)&scanhash_sha256t_neon_sha2;
#elif defined(SHA256T_8WAY)
gate->scanhash = (void*)&scanhash_sha256t_8way;
@@ -28,7 +29,7 @@ bool register_sha256q_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_sha256q_16way;
gate->hash = (void*)&sha256q_16way_hash;
//#elif defined(SHA256T_SHA)
// gate->optimizations = SHA_OPT;
// gate->optimizations = SHA256_OPT;
// gate->scanhash = (void*)&scanhash_sha256q;
// gate->hash = (void*)&sha256q_hash;
#elif defined(SHA256T_8WAY)

View File

@@ -71,12 +71,13 @@ static const uint64_t K512[80] =
// SHA-512 implemented using SHA512 CPU extension.
// Experimental. Not tested. Not reviewed. Compile tested only.
// Experimental. Not supported. Not tested. Not reviewed. Compile tested only.
// Modelled after noloader sha256 implementation, replacing 4x32 bit
// instructions with equivalent 4x64 bit instructions and increasing rounds
// to 80.
// Needs GCC-14 for compilation.
// Needs Intel Lunarlake or Arrowlake CPU, or AMD Zen-6? for execution.
// Modelled after noloader sha256 implementation.
void sha512_opt_transform_be( uint64_t *state_out, const void *input,
const uint64_t *state_in )
@@ -571,6 +572,20 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
#endif
/*
#if defined(__ARM_FEATURE_NEON) && defined(__ARM_FEATURE_SHA512)
uint64x2_t sha512_compile_test( uint64x2_t test )
{
test = vsha512hq_u64( test, test, test );
test = vsha512h2q_u64( test, test, test );
test = vsha512su0q_u64( test, test );
test = vsha512su1q_u64( test, test, test );
return test;
}
#endif
*/
#if defined(SIMD512)

View File

@@ -300,11 +300,12 @@ static inline __m512i v512_mult_x5( const __m512i x )
#define PERM_ELT16( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm512_xor3( xm, xb1, mm512_xorandnot( \
v512_mult_x3( mm512_xor3( xa0, xc, \
v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ) ), \
xb3, xb2 ) ); \
xb0 = mm512_xnor( xa0, mm512_rol_32( xb0, 1 ) ); \
xa0 = mm512_xor3( xa0, xc, \
v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ); \
xb0 = mm512_rol_32( xb0, 1 ); \
xa0 = mm512_xor3( xm, xb1, \
mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \
xb0 = mm512_xnor( xa0, xb0 ); \
} while (0)
#define PERM_STEP_0_16 do { \
@@ -905,11 +906,12 @@ static inline __m256i v256_mult_x5( const __m256i x )
#define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
v256_mult_x3( mm256_xor3( xa0, xc, \
v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ) ), \
xb3, xb2 ) ); \
xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
xa0 = mm256_xor3( xa0, xc, \
v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ); \
xb0 = mm256_rol_32( xb0, 1 ); \
xa0 = mm256_xor3( xm, xb1, \
mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \
xb0 = mm256_xnor( xa0, xb0 ); \
} while (0)
#define PERM_STEP_0_8 do { \

View File

@@ -1,472 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "nist.h"
#include "simd_iv.h"
/* #define NO_PRECOMPUTED_IV */
#if defined(__SSE2__) // || defined(__ARM_NEON)
/*
* Increase the counter.
*/
void IncreaseCounter(hashState_sd *state, DataLength databitlen) {
#ifdef HAS_64
state->count += databitlen;
#else
uint32_t old_count = state->count_low;
state->count_low += databitlen;
if (state->count_low < old_count)
state->count_high++;
#endif
}
/*
* Initialize the hashState_sd with a given IV.
* If the IV is NULL, initialize with zeros.
*/
int InitIV(hashState_sd *state, int hashbitlen, const u32 *IV) {
int n = 8;
state->hashbitlen = hashbitlen;
state->n_feistels = n;
state->blocksize = 128*8;
#ifdef HAS_64
state->count = 0;
#else
state->count_low = 0;
state->count_high = 0;
#endif
// state->buffer = malloc(16*n + 16);
/*
* Align the buffer to a 128 bit boundary.
*/
// state->buffer += ((unsigned char*)NULL - state->buffer)&15;
// state->A = malloc((4*n+4)*sizeof(u32));
/*
* Align the buffer to a 128 bit boundary.
*/
// state->A += ((u32*)NULL - state->A)&3;
state->B = state->A+n;
state->C = state->B+n;
state->D = state->C+n;
if (IV)
memcpy(state->A, IV, 4*n*sizeof(u32));
else
memset(state->A, 0, 4*n*sizeof(u32));
// free(state->buffer);
// free(state->A);
return 0;
}
/*
* Initialize the hashState_sd.
*/
int init_sd(hashState_sd *state, int hashbitlen) {
int r;
char *init;
#ifndef NO_PRECOMPUTED_IV
// if (hashbitlen == 224)
// r=InitIV(state, hashbitlen, IV_224);
// else if (hashbitlen == 256)
// r=InitIV(state, hashbitlen, IV_256);
// else if (hashbitlen == 384)
// r=InitIV(state, hashbitlen, IV_384);
// else
if (hashbitlen == 512)
r = InitIV(state, hashbitlen, IV_512);
else
#endif
{
/*
* Nonstandart length: IV is not precomputed.
*/
r=InitIV(state, hashbitlen, NULL);
if (r != 0)
return r;
init = malloc(state->blocksize);
memset(init, 0, state->blocksize);
#if defined __STDC__ && __STDC_VERSION__ >= 199901L
snprintf(init, state->blocksize, "SIMD-%i v1.1", hashbitlen);
#else
sprintf(init, "SIMD-%i v1.1", hashbitlen);
#endif
SIMD_Compress(state, (unsigned char*) init, 0);
free(init);
}
return r;
}
int update_sd( hashState_sd *state, const BitSequence *data,
DataLength databitlen )
{
unsigned current;
unsigned int bs = state->blocksize;
static int align = -1;
if (align == -1)
align = RequiredAlignment();
#ifdef HAS_64
current = state->count & (bs - 1);
#else
current = state->count_low & (bs - 1);
#endif
if ( current & 7 )
{
// The number of hashed bits is not a multiple of 8.
// Very painfull to implement and not required by the NIST API.
return 1;
}
while ( databitlen > 0 )
{
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_Compress(state, data, 0);
databitlen -= bs;
data += bs/8;
IncreaseCounter(state, bs);
}
else
{
// Copy a chunk of data to the buffer
unsigned int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
IncreaseCounter( state, databitlen );
return 0;
}
else
{
memcpy( state->buffer+current/8, data, len/8 );
IncreaseCounter( state,len );
databitlen -= len;
data += len/8;
current = 0;
SIMD_Compress( state, state->buffer, 0 );
}
}
}
return 0;
}
int final_sd( hashState_sd *state, BitSequence *hashval )
{
#ifdef HAS_64
uint64_t l;
int current = state->count & (state->blocksize - 1);
#else
uint32_t l;
int current = state->count_low & (state->blocksize - 1);
#endif
unsigned int i;
BitSequence bs[64];
int isshort = 1;
// If there is still some data in the buffer, hash it
if ( current )
{
// We first need to zero out the end of the buffer.
if ( current & 7 )
{
BitSequence mask = 0xff >> ( current & 7 );
state->buffer[current/8] &= ~mask;
}
current = ( current+7 ) / 8;
memset( state->buffer+current, 0, state->blocksize/8 - current );
SIMD_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, state->blocksize / 8 );
#ifdef HAS_64
l = state->count;
for ( i=0; i<8; i++ )
{
state->buffer[i] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
#else
l = state->count_low;
for ( i=0; i<4; i++ )
{
state->buffer[i] = l & 0xff;
l >>= 8;
}
l = state->count_high;
for ( i=0; i<4; i++ )
{
state->buffer[4+i] = l & 0xff;
l >>= 8;
}
if ( state->count_high == 0 && state->count_low < 16384 )
isshort = 2;
#endif
SIMD_Compress( state, state->buffer, isshort );
// Decode the 32-bit words into a BitSequence
for ( i=0; i < 2*state->n_feistels; i++ )
{
u32 x = state->A[i];
bs[4*i ] = x&0xff;
x >>= 8;
bs[4*i+1] = x&0xff;
x >>= 8;
bs[4*i+2] = x&0xff;
x >>= 8;
bs[4*i+3] = x&0xff;
}
memcpy( hashval, bs, state->hashbitlen / 8 );
if ( state->hashbitlen % 8 )
{
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
hashval[state->hashbitlen/8 + 1] = bs[state->hashbitlen/8 + 1] & mask;
}
return 0;
}
int update_final_sd( hashState_sd *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen )
{
int current, i;
unsigned int bs = state->blocksize;
static int align = -1;
BitSequence out[64];
int isshort = 1;
uint64_t l;
if (align == -1)
align = RequiredAlignment();
#ifdef HAS_64
current = state->count & (bs - 1);
#else
current = state->count_low & (bs - 1);
#endif
if ( current & 7 )
{
// The number of hashed bits is not a multiple of 8.
// Very painfull to implement and not required by the NIST API.
return 1;
}
while ( databitlen > 0 )
{
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_Compress(state, data, 0);
databitlen -= bs;
data += bs/8;
IncreaseCounter(state, bs);
}
else
{
// Copy a chunk of data to the buffer
unsigned int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
IncreaseCounter( state, databitlen );
break;
}
else
{
memcpy( state->buffer+current/8, data, len/8 );
IncreaseCounter( state,len );
databitlen -= len;
data += len/8;
current = 0;
SIMD_Compress( state, state->buffer, 0 );
}
}
}
current = state->count & (state->blocksize - 1);
// If there is still some data in the buffer, hash it
if ( current )
{
// We first need to zero out the end of the buffer.
if ( current & 7 )
{
BitSequence mask = 0xff >> ( current & 7 );
state->buffer[current/8] &= ~mask;
}
current = ( current+7 ) / 8;
memset( state->buffer+current, 0, state->blocksize/8 - current );
SIMD_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, state->blocksize / 8 );
l = state->count;
for ( i=0; i<8; i++ )
{
state->buffer[i] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
SIMD_Compress( state, state->buffer, isshort );
// Decode the 32-bit words into a BitSequence
for ( i=0; i < 2*state->n_feistels; i++ )
{
u32 x = state->A[i];
out[4*i ] = x & 0xff;
x >>= 8;
out[4*i+1] = x & 0xff;
x >>= 8;
out[4*i+2] = x & 0xff;
x >>= 8;
out[4*i+3] = x & 0xff;
}
memcpy( hashval, out, state->hashbitlen / 8 );
if ( state->hashbitlen % 8 )
{
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
}
return 0;
}
int simd_full( hashState_sd *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen )
{
InitIV( state, 512, IV_512 );
int current, i;
unsigned int bs = state->blocksize;
static int align = -1;
BitSequence out[64];
int isshort = 1;
uint64_t l;
if (align == -1)
align = RequiredAlignment();
#ifdef HAS_64
current = state->count & (bs - 1);
#else
current = state->count_low & (bs - 1);
#endif
if ( current & 7 )
{
// The number of hashed bits is not a multiple of 8.
// Very painfull to implement and not required by the NIST API.
return 1;
}
while ( databitlen > 0 )
{
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_Compress(state, data, 0);
databitlen -= bs;
data += bs/8;
IncreaseCounter(state, bs);
}
else
{
// Copy a chunk of data to the buffer
unsigned int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
IncreaseCounter( state, databitlen );
break;
}
else
{
memcpy( state->buffer+current/8, data, len/8 );
IncreaseCounter( state,len );
databitlen -= len;
data += len/8;
current = 0;
SIMD_Compress( state, state->buffer, 0 );
}
}
}
current = state->count & (state->blocksize - 1);
// If there is still some data in the buffer, hash it
if ( current )
{
// We first need to zero out the end of the buffer.
if ( current & 7 )
{
BitSequence mask = 0xff >> ( current & 7 );
state->buffer[current/8] &= ~mask;
}
current = ( current+7 ) / 8;
memset( state->buffer+current, 0, state->blocksize/8 - current );
SIMD_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, state->blocksize / 8 );
l = state->count;
for ( i=0; i<8; i++ )
{
state->buffer[i] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
SIMD_Compress( state, state->buffer, isshort );
// Decode the 32-bit words into a BitSequence
for ( i=0; i < 2*state->n_feistels; i++ )
{
u32 x = state->A[i];
out[4*i ] = x & 0xff;
x >>= 8;
out[4*i+1] = x & 0xff;
x >>= 8;
out[4*i+2] = x & 0xff;
x >>= 8;
out[4*i+3] = x & 0xff;
}
memcpy( hashval, out, state->hashbitlen / 8 );
if ( state->hashbitlen % 8 )
{
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
}
return 0;
}
#endif

View File

@@ -1,64 +0,0 @@
#ifndef __NIST_H__
#define __NIST_H__
/*define data alignment for different C compilers*/
#if defined(__GNUC__)
#define DATA_ALIGN(x) x __attribute__((aligned(16)))
#else
#define DATA_ALIGN(x) __declspec(align(16)) x
#endif
#include "simd-compat.h"
#include "compat/sha3-defs.h"
/*
* NIST API Specific types.
*/
typedef struct {
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
#ifdef HAS_64
uint64_t count;
#else
uint32_t count_low;
uint32_t count_high;
#endif
DATA_ALIGN(uint32_t A[32]);
uint32_t *B;
uint32_t *C;
uint32_t *D;
DATA_ALIGN(unsigned char buffer[128]);
} hashState_sd;
/*
* NIST API
*/
int init_sd(hashState_sd *state, int hashbitlen);
int update_sd(hashState_sd *state, const BitSequence *data, DataLength databitlen);
int final_sd(hashState_sd *state, BitSequence *hashval);
int update_final_sd( hashState_sd *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen );
int simd_full( hashState_sd *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen );
/*
* Internal API
*/
//int SupportedLength(int hashbitlen);
int RequiredAlignment(void);
void SIMD_Compress(hashState_sd * state, const unsigned char *M, int final);
void fft128_natural(fft_t *a, unsigned char *x);
void fft256_natural(fft_t *a, unsigned char *x);
#endif

View File

@@ -1,198 +0,0 @@
#ifndef __SIMD_COMPAT_H__
#define __SIMD_COMPAT_H__
#include <limits.h>
/*
* This file desfines some helper function for cross-platform compatibility.
*/
#if defined __GNUC_PREREQ && (! defined __STRICT_ANSI__)
#define GNU_EXT
#endif
/*
* First define some integer types.
*/
#if defined __STDC__ && __STDC_VERSION__ >= 199901L
/*
* On C99 implementations, we can use <stdint.h> to get an exact 32-bit
* type, if any, or otherwise use a wider type.
*/
#include <stdint.h>
#include "compat/brg_types.h"
#define C32(x) ((u32)(x))
#define HAS_64 1
#else
/*
* On non-C99 systems, we use "unsigned int" if it is wide enough,
* "unsigned long" otherwise. This supports all "reasonable" architectures.
* We have to be cautious: pre-C99 preprocessors handle constants
* differently in '#if' expressions. Hence the shifts to test UINT_MAX.
*/
#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
typedef unsigned int u32;
#define C32(x) ((u32)(x ## U))
#else
typedef unsigned long u32;
#define C32(x) ((u32)(x ## UL))
#endif
/*
* We want a 64-bit type. We use "unsigned long" if it is wide enough (as
* is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
* "unsigned long long" otherwise, if available. We use ULLONG_MAX to
* test whether "unsigned long long" is available; we also know that
* gcc features this type, even if the libc header do not know it.
*/
#if ((ULONG_MAX >> 31) >> 31) >= 3
typedef unsigned long u64;
#define HAS_64 1
#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
typedef unsigned long long u64;
#define HAS_64 1
#else
/*
* No 64-bit type...
*/
#endif
#endif
/*
* fft_t should be at least 16 bits wide.
* using short int will require less memory, but int is faster...
*/
typedef int fft_t;
/*
* Implementation note: some processors have specific opcodes to perform
* a rotation. Recent versions of gcc recognize the expression above and
* use the relevant opcodes, when appropriate.
*/
#define T32(x) ((x) & C32(0xFFFFFFFF))
#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
#define ROTR32(x, n) ROTL32(x, (32 - (n)))
/*
* The macro MAYBE_INLINE expands to an inline qualifier, is available.
*/
#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined GNU_EXT
#define MAYBE_INLINE static inline
#elif defined _MSC_VER
#define MAYBE_INLINE __inline
#else
#define MAYBE_INLINE
#endif
/* */
#if defined __GNUC__ && ( defined __i386__ || defined __x86_64__ )
#define rdtsc() \
({ \
u32 lo, hi; \
__asm__ __volatile__ ( /* serialize */ \
"xorl %%eax,%%eax \n cpuid" \
::: "%rax", "%rbx", "%rcx", "%rdx"); \
/* We cannot use "=A", since this would use %rax on x86_64 */ \
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); \
(u64)hi << 32 | lo; \
}) \
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
#define rdtsc __rdtsc
#endif
/*
* The IS_ALIGNED macro tests if a char* pointer is aligned to an
* n-bit boundary.
* It is defined as false on unknown architectures.
*/
#define CHECK_ALIGNED(p,n) ((((unsigned char *) (p) - (unsigned char *) NULL) & ((n)-1)) == 0)
#if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
/*
* Unaligned 32-bit access are not expensive on x86 so we don't care
*/
#define IS_ALIGNED(p,n) (n<=4 || CHECK_ALIGNED(p,n))
#elif defined __sparcv9 || defined __sparc || defined __arm || \
defined __ia64 || defined __ia64__ || \
defined __itanium__ || defined __M_IA64 || \
defined __powerpc__ || defined __powerpc
#define IS_ALIGNED(p,n) CHECK_ALIGNED(p,n)
#else
/*
* Unkonwn architecture: play safe
*/
#define IS_ALIGNED(p,n) 0
#endif
/* checks for endianness */
#if defined (__linux__) || defined (__GLIBC__)
# include <endian.h>
#elif defined (__FreeBSD__)
# include <machine/endian.h>
#elif defined (__OpenBSD__)
# include <sys/endian.h>
#endif
#ifdef __BYTE_ORDER
# if __BYTE_ORDER == __LITTLE_ENDIAN
# define SIMD_LITTLE_ENDIAN
# elif __BYTE_ORDER == __BIG_ENDIAN
# define SIMD_BIG_ENDIAN
# endif
#else
# if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
# define SIMD_LITTLE_ENDIAN
# endif
#endif
#endif

View File

@@ -231,7 +231,7 @@ static void FFT64( void *a )
// Unrolled decimation in frequency (DIF) radix-2 NTT.
// Output data is in revbin_permuted order.
static const int w[] = {0, 2, 4, 6};
// static const int w[] = {0, 2, 4, 6};
#define BUTTERFLY_0( i,j ) \
do { \
@@ -240,25 +240,25 @@ do { \
X(i) = v128_sub16( X(i), v ); \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
#define BUTTERFLY_N( i, j, w_n ) \
do { \
v128u16_t v = X(j); \
X(j) = v128_add16( X(i), X(j) ); \
X(i) = v128_sl16( v128_sub16( X(i), v ), w[n] ); \
X(i) = v128_sl16( v128_sub16( X(i), v ), w_n ); \
} while(0)
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
BUTTERFLY_N( 1, 5, 2 );
BUTTERFLY_N( 2, 6, 4 );
BUTTERFLY_N( 3, 7, 6 );
DO_REDUCE( 2 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
BUTTERFLY_N( 1, 3, 4 );
BUTTERFLY_N( 5, 7, 4 );
DO_REDUCE( 1 );
@@ -329,10 +329,10 @@ do { \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
#define BUTTERFLY_N( i, j, w_n ) \
do { \
v128u16_t u = X(j); \
X(i) = v128_sl16( X(i), w[n] ); \
X(i) = v128_sl16( X(i), w_n ); \
X(j) = v128_sub16( X(j), X(i) ); \
X(i) = v128_add16( u, X(i) ); \
} while(0)
@@ -353,15 +353,15 @@ do { \
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
BUTTERFLY_N( 1, 3, 4 );
BUTTERFLY_N( 5, 7, 4 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
BUTTERFLY_N( 1, 5, 2 );
BUTTERFLY_N( 2, 6, 4 );
BUTTERFLY_N( 3, 7, 6 );
DO_REDUCE_FULL_S( 0 );
DO_REDUCE_FULL_S( 1 );
@@ -853,7 +853,7 @@ static void fft64_2way( void *a )
// Unrolled decimation in frequency (DIF) radix-2 NTT.
// Output data is in revbin_permuted order.
static const int w[] = {0, 2, 4, 6};
// static const int w[] = {0, 2, 4, 6};
// __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
@@ -864,25 +864,25 @@ do { \
X(i) = _mm256_sub_epi16( X(i), v ); \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
#define BUTTERFLY_N( i, j, w_n ) \
do { \
__m256i v = X(j); \
X(j) = _mm256_add_epi16( X(i), X(j) ); \
X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w_n ); \
} while(0)
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
BUTTERFLY_N( 1, 5, 2 );
BUTTERFLY_N( 2, 6, 4 );
BUTTERFLY_N( 3, 7, 6 );
DO_REDUCE( 2 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
BUTTERFLY_N( 1, 3, 4 );
BUTTERFLY_N( 5, 7, 4 );
DO_REDUCE( 1 );
@@ -953,10 +953,10 @@ do { \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
#define BUTTERFLY_N( i, j, w_n ) \
do { \
__m256i u = X(j); \
X(i) = _mm256_slli_epi16( X(i), w[n] ); \
X(i) = _mm256_slli_epi16( X(i), w_n ); \
X(j) = _mm256_sub_epi16( X(j), X(i) ); \
X(i) = _mm256_add_epi16( u, X(i) ); \
} while(0)
@@ -977,15 +977,15 @@ do { \
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
BUTTERFLY_N( 1, 3, 4 );
BUTTERFLY_N( 5, 7, 4 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
BUTTERFLY_N( 1, 5, 2 );
BUTTERFLY_N( 2, 6, 4 );
BUTTERFLY_N( 3, 7, 6 );
DO_REDUCE_FULL_S( 0 );
DO_REDUCE_FULL_S( 1 );
@@ -1709,11 +1709,11 @@ do { \
X(i) = _mm512_sub_epi16( X(i), v ); \
} while(0)
#define BUTTERFLY_N( i, j, w ) \
#define BUTTERFLY_N( i, j, w_n ) \
do { \
__m512i v = X(j); \
X(j) = _mm512_add_epi16( X(i), X(j) ); \
X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w ); \
X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w_n ); \
} while(0)
BUTTERFLY_0( 0, 4 );
@@ -1792,10 +1792,10 @@ do { \
} while(0)
#define BUTTERFLY_N( i, j, w ) \
#define BUTTERFLY_N( i, j, w_n ) \
do { \
__m512i u = X(j); \
X(i) = _mm512_slli_epi16( X(i), w ); \
X(i) = _mm512_slli_epi16( X(i), w_n ); \
X(j) = _mm512_sub_epi16( X(j), X(i) ); \
X(i) = _mm512_add_epi16( u, X(i) ); \
} while(0)

View File

@@ -1,7 +1,6 @@
#ifndef SIMD_HASH_2WAY_H__
#define SIMD_HASH_2WAY_H__ 1
#include "simd-compat.h"
#include "simd-utils.h"
#if defined(__SSE2__) || defined (__ARM_NEON)
@@ -34,7 +33,7 @@ typedef struct
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
} simd512_2way_context __attribute__((aligned(128)));
} simd512_2way_context __attribute__((aligned(64)));
#define simd_2way_context simd512_2way_context
// databitlen is bits

View File

@@ -1,948 +0,0 @@
#include <stdlib.h>
#include <stdio.h>
#include "nist.h"
#include "vector.h"
//#if defined(__SSE2__) || defined(__ARM_NEON)
#if defined(__SSE2__)
#define PRINT_SOME 0
/*
int SupportedLength(int hashbitlen) {
if (hashbitlen <= 0 || hashbitlen > 512)
return 0;
else
return 1;
}
*/
int RequiredAlignment(void) {
return 16;
}
static const union cv V128 = CV(128);
static const union cv V255 = CV(255);
static const union cv V257 = CV(257);
static const union cv8 V0 = CV(0);
/*
* Reduce modulo 257; result is in [-127; 383]
* REDUCE(x) := (x&255) - (x>>8)
*/
#define REDUCE(x) \
v16_sub(v16_and(x, V255.v16), v16_shift_r (x, 8))
/*
* Reduce from [-127; 383] to [-128; 128]
* EXTRA_REDUCE_S(x) := x<=128 ? x : x-257
*/
#define EXTRA_REDUCE_S(x) \
v16_sub(x, v16_and(V257.v16, v16_cmp(x, V128.v16)))
/*
* Reduce modulo 257; result is in [-128; 128]
*/
#define REDUCE_FULL_S(x) \
EXTRA_REDUCE_S(REDUCE(x))
#define DO_REDUCE(i) \
X(i) = REDUCE(X(i))
#define DO_REDUCE_FULL_S(i) \
do { \
X(i) = REDUCE(X(i)); \
X(i) = EXTRA_REDUCE_S(X(i)); \
} while(0)
#define MAYBE_VOLATILE
MAYBE_INLINE void fft64(void *a) {
v16* const A = a;
register v16 X0, X1, X2, X3, X4, X5, X6, X7;
/*
#if V16_SIZE == 8
#define X(i) A[i]
#elif V16_SIZE == 4
#define X(i) A[2*i]
#endif
*/
#define X(i) X##i
X0 = A[0];
X1 = A[1];
X2 = A[2];
X3 = A[3];
X4 = A[4];
X5 = A[5];
X6 = A[6];
X7 = A[7];
#define DO_REDUCE(i) \
X(i) = REDUCE(X(i))
/*
* Begin with 8 parallels DIF FFT_8
*
* FFT_8 using w=4 as 8th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT.
* Output data is in revbin_permuted order.
*/
static const int w[] = {0, 2, 4, 6};
// v16 *Twiddle = (v16*)FFT64_Twiddle;
#define BUTTERFLY(i,j,n) \
do { \
MAYBE_VOLATILE v16 v = X(j); \
X(j) = v16_add(X(i), X(j)); \
if (n) \
X(i) = v16_shift_l(v16_sub(X(i), v), w[n]); \
else \
X(i) = v16_sub(X(i), v); \
} while(0)
BUTTERFLY(0, 4, 0);
BUTTERFLY(1, 5, 1);
BUTTERFLY(2, 6, 2);
BUTTERFLY(3, 7, 3);
DO_REDUCE(2);
DO_REDUCE(3);
BUTTERFLY(0, 2, 0);
BUTTERFLY(4, 6, 0);
BUTTERFLY(1, 3, 2);
BUTTERFLY(5, 7, 2);
DO_REDUCE(1);
BUTTERFLY(0, 1, 0);
BUTTERFLY(2, 3, 0);
BUTTERFLY(4, 5, 0);
BUTTERFLY(6, 7, 0);
/* We don't need to reduce X(7) */
DO_REDUCE_FULL_S(0);
DO_REDUCE_FULL_S(1);
DO_REDUCE_FULL_S(2);
DO_REDUCE_FULL_S(3);
DO_REDUCE_FULL_S(4);
DO_REDUCE_FULL_S(5);
DO_REDUCE_FULL_S(6);
#undef BUTTERFLY
/*
* Multiply by twiddle factors
*/
X(6) = v16_mul(X(6), FFT64_Twiddle[0].v16);
X(5) = v16_mul(X(5), FFT64_Twiddle[1].v16);
X(4) = v16_mul(X(4), FFT64_Twiddle[2].v16);
X(3) = v16_mul(X(3), FFT64_Twiddle[3].v16);
X(2) = v16_mul(X(2), FFT64_Twiddle[4].v16);
X(1) = v16_mul(X(1), FFT64_Twiddle[5].v16);
X(0) = v16_mul(X(0), FFT64_Twiddle[6].v16);
/*
* Transpose the FFT state with a revbin order permutation
* on the rows and the column.
* This will make the full FFT_64 in order.
*/
#define INTERLEAVE(i,j) \
do { \
v16 t1= X(i); \
v16 t2= X(j); \
X(i) = v16_interleavel(t1, t2); \
X(j) = v16_interleaveh(t1, t2); \
} while(0)
INTERLEAVE(1, 0);
INTERLEAVE(3, 2);
INTERLEAVE(5, 4);
INTERLEAVE(7, 6);
INTERLEAVE(2, 0);
INTERLEAVE(3, 1);
INTERLEAVE(6, 4);
INTERLEAVE(7, 5);
INTERLEAVE(4, 0);
INTERLEAVE(5, 1);
INTERLEAVE(6, 2);
INTERLEAVE(7, 3);
#undef INTERLEAVE
/*
* Finish with 8 parallels DIT FFT_8
*
* FFT_8 using w=4 as 8th root of unity
* Unrolled decimation in time (DIT) radix-2 NTT.
* Intput data is in revbin_permuted order.
*/
#define BUTTERFLY(i,j,n) \
do { \
MAYBE_VOLATILE v16 u = X(j); \
if (n) \
X(i) = v16_shift_l(X(i), w[n]); \
X(j) = v16_sub(X(j), X(i)); \
X(i) = v16_add(u, X(i)); \
} while(0)
DO_REDUCE(0);
DO_REDUCE(1);
DO_REDUCE(2);
DO_REDUCE(3);
DO_REDUCE(4);
DO_REDUCE(5);
DO_REDUCE(6);
DO_REDUCE(7);
BUTTERFLY(0, 1, 0);
BUTTERFLY(2, 3, 0);
BUTTERFLY(4, 5, 0);
BUTTERFLY(6, 7, 0);
BUTTERFLY(0, 2, 0);
BUTTERFLY(4, 6, 0);
BUTTERFLY(1, 3, 2);
BUTTERFLY(5, 7, 2);
DO_REDUCE(3);
BUTTERFLY(0, 4, 0);
BUTTERFLY(1, 5, 1);
BUTTERFLY(2, 6, 2);
BUTTERFLY(3, 7, 3);
DO_REDUCE_FULL_S(0);
DO_REDUCE_FULL_S(1);
DO_REDUCE_FULL_S(2);
DO_REDUCE_FULL_S(3);
DO_REDUCE_FULL_S(4);
DO_REDUCE_FULL_S(5);
DO_REDUCE_FULL_S(6);
DO_REDUCE_FULL_S(7);
#undef BUTTERFLY
A[0] = X0;
A[1] = X1;
A[2] = X2;
A[3] = X3;
A[4] = X4;
A[5] = X5;
A[6] = X6;
A[7] = X7;
#undef X
}
MAYBE_INLINE void fft128(void *a) {
int i;
// Temp space to help for interleaving in the end
v16 B[8];
v16 *A = (v16*) a;
// v16 *Twiddle = (v16*)FFT128_Twiddle;
/* Size-2 butterflies */
for (i = 0; i<8; i++) {
B[i] = v16_add(A[i], A[i+8]);
B[i] = REDUCE_FULL_S(B[i]);
A[i+8] = v16_sub(A[i], A[i+8]);
A[i+8] = REDUCE_FULL_S(A[i+8]);
A[i+8] = v16_mul(A[i+8], FFT128_Twiddle[i].v16);
A[i+8] = REDUCE_FULL_S(A[i+8]);
}
fft64(B);
fft64(A+8);
/* Transpose (i.e. interleave) */
for (i=0; i<8; i++) {
A[2*i] = v16_interleavel (B[i], A[i+8]);
A[2*i+1] = v16_interleaveh (B[i], A[i+8]);
}
}
#ifdef v16_broadcast
/* Compute the FFT using a table
* The function works if the value of the message is smaller
* than 2^14.
*/
void fft128_msg_final(short *a, const unsigned char *x) {
static const union cv FFT128_Final_Table[] = {
{{ 1, -211, 60, -67, 2, 92, -137, 123}},
{{ 2, 118, 45, 111, 97, -46, 49, -106}},
{{ 4, -73, -17, -11, 8, 111, -34, -22}},
{{ -68, -4, 76, -25, 96, -96, -68, -9}},
{{ 16, -35, -68, -44, 32, -70, -136, -88}},
{{ 0, -124, 17, 12, -6, 57, 47, -8}},
{{ 64, 117, -15, 81, 128, -23, -30, -95}},
{{ -68, -53, -52, -70, -10, -117, 77, 21}},
{{ -1, -46, -60, 67, -2, -92, -120, -123}},
{{ -2, -118, -45, -111, -97, 46, -49, 106}},
{{ -4, 73, 17, 11, -8, -111, 34, 22}},
{{ 68, 4, -76, 25, -96, 96, 68, 9}},
{{ -16, -222, 68, 44, -32, 70, -121, 88}},
{{ 0, 124, -17, -12, 6, -57, -47, 8}},
{{ -64, -117, 15, -81, -128, -234, 30, 95}},
{{ 68, 53, 52, 70, 10, 117, -77, -21}},
{{-118, -31, 116, -61, 21, -62, -25, -122}},
{{-101, 107, -45, -95, -8, 3, 101, -34}},
{{ 42, -124, -50, 13, 84, 9, -100, -231}},
{{ -79, -53, 82, 65, -81, 47, 61, 107}},
{{ -89, -239, 57, -205, -178, 36, -143, 104}},
{{-126, 113, 33, 111, 103, -109, 65, -114}},
{{ -99, 72, -29, -49, -198, -113, -58, -98}},
{{ 8, -27, -106, -30, 111, 6, 10, -108}},
{{-139, 31, -116, -196, -21, 62, 25, -135}},
{{ 101, -107, 45, 95, 8, -3, -101, 34}},
{{ -42, -133, 50, -13, -84, -9, 100, -26}},
{{ 79, 53, -82, -65, 81, -47, -61, -107}},
{{-168, -18, -57, -52, -79, -36, -114, -104}},
{{ 126, -113, -33, -111, -103, 109, -65, 114}},
{{ 99, -72, -228, 49, -59, 113, 58, -159}},
{{ -8, 27, 106, 30, -111, -6, -10, 108}}
};
// v16 *Table = (v16*)FFT128_Final_Table;
v16 *A = (v16*) a;
v16 msg1 = v16_broadcast(x[0]>128?x[0]-257:x[0]);
v16 msg2 = v16_broadcast(x[1]>128?x[1]-257:x[1]);
// v16 msg2 = v16_broadcast(x[1]);
#if 0
int i;
for (i=0; i<16; i++) {
v16 tmp = v16_mul(FFT128_Final_Table[2*i].v16 , msg2);
v16 sum = v16_add(FFT128_Final_Table[2*i+1].v16, msg1);
sum = v16_add(sum, tmp);
A[i] = REDUCE_FULL_S(sum);
}
#else
#define FFT_FINAL(i) \
v16 tmp##i = v16_mul(FFT128_Final_Table[2*i].v16, msg2); \
v16 sum##i = v16_add(FFT128_Final_Table[2*i+1].v16, msg1); \
sum##i = v16_add(sum##i, tmp##i); \
A[i] = REDUCE_FULL_S(sum##i);
FFT_FINAL(0)
FFT_FINAL(1)
FFT_FINAL(2)
FFT_FINAL(3)
FFT_FINAL(4)
FFT_FINAL(5)
FFT_FINAL(6)
FFT_FINAL(7)
FFT_FINAL(8)
FFT_FINAL(9)
FFT_FINAL(10)
FFT_FINAL(11)
FFT_FINAL(12)
FFT_FINAL(13)
FFT_FINAL(14)
FFT_FINAL(15)
#endif
}
#endif
void fft128_msg(short *a, const unsigned char *x, int final) {
static const union cv Tweak =
{{0,0,0,0,0,0,0,1}};
static const union cv FinalTweak =
{{0,0,0,0,0,1,0,1}};
v8 *X = (v8*) x;
v16 *A = (v16*) a;
// v16 *Twiddle = (v16*)FFT128_Twiddle;
#define UNPACK(i) \
do { \
v8 t = X[i]; \
A[2*i] = v8_mergel(t, V0.v8); \
A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16); \
A[2*i+8] = REDUCE(A[2*i+8]); \
A[2*i+1] = v8_mergeh(t, V0.v8); \
A[2*i+9] = v16_mul(A[2*i+1], FFT128_Twiddle[2*i+1].v16); \
A[2*i+9] = REDUCE(A[2*i+9]); \
} while(0)
/*
* This allows to tweak the last butterflies to introduce X^127
*/
#define UNPACK_TWEAK(i,tw) \
do { \
v8 t = X[i]; \
v16 tmp; \
A[2*i] = v8_mergel(t, V0.v8); \
A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16); \
A[2*i+8] = REDUCE(A[2*i+8]); \
tmp = v8_mergeh(t, V0.v8); \
A[2*i+1] = v16_add(tmp, tw); \
A[2*i+9] = v16_mul(v16_sub(tmp, tw), FFT128_Twiddle[2*i+1].v16); \
A[2*i+9] = REDUCE(A[2*i+9]); \
} while(0)
UNPACK(0);
UNPACK(1);
UNPACK(2);
if (final)
UNPACK_TWEAK(3, FinalTweak.v16);
else
UNPACK_TWEAK(3, Tweak.v16);
#undef UNPACK
#undef UNPACK_TWEAK
fft64(a);
fft64(a+64);
}
#if 0
void fft128_msg(short *a, const unsigned char *x, int final) {
for (int i=0; i<64; i++)
a[i] = x[i];
for (int i=64; i<128; i++)
a[i] = 0;
a[127] = 1;
a[125] = final? 1: 0;
fft128(a);
}
#endif
void fft256_msg(short *a, const unsigned char *x, int final) {
static const union cv Tweak =
{{0,0,0,0,0,0,0,1}};
static const union cv FinalTweak =
{{0,0,0,0,0,1,0,1}};
v8 *X = (v8*) x;
v16 *A = (v16*) a;
// v16 *Twiddle = (v16*)FFT256_Twiddle;
#define UNPACK(i) \
do { \
v8 t = X[i]; \
A[2*i] = v8_mergel(t, V0.v8); \
A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16); \
A[2*i+16] = REDUCE(A[2*i+16]); \
A[2*i+1] = v8_mergeh(t, V0.v8); \
A[2*i+17] = v16_mul(A[2*i+1], FFT256_Twiddle[2*i+1].v16); \
A[2*i+17] = REDUCE(A[2*i+17]); \
} while(0)
/*
* This allows to tweak the last butterflies to introduce X^127
*/
#define UNPACK_TWEAK(i,tw) \
do { \
v8 t = X[i]; \
v16 tmp; \
A[2*i] = v8_mergel(t, V0.v8); \
A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16); \
A[2*i+16] = REDUCE(A[2*i+16]); \
tmp = v8_mergeh(t, V0.v8); \
A[2*i+1] = v16_add(tmp, tw); \
A[2*i+17] = v16_mul(v16_sub(tmp, tw), FFT256_Twiddle[2*i+1].v16); \
A[2*i+17] = REDUCE(A[2*i+17]); \
} while(0)
UNPACK(0);
UNPACK(1);
UNPACK(2);
UNPACK(3);
UNPACK(4);
UNPACK(5);
UNPACK(6);
if (final)
UNPACK_TWEAK(7, FinalTweak.v16);
else
UNPACK_TWEAK(7, Tweak.v16);
#undef UNPACK
#undef UNPACK_TWEAK
fft128(a);
fft128(a+128);
}
void rounds(u32* state, const unsigned char* msg, short* fft) {
v32* S = (v32*) state;
const v32* M = (v32*)msg;
volatile v16* W = (v16*)fft;
register v32 S0, S1, S2, S3;
static const union cv code[] = { CV(185), CV(233) };
S0 = v32_xor(S[0], v32_bswap(M[0]));
S1 = v32_xor(S[1], v32_bswap(M[1]));
S2 = v32_xor(S[2], v32_bswap(M[2]));
S3 = v32_xor(S[3], v32_bswap(M[3]));
#define S(i) S##i
/* #define F_0(B, C, D) ((((C) ^ (D)) & (B)) ^ (D)) */
/* #define F_1(B, C, D) (((D) & (C)) | (((D) | (C)) & (B))) */
#define F_0(B, C, D) v32_xor(v32_and(v32_xor(C,D), B), D)
#define F_1(B, C, D) v32_or(v32_and(D, C), v32_and( v32_or(D,C), B))
#define F(a,b,c,fun) F_##fun (a,b,c)
/*
* We split the round function in two halfes
* so as to insert some independent computations in between
*/
#define SUM3_00 1
#define SUM3_01 2
#define SUM3_02 3
#define SUM3_10 2
#define SUM3_11 3
#define SUM3_12 1
#define SUM3_20 3
#define SUM3_21 1
#define SUM3_22 2
#define STEP_1(a,b,c,d,w,fun,r,s,z) \
do { \
if (PRINT_SOME) { \
int j; \
v32 ww=w, aa=a, bb=b, cc=c, dd=d; \
u32 *WW = (void*)&ww; \
u32 *AA = (void*)&aa; \
u32 *BB = (void*)&bb; \
u32 *CC = (void*)&cc; \
u32 *DD = (void*)&dd; \
for (j=0; j<4; j++) { \
printf ("%08x/%2i/%2i[%i]: %08x %08x %08x %08x\n", \
WW[j], r, s, SUM3_##z, \
AA[j], BB[j], CC[j], DD[j]); \
} \
} \
TT = F(a,b,c,fun); \
a = v32_rotate(a,r); \
w = v32_add(w, d); \
TT = v32_add(TT, w); \
TT = v32_rotate(TT,s); \
d = v32_shufxor(a,SUM3_##z); \
} while(0)
#define STEP_2(a,b,c,d,w,fun,r,s) \
do { \
d = v32_add(d, TT); \
} while(0)
#define STEP(a,b,c,d,w,fun,r,s,z) \
do { \
register v32 TT; \
STEP_1(a,b,c,d,w,fun,r,s,z); \
STEP_2(a,b,c,d,w,fun,r,s); \
} while(0);
#define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3, \
fun,r,s,t,u,z,r0) \
do { \
register v32 W0, W1, W2, W3, TT; \
W0 = v16_merge##u0(W[h0], W[l0]); \
W0 = V1632(v16_mul(V3216(W0), code[z].v16)); \
STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, r0##0); \
W1 = v16_merge##u1(W[h1], W[l1]); \
W1 = V1632(v16_mul(V3216(W1), code[z].v16)); \
STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s); \
STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, r0##1); \
W2 = v16_merge##u2(W[h2], W[l2]); \
W2 = V1632(v16_mul(V3216(W2), code[z].v16)); \
STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t); \
STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, r0##2); \
W3 = v16_merge##u3(W[h3], W[l3]); \
W3 = V1632(v16_mul(V3216(W3), code[z].v16)); \
STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u); \
STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, r0##0); \
STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r); \
} while(0)
/*
* 4 rounds with code 185
*/
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0, 0);
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0, 1);
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0, 2);
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0, 0);
/*
* 4 rounds with code 233
*/
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1, 1);
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1, 2);
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1, 0);
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1, 1);
/*
* 1 round as feed-forward
*/
STEP(S(0), S(1), S(2), S(3), S[0], 0, 4, 13, 20);
STEP(S(3), S(0), S(1), S(2), S[1], 0, 13, 10, 21);
STEP(S(2), S(3), S(0), S(1), S[2], 0, 10, 25, 22);
STEP(S(1), S(2), S(3), S(0), S[3], 0, 25, 4, 20);
S[0] = S(0); S[1] = S(1); S[2] = S(2); S[3] = S(3);
#undef ROUND
#undef STEP
#undef STEP_1
#undef STEP_2
}
void rounds512(u32* state, const unsigned char* msg, short* fft) {
v32* S = (v32*) state;
v32* M = (v32*) msg;
v16* W = (v16*) fft;
register v32 S0l, S1l, S2l, S3l;
register v32 S0h, S1h, S2h, S3h;
static const union cv code[] = { CV(185), CV(233) };
S0l = v32_xor(S[0], v32_bswap(M[0]));
S0h = v32_xor(S[1], v32_bswap(M[1]));
S1l = v32_xor(S[2], v32_bswap(M[2]));
S1h = v32_xor(S[3], v32_bswap(M[3]));
S2l = v32_xor(S[4], v32_bswap(M[4]));
S2h = v32_xor(S[5], v32_bswap(M[5]));
S3l = v32_xor(S[6], v32_bswap(M[6]));
S3h = v32_xor(S[7], v32_bswap(M[7]));
#define S(i) S##i
/* #define F_0(B, C, D) ((((C) ^ (D)) & (B)) ^ (D)) */
/* #define F_1(B, C, D) (((D) & (C)) | (((D) | (C)) & (B))) */
#define F_0(B, C, D) v32_xor(v32_and(v32_xor(C,D), B), D)
#define F_1(B, C, D) v32_or(v32_and(D, C), v32_and( v32_or(D,C), B))
#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
/*
* We split the round function in two halfes
* so as to insert some independent computations in between
*/
#define SUM7_00 0
#define SUM7_01 1
#define SUM7_02 2
#define SUM7_03 3
#define SUM7_04 4
#define SUM7_05 5
#define SUM7_06 6
#define SUM7_10 1
#define SUM7_11 2
#define SUM7_12 3
#define SUM7_13 4
#define SUM7_14 5
#define SUM7_15 6
#define SUM7_16 0
#define SUM7_20 2
#define SUM7_21 3
#define SUM7_22 4
#define SUM7_23 5
#define SUM7_24 6
#define SUM7_25 0
#define SUM7_26 1
#define SUM7_30 3
#define SUM7_31 4
#define SUM7_32 5
#define SUM7_33 6
#define SUM7_34 0
#define SUM7_35 1
#define SUM7_36 2
#define SUM7_40 4
#define SUM7_41 5
#define SUM7_42 6
#define SUM7_43 0
#define SUM7_44 1
#define SUM7_45 2
#define SUM7_46 3
#define SUM7_50 5
#define SUM7_51 6
#define SUM7_52 0
#define SUM7_53 1
#define SUM7_54 2
#define SUM7_55 3
#define SUM7_56 4
#define SUM7_60 6
#define SUM7_61 0
#define SUM7_62 1
#define SUM7_63 2
#define SUM7_64 3
#define SUM7_65 4
#define SUM7_66 5
#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
#define PERM_0(d,a) /* XOR 1 */ \
do { \
d##l = v32_shufxor(a##l,1); \
d##h = v32_shufxor(a##h,1); \
} while(0)
#define PERM_1(d,a) /* XOR 6 */ \
do { \
d##l = v32_shufxor(a##h,2); \
d##h = v32_shufxor(a##l,2); \
} while(0)
#define PERM_2(d,a) /* XOR 2 */ \
do { \
d##l = v32_shufxor(a##l,2); \
d##h = v32_shufxor(a##h,2); \
} while(0)
#define PERM_3(d,a) /* XOR 3 */ \
do { \
d##l = v32_shufxor(a##l,3); \
d##h = v32_shufxor(a##h,3); \
} while(0)
#define PERM_4(d,a) /* XOR 5 */ \
do { \
d##l = v32_shufxor(a##h,1); \
d##h = v32_shufxor(a##l,1); \
} while(0)
#define PERM_5(d,a) /* XOR 7 */ \
do { \
d##l = v32_shufxor(a##h,3); \
d##h = v32_shufxor(a##l,3); \
} while(0)
#define PERM_6(d,a) /* XOR 4 */ \
do { \
d##l = a##h; \
d##h = a##l; \
} while(0)
#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
do { \
if (PRINT_SOME) { \
int j; \
v32 ww=w##l, aa=a##l, bb=b##l, cc=c##l, dd=d##l; \
u32 *WW = (void*)&ww; \
u32 *AA = (void*)&aa; \
u32 *BB = (void*)&bb; \
u32 *CC = (void*)&cc; \
u32 *DD = (void*)&dd; \
for (j=0; j<4; j++) { \
printf ("%08x/%2i/%2i: %08x %08x %08x %08x\n", \
WW[j], r, s, \
AA[j], BB[j], CC[j], DD[j]); \
} \
} \
TTl = Fl(a,b,c,fun); \
TTh = Fh(a,b,c,fun); \
a##l = v32_rotate(a##l,r); \
a##h = v32_rotate(a##h,r); \
w##l = v32_add(w##l, d##l); \
w##h = v32_add(w##h, d##h); \
TTl = v32_add(TTl, w##l); \
TTh = v32_add(TTh, w##h); \
TTl = v32_rotate(TTl,s); \
TTh = v32_rotate(TTh,s); \
PERM(z,d,a); \
} while(0)
#define STEP_1(a,b,c,d,w,fun,r,s,z) \
STEP_1_(a,b,c,d,w,fun,r,s,z)
#define STEP_2_(a,b,c,d,w,fun,r,s) \
do { \
d##l = v32_add(d##l, TTl); \
d##h = v32_add(d##h, TTh); \
} while(0)
#define STEP_2(a,b,c,d,w,fun,r,s) \
STEP_2_(a,b,c,d,w,fun,r,s)
#define STEP(a,b,c,d,w1,w2,fun,r,s,z) \
do { \
register v32 TTl, TTh, Wl=w1, Wh=w2; \
STEP_1(a,b,c,d,W,fun,r,s,z); \
STEP_2(a,b,c,d,W,fun,r,s); \
} while(0);
#define MSG_l(x) (2*(x))
#define MSG_h(x) (2*(x)+1)
#define MSG(w,hh,ll,u,z) \
do { \
int a = MSG_##u(hh); \
int b = MSG_##u(ll); \
w##l = v16_mergel(W[a], W[b]); \
w##l = V1632(v16_mul(V3216(w##l), code[z].v16)); \
w##h = v16_mergeh(W[a], W[b]); \
w##h = V1632(v16_mul(V3216(w##h), code[z].v16)); \
} while(0)
#define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3, \
fun,r,s,t,u,z) \
do { \
register v32 W0l, W1l, W2l, W3l, TTl; \
register v32 W0h, W1h, W2h, W3h, TTh; \
MSG(W0,h0,l0,u0,z); \
STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, 0); \
MSG(W1,h1,l1,u1,z); \
STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s); \
STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, 1); \
MSG(W2,h2,l2,u2,z); \
STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t); \
STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, 2); \
MSG(W3,h3,l3,u3,z); \
STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u); \
STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, 3); \
STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r); \
} while(0)
/*
* 4 rounds with code 185
*/
#define PERM_START 0
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 4
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 1
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0);
#undef PERM_START
#define PERM_START 5
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0);
#undef PERM_START
/*
* 4 rounds with code 233
*/
#define PERM_START 2
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 6
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 3
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1);
#undef PERM_START
#define PERM_START 0
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1);
#undef PERM_START
/*
* 1 round as feed-forward
*/
#define PERM_START 4
STEP(S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0);
STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1);
STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2);
STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3);
#undef PERM_START
S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h;
S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h;
#undef ROUND
#undef STEP
#undef STEP_1
#undef STEP_2
}
void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) {
if (state->hashbitlen <= 256) {
union cv Y[16];
short* y = (short*) Y[0].u16;
#ifdef v16_broadcast
if (final == 2) {
fft128_msg_final(y, m);
rounds(state->A, m, y);
} else {
fft128_msg(y, m, final);
rounds(state->A, m, y);
}
#else
fft128_msg(y, m, final);
rounds(state->A, m, y);
#endif
} else {
union cv Y[32];
short* y = (short*) Y[0].u16;
fft256_msg(y, m, final);
rounds512(state->A, m, y);
}
}
/*
* Give the FFT output in the regular order for consitancy checks
*/
void fft128_natural(fft_t *x, unsigned char *a) {
union cv Y[16];
short* y = (short*) Y[0].u16;
int i;
fft128_msg(y, a, 0);
for(i=0; i<64; i++) {
x[2*i] = y[i];
x[2*i+1] = y[i+64];
}
}
#endif // SSE2

View File

@@ -1,248 +0,0 @@
#ifndef __VECTOR_H__
#define __VECTOR_H__
#include "compat.h"
#include "simd-utils.h"
/*******************************
* Using GCC vector extensions *
*******************************/
//typedef unsigned char v16qi __attribute__ ((vector_size (16)));
typedef char v16qi __attribute__ ((vector_size (16)));
typedef short v8hi __attribute__ ((vector_size (16)));
typedef int v4si __attribute__ ((vector_size (16)));
typedef float v4sf __attribute__ ((vector_size (16)));
typedef long long int v2di __attribute__ ((vector_size (16)));
typedef short v4hi __attribute__ ((vector_size (8)));
typedef unsigned char v8qi __attribute__ ((vector_size (8)));
typedef v16qi v8;
typedef v8hi v16;
typedef v4si v32;
#define V16_SIZE 8
union cv {
unsigned short u16[8];
v16 v16;
};
union cv8 {
unsigned char u8[16];
v8 v8;
};
union u32 {
u32 u[4];
v32 v;
};
#define V3216(x) ((v16) (x))
#define V1632(x) ((v32) (x))
#define V168(x) ( (v8) (x))
#define V816(x) ((v16) (x))
#if 0
/* These instruction are shorter than the PAND/POR/... that GCC uses */
#define vec_and(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_andps ((v4sf) a, (v4sf) b);})
#define vec_or(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_orps ((v4sf) a, (v4sf) b);})
#define vec_xor(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_xorps ((v4sf) a, (v4sf) b);})
#define vec_andn(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_andnps ((v4sf) a, (v4sf) b);})
#define v16_and(x,y) ((v16) vec_and ((x), (y)))
#define v16_or(x,y) ((v16) vec_or ((x), (y)))
#define v16_xor(x,y) ((v16) vec_xor ((x), (y)))
#define v16_andn(x,y) ((v16) vec_andn((x), (y)))
#define v32_and(x,y) ((v32) vec_and ((x), (y)))
#define v32_or(x,y) ((v32) vec_or ((x), (y)))
#define v32_xor(x,y) ((v32) vec_xor ((x), (y)))
#define v32_andn(x,y) ((v32) vec_andn((x), (y)))
#endif
//TODO aarch support for widening multiply
#if defined(__SSE2__)
#define vec_and(x,y) ((x)&(y))
#define vec_or(x,y) ((x)|(y))
#define vec_xor(x,y) ((x)^(y))
#define v16_and vec_and
#define v16_or vec_or
#define v16_xor vec_xor
#define v32_and vec_and
#define v32_or vec_or
#define v32_xor vec_xor
#define vec_andn(x,y) __builtin_ia32_pandn128 ((v2di) x, (v2di) y)
#define v16_andn(x,y) ((v16) vec_andn(x,y))
#define v32_andn(x,y) ((v32) vec_andn(x,y))
#define v32_add(x,y) ((x)+(y))
#define v16_add(x,y) ((x)+(y))
#define v16_sub(x,y) ((x)-(y))
#define v16_mul(x,y) ((x)*(y))
#define v16_neg(x) (-(x))
#define v16_shift_l __builtin_ia32_psllwi128
#define v16_shift_r __builtin_ia32_psrawi128
#define v16_cmp __builtin_ia32_pcmpgtw128
#define v16_interleavel __builtin_ia32_punpcklwd128
#define v16_interleaveh __builtin_ia32_punpckhwd128
#define v16_mergel(a,b) V1632(__builtin_ia32_punpcklwd128(a,b))
#define v16_mergeh(a,b) V1632(__builtin_ia32_punpckhwd128(a,b))
#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
#define v32_shift_l __builtin_ia32_pslldi128
#define v32_shift_r __builtin_ia32_psrldi128
#define v32_rotate(x,n) \
v32_or(v32_shift_l(x,n), v32_shift_r(x,32-(n)))
#define v32_shuf __builtin_ia32_pshufd
#define SHUFXOR_1 0xb1 /* 0b10110001 */
#define SHUFXOR_2 0x4e /* 0b01001110 */
#define SHUFXOR_3 0x1b /* 0b00011011 */
#define CAT(x, y) x##y
#define XCAT(x,y) CAT(x,y)
#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
#define v32_bswap(x) (x)
#define v16_broadcast(x) ({ \
union u32 u; \
u32 xx = x; \
u.u[0] = xx | (xx << 16); \
V3216(v32_shuf(u.v,0)); })
#define CV(x) {{x, x, x, x, x, x, x, x}}
#elif defined(__aarch64__) && defined(__ARM_NEON)
#define vec_and( x, y ) v128_and( x, y )
#define vec_or(x,y) v128_or( x, y )
#define vec_xor(x,y) v128_xor( x, y )
#define v16_and v128_and
#define v16_or v128_or
#define v16_xor v128_xor
#define v32_and v128_and
#define v32_or v128_or
#define v32_xor v128_xor
#define vec_andn( x,y ) v128_andnot( x, y )
#define v16_andn vec_andn
#define v32_andn vec_andn
#define v32_add( x, y ) v128_add32( x, y )
#define v16_add( x, y ) v128_add16( x, y )
#define v16_sub( x, y ) v128_sub16( x, y )
#define v16_mul( x, y ) v128_mul16( x, y )
#define v16_neg(x) v128_negate16( x )
#define v16_shift_l( x, c ) v128_sl16
#define v16_shift_r v128_sr16
#define v16_cmp v128_cmpgt16
#define v16_interleavel v128_unpacklo16
#define v16_interleaveh v128_unpackhi16
#define v16_mergel(a,b) V1632(__builtin_ia32_punpcklwd128(a,b))
#define v16_mergeh(a,b) V1632(__builtin_ia32_punpckhwd128(a,b))
#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
#define v32_shift_l v128_sl32
#define v32_shift_r v128_sr32
#define v32_rotate(x,n) v128_rol32
#define v32_shuf __builtin_ia32_pshufd
#define SHUFXOR_1 0xb1 /* 0b10110001 */
#define SHUFXOR_2 0x4e /* 0b01001110 */
#define SHUFXOR_3 0x1b /* 0b00011011 */
#define CAT(x, y) x##y
#define XCAT(x,y) CAT(x,y)
#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
#define v32_bswap(x) (x)
#define v16_broadcast(x) ({ \
union u32 u; \
u32 xx = x; \
u.u[0] = xx | (xx << 16); \
V3216(v32_shuf(u.v,0)); })
#define CV(x) {{x, x, x, x, x, x, x, x}}
#else
#error "I don't know how to vectorize on this architecture."
#endif
/* Twiddle tables */
static const union cv FFT64_Twiddle[] = {
{{1, 2, 4, 8, 16, 32, 64, 128}},
{{1, 60, 2, 120, 4, -17, 8, -34}},
{{1, 120, 8, -68, 64, -30, -2, 17}},
{{1, 46, 60, -67, 2, 92, 120, 123}},
{{1, 92, -17, -22, 32, 117, -30, 67}},
{{1, -67, 120, -73, 8, -22, -68, -70}},
{{1, 123, -34, -70, 128, 67, 17, 35}},
};
static const union cv FFT128_Twiddle[] = {
{{ 1, -118, 46, -31, 60, 116, -67, -61}},
{{ 2, 21, 92, -62, 120, -25, 123, -122}},
{{ 4, 42, -73, -124, -17, -50, -11, 13}},
{{ 8, 84, 111, 9, -34, -100, -22, 26}},
{{ 16, -89, -35, 18, -68, 57, -44, 52}},
{{ 32, 79, -70, 36, 121, 114, -88, 104}},
{{ 64, -99, 117, 72, -15, -29, 81, -49}},
{{128, 59, -23, -113, -30, -58, -95, -98}},
};
static const union cv FFT256_Twiddle[] = {
{{ 1, 41, -118, 45, 46, 87, -31, 14}},
{{ 60, -110, 116, -127, -67, 80, -61, 69}},
{{ 2, 82, 21, 90, 92, -83, -62, 28}},
{{ 120, 37, -25, 3, 123, -97, -122, -119}},
{{ 4, -93, 42, -77, -73, 91, -124, 56}},
{{ -17, 74, -50, 6, -11, 63, 13, 19}},
{{ 8, 71, 84, 103, 111, -75, 9, 112}},
{{ -34, -109, -100, 12, -22, 126, 26, 38}},
{{ 16, -115, -89, -51, -35, 107, 18, -33}},
{{ -68, 39, 57, 24, -44, -5, 52, 76}},
{{ 32, 27, 79, -102, -70, -43, 36, -66}},
{{ 121, 78, 114, 48, -88, -10, 104, -105}},
{{ 64, 54, -99, 53, 117, -86, 72, 125}},
{{ -15, -101, -29, 96, 81, -20, -49, 47}},
{{ 128, 108, 59, 106, -23, 85, -113, -7}},
{{ -30, 55, -58, -65, -95, -40, -98, 94}}
};
#endif

View File

@@ -8,15 +8,15 @@ bool register_skein_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_skein_8way;
gate->hash = (void*)&skeinhash_8way;
#elif defined(SKEIN_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_skein_4way;
gate->hash = (void*)&skeinhash_4way;
#elif defined(SKEIN_2WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_skein_2x64;
gate->hash = (void*)&skeinhash_2x64;
#else
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_skein;
gate->hash = (void*)&skeinhash;
#endif

View File

@@ -240,10 +240,10 @@ void sm3_8way_close( void *cc, void *dst )
#if defined(__SSE2__)
#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 9 ), \
mm128_rol_32( x, 17 ) ) )
#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 15 ), \
mm128_rol_32( x, 23 ) ) )
#define P0(x) _mm_xor_si128( x, _mm_xor_si128( v128_rol32( x, 9 ), \
v128_rol32( x, 17 ) ) )
#define P1(x) _mm_xor_si128( x, _mm_xor_si128( v128_rol32( x, 15 ), \
v128_rol32( x, 23 ) ) )
#define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) )
#define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \
@@ -273,13 +273,13 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
int j;
for ( j = 0; j < 16; j++ )
W[j] = mm128_bswap_32( block[j] );
W[j] = v128_bswap32( block[j] );
for ( j = 16; j < 68; j++ )
W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
W[ j-9 ] ),
mm128_rol_32( W[ j-3 ], 15 ) ) ),
_mm_xor_si128( mm128_rol_32( W[ j-13 ], 7 ),
v128_rol32( W[ j-3 ], 15 ) ) ),
_mm_xor_si128( v128_rol32( W[ j-13 ], 7 ),
W[ j-6 ] ) );
for( j = 0; j < 64; j++ )
@@ -288,19 +288,19 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
T = _mm_set1_epi32( 0x79CC4519UL );
for( j =0; j < 16; j++ )
{
SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
SS1 = v128_rol32( _mm_add_epi32( _mm_add_epi32( v128_rol32(A,12), E ),
mm128_rol_var_32( T, j ) ), 7 );
SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );
SS2 = _mm_xor_si128( SS1, v128_rol32( A, 12 ) );
TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ),
SS2 ), W1[j] );
TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ),
SS1 ), W[j] );
D = C;
C = mm128_rol_32( B, 9 );
C = v128_rol32( B, 9 );
B = A;
A = TT1;
H = G;
G = mm128_rol_32( F, 19 );
G = v128_rol32( F, 19 );
F = E;
E = P0( TT2 );
}
@@ -308,19 +308,19 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
T = _mm_set1_epi32( 0x7A879D8AUL );
for( j =16; j < 64; j++ )
{
SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
SS1 = v128_rol32( _mm_add_epi32( _mm_add_epi32( v128_rol32(A,12), E ),
mm128_rol_var_32( T, j&31 ) ), 7 );
SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );
SS2 = _mm_xor_si128( SS1, v128_rol32( A, 12 ) );
TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ),
SS2 ), W1[j] );
TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ),
SS1 ), W[j] );
D = C;
C = mm128_rol_32( B, 9 );
C = v128_rol32( B, 9 );
B = A;
A = TT1;
H = G;
G = mm128_rol_32( F, 19 );
G = v128_rol32( F, 19 );
F = E;
E = P0( TT2 );
}
@@ -408,14 +408,14 @@ void sm3_4way_close( void *cc, void *dst )
memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
}
count[0] = mm128_bswap_32(
count[0] = v128_bswap32(
_mm_set1_epi32( ctx->nblocks >> 23 ) );
count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
( ctx->num << 3 ) ) );
count[1] = v128_bswap32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
( ctx->num << 3 ) ) );
sm3_4way_compress( ctx->digest, block );
for ( i = 0; i < 8 ; i++ )
hash[i] = mm128_bswap_32( ctx->digest[i] );
hash[i] = v128_bswap32( ctx->digest[i] );
}
#endif

View File

@@ -137,53 +137,8 @@ void verthash_info_free(verthash_info_t* info)
#define VH_N_INDEXES 4096
#define VH_BYTE_ALIGNMENT 16
static inline uint32_t fnv1a(const uint32_t a, const uint32_t b)
{
return (a ^ b) * 0x1000193;
}
#define fnv1a( a, b ) ( ( (a) ^ (b) ) * 0x1000193 )
#if 0
static void rotate_indexes( uint32_t *p )
{
#if defined(__AVX2__)
for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m256i); x += 8 )
{
__m256i *px = (__m256i*)p + x;
px[0] = mm256_rol_32( px[0], 1 );
px[1] = mm256_rol_32( px[1], 1 );
px[2] = mm256_rol_32( px[2], 1 );
px[3] = mm256_rol_32( px[3], 1 );
px[4] = mm256_rol_32( px[4], 1 );
px[5] = mm256_rol_32( px[5], 1 );
px[6] = mm256_rol_32( px[6], 1 );
px[7] = mm256_rol_32( px[7], 1 );
}
#else
for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m128i); x += 8 )
{
__m128i *px = (__m128i*)p0_index + x;
px[0] = mm128_rol_32( px[0], 1 );
px[1] = mm128_rol_32( px[1], 1 );
px[2] = mm128_rol_32( px[2], 1 );
px[3] = mm128_rol_32( px[3], 1 );
px[4] = mm128_rol_32( px[4], 1 );
px[5] = mm128_rol_32( px[5], 1 );
px[6] = mm128_rol_32( px[6], 1 );
px[7] = mm128_rol_32( px[7], 1 );
}
#endif
/*
for ( size_t x = 0; x < VH_N_SUBSET / sizeof(uint32_t); ++x )
p[x] = ( p[x] << 1 ) | ( p[x] >> 31 );
*/
}
#endif
// Vectorized and targetted version of fnv1a
#if defined (__AVX2__)
@@ -191,7 +146,7 @@ static void rotate_indexes( uint32_t *p )
*(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
*(__m256i*)hash, *(__m256i*)blob_off ), k );
#elif defined(__SSE4_1__) || defined(__ARM_NEON)
#elif defined(__SSE4_1__) || defined(__ARM_NEON)
#define MULXOR \
casti_v128( hash, 0 ) = v128_mul32( v128_xor( \
@@ -229,7 +184,7 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
MULXOR; \
}
// subsequent passes rotate by r on demand, no need for mass rotate
// subsequent passes rotate by r
#define ROUND_r( r ) \
for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
{ \
@@ -243,8 +198,8 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
void verthash_hash( const void *blob_bytes, const size_t blob_size,
const void *input, void *output )
{
uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (64)));
uint32_t subset[ VH_N_SUBSET / 4 ] __attribute__ ((aligned (64)));
uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (32)));
const uint32_t *blob = (const uint32_t*)blob_bytes;
uint32_t accumulator = 0x811c9dc5;
const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE )

View File

@@ -91,8 +91,8 @@ void verthash_sha3_512_final_8( void *hash, const uint64_t nonce )
int scanhash_verthash( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t edata[20] __attribute__((aligned(64)));
uint32_t hash[8] __attribute__((aligned(64)));
uint32_t edata[20] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -101,9 +101,7 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
for (int i = 0; i < 20; i++)
edata[i] = bswap_32( pdata[i] );
// v128_bswap32_80( edata, pdata );
v128_bswap32_80( edata, pdata );
verthash_sha3_512_prehash_72( edata );
do

View File

@@ -204,11 +204,11 @@ int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
const __m512i eight = _mm512_set1_epi64( 8 );
const bool bench = opt_benchmark;
edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );
mm512_intrlv80_8x64( vdata, edata );
*noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
@@ -372,11 +372,11 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
const __m256i four = _mm256_set1_epi64x( 4 );
const bool bench = opt_benchmark;
edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );
mm256_intrlv80_4x64( vdata, edata );

View File

@@ -13,11 +13,7 @@
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/luffa/luffa_for_sse2.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
@@ -43,11 +39,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
} c11_ctx_holder;
c11_ctx_holder c11_ctx __attribute__ ((aligned (64)));
@@ -69,11 +61,6 @@ void init_c11_ctx()
init_luffa( &c11_ctx.luffa, 512 );
cubehashInit( &c11_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &c11_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &c11_ctx.simd );
#else
init_sd( &c11_ctx.simd, 512 );
#endif
}
void c11_hash( void *output, const void *input )
@@ -105,41 +92,35 @@ void c11_hash( void *output, const void *input )
sph_skein512( &ctx.skein, (const void*) hash, 64 );
sph_skein512_close( &ctx.skein, hash );
update_and_final_luffa( &ctx.luffa, hash, hash, 64 );
update_and_final_luffa( &ctx.luffa, hash, hash, 64 );
cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#else
sph_echo512( &ctx.echo, hash, 64 );
sph_echo512_close( &ctx.echo, hash );
sph_echo512( &ctx.echo, hash, 64 );
sph_echo512_close( &ctx.echo, hash );
#endif
memcpy(output, hash, 32);
memcpy(output, hash, 32);
}
int scanhash_c11( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
const uint32_t Htarg = ptarget[7];
uint32_t nonce = first_nonce;
int thr_id = mythr->id;
volatile uint8_t *restart = &(work_restart[thr_id].restart);

View File

@@ -13,17 +13,13 @@
#include "algo/skein/sph_skein.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#ifdef __AES__
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
@@ -37,11 +33,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
#ifdef __AES__
hashState_groestl groestl;
#else
@@ -62,11 +54,6 @@ void init_tt10_ctx()
init_luffa( &tt10_ctx.luffa, 512 );
cubehashInit( &tt10_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &tt10_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &tt10_ctx.simd );
#else
init_sd( &tt10_ctx.simd, 512 );
#endif
#ifdef __AES__
init_groestl( &tt10_ctx.groestl, 64 );
#else
@@ -222,27 +209,7 @@ void timetravel10_hash(void *output, const void *input)
}
break;
case 9:
if ( i == 0 )
{
memcpy( &ctx.simd, &tt10_mid.simd, sizeof tt10_mid.simd );
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) input + midlen, tail );
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hashB,
(const BitSequence *)input + midlen, tail*8 );
#endif
}
else
{
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_sd( &ctx.simd, (const BitSequence *)hashA, dataLen*8 );
final_sd( &ctx.simd, (BitSequence *)hashB );
#endif
}
simd512_ctx( &ctx.simd, hashB, hashA, dataLen );
break;
default:
break;
@@ -325,15 +292,6 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce,
memcpy( &tt10_mid.shavite, &tt10_ctx.shavite, sizeof(tt10_mid.shavite ) );
sph_shavite512( &tt10_mid.shavite, endiandata, 64 );
break;
case 9:
memcpy( &tt10_mid.simd, &tt10_ctx.simd, sizeof(tt10_mid.simd ) );
#if defined(__aarch64__)
sph_simd512( &tt10_mid.simd, (const void*) endiandata, 64 );
sph_simd512_close( &tt10_mid.simd, hash);
#else
update_sd( &tt10_mid.simd, (const BitSequence *)endiandata, 512 );
#endif
break;
default:
break;
}

View File

@@ -22,12 +22,7 @@
#include "algo/echo/sph_echo.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
typedef struct {
sph_blake512_context blake;
@@ -45,11 +40,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
} x11_ctx_holder;
x11_ctx_holder x11_ctx;
@@ -71,11 +62,6 @@ void init_x11_ctx()
init_luffa( &x11_ctx.luffa, 512 );
cubehashInit( &x11_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &x11_ctx.simd );
#else
init_sd( &x11_ctx.simd, 512 );
#endif
}
void x11_hash( void *state, const void *input )
@@ -118,13 +104,7 @@ void x11_hash( void *state, const void *input )
sph_shavite512( &ctx.shavite, hash, 64 );
sph_shavite512_close( &ctx.shavite, hash );
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -20,11 +20,7 @@
#include "algo/echo/sph_echo.h"
#endif
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/luffa/luffa_for_sse2.h"
typedef struct {
@@ -37,11 +33,7 @@ typedef struct {
#endif
hashState_luffa luffa;
cubehashParam cube;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_blake512_context blake;
sph_bmw512_context bmw;
sph_skein512_context skein;
@@ -63,11 +55,6 @@ void init_x11evo_ctx()
#endif
init_luffa( &x11evo_ctx.luffa, 512 );
cubehashInit( &x11evo_ctx.cube, 512, 16, 32 );
#if defined(__aarch64__)
sph_simd512_init( &x11evo_ctx.simd );
#else
init_sd( &x11evo_ctx.simd, 512 );
#endif
sph_blake512_init( &x11evo_ctx.blake );
sph_bmw512_init( &x11evo_ctx.bmw );
sph_skein512_init( &x11evo_ctx.skein );
@@ -146,12 +133,7 @@ void x11evo_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, (char*)hash );
break;
case 9:
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (char*)hash, (const char*)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
break;
case 10:
#ifdef __AES__

View File

@@ -17,12 +17,7 @@
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -47,11 +42,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_gost512_context gost;
} x11gost_ctx_holder;
@@ -75,11 +66,6 @@ void init_x11gost_ctx()
sph_shavite512_init( &x11gost_ctx.shavite );
init_luffa( &x11gost_ctx.luffa, 512 );
cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
#if defined(__aarch64__)
sph_simd512_init(&x11gost_ctx.simd);
#else
init_sd( &x11gost_ctx.simd, 512 );
#endif
}
void x11gost_hash(void *output, const void *input)
@@ -123,13 +109,7 @@ void x11gost_hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64 );
sph_shavite512_close( &ctx.shavite, hash );
#if defined(__aarch64__)
sph_simd512 (&ctx.simd, hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -17,11 +17,7 @@
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -44,11 +40,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
} x12_ctx_holder;
@@ -68,14 +60,9 @@ void init_x12_ctx()
sph_groestl512_init(&x12_ctx.groestl);
sph_echo512_init(&x12_ctx.echo);
#endif
init_luffa( &x12_ctx.luffa, 512 );
init_luffa( &x12_ctx.luffa, 512 );
cubehashInit( &x12_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &x12_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &x12_ctx.simd );
#else
init_sd( &x12_ctx.simd, 512 );
#endif
sph_hamsi512_init( &x12_ctx.hamsi );
};
@@ -101,13 +88,7 @@ void x12hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hashB);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hashB, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
final_sd( &ctx.simd, (BitSequence *)hash );
#endif
simd512_ctx( &ctx.simd, hash, hashB, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hashB,

View File

@@ -15,11 +15,7 @@
#include "algo/hamsi/sph_hamsi.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -48,11 +44,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
} x13_ctx_holder;
@@ -77,11 +69,6 @@ void init_x13_ctx()
init_luffa( &x13_ctx.luffa, 512 );
cubehashInit( &x13_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &x13_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init(&x13_ctx.simd);
#else
init_sd( &x13_ctx.simd, 512 );
#endif
sph_hamsi512_init( &x13_ctx.hamsi );
};
@@ -121,13 +108,7 @@ void x13hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -15,11 +15,7 @@
#include "algo/shavite/sph_shavite.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -47,11 +43,7 @@ typedef struct {
sph_skein512_context skein;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sm3_ctx_t sm3;
} x13bcd_ctx_holder;
@@ -76,11 +68,6 @@ void init_x13bcd_ctx()
sph_keccak512_init( &x13bcd_ctx.keccak );
cubehashInit( &x13bcd_ctx.cube,512,16,32 );
sph_shavite512_init( &x13bcd_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &x13bcd_ctx.simd );
#else
init_sd( &x13bcd_ctx.simd, 512 );
#endif
sm3_init( &x13bcd_ctx.sm3 );
sph_hamsi512_init( &x13bcd_ctx.hamsi );
};
@@ -127,13 +114,7 @@ void x13bcd_hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -17,11 +17,7 @@
#include "algo/fugue/sph_fugue.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -46,11 +42,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sm3_ctx_t sm3;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
@@ -75,11 +67,6 @@ void init_x13sm3_ctx()
init_luffa( &hsr_ctx.luffa,512 );
cubehashInit( &hsr_ctx.cube,512,16,32 );
sph_shavite512_init( &hsr_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &hsr_ctx.simd );
#else
init_sd( &hsr_ctx.simd,512 );
#endif
sm3_init( &hsr_ctx.sm3 );
sph_hamsi512_init( &hsr_ctx.hamsi );
sph_fugue512_init( &hsr_ctx.fugue );
@@ -123,13 +110,7 @@ void x13sm3_hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
//11---echo---
#ifdef __AES__

View File

@@ -15,11 +15,7 @@
#include "algo/hamsi/sph_hamsi.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -49,11 +45,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
} x14_ctx_holder;
@@ -79,11 +71,6 @@ void init_x14_ctx()
init_luffa( &x14_ctx.luffa,512 );
cubehashInit( &x14_ctx.cube,512,16,32 );
sph_shavite512_init( &x14_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &x14_ctx.simd );
#else
init_sd( &x14_ctx.simd, 512 );
#endif
sph_hamsi512_init( &x14_ctx.hamsi );
sph_shabal512_init( &x14_ctx.shabal );
};
@@ -124,13 +111,7 @@ void x14hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -17,12 +17,7 @@
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -52,11 +47,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -83,11 +74,6 @@ void init_x15_ctx()
init_luffa( &x15_ctx.luffa,512 );
cubehashInit( &x15_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &x15_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &x15_ctx.simd );
#else
init_sd( &x15_ctx.simd, 512 );
#endif
sph_hamsi512_init( &x15_ctx.hamsi );
sph_shabal512_init( &x15_ctx.shabal );
sph_whirlpool_init( &x15_ctx.whirlpool );
@@ -131,13 +117,7 @@ void x15hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -236,7 +236,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
do
{
edata[19] = nonce;
if ( hex_hash( hash32, edata, thr_id ) );
if ( hex_hash( hash32, edata, thr_id ) )
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
{
be32enc( &pdata[19], nonce );

View File

@@ -318,7 +318,7 @@ bool register_minotaur_algo( algo_gate_t* gate )
gate->hash = (void*)&minotaur_hash;
gate->miner_thread_init = (void*)&initialize_torture_garden;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA_OPT;
if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA256_OPT;
return true;
};

View File

@@ -526,7 +526,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if( x16r_8way_hash( hash, vdata, thr_id ) );
if ( x16r_8way_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 8; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
@@ -952,7 +952,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_4way_hash( hash, vdata, thr_id ) );
if ( x16r_4way_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 4; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
@@ -1353,7 +1353,7 @@ int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce,
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_2x64_hash( hash, vdata, thr_id ) );
if ( x16r_2x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{

View File

@@ -15,7 +15,6 @@
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/sph_simd.h"
#include "algo/simd/nist.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"

View File

@@ -137,7 +137,7 @@ int scanhash_x20r_8x64( struct work *work, uint32_t max_nonce,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if( x20r_8x64_hash( hash, vdata, thr_id ) );
if ( x20r_8x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 8; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
@@ -205,7 +205,7 @@ int scanhash_x20r_4x64( struct work *work, uint32_t max_nonce,
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x20r_4x64_hash( hash, vdata, thr_id ) );
if ( x20r_4x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 4; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
@@ -269,7 +269,7 @@ int scanhash_x20r_2x64( struct work *work, uint32_t max_nonce,
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x20r_2x64_hash( hash, vdata, thr_id ) );
if ( x20r_2x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{

View File

@@ -18,11 +18,7 @@
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/sha/sph_sha2.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
@@ -53,11 +49,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -86,11 +78,6 @@ void init_sonoa_ctx()
init_luffa( &sonoa_ctx.luffa, 512 );
cubehashInit( &sonoa_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &sonoa_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &sonoa_ctx.simd );
#else
init_sd( &sonoa_ctx.simd, 512 );
#endif
sph_hamsi512_init( &sonoa_ctx.hamsi );
sph_shabal512_init( &sonoa_ctx.shabal );
sph_whirlpool_init( &sonoa_ctx.whirlpool );
@@ -134,13 +121,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,
@@ -189,13 +170,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
@@ -249,13 +224,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
@@ -318,13 +287,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
@@ -410,13 +373,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, hash, hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
@@ -483,13 +440,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, hash, hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
@@ -527,7 +478,6 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_whirlpool_close(&ctx.whirlpool, hash);
if ( work_restart[thr_id].restart ) return 0;
//
sph_bmw512_init( &ctx.bmw);
sph_bmw512(&ctx.bmw, hash, 64);
@@ -565,13 +515,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, hash, hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );

View File

@@ -418,11 +418,11 @@ int scanhash_x17_16x32( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
// convert LE32 to LE64
edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );
mm512_intrlv80_8x64( vdata, edata );
blake512_8way_prehash_le( &blake512_8way_ctx, x17_16way_midstate, vdata );
@@ -681,11 +681,11 @@ int scanhash_x17_8x64( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
// convert LE32 to LE64
edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );
mm512_intrlv80_8x64( vdata, edata );
*noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
@@ -895,11 +895,11 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
// convert LE32 to LE64
edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );
mm256_intrlv80_4x64( vdata, edata );
*noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32( 0,3,0,2, 0,1,0,0 ) );

View File

@@ -18,11 +18,7 @@
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/sha/sph_sha2.h"
#if defined(__AES__)
#include "algo/fugue/fugue-aesni.h"
@@ -34,7 +30,7 @@
#include "algo/fugue/sph_fugue.h"
#endif
#include "algo/blake/sph_blake.h"
#include "algo/cubehash/sph_cubehash.h"
//#include "algo/cubehash/sph_cubehash.h"
#include "algo/luffa/sph_luffa.h"
@@ -63,17 +59,9 @@ union _x17_context_overlay
#else
hashState_luffa luffa;
#endif
//#if defined(__aarch64__)
// sph_cubehash512_context cube;
//#else
cubehashParam cube;
//#endif
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -127,26 +115,13 @@ int x17_hash(void *output, const void *input, int thr_id )
luffa_full( &ctx.luffa, hash, 512, hash, 64 );
#endif
//#if defined(__aarch64__)
// sph_cubehash512_init(&ctx.cube);
// sph_cubehash512(&ctx.cube, (const void*) hash, 64);
// sph_cubehash512_close(&ctx.cube, hash);
//#else
cubehash_full( &ctx.cube, hash, 512, hash, 64 );
//#endif
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
echo_full( &ctx.echo, (BitSequence *)hash, 512,

View File

@@ -17,11 +17,7 @@
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/sha/sph_sha2.h"
#if defined(__AES__)
@@ -45,11 +41,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -78,11 +70,6 @@ void init_xevan_ctx()
init_luffa( &xevan_ctx.luffa, 512 );
cubehashInit( &xevan_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &xevan_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &xevan_ctx.simd );
#else
init_sd( &xevan_ctx.simd, 512 );
#endif
sph_hamsi512_init( &xevan_ctx.hamsi );
sph_shabal512_init( &xevan_ctx.shabal );
sph_whirlpool_init( &xevan_ctx.whirlpool );
@@ -137,13 +124,7 @@ int xevan_hash(void *output, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, dataLen);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512( &ctx.simd, (const void*) hash, dataLen );
sph_simd512_close( &ctx.simd, hash );
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, dataLen*8 );
#endif
simd512_ctx( &ctx.simd, hash, hash, dataLen );
#if defined(__AES__)
update_final_echo( &ctx.echo, (BitSequence *) hash,
@@ -210,13 +191,14 @@ int xevan_hash(void *output, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, dataLen);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, dataLen*8 );
#endif
simd512_ctx( &ctx.simd, hash, hash, dataLen );
//#if defined(__aarch64__)
// sph_simd512(&ctx.simd, (const void*) hash, 64);
// sph_simd512_close(&ctx.simd, hash);
//#else
// update_final_sd( &ctx.simd, (BitSequence *)hash,
// (const BitSequence *)hash, dataLen*8 );
//#endif
#if defined(__AES__)
update_final_echo( &ctx.echo, (BitSequence *) hash,

View File

@@ -31,7 +31,7 @@ bool register_x22i_algo( algo_gate_t* gate )
#endif
gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT
gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA256_OPT
| AVX512_OPT | VAES_OPT | NEON_OPT;
return true;
};
@@ -48,7 +48,7 @@ bool register_x25x_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_x25x;
gate->hash = (void*)&x25x_hash;
#endif
gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA256_OPT |
AVX512_OPT | VAES_OPT | NEON_OPT;
InitializeSWIFFTX();
return true;

View File

@@ -18,7 +18,6 @@
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/nist.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/fugue/fugue-aesni.h"
#include "algo/whirlpool/sph_whirlpool.h"

View File

@@ -71,7 +71,7 @@ int scanhash_yespower_r8g( struct work *work, uint32_t max_nonce,
bool register_yescryptr8g_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_yespower_r8g;
#if (__SSE2__) || defined(__aarch64__)
gate->hash = (void*)&yespower_hash;

View File

@@ -162,7 +162,7 @@ bool register_yespower_algo( algo_gate_t* gate )
if ( yespower_params.pers )
applog( LOG_NOTICE,"Key= \"%s\"\n", yespower_params.pers );
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_yespower;
#if (__SSE2__) || defined(__aarch64__)
gate->hash = (void*)&yespower_hash;
@@ -180,7 +180,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
yespower_params.r = 16;
yespower_params.pers = NULL;
yespower_params.perslen = 0;
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_yespower;
#if (__SSE2__) || defined(__aarch64__)
gate->hash = (void*)&yespower_hash;
@@ -195,7 +195,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
bool register_yescrypt_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_yespower;
#if (__SSE2__) || defined(__aarch64__)
gate->hash = (void*)&yespower_hash;
@@ -233,7 +233,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )
bool register_yescryptr8_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_yespower;
#if (__SSE2__) || defined(__aarch64__)
gate->hash = (void*)&yespower_hash;
@@ -251,7 +251,7 @@ bool register_yescryptr8_algo( algo_gate_t* gate )
bool register_yescryptr16_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_yespower;
#if (__SSE2__) || defined(__aarch64__)
gate->hash = (void*)&yespower_hash;
@@ -269,7 +269,7 @@ bool register_yescryptr16_algo( algo_gate_t* gate )
bool register_yescryptr32_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_yespower;
#if (__SSE2__) || defined(__aarch64__)
gate->hash = (void*)&yespower_hash;

2
api.c
View File

@@ -531,7 +531,7 @@ static void api()
time_t bindstart;
struct sockaddr_in serv;
struct sockaddr_in cli;
socklen_t clisiz;
uint32_t clisiz;
bool addrok = false;
long long counter;
char *result;

View File

@@ -1,14 +0,0 @@
#!/bin/bash
# Linux build
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=native -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer

View File

@@ -4,72 +4,54 @@
# during develpment. However the information contained may provide compilation
# tips to users.
rm cpuminer cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.2-aes-sha3-sve2 cpuminer-armv8-aes-sha2-sve2 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null
rm cpuminer cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null
# armv9 needs gcc-13
# -march-armv9-a includes SVE2 but no crypto
# -march=armv9-a+crypto adds AES & SHA2 but not SHA512
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=armv9-a+crypto+sha3+aes -Wall -flax-vector-conversions" ./configure --with-curl
CFLAGS="-O3 -march=armv9-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv9-aes-sha3
mv cpuminer cpuminer-armv9-crypto-sha3
make clean || echo clean
CFLAGS="-O3 -march=armv9-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure --with-curl
CFLAGS="-O3 -march=armv9-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv9-aes-sha3-sve2
mv cpuminer cpuminer-armv9-crypto
make clean || echo clean
CFLAGS="-O3 -march=armv8.2-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure --with-curl
CFLAGS="-O3 -march=armv9-a -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv8.2-aes-sha3-sve2
mv cpuminer cpuminer-armv9
# SVE2 available in armv8.5
make clean || echo clean
CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes+sve2 -Wall -flax-vector-conversions" ./configure --with-curl
CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+sve2 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv8-aes-sha2-sve2
mv cpuminer cpuminer-armv8.5-crypto-sha3-sve2
# SHA3 available in armv8.4
make clean || echo clean
CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure --with-curl
CFLAGS="-O3 -march=armv8.4-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv8-aes-sha2
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=armv8-a+crypto+sha2 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv8-sha2
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=armv8-a+crypto+aes -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv8-aes
mv cpuminer cpuminer-armv8.4-crypto-sha3
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=armv8-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv8-crypto
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=armv8-a -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv8
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=native -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer

View File

@@ -4,7 +4,7 @@
# during develpment. However the information contained may provide compilation
# tips to users.
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 > /dev/null
rm cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 > /dev/null
# AVX512 SHA VAES: Intel Core Icelake, Rocketlake
make distclean || echo clean
@@ -18,28 +18,55 @@ strip -s cpuminer
mv cpuminer cpuminer-avx512-sha-vaes
# Intel Core Alderlake: AVX2 SHA VAES, needs gcc-12
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
make -j 8
strip -s cpuminer
mv cpuminer cpuminer-alderlake
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
#make -j 8
#strip -s cpuminer
#mv cpuminer cpuminer-alderlake
# Intel Core Arrowlake: AVX2 SHA512 VAES, needs gcc-14
# Intel Core Arrowlake-s: AVX2 SHA512 VAES, needs gcc-14
# Arrowlake-s includes SHA512, Arrowlake does not?
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl
#make -j 8
#strip -s cpuminer
#mv cpuminer cpuminer-arrowlake
#mv cpuminer cpuminer-arrowlake-s
# Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14
# Apparently Granitrapids will not include AVX10, SHA512 or APX,
# wait for Diamondrapids & gcc-15.
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
#make -j 8
#strip -s cpuminer
#mv cpuminer cpuminer-graniterapids
# Force AVX10-256
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=arrowlake-s -mavx10.1-256 -Wall" ./configure --with-curl
#make -j 8
#strip -s cpuminer
#mv cpuminer cpuminer-avx10-256
# Force SHA512 AVX10-512
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1-512 -Wall" ./configure --with-curl
#make -j 8
#strip -s cpuminer
#mv cpuminer cpuminer-avx10-512
# Zen5: AVX512 SHA VAES, requires gcc-14.
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=znver5" ./configure --with-curl
#CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl
#make -j $(nproc)
#strip -s cpuminer
#mv cpuminer cpuminer-zen4
#mv cpuminer cpuminer-zen5
# Zen4: AVX512 SHA VAES
make clean || echo clean
@@ -70,7 +97,7 @@ make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-avx512
# AVX2 SHA VAES: generic
# AVX2 SHA VAES: generic, zen3, alderlake...arrowlake
make clean || echo done
rm -f config.status
# vaes doesn't include aes

View File

@@ -1,27 +1,9 @@
#!/bin/bash
#if [ "$OS" = "Windows_NT" ]; then
# ./mingw64.sh
# exit 0
#fi
#!/bin/sh
# Linux build
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
# Ubuntu 10.04 (gcc 4.4)
# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl
#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer

View File

@@ -1,10 +0,0 @@
#!/bin/bash
#
# Compile on Windows using MSYS2 and MinGW.
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=native -Wall -D_WIN32_WINNT=0x0601" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer

View File

@@ -1,20 +1,9 @@
#!/bin/bash
#if [ "$OS" = "Windows_NT" ]; then
# ./mingw64.sh
# exit 0
#fi
# Linux build
#!/bin/sh
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
#strip -s cpuminer

View File

@@ -1,8 +1,8 @@
#!/bin/bash
#!/bin/sh
#
# make clean and rm all the targetted executables.
rm cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.2-aes-sha3-sve2 cpuminer-armv8-aes-sha3-sve2 cpuminer-armv8-aes-sha2-sve2 cpuminer-armv8-crypto cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-aes-sha3 cpuminer-armv8-aes-sha2 cpuminer-armv8-sha2 > /dev/null
rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9 cpuminer-armv9-crypto cpuminer-armv9-crypto-sha3 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8-crypto cpuminer-armv8 > /dev/null
rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null

View File

@@ -3,7 +3,7 @@
#ifdef WIN32
#if _WIN32_WINNT==0x0601 // Windows 7
#if _WIN32_WINNT>=0x0601 // Windows 7
#define WINDOWS_CPU_GROUPS_ENABLED 1
#endif

425
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.2.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.3.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='24.2'
PACKAGE_STRING='cpuminer-opt 24.2'
PACKAGE_VERSION='25.3'
PACKAGE_STRING='cpuminer-opt 25.3'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -657,14 +657,14 @@ JANSSON_LIBS
LIBCURL_CPPFLAGS
LIBCURL_CFLAGS
LIBCURL
HAVE_APPLE_FALSE
HAVE_APPLE_TRUE
MINGW_FALSE
MINGW_TRUE
ARCH_ARM_FALSE
ARCH_ARM_TRUE
ARCH_ARM64_FALSE
ARCH_ARM64_TRUE
ARCH_x86_64_FALSE
ARCH_x86_64_TRUE
ARCH_x86_FALSE
ARCH_x86_TRUE
USE_ASM_FALSE
USE_ASM_TRUE
HAVE_WINDOWS_FALSE
@@ -796,7 +796,6 @@ enable_maintainer_mode
enable_dependency_tracking
enable_assembly
with_curl
with_crypto
'
ac_precious_vars='build_alias
host_alias
@@ -1360,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 24.2 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 25.3 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1432,7 +1431,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 24.2:";;
short | recursive ) echo "Configuration of cpuminer-opt 25.3:";;
esac
cat <<\_ACEOF
@@ -1455,7 +1454,6 @@ Optional Packages:
--with-PACKAGE[=ARG] use PACKAGE [ARG=yes]
--without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no)
--with-curl=PATH prefix where curl is installed default=/usr
--with-crypto=PATH prefix where openssl crypto is installed default=/usr
Some influential environment variables:
CC C compiler command
@@ -1538,7 +1536,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 24.2
cpuminer-opt configure 25.3
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1983,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 24.2, which was
It was created by cpuminer-opt $as_me 25.3, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3593,7 +3591,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='24.2'
VERSION='25.3'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -6502,32 +6500,30 @@ then :
fi
MINGW_TARGET=`$CC -dumpmachine 2>&1`
case $MINGW_TARGET in
arm*-*-*)
have_arm=true
;;
i*86-*-mingw*)
have_x86=true
have_win32=true
CFLAGS="-Icompat/pthreads $CFLAGS"
PTHREAD_LDFLAGS="-Lcompat/pthreads/x86"
WS2_LIBS="-lws2_32"
;;
x86_64-*-mingw*|amd64-*-mingw*)
have_x86_64=true
have_win32=true
CFLAGS="-Icompat/pthreads $CFLAGS"
PTHREAD_LDFLAGS="-Lcompat/pthreads/x64"
# SHOULD BE AT END! after -lcrypto #
WS2_LIBS="-L/mingw/x86_64-w64-mingw32/lib -lws2_32"
;;
i*86-*-*)
have_x86=true
;;
case $target in
x86_64-*-*|amd64-*-*)
have_x86_64=true
;;
aarch64*-*-*|arm64*-*-*)
have_arm64=true
;;
powerpc*-*-*)
have_ppc=true
;;
esac
PTHREAD_FLAGS="-pthread"
WS2_LIBS=""
case $target in
*-*-mingw*)
have_win32=true
PTHREAD_FLAGS=""
WS2_LIBS="-lws2_32"
;;
*-apple-*)
have_apple=true
;;
esac
# Check whether --enable-assembly was given.
@@ -6542,126 +6538,7 @@ printf "%s\n" "#define USE_ASM 1" >>confdefs.h
fi
if test x$enable_assembly != xno -a x$have_x86_64 = xtrue
then
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX code" >&5
printf %s "checking whether we can compile AVX code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main (void)
{
asm ("vmovdqa %ymm0, %ymm1");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"
then :
printf "%s\n" "#define USE_AVX 1" >>confdefs.h
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile XOP code" >&5
printf %s "checking whether we can compile XOP code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main (void)
{
asm ("vprotd \$7, %xmm0, %xmm1");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"
then :
printf "%s\n" "#define USE_XOP 1" >>confdefs.h
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the XOP instruction set." >&5
printf "%s\n" "$as_me: WARNING: The assembler does not support the XOP instruction set." >&2;}
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX2 code" >&5
printf %s "checking whether we can compile AVX2 code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main (void)
{
asm ("vpaddd %ymm0, %ymm1, %ymm2");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"
then :
printf "%s\n" "#define USE_AVX2 1" >>confdefs.h
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX512 code" >&5
printf %s "checking whether we can compile AVX512 code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main (void)
{
asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"
then :
printf "%s\n" "#define USE_AVX512 1" >>confdefs.h
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX512 instruction set." >&5
printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX512 instruction set." >&2;}
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX2 instruction set." >&5
printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX2 instruction set." >&2;}
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX instruction set." >&5
printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX instruction set." >&2;}
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
fi
# jansson test fails on Linux/Mingw, handled in Makefile.am.
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for json_loads in -ljansson" >&5
printf %s "checking for json_loads in -ljansson... " >&6; }
if test ${ac_cv_lib_jansson_json_loads+y}
@@ -6705,51 +6582,7 @@ else $as_nop
fi
# GC2 for GNU static
if test "x$have_win32" = "xtrue" ; then
# MinGW
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
printf %s "checking for pthread_create in -lpthread... " >&6; }
if test ${ac_cv_lib_pthread_pthread_create+y}
then :
printf %s "(cached) " >&6
else $as_nop
ac_check_lib_save_LIBS=$LIBS
LIBS="-lpthread $LIBS"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
/* Override any GCC internal prototype to avoid an error.
Use char because int might match the return type of a GCC
builtin and then its argument prototype would still apply. */
char pthread_create ();
int
main (void)
{
return pthread_create ();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"
then :
ac_cv_lib_pthread_pthread_create=yes
else $as_nop
ac_cv_lib_pthread_pthread_create=no
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam \
conftest$ac_exeext conftest.$ac_ext
LIBS=$ac_check_lib_save_LIBS
fi
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthread_pthread_create" >&5
printf "%s\n" "$ac_cv_lib_pthread_pthread_create" >&6; }
if test "x$ac_cv_lib_pthread_pthread_create" = xyes
then :
PTHREAD_LIBS="-lpthreadGC2"
fi
else
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
printf %s "checking for pthread_create in -lpthread... " >&6; }
if test ${ac_cv_lib_pthread_pthread_create+y}
then :
@@ -6787,12 +6620,132 @@ printf "%s\n" "$ac_cv_lib_pthread_pthread_create" >&6; }
if test "x$ac_cv_lib_pthread_pthread_create" = xyes
then :
PTHREAD_LIBS="-lpthread"
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC2" >&5
printf %s "checking for pthread_create in -lpthreadGC2... " >&6; }
if test ${ac_cv_lib_pthreadGC2_pthread_create+y}
then :
printf %s "(cached) " >&6
else $as_nop
ac_check_lib_save_LIBS=$LIBS
LIBS="-lpthreadGC2 $LIBS"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
/* Override any GCC internal prototype to avoid an error.
Use char because int might match the return type of a GCC
builtin and then its argument prototype would still apply. */
char pthread_create ();
int
main (void)
{
return pthread_create ();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"
then :
ac_cv_lib_pthreadGC2_pthread_create=yes
else $as_nop
ac_cv_lib_pthreadGC2_pthread_create=no
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam \
conftest$ac_exeext conftest.$ac_ext
LIBS=$ac_check_lib_save_LIBS
fi
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC2_pthread_create" >&5
printf "%s\n" "$ac_cv_lib_pthreadGC2_pthread_create" >&6; }
if test "x$ac_cv_lib_pthreadGC2_pthread_create" = xyes
then :
PTHREAD_LIBS="-lpthreadGC2"
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC1" >&5
printf %s "checking for pthread_create in -lpthreadGC1... " >&6; }
if test ${ac_cv_lib_pthreadGC1_pthread_create+y}
then :
printf %s "(cached) " >&6
else $as_nop
ac_check_lib_save_LIBS=$LIBS
LIBS="-lpthreadGC1 $LIBS"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
/* Override any GCC internal prototype to avoid an error.
Use char because int might match the return type of a GCC
builtin and then its argument prototype would still apply. */
char pthread_create ();
int
main (void)
{
return pthread_create ();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"
then :
ac_cv_lib_pthreadGC1_pthread_create=yes
else $as_nop
ac_cv_lib_pthreadGC1_pthread_create=no
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam \
conftest$ac_exeext conftest.$ac_ext
LIBS=$ac_check_lib_save_LIBS
fi
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC1_pthread_create" >&5
printf "%s\n" "$ac_cv_lib_pthreadGC1_pthread_create" >&6; }
if test "x$ac_cv_lib_pthreadGC1_pthread_create" = xyes
then :
PTHREAD_LIBS="-lpthreadGC1"
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC" >&5
printf %s "checking for pthread_create in -lpthreadGC... " >&6; }
if test ${ac_cv_lib_pthreadGC_pthread_create+y}
then :
printf %s "(cached) " >&6
else $as_nop
ac_check_lib_save_LIBS=$LIBS
LIBS="-lpthreadGC $LIBS"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
/* Override any GCC internal prototype to avoid an error.
Use char because int might match the return type of a GCC
builtin and then its argument prototype would still apply. */
char pthread_create ();
int
main (void)
{
return pthread_create ();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"
then :
ac_cv_lib_pthreadGC_pthread_create=yes
else $as_nop
ac_cv_lib_pthreadGC_pthread_create=no
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam \
conftest$ac_exeext conftest.$ac_ext
LIBS=$ac_check_lib_save_LIBS
fi
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC_pthread_create" >&5
printf "%s\n" "$ac_cv_lib_pthreadGC_pthread_create" >&6; }
if test "x$ac_cv_lib_pthreadGC_pthread_create" = xyes
then :
PTHREAD_LIBS="-lpthreadGC"
fi
fi
fi
fi
LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
# PTHREAD_LIBS="$PTHREAD_LIBS"
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether __uint128_t is supported" >&5
printf %s "checking whether __uint128_t is supported... " >&6; }
@@ -6847,14 +6800,6 @@ else
USE_ASM_FALSE=
fi
if test x$have_x86 = xtrue; then
ARCH_x86_TRUE=
ARCH_x86_FALSE='#'
else
ARCH_x86_TRUE='#'
ARCH_x86_FALSE=
fi
if test x$have_x86_64 = xtrue; then
ARCH_x86_64_TRUE=
ARCH_x86_64_FALSE='#'
@@ -6863,12 +6808,12 @@ else
ARCH_x86_64_FALSE=
fi
if test x$have_arm = xtrue; then
ARCH_ARM_TRUE=
ARCH_ARM_FALSE='#'
if test x$have_arm64 = xtrue; then
ARCH_ARM64_TRUE=
ARCH_ARM64_FALSE='#'
else
ARCH_ARM_TRUE='#'
ARCH_ARM_FALSE=
ARCH_ARM64_TRUE='#'
ARCH_ARM64_FALSE=
fi
if test "x$OS" = "xWindows_NT"; then
@@ -6879,13 +6824,15 @@ else
MINGW_FALSE=
fi
if test x$request_jansson = xtrue ; then
JANSSON_LIBS="compat/jansson/libjansson.a"
if test x$have_apple = xtrue; then
HAVE_APPLE_TRUE=
HAVE_APPLE_FALSE='#'
else
JANSSON_LIBS=-ljansson
HAVE_APPLE_TRUE='#'
HAVE_APPLE_FALSE=
fi
# libcurl install path (for mingw : --with-curl=/usr/local)
# Check whether --with-curl was given.
@@ -6902,30 +6849,10 @@ if test -n "$with_curl" ; then
LIBCURL="-lcurl -lz"
fi
# SSL install path (for mingw : --with-crypto=/usr/local/ssl)
# Check whether --with-crypto was given.
if test ${with_crypto+y}
then :
withval=$with_crypto;
fi
if test -n "$with_crypto" ; then
LIBCURL_CFLAGS="$LIBCURL_CFLAGS -I$with_crypto/include"
LIBCURL_CPPFLAGS="$LIBCURL_CPPFLAGS -I$with_crypto/include"
LIBCURL_LDFLAGS="-L$with_crypto/lib $LIBCURL_LDFLAGS"
LIBCURL="$LIBCURL -lssl -lcrypto"
fi
CFLAGS="$CFLAGS $LIBCURL_CFLAGS"
CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS"
#AC_CHECK_LIB([z],[gzopen],[],[])
#AC_CHECK_LIB([crypto],[OPENSSL_init], crypto=yes, [AC_MSG_ERROR([OpenSSL crypto library required])])
#AC_CHECK_LIB([ssl],[SSL_new], ssl=yes, ssl=no)
# AC_CHECK_LIB([curl], [curl_multi_timeout],
# have_libcurl=yes,
# have_libcurl=no AC_MSG_ERROR([curl library required])
@@ -7102,22 +7029,22 @@ if test -z "${USE_ASM_TRUE}" && test -z "${USE_ASM_FALSE}"; then
as_fn_error $? "conditional \"USE_ASM\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${ARCH_x86_TRUE}" && test -z "${ARCH_x86_FALSE}"; then
as_fn_error $? "conditional \"ARCH_x86\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${ARCH_x86_64_TRUE}" && test -z "${ARCH_x86_64_FALSE}"; then
as_fn_error $? "conditional \"ARCH_x86_64\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${ARCH_ARM_TRUE}" && test -z "${ARCH_ARM_FALSE}"; then
as_fn_error $? "conditional \"ARCH_ARM\" was never defined.
if test -z "${ARCH_ARM64_TRUE}" && test -z "${ARCH_ARM64_FALSE}"; then
as_fn_error $? "conditional \"ARCH_ARM64\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${MINGW_TRUE}" && test -z "${MINGW_FALSE}"; then
as_fn_error $? "conditional \"MINGW\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${HAVE_APPLE_TRUE}" && test -z "${HAVE_APPLE_FALSE}"; then
as_fn_error $? "conditional \"HAVE_APPLE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
: "${CONFIG_STATUS=./config.status}"
ac_write_fail=0
@@ -7508,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 24.2, which was
This file was extended by cpuminer-opt $as_me 25.3, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7576,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 24.2
cpuminer-opt config.status 25.3
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [24.2])
AC_INIT([cpuminer-opt], [25.3])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM
@@ -41,32 +41,30 @@ AC_CHECK_DECLS([be32dec, le32dec, be32enc, le32enc, le16dec, le16enc], [], [],
AC_FUNC_ALLOCA
AC_CHECK_FUNCS([getopt_long])
MINGW_TARGET=`$CC -dumpmachine 2>&1`
case $MINGW_TARGET in
arm*-*-*)
have_arm=true
;;
i*86-*-mingw*)
have_x86=true
have_win32=true
CFLAGS="-Icompat/pthreads $CFLAGS"
PTHREAD_LDFLAGS="-Lcompat/pthreads/x86"
WS2_LIBS="-lws2_32"
;;
x86_64-*-mingw*|amd64-*-mingw*)
have_x86_64=true
have_win32=true
CFLAGS="-Icompat/pthreads $CFLAGS"
PTHREAD_LDFLAGS="-Lcompat/pthreads/x64"
# SHOULD BE AT END! after -lcrypto #
WS2_LIBS="-L/mingw/x86_64-w64-mingw32/lib -lws2_32"
;;
i*86-*-*)
have_x86=true
;;
case $target in
x86_64-*-*|amd64-*-*)
have_x86_64=true
;;
aarch64*-*-*|arm64*-*-*)
have_arm64=true
;;
powerpc*-*-*)
have_ppc=true
;;
esac
PTHREAD_FLAGS="-pthread"
WS2_LIBS=""
case $target in
*-*-mingw*)
have_win32=true
PTHREAD_FLAGS=""
WS2_LIBS="-lws2_32"
;;
*-apple-*)
have_apple=true
;;
esac
AC_ARG_ENABLE([assembly],
@@ -75,54 +73,14 @@ if test x$enable_assembly != xno; then
AC_DEFINE([USE_ASM], [1], [Define to 1 if assembly routines are wanted.])
fi
if test x$enable_assembly != xno -a x$have_x86_64 = xtrue
then
AC_MSG_CHECKING(whether we can compile AVX code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vmovdqa %ymm0, %ymm1");])],
AC_DEFINE(USE_AVX, 1, [Define to 1 if AVX assembly is available.])
AC_MSG_RESULT(yes)
AC_MSG_CHECKING(whether we can compile XOP code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vprotd \$7, %xmm0, %xmm1");])],
AC_DEFINE(USE_XOP, 1, [Define to 1 if XOP assembly is available.])
AC_MSG_RESULT(yes)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the XOP instruction set.])
)
AC_MSG_CHECKING(whether we can compile AVX2 code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
AC_MSG_RESULT(yes)
AC_MSG_CHECKING(whether we can compile AVX512 code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");])],
AC_DEFINE(USE_AVX512, 1, [Define to 1 if AVX512 assembly is available.])
AC_MSG_RESULT(yes)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX512 instruction set.])
)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])
)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX instruction set.])
)
fi
# jansson test fails on Linux/Mingw, handled in Makefile.am.
AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)
# GC2 for GNU static
if test "x$have_win32" = "xtrue" ; then
# MinGW
AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",[])
else
AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",[])
fi
LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
# PTHREAD_LIBS="$PTHREAD_LIBS"
AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
AC_CHECK_LIB([pthreadGC2], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",
AC_CHECK_LIB([pthreadGC1], [pthread_create], PTHREAD_LIBS="-lpthreadGC1",
AC_CHECK_LIB([pthreadGC], [pthread_create], PTHREAD_LIBS="-lpthreadGC"
))))
AC_MSG_CHECKING(whether __uint128_t is supported)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([static __uint128_t i = 100;])],
@@ -136,16 +94,10 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([static __uint128_t i = 100;])],
AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
AM_CONDITIONAL([USE_ASM], [test x$enable_assembly != xno])
AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue])
AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue])
AM_CONDITIONAL([ARCH_ARM], [test x$have_arm = xtrue])
AM_CONDITIONAL([ARCH_ARM64], [test x$have_arm64 = xtrue])
AM_CONDITIONAL([MINGW], [test "x$OS" = "xWindows_NT"])
if test x$request_jansson = xtrue ; then
JANSSON_LIBS="compat/jansson/libjansson.a"
else
JANSSON_LIBS=-ljansson
fi
AM_CONDITIONAL([HAVE_APPLE], [test x$have_apple = xtrue])
# libcurl install path (for mingw : --with-curl=/usr/local)
AC_ARG_WITH([curl],
@@ -158,25 +110,10 @@ if test -n "$with_curl" ; then
LIBCURL="-lcurl -lz"
fi
# SSL install path (for mingw : --with-crypto=/usr/local/ssl)
AC_ARG_WITH([crypto],
[ --with-crypto=PATH prefix where openssl crypto is installed [default=/usr]])
if test -n "$with_crypto" ; then
LIBCURL_CFLAGS="$LIBCURL_CFLAGS -I$with_crypto/include"
LIBCURL_CPPFLAGS="$LIBCURL_CPPFLAGS -I$with_crypto/include"
LIBCURL_LDFLAGS="-L$with_crypto/lib $LIBCURL_LDFLAGS"
LIBCURL="$LIBCURL -lssl -lcrypto"
fi
CFLAGS="$CFLAGS $LIBCURL_CFLAGS"
CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS"
#AC_CHECK_LIB([z],[gzopen],[],[])
#AC_CHECK_LIB([crypto],[OPENSSL_init], crypto=yes, [AC_MSG_ERROR([OpenSSL crypto library required])])
#AC_CHECK_LIB([ssl],[SSL_new], ssl=yes, ssl=no)
# AC_CHECK_LIB([curl], [curl_multi_timeout],
# have_libcurl=yes,
# have_libcurl=no AC_MSG_ERROR([curl library required])

2226
configure~

File diff suppressed because it is too large Load Diff

View File

@@ -206,7 +206,7 @@ static uint32_t last_block_height = 0;
static double highest_share = 0; // highest accepted share diff
static double lowest_share = 9e99; // lowest accepted share diff
static double last_targetdiff = 0.;
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32) || defined(__APPLE__))
static uint32_t hi_temp = 0;
static uint32_t prev_temp = 0;
#endif
@@ -286,15 +286,15 @@ static inline void drop_policy(void) { }
static void affine_to_cpu( struct thr_info *thr )
{
int thread = thr->id;
unsigned long last_error;
bool ok;
unsigned long last_error = 0;
bool ok = true;
#if defined(WINDOWS_CPU_GROUPS_ENABLED)
unsigned long group_size = GetActiveProcessorCount( 0 );
unsigned long group = thread / group_size;
unsigned long cpu = thread_affinity_map[ thread % group_size ];
GROUP_AFFINITY affinity;
GROUP_AFFINITY affinity = {0};
affinity.Group = group;
affinity.Mask = 1ULL << cpu;
@@ -320,8 +320,7 @@ static void affine_to_cpu( struct thr_info *thr )
{
last_error = GetLastError();
if ( !thread )
applog( LOG_WARNING, "Set affinity returned error 0x%x for thread %d",
last_error, thread );
applog( LOG_WARNING, "Set affinity returned error 0x%x", last_error );
}
}
@@ -992,19 +991,19 @@ void report_summary_log( bool force )
if ( rejected_share_count > 10 )
{
if ( rejected_share_count > ( submitted_share_count * .5 ) )
if ( rejected_share_count > ( submitted_share_count / 2 ) )
{
applog(LOG_ERR,"Excessive rejected share rate, exiting...");
exit(1);
}
else if ( rejected_share_count > ( submitted_share_count * .1 ) )
else if ( rejected_share_count > ( submitted_share_count / 10 ) )
applog(LOG_WARNING,"High rejected share rate, check settings.");
}
gettimeofday( &now, NULL );
timeval_subtract( &et, &now, &five_min_start );
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32) || defined(__APPLE__))
// Display CPU temperature and clock rate.
int curr_temp = cpu_temp(0);
@@ -1013,8 +1012,9 @@ void report_summary_log( bool force )
if ( !opt_quiet || ( curr_temp >= 80 ) )
{
int wait_time = curr_temp >= 90 ? 5 : curr_temp >= 80 ? 30 :
curr_temp >= 70 ? 60 : 120;
int wait_time = curr_temp >= 90 ? 5
: curr_temp >= 80 ? 30
: curr_temp >= 70 ? 60 : 120;
timeval_subtract( &diff, &now, &cpu_temp_time );
if ( ( diff.tv_sec > wait_time )
|| ( ( curr_temp > prev_temp ) && ( curr_temp >= 75 ) ) )
@@ -1591,13 +1591,13 @@ start:
last_targetdiff = net_diff;
applog( LOG_BLUE, "New Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
work->height, work->tx_count, net_diff,
work->data[ algo_gate.ntime_index ] );
work->height, work->tx_count, net_diff,
bswap_32( work->data[ algo_gate.ntime_index ] ) );
}
else if ( memcmp( &work->data[1], &g_work.data[1], 32 ) )
else if ( memcmp( work->data, g_work.data, algo_gate.work_cmp_size ) )
applog( LOG_BLUE, "New Work: Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
work->height, work->tx_count, net_diff,
work->data[ algo_gate.ntime_index ] );
work->height, work->tx_count, net_diff,
bswap_32( work->data[ algo_gate.ntime_index ] ) );
else
new_work = false;
@@ -1912,6 +1912,8 @@ static bool wanna_mine(int thr_id)
{
bool state = true;
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32) || defined(__APPLE__))
if (opt_max_temp > 0.0)
{
float temp = cpu_temp(0);
@@ -1921,8 +1923,12 @@ static bool wanna_mine(int thr_id)
applog(LOG_NOTICE, "CPU temp too high: %.0fC max %.0f, waiting...", temp, opt_max_temp );
state = false;
}
if ( temp > hi_temp ) hi_temp = temp;
}
if (opt_max_diff > 0.0 && net_diff > opt_max_diff)
#endif
if (opt_max_diff > 0.0 && net_diff > opt_max_diff)
{
if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
applog(LOG_NOTICE, "network diff too high, waiting...");
@@ -2133,7 +2139,7 @@ static void *miner_thread( void *userdata )
// uint32_t end_nonce = opt_benchmark
// ? ( 0xffffffffU / opt_n_threads ) * (thr_id + 1) - 0x20
// : 0;
uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20;
uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - opt_n_threads;
memset( &work, 0, sizeof(work) );
@@ -2200,58 +2206,58 @@ static void *miner_thread( void *userdata )
// int64_t max64 = 1000;
int nonce_found = 0;
// if ( likely( algo_gate.do_this_thread( thr_id ) ) )
// {
if ( have_stratum )
if ( have_stratum )
{
while ( unlikely( stratum_down ) )
sleep( 1 );
if ( unlikely( ( *nonceptr >= end_nonce )
&& !work_restart[thr_id].restart ) )
{
while ( unlikely( stratum_down ) )
sleep( 1 );
if ( unlikely( ( *nonceptr >= end_nonce )
&& !work_restart[thr_id].restart ) )
if ( opt_extranonce )
stratum_gen_work( &stratum, &g_work );
else
{
if ( opt_extranonce )
stratum_gen_work( &stratum, &g_work );
else
if ( !thr_id )
{
if ( !thr_id )
{
applog( LOG_WARNING, "nonce range exhausted, extranonce not subscribed" );
applog( LOG_WARNING, "waiting for new work...");
}
while ( !work_restart[thr_id].restart )
sleep ( 1 );
applog( LOG_WARNING, "Nonce range exhausted, extranonce not subscribed." );
applog( LOG_WARNING, "Waiting for new work...");
}
while ( !work_restart[thr_id].restart )
sleep ( 1 );
}
}
else if ( !opt_benchmark ) // GBT or getwork
}
else if ( !opt_benchmark ) // GBT or getwork
{
// max64 is used to set end_nonce to match the scantime.
// It also factors the nonce range to end the scan when nonces are
// exhausted. In either case needing new work can be assumed.
// Only problem is every thread will call get_work.
// First thread resets scantime blocking all subsequent threads
// from fetching new work.
pthread_rwlock_wrlock( &g_work_lock );
const time_t now = time(NULL);
if ( ( ( now - g_work_time ) >= opt_scantime )
|| ( *nonceptr >= end_nonce ) )
{
pthread_rwlock_wrlock( &g_work_lock );
if ( ( ( time(NULL) - g_work_time ) >= opt_scantime )
|| ( *nonceptr >= end_nonce ) )
if ( unlikely( !get_work( mythr, &g_work ) ) )
{
if ( unlikely( !get_work( mythr, &g_work ) ) )
{
pthread_rwlock_unlock( &g_work_lock );
applog( LOG_ERR, "work retrieval failed, exiting miner thread %d", thr_id );
goto out;
}
g_work_time = time(NULL);
// restart_threads();
}
pthread_rwlock_unlock( &g_work_lock );
pthread_rwlock_unlock( &g_work_lock );
applog( LOG_ERR, "work retrieval failed, exiting miner thread %d", thr_id );
goto out;
}
g_work_time = now;
}
pthread_rwlock_rdlock( &g_work_lock );
algo_gate.get_new_work( &work, &g_work, thr_id, &end_nonce );
work_restart[thr_id].restart = 0;
pthread_rwlock_unlock( &g_work_lock );
}
// } // do_this_thread
// algo_gate.resync_threads( thr_id, &work );
pthread_rwlock_rdlock( &g_work_lock );
algo_gate.get_new_work( &work, &g_work, thr_id, &end_nonce );
work_restart[thr_id].restart = 0;
pthread_rwlock_unlock( &g_work_lock );
// conditional mining
if ( unlikely( !wanna_mine( thr_id ) ) )
@@ -2309,12 +2315,6 @@ static void *miner_thread( void *userdata )
gettimeofday( (struct timeval *) &tv_start, NULL );
// Scan for nonce
// nonce_found = scanhash_sha256dt_ref( &work, max_nonce, &hashes_done,
// mythr );
// nonce_found = scanhash_sha256dt_4x32( &work, max_nonce, &hashes_done,
// mythr );
nonce_found = algo_gate.scanhash( &work, max_nonce, &hashes_done,
mythr );
@@ -2336,8 +2336,8 @@ static void *miner_thread( void *userdata )
// If unsubmiited nonce(s) found, submit now.
if ( unlikely( nonce_found && !opt_benchmark ) )
{
// applog( LOG_WARNING, "BUG: See RELEASE_NOTES for reporting bugs. Algo = %s.",
// algo_names[ opt_algo ] );
applog( LOG_WARNING, "BUG: See RELEASE_NOTES for reporting bugs. Algo = %s.",
algo_names[ opt_algo ] );
if ( !submit_work( mythr, &work ) )
{
applog( LOG_WARNING, "Failed to submit share." );
@@ -2400,7 +2400,7 @@ static void *miner_thread( void *userdata )
{
scale_hash_for_display( &hashrate, hr_units );
sprintf( hr, "%.2f", hashrate );
#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32) || defined(__APPLE__))
applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
#else
float lo_freq = 0., hi_freq = 0.;
@@ -2828,9 +2828,9 @@ out:
static void show_credits()
{
printf("\n ********** "PACKAGE_NAME" "PACKAGE_VERSION" *********** \n");
printf("\n ********** "PACKAGE_NAME" "PACKAGE_VERSION" ********** \n");
printf(" A CPU miner with multi algo support and optimized for CPUs\n");
printf(" with AVX512, SHA and VAES extensions by JayDDee.\n");
printf(" with AVX512, SHA, AES and NEON extensions by JayDDee.\n");
printf(" BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n");
}
@@ -2840,40 +2840,46 @@ static void show_credits()
static bool cpu_capability( bool display_only )
{
char cpu_brand[0x40];
bool cpu_has_aarch64 = cpu_arch_aarch64();
bool cpu_has_x86_64 = cpu_arch_x86_64();
bool cpu_has_sse2 = has_sse2(); // X86_64 only
bool cpu_has_ssse3 = has_ssse3(); // X86_64 only
bool cpu_has_sse41 = has_sse41(); // X86_64 only
bool cpu_has_sse42 = has_sse42();
bool cpu_has_avx = has_avx();
bool cpu_has_avx2 = has_avx2();
bool cpu_has_avx512 = has_avx512();
bool cpu_has_avx10 = has_avx10();
bool cpu_has_aes = has_aes_ni(); // x86_64 or AArch64 AES
bool cpu_has_vaes = has_vaes();
bool cpu_has_sha256 = has_sha(); // x86_64 or AArch64
bool cpu_has_sha512 = has_sha512();
bool sw_has_x86_64 = false;
bool sw_has_aarch64 = false;
int sw_arm_arch = 0; // AArch64
bool sw_has_neon = false; // AArch64
// bool sw_has_sve = false; // AArch64
// bool sw_has_sve2 = false; // AArch64
bool sw_has_sse2 = false; // x86_64
bool sw_has_ssse3 = false; // x86_64
bool sw_has_sse41 = false; // x86_64
bool sw_has_sse42 = false;
bool sw_has_avx = false;
bool sw_has_avx2 = false;
bool sw_has_avx512 = false;
bool cpu_has_sse2 = has_sse2(); // X86_64 only
bool cpu_has_ssse3 = has_ssse3(); // X86_64 only
bool cpu_has_sse41 = has_sse41(); // X86_64 only
bool cpu_has_sse42 = has_sse42();
bool cpu_has_avx = has_avx();
bool cpu_has_neon = has_neon(); // AArch64
bool cpu_has_sve = has_sve(); // aarch64 only, insignificant
bool cpu_has_sve2 = has_sve2(); // AArch64 only
bool cpu_has_sme = has_sme();
bool cpu_has_sme2 = has_sme2();
bool cpu_has_avx2 = has_avx2();
bool cpu_has_avx512 = has_avx512();
bool cpu_has_avx10 = has_avx10();
bool cpu_has_aes = has_aes(); // x86_64 or AArch64
bool cpu_has_vaes = has_vaes(); // X86_64 only
bool cpu_has_sha256 = has_sha256(); // x86_64 or AArch64
bool cpu_has_sha512 = has_sha512();
bool sw_has_x86_64 = false;
bool sw_has_aarch64 = false;
int sw_arm_arch = 0; // AArch64 version
bool sw_has_neon = false; // AArch64
bool sw_has_sve = false; // AArch64
bool sw_has_sve2 = false; // AArch64
bool sw_has_sme = false;
bool sw_has_sme2 = false;
bool sw_has_sse2 = false; // x86_64
bool sw_has_ssse3 = false; // x86_64
bool sw_has_sse41 = false; // x86_64
bool sw_has_sse42 = false;
bool sw_has_avx = false;
bool sw_has_avx2 = false;
bool sw_has_avx512 = false;
bool sw_has_avx10_256 = false;
bool sw_has_avx10_512 = false;
bool sw_has_aes = false;
bool sw_has_vaes = false;
bool sw_has_sha256 = false; // x86_64 or AArch64 SHA2
bool sw_has_sha512 = false; // x86_64 or AArch64 SHA3
set_t algo_features = algo_gate.optimizations;
bool sw_has_aes = false;
bool sw_has_vaes = false;
bool sw_has_sha256 = false; // x86_64 or AArch64
bool sw_has_sha512 = false; // x86_64 or AArch64
/*
set_t algo_features = algo_gate.optimizations;
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
bool algo_has_avx = set_incl( AVX_OPT, algo_features );
@@ -2881,7 +2887,7 @@ static bool cpu_capability( bool display_only )
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
bool algo_has_aes = set_incl( AES_OPT, algo_features );
bool algo_has_vaes = set_incl( VAES_OPT, algo_features );
bool algo_has_sha256 = set_incl( SHA_OPT, algo_features );
bool algo_has_sha256 = set_incl( SHA256_OPT, algo_features );
bool algo_has_sha512 = set_incl( SHA512_OPT, algo_features );
bool algo_has_neon = set_incl( NEON_OPT, algo_features );
bool use_sse2;
@@ -2895,8 +2901,7 @@ static bool cpu_capability( bool display_only )
bool use_sha512;
bool use_neon;
bool use_none;
// x86_64
*/
#if defined(__x86_64__)
sw_has_x86_64 = true;
#elif defined(__aarch64__)
@@ -2908,6 +2913,8 @@ static bool cpu_capability( bool display_only )
sw_arm_arch = __ARM_ARCH;
#endif
#endif
// x86_64 only
#if defined(__SSE2__)
sw_has_sse2 = true;
#endif
@@ -2935,9 +2942,10 @@ static bool cpu_capability( bool display_only )
#if defined(__AVX10_1_512__)
sw_has_avx10_512 = true;
#endif
// x86_64 or AArch64
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
sw_has_aes = true;
sw_has_aes = true;
#endif
#ifdef __VAES__
sw_has_vaes = true;
@@ -2945,95 +2953,117 @@ static bool cpu_capability( bool display_only )
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
sw_has_sha256 = true;
#endif
#if defined(__SHA512__) || defined(__ARM_FEATURE_SHA3)
#if defined(__SHA512__) || defined(__ARM_FEATURE_SHA512)
sw_has_sha512 = true;
#endif
// AArch64 only
#if defined(__ARM_NEON)
sw_has_neon = true;
#endif
// #if defined(__ARM_FEATURE_SVE)
// sw_has_sve = true;
// #endif
// #if defined(__ARM_FEATURE_SVE2)
// sw_has_sve2 = true;
// #endif
#if defined(__ARM_FEATURE_SVE)
sw_has_sve = true;
#endif
#if defined(__ARM_FEATURE_SVE2)
sw_has_sve2 = true;
#endif
#if defined(__ARM_FEATURE_SME)
sw_has_sme = true;
#endif
#if defined(__ARM_FEATURE_SME2)
sw_has_sme2 = true;
#endif
// CPU
cpu_brand_string( cpu_brand );
printf( "CPU: %s\n", cpu_brand );
printf("SW built on " __DATE__
#ifdef _MSC_VER
" with VC++ 2013\n");
// Build
printf( "SW built on " __DATE__
#if defined(__clang__)
" with CLANG-%d.%d.%d", __clang_major__, __clang_minor__,
__clang_patchlevel__ );
#elif defined(__GNUC__)
" with GCC-");
printf("%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
#else
printf("\n");
" with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ );
#endif
// OS
#if defined(__linux)
printf(" Linux\n");
#elif defined(WIN32)
printf(" Windows\n");
printf(" Windows");
#if defined(__MINGW64__)
printf(" MinGW-w64\n");
#else
printf("\n");
#endif
#elif defined(__APPLE__)
printf(" MacOS\n");
#elif defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
printf(" Unix\n");
#elif defined(__bsd__) || defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
printf(" BSD/Unix\n");
#else
printf("\n");
#endif
printf("CPU features: ");
if ( cpu_has_x86_64 )
if ( cpu_arch_x86_64() )
{
printf( " x86_64" );
if ( cpu_has_avx512 ) printf( " AVX512" );
else if ( cpu_has_avx2 ) printf( " AVX2 " );
else if ( cpu_has_avx ) printf( " AVX " );
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
else if ( cpu_has_sse41 ) printf( " SSE4.1" );
else if ( cpu_has_ssse3 ) printf( " SSSE3 " );
else if ( cpu_has_sse2 ) printf( " SSE2 " );
if ( cpu_has_avx10 ) printf( " AVX10.%d-%d", avx10_version(),
avx10_vector_length() );
if ( cpu_has_avx512 ) printf( " AVX512" );
else if ( cpu_has_avx2 ) printf( " AVX2 " );
else if ( cpu_has_avx ) printf( " AVX " );
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
else if ( cpu_has_sse41 ) printf( " SSE4.1" );
else if ( cpu_has_ssse3 ) printf( " SSSE3 " );
else if ( cpu_has_sse2 ) printf( " SSE2 " );
}
else if ( cpu_has_aarch64 ) printf( " AArch64 NEON" ); // NEON assumed
if ( cpu_has_vaes ) printf( " VAES" );
else if ( cpu_has_aes ) printf( " AES" );
if ( cpu_has_sha512 ) printf( " SHA512" );
else if ( cpu_has_sha256 ) printf( " SHA256" );
if ( cpu_has_avx10 ) printf( " AVX10.%d-%d",
avx10_version(), avx10_vector_length() );
else if ( cpu_arch_aarch64() )
{
if ( cpu_has_neon ) printf( " NEON" );
if ( cpu_has_sve2 ) printf( " SVE2-%d", sve_vector_length() );
else if ( cpu_has_sve ) printf( " SVE" );
if ( cpu_has_sme2 ) printf( " SME2" );
else if ( cpu_has_sme ) printf( " SME" );
}
if ( cpu_has_vaes ) printf( " VAES" );
else if ( cpu_has_aes ) printf( " AES" );
if ( cpu_has_sha512 ) printf( " SHA512" );
else if ( cpu_has_sha256 ) printf( " SHA256" );
printf("\nSW features: ");
if ( sw_has_x86_64 )
{
printf( " x86_64" );
if ( sw_has_avx512 ) printf( " AVX512" );
else if ( sw_has_avx2 ) printf( " AVX2 " );
else if ( sw_has_avx ) printf( " AVX " );
else if ( sw_has_sse42 ) printf( " SSE4.2" );
else if ( sw_has_sse41 ) printf( " SSE4.1" );
else if ( sw_has_ssse3 ) printf( " SSSE3 " );
else if ( sw_has_sse2 ) printf( " SSE2 " );
if ( sw_has_avx10_512 ) printf( " AVX10-512" );
else if ( sw_has_avx10_256 ) printf( " AVX10-256" );
else if ( sw_has_avx512 ) printf( " AVX512" );
else if ( sw_has_avx2 ) printf( " AVX2 " );
else if ( sw_has_avx ) printf( " AVX " );
else if ( sw_has_sse42 ) printf( " SSE4.2" );
else if ( sw_has_sse41 ) printf( " SSE4.1" );
else if ( sw_has_ssse3 ) printf( " SSSE3 " );
else if ( sw_has_sse2 ) printf( " SSE2 " );
}
else if ( sw_has_aarch64 )
{
printf( " AArch64" );
if ( sw_arm_arch ) printf( " armv%d", sw_arm_arch );
if ( sw_has_neon ) printf( " NEON" );
// if ( sw_has_sve ) printf( " SVE" );
// else if ( sw_has_sve2 ) printf( " SVE2" );
if ( sw_has_neon ) printf( " NEON" );
if ( sw_has_sve2 ) printf( " SVE2" );
else if ( sw_has_sve ) printf( " SVE" );
if ( sw_has_sme2 ) printf( " SME2" );
else if ( sw_has_sme ) printf( " SME" );
}
if ( sw_has_vaes ) printf( " VAES" );
else if ( sw_has_aes ) printf( " AES" );
if ( sw_has_sha512 ) printf( " SHA512" );
else if ( sw_has_sha256 ) printf( " SHA256" );
if ( sw_has_vaes ) printf( " VAES" );
else if ( sw_has_aes ) printf( " AES" );
if ( sw_has_sha512 ) printf( " SHA512" );
else if ( sw_has_sha256 ) printf( " SHA256" );
printf("\n");
/*
if ( !display_only )
{
printf("\nAlgo features: ");
printf("\nAlgo features:");
if ( algo_features == EMPTY_SET ) printf( " None" );
else
{
@@ -3041,7 +3071,7 @@ static bool cpu_capability( bool display_only )
else if ( algo_has_avx2 ) printf( " AVX2 " );
else if ( algo_has_sse42 ) printf( " SSE4.2" );
else if ( algo_has_sse2 ) printf( " SSE2 " );
if ( algo_has_neon ) printf( " NEON " );
if ( algo_has_neon ) printf( " NEON" );
if ( algo_has_vaes ) printf( " VAES" );
else if ( algo_has_aes ) printf( " AES" );
if ( algo_has_sha512 ) printf( " SHA512" );
@@ -3050,37 +3080,9 @@ static bool cpu_capability( bool display_only )
}
printf("\n");
if ( display_only ) return true;
/*
// Check for CPU and build incompatibilities
if ( !cpu_has_sse2 && !cpu_has_aarch64 )
{
printf( "A CPU with SSE2 is required to use cpuminer-opt\n" );
return false;
}
if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) )
{
printf( "The SW build requires a CPU with AES and AVX2!\n" );
return false;
}
if ( sw_has_sse42 && !cpu_has_sse42 )
{
printf( "The SW build requires a CPU with SSE4.2!\n" );
return false;
}
if ( sw_has_aes && !cpu_has_aes )
{
printf( "The SW build requires a CPU with AES!\n" );
return false;
}
if ( sw_has_sha && !cpu_has_sha )
{
printf( "The SW build requires a CPU with SHA!\n" );
return false;
}
*/
// Determine mining options
use_sse2 = cpu_has_sse2 && sw_has_sse2 && algo_has_sse2;
use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42;
@@ -3096,13 +3098,10 @@ static bool cpu_capability( bool display_only )
|| use_avx2 || use_sha256 || use_vaes || use_sha512 || use_neon );
// Display best options
applog_nl( "Enabled optimizations:" );
if ( use_none ) printf( " none" );
else
if ( !use_none )
{
// if ( cpu_has_aarch64 ) printf( " AArch64");
// else
// printf( " x86_64" );
applog_nl( "Enabled optimizations:" );
if ( use_neon ) printf( " NEON" );
if ( use_avx512 ) printf( " AVX512" );
else if ( use_avx2 ) printf( " AVX2" );
else if ( use_avx ) printf( " AVX" );
@@ -3112,15 +3111,14 @@ static bool cpu_capability( bool display_only )
else if ( use_aes ) printf( " AES" );
if ( use_sha512 ) printf( " SHA512" );
else if ( use_sha256 ) printf( " SHA256" );
if ( use_neon ) printf( " NEON" );
printf( "\n" );
}
printf( "\n" );
*/
return true;
}
/*
void show_version_and_exit(void)
{
printf("\n built on " __DATE__
@@ -3130,7 +3128,6 @@ void show_version_and_exit(void)
" with GCC");
printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
#endif
printf(" features:"
#if defined(USE_ASM) && defined(__i386__)
" i386"
@@ -3178,12 +3175,11 @@ void show_version_and_exit(void)
printf("\n");
exit(0);
}
*/
void show_usage_and_exit(int status)
{
if (status)
fprintf(stderr, "Try `--help' for more information.\n");
// fprintf(stderr, "Try `" PACKAGE_NAME " --help' for more information.\n");
else
printf(usage);
exit(status);
@@ -3199,7 +3195,6 @@ void parse_arg(int key, char *arg )
{
char *p;
int v, i;
// uint64_t ul;
double d;
switch( key )
@@ -3343,7 +3338,8 @@ void parse_arg(int key, char *arg )
free(rpc_user);
rpc_user = strdup(arg);
break;
case 'o': // url
case 'o': // url
{
char *ap, *hp;
ap = strstr( arg, "://" );
@@ -3408,7 +3404,8 @@ void parse_arg(int key, char *arg )
have_stratum = !opt_benchmark && !strncasecmp( rpc_url, "stratum", 7 );
break;
}
case 'O': // userpass
case 'O': // userpass
p = strchr(arg, ':');
if (!p)
{
@@ -3568,10 +3565,10 @@ void parse_arg(int key, char *arg )
case 1029: // stratum-keepalive
opt_stratum_keepalive = true;
break;
case 'V':
case 'V': // version
display_cpu_capability();
exit(0);
case 'h':
case 'h': // help
show_usage_and_exit(0);
default:
@@ -3710,9 +3707,6 @@ int main(int argc, char *argv[])
{
int cpus = GetActiveProcessorCount( i );
num_cpus += cpus;
// if (opt_debug)
// applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
}
#else
@@ -3883,12 +3877,23 @@ int main(int argc, char *argv[])
}
#endif
#if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
applog( LOG_INFO, "Found %d CPUs in %d groups",
num_cpus, num_cpugroups );
#if defined(WIN32)
#if defined(_WIN32_WINNT)
if (opt_debug)
applog( LOG_INFO, "_WIN232_WINNT = 0x%04x", _WIN32_WINNT );
#else
if (opt_debug)
applog( LOG_INFO, "_WIN232_WINNT undefined." );
#endif
#if defined(WINDOWS_CPU_GROUPS_ENABLED)
if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
applog( LOG_INFO, "Found %d CPUs in %d groups",
num_cpus, num_cpugroups );
#endif
#endif
conditional_state = malloc( opt_n_threads * ((sizeof(bool)) ) );
memset( conditional_state, 0, opt_n_threads * ((sizeof(bool)) ) );
@@ -3909,7 +3914,7 @@ int main(int argc, char *argv[])
if ( cpu < num_cpus ) active_cpus++;
}
if ( opt_n_threads > active_cpus )
applog( LOG_WARNING, "Affinity: more threads (%d) than active CPUs (%d)", opt_n_threads, active_cpus );
applog( LOG_WARNING, "More miner threads (%d) than active CPUs in affinity mask (%d)", opt_n_threads, active_cpus );
if ( !opt_quiet )
{
char affinity_mask[64];

40
miner.h
View File

@@ -3,10 +3,7 @@
#include <cpuminer-config.h>
#if !( defined(__SSE2__) || ( defined(__aarch64__) && defined(__ARM_NEON) ) )
#warning "Unknown or unsupported CPU, requires x86_64 with SSE2 or AArch64 with NEON."
#endif
// CPU architecture
#if defined(__x86_64__)
#define USER_AGENT_ARCH "x64" // Intel, AMD x86_64
#elif defined(__aarch64__)
@@ -17,14 +14,15 @@
#define USER_AGENT_ARCH
#endif
// Operating system
// __APPLE__ includes MacOS & IOS, no MacOS only macros found.
#if defined(__linux)
#define USER_AGENT_OS "L" // GNU Linux
#elif defined(WIN32)
#define USER_AGENT_OS "W" // MS Windows
#elif defined(__APPLE__)
#define USER_AGENT_OS "M" // Apple MacOS
// is there a generic BSD macro?
#elif defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
#elif defined(__bsd__) || defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
#define USER_AGENT_OS "U" // BSD unix
#else
#define USER_AGENT_OS
@@ -191,7 +189,7 @@ static inline uint32_t swab32(uint32_t x)
return __builtin_bswap32(x);
#else
return ( ( (x) << 24 ) & 0xff000000u ) | ( ( (x) << 8 ) & 0x00ff0000u )
| ( ( (x) >> 8 ) & 0x0000ff00u ) | ( ( (x) >> 24 ) & 0x000000ffu )
| ( ( (x) >> 8 ) & 0x0000ff00u ) | ( ( (x) >> 24 ) & 0x000000ffu );
// return bswap_32(v);
@@ -291,26 +289,6 @@ static inline void le16enc(void *pp, uint16_t x)
json_t* json_load_url(char* cfg_url, json_error_t *err);
//void sha256_init(uint32_t *state);
//void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
//void sha256d(unsigned char *hash, const unsigned char *data, int len);
#ifdef USE_ASM
#if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__)
#define HAVE_SHA256_4WAY 1
int sha256_use_4way();
void sha256_init_4way(uint32_t *state);
void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
#endif
//#if defined(__x86_64__) && defined(USE_AVX2)
#if defined(__x86_64__) && defined(__AVX2__)
#define HAVE_SHA256_8WAY 1
int sha256_use_8way();
void sha256_init_8way(uint32_t *state);
void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap);
#endif
#endif
struct work;
void work_free(struct work *w);
@@ -644,7 +622,6 @@ enum algos {
ALGO_SHA256T,
ALGO_SHA3D,
ALGO_SHA512256D,
ALGO_SHAVITE3,
ALGO_SKEIN,
ALGO_SKEIN2,
ALGO_SKUNK,
@@ -740,7 +717,6 @@ static const char* const algo_names[] = {
"sha256t",
"sha3d",
"sha512256d",
"shavite3",
"skein",
"skein2",
"skunk",
@@ -855,10 +831,9 @@ Options:\n\
-a, --algo=ALGO specify the algorithm to use\n\
allium Garlicoin (GRLC)\n\
anime Animecoin (ANI)\n\
argon2 Argon2 Coin (AR2)\n\
argon2d250\n\
argon2d500 argon2d-dyn, Dynamic (DYN)\n\
argon2d4096 argon2d-uis, Unitus (UIS)\n\
argon2d500\n\
argon2d4096\n\
axiom Shabal-256 MemoHash\n\
blake blake256r14 (SFR)\n\
blake2b Blake2b 256\n\
@@ -904,7 +879,6 @@ Options:\n\
sha256t Triple SHA-256, Onecoin (OC)\n\
sha3d Double Keccak256 (BSHA3)\n\
sha512256d Double SHA-512 (Radiant)\n\
shavite3 Shavite3\n\
skein Skein+Sha (Skeincoin)\n\
skein2 Double Skein (Woodcoin)\n\
skunk Signatum (SIGT)\n\

View File

@@ -29,7 +29,6 @@
// is no significant 64 bit vectorization therefore SSE2 is the practical
// minimum for using this code.
//
// MMX: 64 bit vectors (Not used in cpuminer-opt)
// SSE2: 128 bit vectors (64 bit CPUs only, such as Intel Core2.
// AVX2: 256 bit vectors (Starting with Intel Haswell and AMD Ryzen)
// AVX512: 512 bit vectors (Starting with SkylakeX)
@@ -141,28 +140,56 @@
#include <stdint.h>
#include <stddef.h>
// SIMD512: Use 512, 256 & 128 bit vectors, excludes AVX512VBMI
// VL256: Include AVX512VL instructions on 256 & 128 bit vectors
// VBMI: Include AVX512VBMI instructions on all vectors.
// AVX512 macros are not a reliable indicator of 512 bit vector capability
// because they get defined with AVX10_1_256 which doesn't support 512 bit.
// EVEX512 is also unreliable as it can also be defined when 512b is not
// available.
// Use AVX10_1_512 for 512b & AVX10_1_256 for 256b whenever AVX10 is present.
// Use AVX512 macros only whithout AVX10.
// AVX10 can exist without support for 512 bit vectors.
#if defined(__AVX10_1_512__)
#define SIMD512 1
#elif !defined(__AVX10_1__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SIMD512 1
/*
// Test for macros
#ifdef __AVX10_1__
#warning "__AVX10_1__"
#endif
#ifdef __AVX10_1_256__
#warning "__AVX10_1_256__"
#endif
#ifdef __AVX10_1_512__
#warning "__AVX10_1_512__"
#endif
#ifdef __EVEX256__
#warning "__EVEX256__"
#endif
#ifdef __EVEX512__
#warning "__EVEX512__"
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#warning "AVX512"
#endif
*/
// SIMD512: Use 512, 256 & 128 bit vectors, AVX512VBMI is not included and
// must be tested seperately.
// VL256: Include AVX512VL instructions for 256 & 128 bit vectors.
// VBMI: Include AVX512VBMI instructions for supported vector lengths.
// AVX512VL instructions applied to 256 & 128 bit vectors is supported with
// either AVX512VL or any version of AVX10.
#if defined(__AVX10_1__)
#define VL256 1
#elif defined(__AVX512VL__)
#define VL256 1
#endif
// VBMI does not exist on early versions of AVX512
#if defined(__AVX10_1__) || defined(__AVX512VBMI__)
#define VL256 1
#define VBMI 1
#if defined(__AVX10_1_512__)
#define SIMD512 1
#endif
#elif defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define VL256 1
#define SIMD512 1
#if defined(__AVX512VBMI__)
#define VBMI 1
#endif
#endif
/*
@@ -189,9 +216,6 @@
#include "simd-utils/simd-int.h"
// x86_64 MMX 64 bit vectors
#include "simd-utils/simd-64.h"
// x86_64 SSE2 128 bit vectors
#include "simd-utils/simd-128.h"
@@ -201,10 +225,6 @@
// x86_64 AVX512 512 bit vectors
#include "simd-utils/simd-512.h"
// move up after cleaning
// CPU architectire abstraction
//#include "simd-utils/simd-portable.h"
// aarch64 neon 128 bit vectors
#include "simd-utils/simd-neon.h"

View File

@@ -86,7 +86,7 @@ static inline void extr_lane_2x32( void *dst, const void *src,
// 4x32
#if ( defined(__x86_64__) && defined(__SSE2__) ) || ( defined(__aarch64__) && defined(__ARM_NEON) )
#if defined(__x86_64__) && defined(__SSE2__)
#define ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ) \
{ \
@@ -174,6 +174,7 @@ static inline void intrlv_4x32_512( void *dst, const void *src0,
STOR_DEST_4x32( D0, D1, D2, D3, dst, 12, dst, 13, dst, 14, dst, 15 );
}
static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src, const int bit_len )
{
@@ -235,6 +236,190 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
STOR_DEST_4x32( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 );
}
#elif defined(__aarch64__) && defined(__ARM_NEON)
static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
const void *src2, const void *src3, const int bit_len )
{
uint32x4x4_t s;
s.val[0] = casti_v128u32( src0, 0 );
s.val[1] = casti_v128u32( src1, 0 );
s.val[2] = casti_v128u32( src2, 0 );
s.val[3] = casti_v128u32( src3, 0 );
vst4q_u32( dst, s );
s.val[0] = casti_v128u32( src0, 1 );
s.val[1] = casti_v128u32( src1, 1 );
s.val[2] = casti_v128u32( src2, 1 );
s.val[3] = casti_v128u32( src3, 1 );
vst4q_u32( dst + 64, s );
if ( bit_len <= 256 ) return;
s.val[0] = casti_v128u32( src0, 2 );
s.val[1] = casti_v128u32( src1, 2 );
s.val[2] = casti_v128u32( src2, 2 );
s.val[3] = casti_v128u32( src3, 2 );
vst4q_u32( dst + 128, s );
s.val[0] = casti_v128u32( src0, 3 );
s.val[1] = casti_v128u32( src1, 3 );
s.val[2] = casti_v128u32( src2, 3 );
s.val[3] = casti_v128u32( src3, 3 );
vst4q_u32( dst + 192, s );
if ( bit_len <= 512 ) return;
s.val[0] = casti_v128u32( src0, 4 );
s.val[1] = casti_v128u32( src1, 4 );
s.val[2] = casti_v128u32( src2, 4 );
s.val[3] = casti_v128u32( src3, 4 );
vst4q_u32( dst + 256, s );
if ( bit_len <= 640 ) return;
s.val[0] = casti_v128u32( src0, 5 );
s.val[1] = casti_v128u32( src1, 5 );
s.val[2] = casti_v128u32( src2, 5 );
s.val[3] = casti_v128u32( src3, 5 );
vst4q_u32( dst + 320, s );
s.val[0] = casti_v128u32( src0, 6 );
s.val[1] = casti_v128u32( src1, 6 );
s.val[2] = casti_v128u32( src2, 6 );
s.val[3] = casti_v128u32( src3, 6 );
vst4q_u32( dst + 384, s );
s.val[0] = casti_v128u32( src0, 7 );
s.val[1] = casti_v128u32( src1, 7 );
s.val[2] = casti_v128u32( src2, 7 );
s.val[3] = casti_v128u32( src3, 7 );
vst4q_u32( dst + 448, s );
// if ( bit_len <= 1024 return;
}
static inline void intrlv_4x32_512( void *dst, const void *src0,
const void *src1, const void *src2, const void *src3 )
{
uint32x4x4_t s;
s.val[0] = casti_v128u32( src0, 0 );
s.val[1] = casti_v128u32( src1, 0 );
s.val[2] = casti_v128u32( src2, 0 );
s.val[3] = casti_v128u32( src3, 0 );
vst4q_u32( dst, s );
s.val[0] = casti_v128u32( src0, 1 );
s.val[1] = casti_v128u32( src1, 1 );
s.val[2] = casti_v128u32( src2, 1 );
s.val[3] = casti_v128u32( src3, 1 );
vst4q_u32( dst + 64, s );
s.val[0] = casti_v128u32( src0, 2 );
s.val[1] = casti_v128u32( src1, 2 );
s.val[2] = casti_v128u32( src2, 2 );
s.val[3] = casti_v128u32( src3, 2 );
vst4q_u32( dst + 128, s );
s.val[0] = casti_v128u32( src0, 3 );
s.val[1] = casti_v128u32( src1, 3 );
s.val[2] = casti_v128u32( src2, 3 );
s.val[3] = casti_v128u32( src3, 3 );
vst4q_u32( dst + 192, s );
}
static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src, int bit_len )
{
uint32x4x4_t s = vld4q_u32( src );
casti_v128( dst0, 0 ) = s.val[0];
casti_v128( dst1, 0 ) = s.val[1];
casti_v128( dst2, 0 ) = s.val[2];
casti_v128( dst3, 0 ) = s.val[3];
s = vld4q_u32( src + 64 );
casti_v128( dst0, 1 ) = s.val[0];
casti_v128( dst1, 1 ) = s.val[1];
casti_v128( dst2, 1 ) = s.val[2];
casti_v128( dst3, 1 ) = s.val[3];
if ( bit_len <= 256 ) return;
s = vld4q_u32( src + 128 );
casti_v128( dst0, 2 ) = s.val[0];
casti_v128( dst1, 2 ) = s.val[1];
casti_v128( dst2, 2 ) = s.val[2];
casti_v128( dst3, 2 ) = s.val[3];
s = vld4q_u32( src + 192 );
casti_v128( dst0, 3 ) = s.val[0];
casti_v128( dst1, 3 ) = s.val[1];
casti_v128( dst2, 3 ) = s.val[2];
casti_v128( dst3, 3 ) = s.val[3];
if ( bit_len <= 512 ) return;
s = vld4q_u32( src + 256 );
casti_v128( dst0, 4 ) = s.val[0];
casti_v128( dst1, 4 ) = s.val[1];
casti_v128( dst2, 4 ) = s.val[2];
casti_v128( dst3, 4 ) = s.val[3];
if ( bit_len <= 640 ) return;
s = vld4q_u32( src + 320 );
casti_v128( dst0, 5 ) = s.val[0];
casti_v128( dst1, 5 ) = s.val[1];
casti_v128( dst2, 5 ) = s.val[2];
casti_v128( dst3, 5 ) = s.val[3];
s = vld4q_u32( src + 384 );
casti_v128( dst0, 6 ) = s.val[0];
casti_v128( dst1, 6 ) = s.val[1];
casti_v128( dst2, 6 ) = s.val[2];
casti_v128( dst3, 6 ) = s.val[3];
s = vld4q_u32( src + 448 );
casti_v128( dst0, 6 ) = s.val[0];
casti_v128( dst1, 6 ) = s.val[1];
casti_v128( dst2, 6 ) = s.val[2];
casti_v128( dst3, 6 ) = s.val[3];
// if ( bit_len <= 1024 ) return;
}
static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src )
{
uint32x4x4_t s = vld4q_u32( src );
casti_v128( dst0, 0 ) = s.val[0];
casti_v128( dst1, 0 ) = s.val[1];
casti_v128( dst2, 0 ) = s.val[2];
casti_v128( dst3, 0 ) = s.val[3];
s = vld4q_u32( src + 64 );
casti_v128( dst0, 1 ) = s.val[0];
casti_v128( dst1, 1 ) = s.val[1];
casti_v128( dst2, 1 ) = s.val[2];
casti_v128( dst3, 1 ) = s.val[3];
s = vld4q_u32( src + 128 );
casti_v128( dst0, 2 ) = s.val[0];
casti_v128( dst1, 2 ) = s.val[1];
casti_v128( dst2, 2 ) = s.val[2];
casti_v128( dst3, 2 ) = s.val[3];
s = vld4q_u32( src + 192 );
casti_v128( dst0, 3 ) = s.val[0];
casti_v128( dst1, 3 ) = s.val[1];
casti_v128( dst2, 3 ) = s.val[2];
casti_v128( dst3, 3 ) = s.val[3];
}
#else // !SSE2 && !NEON
static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
@@ -456,15 +641,13 @@ static inline void v128_bswap32_80( void *d, void *s )
#endif
#if defined(__SSE2__)
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
{
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
v128u32_t s0 = casti_v128u32( src,0 );
v128u32_t s1 = casti_v128u32( src,1 );
v128u32_t s2 = casti_v128u32( src,2 );
v128u32_t s3 = casti_v128u32( src,3 );
v128u32_t s4 = casti_v128u32( src,4 );
#if defined(__SSSE3__)
@@ -487,79 +670,34 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
#endif
casti_v128( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
casti_v128( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
casti_v128( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
casti_v128( d, 3 ) = _mm_shuffle_epi32( s0, 0xff );
casti_v128u32( d, 0 ) = v128_duplane32( s0, 0 );
casti_v128u32( d, 1 ) = v128_duplane32( s0, 1 );
casti_v128u32( d, 2 ) = v128_duplane32( s0, 2 );
casti_v128u32( d, 3 ) = v128_duplane32( s0, 3 );
casti_v128( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 );
casti_v128( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 );
casti_v128( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa );
casti_v128( d, 7 ) = _mm_shuffle_epi32( s1, 0xff );
casti_v128u32( d, 4 ) = v128_duplane32( s1, 0 );
casti_v128u32( d, 5 ) = v128_duplane32( s1, 1 );
casti_v128u32( d, 6 ) = v128_duplane32( s1, 2 );
casti_v128u32( d, 7 ) = v128_duplane32( s1, 3 );
casti_v128( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 );
casti_v128( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 );
casti_v128( d,10 ) = _mm_shuffle_epi32( s2, 0xaa );
casti_v128( d,11 ) = _mm_shuffle_epi32( s2, 0xff );
casti_v128u32( d, 8 ) = v128_duplane32( s2, 0 );
casti_v128u32( d, 9 ) = v128_duplane32( s2, 1 );
casti_v128u32( d,10 ) = v128_duplane32( s2, 2 );
casti_v128u32( d,11 ) = v128_duplane32( s2, 3 );
casti_v128( d,12 ) = _mm_shuffle_epi32( s3, 0x00 );
casti_v128( d,13 ) = _mm_shuffle_epi32( s3, 0x55 );
casti_v128( d,14 ) = _mm_shuffle_epi32( s3, 0xaa );
casti_v128( d,15 ) = _mm_shuffle_epi32( s3, 0xff );
casti_v128u32( d,12 ) = v128_duplane32( s3, 0 );
casti_v128u32( d,13 ) = v128_duplane32( s3, 1 );
casti_v128u32( d,14 ) = v128_duplane32( s3, 2 );
casti_v128u32( d,15 ) = v128_duplane32( s3, 3 );
casti_v128( d,16 ) = _mm_shuffle_epi32( s4, 0x00 );
casti_v128( d,17 ) = _mm_shuffle_epi32( s4, 0x55 );
casti_v128( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
casti_v128( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
casti_v128u32( d,16 ) = v128_duplane32( s2, 0 );
casti_v128u32( d,17 ) = v128_duplane32( s2, 1 );
casti_v128u32( d,18 ) = v128_duplane32( s2, 2 );
casti_v128u32( d,19 ) = v128_duplane32( s2, 3 );
}
#elif defined(__aarch64__) && defined(__ARM_NEON)
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
{
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
casti_v128( d, 0 ) = vdupq_laneq_u32( s0, 0 );
casti_v128( d, 1 ) = vdupq_laneq_u32( s0, 1 );
casti_v128( d, 2 ) = vdupq_laneq_u32( s0, 2 );
casti_v128( d, 3 ) = vdupq_laneq_u32( s0, 3 );
casti_v128( d, 4 ) = vdupq_laneq_u32( s1, 0 );
casti_v128( d, 5 ) = vdupq_laneq_u32( s1, 1 );
casti_v128( d, 6 ) = vdupq_laneq_u32( s1, 2 );
casti_v128( d, 7 ) = vdupq_laneq_u32( s1, 3 );
casti_v128( d, 8 ) = vdupq_laneq_u32( s2, 0 );
casti_v128( d, 9 ) = vdupq_laneq_u32( s2, 1 );
casti_v128( d,10 ) = vdupq_laneq_u32( s2, 2 );
casti_v128( d,11 ) = vdupq_laneq_u32( s2, 3 );
casti_v128( d,12 ) = vdupq_laneq_u32( s3, 0 );
casti_v128( d,13 ) = vdupq_laneq_u32( s3, 1 );
casti_v128( d,14 ) = vdupq_laneq_u32( s3, 2 );
casti_v128( d,15 ) = vdupq_laneq_u32( s3, 3 );
casti_v128( d,16 ) = vdupq_laneq_u32( s2, 0 );
casti_v128( d,17 ) = vdupq_laneq_u32( s2, 1 );
casti_v128( d,18 ) = vdupq_laneq_u32( s2, 2 );
casti_v128( d,19 ) = vdupq_laneq_u32( s2, 3 );
}
#endif
// 8x32
#if defined(__AVX2__)
#define ILEAVE_8x32( D0, D1, D2, D3, D4, D5, D6, D7, \
@@ -1544,7 +1682,9 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
//
// 64 bit data
// 2x64 SSE2, NEON
// 2x64
#if defined(__x86_64__) && defined(__SSE2__)
static inline void intrlv_2x64( void *dst, const void *src0,
const void *src1, const int bit_len )
@@ -1602,7 +1742,101 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
d1[7] = v128_unpackhi64( s[14], s[15] );
}
/*
#elif defined(__aarch64__) && defined(__ARM_NEON)
static inline void intrlv_2x64( void *dst, const void *src0,
const void *src1, const int bit_len )
{
uint64x2x2_t s;
s.val[0] = casti_v128u64( src0, 0 );
s.val[1] = casti_v128u64( src1, 0 );
vst2q_u64( dst, s );
s.val[0] = casti_v128u64( src0, 1 );
s.val[1] = casti_v128u64( src1, 1 );
vst2q_u64( dst + 32, s );
if ( bit_len <= 256 ) return;
s.val[0] = casti_v128u64( src0, 2 );
s.val[1] = casti_v128u64( src1, 2 );
vst2q_u64( dst + 64, s );
s.val[0] = casti_v128u64( src0, 3 );
s.val[1] = casti_v128u64( src1, 3 );
vst2q_u64( dst + 96, s );
if ( bit_len <= 512 ) return;
s.val[0] = casti_v128u64( src0, 4 );
s.val[1] = casti_v128u64( src1, 4 );
vst2q_u64( dst + 128, s );
if ( bit_len <= 640 ) return;
s.val[0] = casti_v128u64( src0, 5 );
s.val[1] = casti_v128u64( src1, 5 );
vst2q_u64( dst + 160, s );
s.val[0] = casti_v128u64( src0, 6 );
s.val[1] = casti_v128u64( src1, 6 );
vst2q_u64( dst + 192, s );
s.val[0] = casti_v128u64( src0, 7 );
s.val[1] = casti_v128u64( src1, 7 );
vst2q_u64( dst + 224, s );
// if ( bit_len <= 1024 ) return;
}
static inline void dintrlv_2x64( void *dst0, void *dst1,
const void *src, const int bit_len )
{
uint64x2x2_t s = vld2q_u64( src );
casti_v128u64( dst0, 0 ) = s.val[0];
casti_v128u64( dst1, 0 ) = s.val[1];
s = vld2q_u64( src + 32 );
casti_v128u64( dst0, 1 ) = s.val[0];
casti_v128u64( dst1, 1 ) = s.val[1];
if ( bit_len <= 256 ) return;
s = vld2q_u64( src + 64 );
casti_v128u64( dst0, 2 ) = s.val[0];
casti_v128u64( dst1, 2 ) = s.val[1];
s = vld2q_u64( src + 96 );
casti_v128u64( dst0, 3 ) = s.val[0];
casti_v128u64( dst1, 3 ) = s.val[1];
if ( bit_len <= 512 ) return;
s = vld2q_u64( src + 128 );
casti_v128u64( dst0, 4 ) = s.val[0];
casti_v128u64( dst1, 4 ) = s.val[1];
if ( bit_len <= 640 ) return;
s = vld2q_u64( src + 160 );
casti_v128u64( dst0, 5 ) = s.val[0];
casti_v128u64( dst1, 5 ) = s.val[1];
s = vld2q_u64( src + 192 );
casti_v128u64( dst0, 6 ) = s.val[0];
casti_v128u64( dst1, 6 ) = s.val[1];
s = vld2q_u64( src + 224 );
casti_v128u64( dst0, 7 ) = s.val[0];
casti_v128u64( dst1, 7 ) = s.val[1];
// if ( bit_len <= 1024 ) return;
}
#else
static inline void intrlv_2x64( void *dst, const void *src0,
const void *src1, const int bit_len )
{
@@ -1621,8 +1855,7 @@ static inline void intrlv_2x64( void *dst, const void *src0,
d[24] = s0[12]; d[25] = s1[12]; d[26] = s0[13]; d[27] = s1[13];
d[28] = s0[14]; d[29] = s1[14]; d[30] = s0[15]; d[31] = s1[15];
}
*/
/*
static inline void dintrlv_2x64( void *dst0, void *dst1,
const void *src, const int bit_len )
{
@@ -1642,15 +1875,16 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
d0[12] = s[24]; d1[12] = s[25]; d0[13] = s[26]; d1[13] = s[27];
d0[14] = s[28]; d1[14] = s[29]; d0[15] = s[30]; d1[15] = s[31];
}
*/
#endif
static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
{
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
v128u64_t s0 = casti_v128u64( src,0 );
v128u64_t s1 = casti_v128u64( src,1 );
v128u64_t s2 = casti_v128u64( src,2 );
v128u64_t s3 = casti_v128u64( src,3 );
v128u64_t s4 = casti_v128u64( src,4 );
#if defined(__SSSE3__)
@@ -1673,41 +1907,20 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
#endif
#if defined(__SSE2__)
casti_v128u64( d,0 ) = v128_duplane64( s0, 0 );
casti_v128u64( d,1 ) = v128_duplane64( s0, 1 );
casti_v128( d,0 ) = _mm_shuffle_epi32( s0, 0x44 );
casti_v128( d,1 ) = _mm_shuffle_epi32( s0, 0xee );
casti_v128u64( d,2 ) = v128_duplane64( s1, 0 );
casti_v128u64( d,3 ) = v128_duplane64( s1, 1 );
casti_v128( d,2 ) = _mm_shuffle_epi32( s1, 0x44 );
casti_v128( d,3 ) = _mm_shuffle_epi32( s1, 0xee );
casti_v128u64( d,4 ) = v128_duplane64( s2, 0 );
casti_v128u64( d,5 ) = v128_duplane64( s2, 1 );
casti_v128( d,4 ) = _mm_shuffle_epi32( s2, 0x44 );
casti_v128( d,5 ) = _mm_shuffle_epi32( s2, 0xee );
casti_v128u64( d,6 ) = v128_duplane64( s3, 0 );
casti_v128u64( d,7 ) = v128_duplane64( s3, 1 );
casti_v128( d,6 ) = _mm_shuffle_epi32( s3, 0x44 );
casti_v128( d,7 ) = _mm_shuffle_epi32( s3, 0xee );
casti_v128( d,8 ) = _mm_shuffle_epi32( s4, 0x44 );
casti_v128( d,9 ) = _mm_shuffle_epi32( s4, 0xee );
#elif defined(__ARM_NEON)
casti_v128u64( d,0 ) = vdupq_laneq_u64( (uint64x2_t)s0, 0 );
casti_v128u64( d,1 ) = vdupq_laneq_u64( (uint64x2_t)s0, 1 );
casti_v128u64( d,2 ) = vdupq_laneq_u64( (uint64x2_t)s1, 0 );
casti_v128u64( d,3 ) = vdupq_laneq_u64( (uint64x2_t)s1, 1 );
casti_v128u64( d,4 ) = vdupq_laneq_u64( (uint64x2_t)s2, 0 );
casti_v128u64( d,5 ) = vdupq_laneq_u64( (uint64x2_t)s2, 1 );
casti_v128u64( d,6 ) = vdupq_laneq_u64( (uint64x2_t)s3, 0 );
casti_v128u64( d,7 ) = vdupq_laneq_u64( (uint64x2_t)s3, 1 );
casti_v128u64( d,8 ) = vdupq_laneq_u64( (uint64x2_t)s4, 0 );
casti_v128u64( d,9 ) = vdupq_laneq_u64( (uint64x2_t)s4, 1 );
#endif
casti_v128u64( d,8 ) = v128_duplane64( s4, 0 );
casti_v128u64( d,9 ) = v128_duplane64( s4, 1 );
}
static inline void extr_lane_2x64( void *dst, const void *src,
@@ -2436,7 +2649,7 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
{
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
0x0405060700010203 );
const __m512i c1 = v512_64( 1 );
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );

View File

@@ -32,13 +32,20 @@
// Intrinsics automatically promote from REX to VEX when AVX is available
// but ASM needs to be done manually.
//
// APX supports EGPR which adds 16 more GPRs and 3 operand instructions.
// This may affect ASM that include instructions that are superseded by APX
// versions and are therefore incompatible with APX.
// As a result GCC-14 disables EGPR by default and can be enabled with
// "-mapx-inline-asm-use-gpr32"
//TODO
// Some ASM functions may need to be updated to support EGPR with APX.
//
///////////////////////////////////////////////////////////////////////////////
// New architecturally agnostic syntax:
//
// __m128i -> v128_t
// _mm_ -> v128_
// mm128_ -> v128_
//
// There is also new syntax to accomodate ARM's stricter type checking of
// vector element size. They have no effect on x86_64.
@@ -145,10 +152,8 @@
typedef union
{
v128_t v128;
__m128i m128;
uint32_t u32[4];
} __attribute__ ((aligned (16))) m128_ovly;
#define v128_ovly m128_ovly
} __attribute__ ((aligned (16))) v128_ovly;
// use for immediate constants, use load1 for mem.
#define v128_64 _mm_set1_epi64x
@@ -167,8 +172,13 @@ typedef union
// necessary the cvt, set, or set1 intrinsics can be used allowing the
// compiler to exploit new features to produce optimum code.
// Currently only used internally and by Luffa.
// It also has implications for APX EGPR feature.
static inline __m128i mm128_mov64_128( const uint64_t n )
#define v128_mov64 _mm_cvtsi64_si128
#define v128_mov32 _mm_cvtsi32_si128
/*
static inline __m128i v128_mov64( const uint64_t n )
{
__m128i a;
#if defined(__AVX__)
@@ -178,10 +188,8 @@ static inline __m128i mm128_mov64_128( const uint64_t n )
#endif
return a;
}
//#define v128_mov64( u64 ) mm128_mov64_128( u64 )
static inline __m128i mm128_mov32_128( const uint32_t n )
static inline __m128i v128_mov32( const uint32_t n )
{
__m128i a;
#if defined(__AVX__)
@@ -191,11 +199,14 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
#endif
return a;
}
*/
// broadcast lane 0 to all lanes
#define v128_bcast64(v) _mm_shuffle_epi32( v, 0x44 )
#define v128_bcast32(v) _mm_shuffle_epi32( v, 0x00 )
// Not used, test first
/*
#if defined(__AVX2__)
#define v128_bcast16(v) _mm_broadcastw_epi16(v)
@@ -203,9 +214,10 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
#else
#define v128_bcast16(v) \
v128_bcast32( v128_or( v128_sl32( v, 16 ), v ) )
_mm_shuffle_epi32( _mm_shufflelo_epi16( v, 0x00 ), 0x00 )
#endif
*/
// Broadcast lane l to all lanes
#define v128_duplane64( v, l ) \
@@ -221,28 +233,15 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
// Pseudo constants
#define v128_zero _mm_setzero_si128()
#if defined(__SSE4_1__)
// Bitwise AND, return 1 if result is all bits clear.
#define v128_and_eq0(v1, v0) _mm_testz_si128(v1, v0)
// v128_is_zero?
static inline int v128_cmpeq0( v128_t v )
{ return v128_and_eq0( v, v ); }
#endif
// Bitwise compare return 1 if all bits set.
#define v128_cmpeq1(v) _mm_test_all ones(v)
#define v128_one mm128_mov64_128(1)
//#define v128_one v128_mov64(1)
#define v128_one _mm_cvtsi64_si128( 1 )
// ASM avoids the need to initialize return variable to avoid compiler warning.
// Macro abstracts function parentheses to look like an identifier.
static inline __m128i v128_neg1_fn()
{
__m128i a;
#if defined(__AVX__)
#if defined(__AVX__)
asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) );
#else
asm( "pcmpeqq %0, %0\n\t" : "=x"(a) );
@@ -273,7 +272,6 @@ static inline __m128i v128_neg1_fn()
// p = any aligned pointer, i = scaled array index
// returns value p[i]
#define casti_v128(p,i) (((__m128i*)(p))[(i)])
#define casti_m128i casti_v128 // deprecated
#define casti_v128u64 casti_v128
#define casti_v128u32 casti_v128
#define casti_v128u16 casti_v128
@@ -284,13 +282,14 @@ static inline __m128i v128_neg1_fn()
#define casto_v128(p,o) (((__m128i*)(p))+(o))
#if defined(__SSE4_1__)
#define v128_get64( v, l ) _mm_extract_epi64( v, l )
#define v128_get32( v, l ) _mm_extract_epi32( v, l )
#define v128_get16( v, l ) _mm_extract_epi16( v, l )
#define v128_get8( v, l ) _mm_extract_epi8( v, l )
#define v128_put64( v, u64, l ) _mm_insert_epi64( v, u64, l )
#define v128_put32( v, u32, l ) _mm_insert_epi64( v, u32, l )
#define v128_put32( v, u32, l ) _mm_insert_epi32( v, u32, l )
#define v128_put16( v, u16, l ) _mm_insert_epi16( v, u16, l )
#define v128_put8( v, u8, l ) _mm_insert_epi8( v, u8, l )
@@ -327,7 +326,7 @@ static inline __m128i v128_neg1_fn()
/*
// Copy i32 to element c of dest and copy remaining elemnts from v.
#define v128_put32( v, i32, c ) \
v128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
v128_xim_32( v, v128_mov32( i32 ), (c)<<4 )
*/
@@ -401,7 +400,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
#define memcpy_128 v128_memcpy
// Boolean operations
#if defined(VL256)
// Macros with duplicate references to the same argument are
// not expression safe. Switch to inline function if required.
// ~v1 | v0
#define v128_ornot( v1, v0 ) _mm_ternarylogic_epi64( v1, v0, v0, 0xcf )
@@ -435,13 +437,13 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#else
#define v128_ornot( v1, v0 ) _mm_or_si128( v1, v128_not( v0 ) )
#define v128_ornot( v1, v0 ) _mm_or_si128( v128_not( v1 ), v0 )
#define v128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
#define v128_xor3( a, b, c ) _mm_xor_si128( _mm_xor_si128( a, b ), c )
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
#define v128_and3( a, b, c ) _mm_and_si128( _mm_and_si128( a, b ), c )
#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
#define v128_or3( a, b, c ) _mm_or_si128( _mm_or_si128( a, b ), c )
#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
@@ -463,17 +465,13 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
// Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
// Effectively a sign test.
#define mm128_movmask_64( v ) \
#define v128_movmask64( v ) \
_mm_movemask_pd( (__m128d)(v) )
#define v128_movmask64 mm128_movmask_64
#define mm128_movmask_32( v ) \
#define v128_movmask32( v ) \
_mm_movemask_ps( (__m128)(v) )
#define v128_movmask32 mm128_movmask_32
//
// Bit rotations
// Shuffle 16 bit elements within 64 bit lanes.
#define v128_shuffle16( v, c ) \
_mm_shufflehi_epi16( _mm_shufflelo_epi16( v, c ), c )
@@ -483,6 +481,9 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#define v128_qrev16(v) v128_shuffle16( v, 0x1b )
#define v128_lrev16(v) v128_shuffle16( v, 0xb1 )
//
// Bit rotations
// Internal use only, should never be callled from application code.
#define v128_ror64_sse2( v, c ) \
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
@@ -608,10 +609,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#endif
// deprecated
#define mm128_rol_32 v128_rol32
// ror( v1 ^ v0, n )
// (v1 ^ v0) >>> n, ARM NEON has optimized version
#define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n )
/* not used
@@ -689,7 +687,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
/* Not used, exists only for compatibility with NEON if ever needed.
#define v128_shufflev32( v, vmask ) \
v128_shuffle32( v, mm128_movmask_32( vmask ) )
v128_shuffle32( v, v128_movmask32( vmask ) )
*/
#define v128_shuffle8 _mm_shuffle_epi8
@@ -710,15 +708,11 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#define v128_swap64(v) _mm_shuffle_epi32( v, 0x4e ) // grandfathered
#define v128_rev64(v) _mm_shuffle_epi32( v, 0x4e ) // preferred
#define v128_rev32(v) _mm_shuffle_epi32( v, 0x1b )
#define v128_rev16(v) v128_shuffle16( v, 0x1b )
// rotate vector elements
#define v128_shuflr32(v) _mm_shuffle_epi32( v, 0x39 )
#define v128_shufll32(v) _mm_shuffle_epi32( v, 0x93 )
#define v128_shuflr16(v) v128_shuffle16( v, 0x39 )
#define v128_shufll16(v) v128_shuffle16( v, 0x93 )
// Endian byte swap.
#if defined(__SSSE3__)
@@ -734,15 +728,12 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#define v128_bswap32( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ) )
// deprecated
#define mm128_bswap_32 v128_bswap32
#define v128_bswap16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \
0x0607040502030001 )
// 8 byte qword * 8 qwords * 2 lanes = 128 bytes
#define mm128_block_bswap_64( d, s ) \
#define v128_block_bswap64( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
@@ -754,8 +745,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
}
#define mm128_block_bswap64_512 mm128_block_bswap_64
#define v128_block_bswap64_512 mm128_block_bswap_64
#define v128_block_bswap64_512 v128_block_bswap64
#define v128_block_bswap64_1024( d, s ) \
{ \
@@ -779,7 +769,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
}
// 4 byte dword * 8 dwords * 4 lanes = 128 bytes
#define mm128_block_bswap_32( d, s ) \
#define v128_block_bswap32( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
@@ -791,11 +781,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
}
#define mm128_block_bswap32_256 mm128_block_bswap_32
#define v128_block_bswap32_256 mm128_block_bswap_32
#define v128_block_bswap32_256 v128_block_bswap32
#define mm128_block_bswap32_128( d, s ) \
#define v128_block_bswap32_128( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
@@ -840,7 +829,6 @@ static inline v128_t v128_bswap32( __m128i v )
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
}
#define mm128_bswap_32 v128_bswap32
static inline v128_t v128_bswap16( __m128i v )
{
@@ -849,7 +837,7 @@ static inline v128_t v128_bswap16( __m128i v )
#define v128_bswap128( v ) v128_qrev32( v128_bswap64( v ) )
static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
static inline void v128_block_bswap64( __m128i *d, const __m128i *s )
{
d[0] = v128_bswap64( s[0] );
d[1] = v128_bswap64( s[1] );
@@ -860,9 +848,8 @@ static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
d[6] = v128_bswap64( s[6] );
d[7] = v128_bswap64( s[7] );
}
#define v128_block_bswap64_512 mm128_block_bswap_64
static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s )
static inline void v128_block_bswap64_1024( __m128i *d, const __m128i *s )
{
d[ 0] = v128_bswap64( s[ 0] );
d[ 1] = v128_bswap64( s[ 1] );
@@ -882,7 +869,7 @@ static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s )
d[15] = v128_bswap64( s[15] );
}
static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
static inline void v128_block_bswap32( __m128i *d, const __m128i *s )
{
d[0] = v128_bswap32( s[0] );
d[1] = v128_bswap32( s[1] );
@@ -893,10 +880,9 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
d[6] = v128_bswap32( s[6] );
d[7] = v128_bswap32( s[7] );
}
#define mm128_block_bswap32_256 mm128_block_bswap_32
#define v128_block_bswap32_256 mm128_block_bswap_32
#define v128_block_bswap32_256 v128_block_bswap32
static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
static inline void v128_block_bswap32_512( __m128i *d, const __m128i *s )
{
d[ 0] = v128_bswap32( s[ 0] );
d[ 1] = v128_bswap32( s[ 1] );
@@ -918,9 +904,6 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
#endif // SSSE3 else SSE2
#define v128_block_bswap32 mm128_block_bswap_32
#define v128_block_bswap64 mm128_block_bswap_64
// alignr instruction for 32 & 64 bit elements is only available with AVX512
// but emulated here. Behaviour is consistent with Intel alignr intrinsics.
#if defined(__SSSE3__)
@@ -932,25 +915,27 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
#else
#define v128_alignr8( hi, lo, c ) \
_mm_or_si128( _mm_slli_si128( hi, c ), _mm_srli_si128( lo, c ) )
_mm_or_si128( _mm_slli_si128( hi, 16-(c) ), _mm_srli_si128( lo, c ) )
// c arg is trivial only valid value is 1
#define v128_alignr64( hi, lo, c ) \
_mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
_mm_or_si128( _mm_slli_si128( hi, 16-((c)*8) ), _mm_srli_si128( lo, (c)*8 ) )
#define v128_alignr32( hi, lo, c ) \
_mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
_mm_or_si128( _mm_slli_si128( hi, 16-((c)*4) ), _mm_srli_si128( lo, (c)*4 ) )
#endif
// blend using vector mask
#if defined(__SSE4_1__)
// Bytewise using sign bit of each byte element of mask
// Bytewise using sign bit of each byte element of mask. Use full bitmask
// for compatibility with SSE2 & NEON.
#define v128_blendv _mm_blendv_epi8
#else
// Bitwise
// Bitwise, use only byte wise for compatibility with SSE4_1.
#define v128_blendv( v1, v0, mask ) \
v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )

View File

@@ -73,10 +73,10 @@ typedef union
#else
#define mm256_bcast128lo_64( i64 ) mm256_bcast_m128( mm128_mov64_128( i64 ) )
#define mm256_bcast128lo_64( i64 ) mm256_bcast_m128( v128_mov64( i64 ) )
#define mm256_bcast128hi_64( i64 ) _mm256_permute4x64_epi64( \
_mm256_castsi128_si256( mm128_mov64_128( i64 ) ), 0x11 )
_mm256_castsi128_si256( v128_mov64( i64 ) ), 0x11 )
#endif
@@ -172,19 +172,24 @@ static inline __m256i mm256_not( const __m256i v )
#else
#define mm256_ornot( v1, v0 ) _mm256_or_si256( v1, mm256_not( v0 ) )
#define mm256_ornot( v1, v0 ) _mm256_or_si256( mm256_not( v1 ), v0 )
// usage hints to improve performance when ternary logic is not avalable:
// If overwriting an input arg put that arg first so the intermediate
// result can be stored in the dest.
// Put an arg with the nearest dependency last so independant args can be
// processed first.
#define mm256_xor3( a, b, c ) \
_mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
_mm256_xor_si256( _mm256_xor_si256( a, b ), c )
#define mm256_xor4( a, b, c, d ) \
_mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
#define mm256_and3( a, b, c ) \
_mm256_and_si256( a, _mm256_and_si256( b, c ) )
_mm256_and_si256( _mm256_and_si256( a, b ), c )
#define mm256_or3( a, b, c ) \
_mm256_or_si256( a, _mm256_or_si256( b, c ) )
_mm256_or_si256( _mm256_or_si256( a, b ), c )
#define mm256_xorand( a, b, c ) \
_mm256_xor_si256( a, _mm256_and_si256( b, c ) )
@@ -217,12 +222,11 @@ static inline __m256i mm256_not( const __m256i v )
#define mm256_movmask_32( v ) \
_mm256_movemask_ps( _mm256_castsi256_ps( v ) )
//
// Bit rotations.
// shuffle 16 bit elements within 64 bit lanes.
#define mm256_shuffle16( v, c ) \
_mm256_shufflehi_epi16( _mm256_shufflelo_epi16( v, c ), c )
// reverse elements within lanes.
#define mm256_qrev32(v) _mm256_shuffle_epi32( v, 0xb1 )
#define mm256_swap64_32 mm256_qrev32 // grandfathered
@@ -242,6 +246,9 @@ static inline __m256i mm256_not( const __m256i v )
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
v128_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
//
// Bit rotations.
// These should never be called directly by applications.
#define mm256_ror_64_avx2( v, c ) \
_mm256_or_si256( _mm256_srli_epi64( v, c ), \

View File

@@ -125,7 +125,7 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
// Pseudo constants.
#define m512_zero _mm512_setzero_si512()
// use asm to avoid compiler warning for unitialized local
// use asm to avoid compiler warning for uninitialized local
static inline __m512i mm512_neg1_fn()
{
__m512i v;
@@ -185,6 +185,8 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
//
// Ternary logic uses 8 bit truth table to define any 3 input logical
// expression using any number or combinations of AND, OR, XOR, NOT.
// Macros with duplicate references to the same argument are
// not expression safe. Switch to inline function if required.
// ~v1 | v0
#define mm512_ornot( v1, v0 ) _mm512_ternarylogic_epi64( v1, v0, v0, 0xcf )

View File

@@ -1,182 +0,0 @@
#if !defined(SIMD_64_H__)
#define SIMD_64_H__ 1
#if defined(__x86_64__) && defined(__MMX__)
////////////////////////////////////////////////////////////////
//
// 64 bit MMX vectors.
//
// This code is not used anywhere annd likely never will. It's intent was
// to support 2 way parallel hashing using MMX, or NEON for 32 bit hash
// functions, but hasn't been implementedwas never implemented.
//
#define v64_t __m64
#define v64u32_t v64_t
#define v64_load _mm_load_si64
#define v64_store _mm_store_si64
#define v64_64(i64) ((__m64)(i64))
#define v64_32 _mm_set1_pi32
#define v64_16 _mm_set1_pi16
#define v64_8 _mm_set1_pi8
#define v64_add32 _mm_add_pi32
#define v64_add16 _mm_add_pi16
#define v64_add8 _mm_add_pi8
#define v64_mul32 _mm_mullo_pi32
#define v64_mul16 _mm_mullo_pi16
// compare
#define v64_cmpeq32 _mm_cmpeq_epi32
#define v64_cmpeq16 _mm_cmpeq_epi16
#define v64_cmpeq8 _mm_cmpeq_epi8
#define v64_cmpgt32 _mm_cmpgt_epi32
#define v64_cmpgt16 _mm_cmpgt_epi16
#define v64_cmpgt8 _mm_cmpgt_epi8
#define v64_cmplt32 _mm_cmplt_epi32
#define v64_cmplt16 _mm_cmplt_epi16
#define v64_cmplt8 _mm_cmplt_epi8
// bit shift
#define v64_sl32 _mm_slli_epi32
#define v64_sl16 _mm_slli_epi16
#define v64_sl8 _mm_slli_epi8
#define v64_sr32 _mm_srli_epi32
#define v64_sr16 _mm_srli_epi16
#define v64_sr8 _mm_srli_epi8
#define v64_sra32 _mm_srai_epi32
#define v64_sra16 _mm_srai_epi16
#define v64_sra8 _mm_srai_epi8
#define v64_alignr8 _mm_alignr_pi8
#define v64_unpacklo32 _mm_unpacklo_pi32
#define v64_unpackhi32 _mm_unpackhi_pi32
#define v64_unpacklo16 _mm_unpacklo_pi16
#define v64_unpackhi16 _mm_unpacklhi_pi16
#define v64_unpacklo8 _mm_unpacklo_pi8
#define v64_unpackhi8 _mm_unpackhi_pi16
// Pseudo constants
#define v64_zero _mm_setzero_si64()
#define v64_one_64 _mm_set_pi32( 0UL, 1UL )
#define v64_one_32 v64_32( 1UL )
#define v64_one_16 v64_16( 1U )
#define v64_one_8 v64_8( 1U );
#define v64_neg1 v64_32( 0xFFFFFFFFUL )
#define casti_v64(p,i) (((v64_t*)(p))[(i)])
// Bitwise not: ~(a)
//#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
#define v64_not( a ) ( (v64_t)( ~( (uint64_t)(a) ) )
/*
// Unary negate elements
#define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, v )
#define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, v )
#define mm64_negate_8( v ) _mm_sub_pi8( m64_zero, v )
*/
static inline void v64_memset_zero( __m64 *dst, const int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = v64_zero; }
static inline void v64_memset( __m64 *dst, const __m64 a, const int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
static inline void v64_memcpy( __m64 *dst, const __m64 *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
#define v64_or _mm_or_si64
#define v64_and _mm_and_si64
#define v64_xor _mm_xor_si64
#define v64_andnot _mm_andnot_si64
#define v64_xor3( v2, v1, v0 ) v64_xor( v2, v64_andnot( v1, v0 ) )
#define v64_xorandnot( v2, v1, v0 ) v64_xor( v2, v64_andnot( v1, v0 ) )
// Rotate bits in packed elements of 64 bit vector
#define v64_rol64( a, n ) \
_mm_or_si64( _mm_slli_si64( a, n ), \
_mm_srli_si64( a, 64-(n) ) )
#define v64_ror64( a, n ) \
_mm_or_si64( _mm_srli_si64( a, n ), \
_mm_slli_si64( a, 64-(n) ) )
#define v64_rol32( a, n ) \
_mm_or_si64( _mm_slli_pi32( a, n ), \
_mm_srli_pi32( a, 32-(n) ) )
#define v64_ror32( a, n ) \
_mm_or_si64( _mm_srli_pi32( a, n ), \
_mm_slli_pi32( a, 32-(n) ) )
#define v64_rol16( a, n ) \
_mm_or_si64( _mm_slli_pi16( a, n ), \
_mm_srli_pi16( a, 16-(n) ) )
#define v64_ror16( a, n ) \
_mm_or_si64( _mm_srli_pi16( a, n ), \
_mm_slli_pi16( a, 16-(n) ) )
// Rotate packed elements accross lanes. Useful for byte swap and byte
// rotation.
#if defined(__SSE__)
// Swap hi & lo 32 bits.
#define v64_swap32( a ) _mm_shuffle_pi16( a, 0x4e )
#define v64_shulfr16( a ) _mm_shuffle_pi16( a, 0x39 )
#define v64_shufll16( a ) _mm_shuffle_pi16( a, 0x93 )
// Swap hi & lo 16 bits of each 32 bit element
#define v64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 )
#endif // SSE
#if defined(__SSSE3__)
// Endian byte swap packed elements
#define v64_bswap32( v ) \
_mm_shuffle_pi8( v, (__m64)0x0405060700010203 )
#define v64_bswap16( v ) \
_mm_shuffle_pi8( v, (__m64)0x0607040502030001 );
// Rotate right by c bytes
static inline v64_t v64_shuflr_x8( __m64 v, const int c )
{ return _mm_alignr_pi8( v, v, c ); }
#else
#define v64_bswap32( v ) \
_mm_set_pi32( __builtin_bswap32( ((uint32_t*)&v)[1] ), \
__builtin_bswap32( ((uint32_t*)&v)[0] ) )
#define v64_bswap16( v ) \
_mm_set_pi16( __builtin_bswap16( ((uint16_t*)&v)[3] ), \
__builtin_bswap16( ((uint16_t*)&v)[2] ), \
__builtin_bswap16( ((uint16_t*)&v)[1] ), \
__builtin_bswap16( ((uint16_t*)&v)[0] ) )
#endif // SSSE3
#define v64_blendv( v1, v0, mask ) \
v64_or( v64_and( mask, v1 ), v64_andnot( mask, v0 ) )
#endif // MMX
#endif // SIMD_64_H__

View File

@@ -2,7 +2,7 @@
#define SIMD_INT_H__ 1
//TODO compile time test for byte order
// be64 etc using HW bowap.
// be64 etc using HW bswap.
//
// Endian byte swap
#if defined(__x86_64__)
@@ -19,6 +19,9 @@ static inline uint64_t bswap_64( uint64_t a )
return b;
}
// This produces warnings from clang, but its suggested workaround
// "rev32 %w0, %w1\n\t" produced errors instead. GCC doesn't complain and
// it works as is on both.
static inline uint32_t bswap_32( uint32_t a )
{
uint32_t b;
@@ -94,7 +97,7 @@ static inline uint16_t be16( const uint16_t u16 )
return ( (uint16_t)(p[3]) ) + ( (uint16_t)(p[2]) << 8 );
}
static inline uint32_t le162( const uint16_t u16 )
static inline uint32_t le16( const uint16_t u16 )
{
const uint8_t *p = (uint8_t const *)&u16;
return ( (uint16_t)(p[0]) ) + ( (uint16_t)(p[1]) << 8 );
@@ -108,8 +111,12 @@ static inline uint32_t le162( const uint16_t u16 )
#define rol32 __rold
#define ror32 __rord
/* these don't seem to work
#elif defined(__aarch64__)
// Documentation is vague, ror exists but is ambiguous. Docs say it can
// do 32 or 64 bit registers. Assuming that is architecture specific and can
// only do 32 bit on 32 bit arch. Rarely used so not a big issue.
static inline uint64_t ror64( uint64_t a, const int c )
{
uint64_t b;
@@ -125,6 +132,7 @@ static inline uint32_t ror32( uint32_t a, const int c )
return b;
}
#define rol32( a, c ) ror32( a, 32-(c) )
*/
#else

View File

@@ -38,7 +38,9 @@
#define v128u8_load( p ) vld1q_u16( (uint8_t*)(p) )
#define v128u8_store( p, v ) vst1q_u16( (uint8_t*)(p), v )
// load & set1 combined, doesn't work
// load & set1 combined. What if source is already loaded?
// Don't use, leave it up to the compiler to optimize.
// Same with vld1q_lane.
#define v128_load1_64(p) vld1q_dup_u64( (uint64_t*)(p) )
#define v128_load1_32(p) vld1q_dup_u32( (uint32_t*)(p) )
#define v128_load1_16(p) vld1q_dup_u16( (uint16_t*)(p) )
@@ -61,17 +63,13 @@
#define v128_sub16 vsubq_u16
#define v128_sub8 vsubq_u8
// returns low half, u64 undocumented, may not exist.
#define v128_mul64 vmulq_u64
// returns low half
#define v128_mul32 vmulq_u32
#define v128_mul16 vmulq_u16
// Widening multiply, align source elements with Intel
static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
{
return vmull_u32( vget_low_u32( vcopyq_laneq_u32( v1, 1, v1, 2 ) ),
vget_low_u32( vcopyq_laneq_u32( v0, 1, v0, 2 ) ) );
}
// Widening multiply, realign source elements from x86_64 to NEON.
#define v128_mulw32( v1, v0 ) \
vmull_u32( vmovn_u64( v1 ), vmovn_u64( v0 ) )
// compare
#define v128_cmpeq64 vceqq_u64
@@ -95,6 +93,8 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
#define v128_cmplt16( v1, v0 ) vcltq_s16( (int16x8_t)v1, (int16x8_t)(v0) )
#define v128_cmplt8( v1, v0 ) vcltq_s8( (int8x16_t)v1, (int8x16_t)(v0) )
#define v128_cmpeq_zero vceqzq_u64
// Logical bit shift
#define v128_sl64 vshlq_n_u64
#define v128_sl32 vshlq_n_u32
@@ -137,14 +137,14 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
#if defined(__ARM_FEATURE_SHA3)
#define v128_xor3 veor3q_u32
#else
#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) )
#define v128_xor3( v2, v1, v0 ) veorq_u32( veorq_u32( v2, v1 ), v0 )
#endif
// v2 & v1 & v0
#define v128_and3( v2, v1, v0 ) v128_and( v2, v128_and( v1, v0 ) )
#define v128_and3( v2, v1, v0 ) v128_and( v128_and( v2, v1 ), v0 )
// v2 | v1 | v0
#define v128_or3( v2, v1, v0 ) v128_or( v2, v128_or( v1, v0 ) )
#define v128_or3( v2, v1, v0 ) v128_or( v128_or( v2, v1 ), v0 )
// v2 ^ ( ~v1 & v0 )
#if defined(__ARM_FEATURE_SHA3)
@@ -180,6 +180,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
#define v128_unpacklo8( v1, v0 ) vzip1q_u8( v1, v0 )
#define v128_unpackhi8( v1, v0 ) vzip2q_u8( v1, v0 )
// vzipq_u32 can do hi & lo and return uint32x4x2, no 64 bit version.
// AES
// consistent with Intel AES intrinsics, break up for optimizing
@@ -239,18 +240,15 @@ typedef union
#define cast_v128u32( p ) (*((uint32x4_t*)(p)))
#define castp_v128u32( p ) ((uint32x4_t*)(p))
#define v128_zero v128_64( 0ull )
#define v128_cmpeq_zero vceqzq_u64
#define v128_neg1 v128_64( 0xffffffffffffffffull )
// set1
#define v128_64 vmovq_n_u64
#define v128_32 vmovq_n_u32
#define v128_16 vmovq_n_u16
#define v128_8 vmovq_n_u8
#define v128_zero v128_64( 0ull )
#define v128_neg1 v128_64( 0xffffffffffffffffull )
#define v64_set32( u32_1, u32_0 ) \
vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )
@@ -315,7 +313,6 @@ static inline void v128_memset_zero( void *dst, const int n )
memset( dst, 0, n*16 );
}
static inline void v128_memset( void *dst, const void *src, const int n )
{
for( int i = 0; i < n; i++ )
@@ -360,28 +357,23 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
((uint16x8_t)(v)), c )
#define v128_rol16( v, c ) \
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \
: vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
((uint16x8_t)(v)), c )
#define v128_ror8( v, c ) \
vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
((uint8x16_t)(v)), c )
#define v128_rol8( v, c ) \
vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
((uint8x16_t)(v)), c )
// ror( v1 ^ v0, n )
// ( v1 ^ v0 ) >>> c
#if defined(__ARM_FEATURE_SHA3)
#define v128_ror64xor( v1, v0, n ) vxarq_u64( v1, v0, n )
#define v128_ror64xor( v1, v0, c ) vxarq_u64( v1, v0, c )
#else
#define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n )
#define v128_ror64xor( v1, v0, c ) v128_ror64( v128_xor( v1, v0 ), c )
#endif
#define v128_2ror64( v1, v0, c ) \
@@ -414,7 +406,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
v1 = vorrq_u32( v1, t1 ); \
}
#define v128_2rorx32( v1, v0, c ) \
#define v128_2ror32( v1, v0, c ) \
{ \
uint32x4_t t0 = vshlq_n_u32( v0, c ); \
uint32x4_t t1 = vshlq_n_u32( v1, c ); \
@@ -438,7 +430,6 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
// sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
// Bit rotation already promotes faster widths. Usage is context sensitive.
// preferred.
// reverse elements in vector lanes
#define v128_qrev32 vrev64q_u32
@@ -448,9 +439,9 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
#define v128_lrev16 vrev32q_u16
// aka bswap
#define v128_qrev8 vrev64q_u8
#define v128_lrev8 vrev32q_u8
#define v128_wrev8 vrev16q_u8
// #define v128_qrev8 vrev64q_u8
// #define v128_lrev8 vrev32q_u8
// #define v128_wrev8 vrev16q_u8
// full vector rotation
@@ -460,7 +451,6 @@ static inline uint64x2_t v128_rev64( uint64x2_t v )
#define v128_swap64 v128_rev64 // grandfathered
#define v128_rev32(v) v128_rev64( v128_qrev32( v ) )
#define v128_rev16(v) v128_rev64( v128_qrev16( v ) )
// shuffle-rotate vector elements
static inline uint32x4_t v128_shuflr32( uint32x4_t v )
@@ -469,12 +459,6 @@ static inline uint32x4_t v128_shuflr32( uint32x4_t v )
static inline uint32x4_t v128_shufll32( uint32x4_t v )
{ return vextq_u32( v, v, 3 ); }
static inline uint16x8_t v128_shuflr16( uint16x8_t v )
{ return vextq_u16( v, v, 1 ); }
static inline uint16x8_t v128_shufll16( uint16x8_t v )
{ return vextq_u16( v, v, 7 ); }
// reverse bits in bytes, nothing like it in x86_64
#define v128_bitrev8 vrbitq_u8
@@ -482,9 +466,9 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
#define v128_bswap16(v) (uint16x8_t)vrev16q_u8( (uint8x16_t)(v) )
#define v128_bswap32(v) (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) )
#define v128_bswap64(v) (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
#define v128_bswap128(v) (uint32x4_t)v128_swap64( v128_bswap64(v) )
#define v128_bswap128(v) (uint32x4_t)v128_rev64( v128_bswap64(v) )
// Usefull for x86_64 but does nothing for ARM
// Useful for x86_64 but does nothing for ARM
#define v128_block_bswap32( dst, src ) \
{ \
casti_v128u32( dst,0 ) = v128_bswap32( casti_v128u32( src,0 ) ); \
@@ -496,7 +480,7 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
casti_v128u32( dst,6 ) = v128_bswap32( casti_v128u32( src,6 ) ); \
casti_v128u32( dst,7 ) = v128_bswap32( casti_v128u32( src,7 ) ); \
}
#define v128_block_bswap32_256( dst, src ) \
#define v128_block_bswap32_256 v128_block_bswap32
#define v128_block_bswap32_512( dst, src ) \
{ \
@@ -551,8 +535,9 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
}
// Bitwise blend using vector mask
#define v128_blendv( v1, v0, mask ) vbslq_u32( mask, v1, v0 )
// Bitwise blend using vector mask, use only bytewise for compatibility
// with x86_64.
#define v128_blendv( v1, v0, mask ) vbslq_u32( mask, v0, v1 )
#endif // __ARM_NEON
#endif // SIMD_NEON_H__

152
simd-utils/simd-sve.h Normal file
View File

@@ -0,0 +1,152 @@
// Placeholder for now.
//
// This file will hold AArch64 SVE code, a replecement for NEON that uses
// vector length agnostic instructions. This means the same code can be used
// on CPUs with different SVE vector register lengths. This is not good for
// vectorized hashing.
// Optimum hash is sensitive to the vector register length with different code
// used for different register sizes. On X86_64 the vector length is tied to
// the CPU feature making it simple and efficient to handle different lengths
// although it results in multiple executables. Theoretically SVE could use a
// single executable for any vector length.
//
// With the SVE vector length only known at run time it results in run time
// overhead to test the vector length. Theoretically it could be tested at
// program loading and appropriate libraries loaded. However I don't know if
// this can be done and if specified how to do it.
//
// SVE is not expected to be used for 128 bit vectors as it does not provide any
// advantages over NEON. However, it may be implemented for testing purposes
// because CPU with registers larger than 128 bits are currently very rare and
// very expensive server class CPUs.
//
// However, 128 bit vectors also need to be supported with 256 bit registers.
// This could be a challenge for un-predicated functions.
//
// N-way parallel hashing could be the best use of SVE, usimg the same code
// for all vector lengths with the only variable being the number of lanes.
// This will still require run time checking but should be lighter than
// substituting functions.
// Current approach is to hard code the length in these intrinsics and called
// by existing length specific code.
// define with sv_ prefix for generic use predicate provided by caller,
// use sv<size>_ with hard coded predicate.
// v<size>_ only if and when it's compatible with SSE & NEON
// Many instructions have no predicate operand, how is VVL handled?
// How does the CPU know how long the vector is and whether it spans
// multiple registers without the predicate?
// Also how does the predicate define the vector size? How to tell if inactive
// high lanes are part of the vector or beyond its range.
//
// Some intructions may have an implied predicate by other arguments.
// TBL for example will only have shuffle indexes for active lanes.
// However this is dependant on software being aware of register size.
#if 0
// #if defined USE_SV128
// NEON needs to be disabled
#define PRED128 0xffff
#define PRED256 0xffffffff
// Types should be transparent
#define sv128u32_t svuint32_t
#define sv256u32_t svuint32_t
// load1
// arithmetic
// _z zero inactive elements, _x undefined inactive elements, _m inactive
// elements from first arg. arg order only matters when _m used. Use _x.
#define sv_add32( p, v1, v0 ) svadd_u32_x( p, v1, v0 )
#define sv128_add32( v1, v0 ) svadd_u32_x( PRED128, v1, v0 )
#define sv256_add32( v1, v0 ) svadd_u32_x( PRED256, v1, v0 )
// Add integer to each element
#define sv_addi32( p, v, i ) svadd_n_u32_x( p, v, i )
// compare
#define sv_cmpeq32( p, v1, v0 ) svcmpeq_u32( p, v1, v0 )
#define sv128_cmpeq32( v1, v0 ) svcmpeq_u32( PRED128, v1, v0 )
#define sv256_cmpeq32( v1, v0 ) svcmpeq_u32( PRED256, v1, v0 )
// bit shift
#define sv_sl32( v, c ) svlsl_n_u32_x( p, v, c )
#define sv128_sl32( v, c ) svlsl_n_u32_x( PRED128, v, c )
#define sv256_sl32( v, c ) svlsl_n_u32_x( PRED256, v, c )
// logic
#define sv_or( p, v1, v0 ) svorr_u32_x( p, v1, v0 )
#define sv128_or( v1, v0 ) svorr_u32_x( PRED128, v1, v0 )
#define sv256_or( v1, v0 ) svorr_u32_x( PRED256, v1, v0 )
// ext used for alignr, and zip used for unpack have no predicate arg.
// How is vector length determined? How are register sizes handled?
// How are part registers handled?
// alignr (ext)
// unpack
// AES
// AES uses fixed 128 bit vectors, how does this work with larger registers?
// set1
#define sv128_32( n ) svdup_n_u32_x( PRED128, n )
#define sv256_32( n ) svdup_n_u32_x( PRED256, n )
// broadcast
// svdup_lane has no predicate
// constants
// pointer cast
// Bit rotation
// No predication for shift instructions
// Cross lane shuffles
// Very limited shuffling, mostly svtbl which has no predicate and uses
// vector for the index.
// endian byte swap
#define sv128_bswap32(v) svrevb_u32_x( p, v )
// blend
#enfif

View File

@@ -16,14 +16,19 @@
#include "miner.h"
#include "simd-utils.h"
#if defined(__aarch64__) && !defined(__APPLE__)
// Missing on MinGW, MacOS
#if defined(__aarch64__) && !defined(WIN32) && !defined(__APPLE__)
#define ARM_AUXV
#endif
#if defined(ARM_AUXV)
// for arm's "cpuid"
#include <sys/auxv.h>
#include <asm/hwcap.h>
#include <sys/prctl.h>
#endif
#ifndef WIN32
#if !(defined(WIN32) || defined(__APPLE__))
// 1035g1: /sys/devices/platform/coretemp.0/hwmon/hwmon3/temp1_input
// 1035g1: /sys/class/hwmon/hwmon1/temp1_input wrong temp
@@ -147,7 +152,7 @@ static inline void linux_cpu_hilo_freq( float *lo, float *hi )
static inline float cpu_temp( int core )
{
#ifdef WIN32
#if defined(WIN32) || defined(__APPLE__)
return 0.;
#else
return linux_cputemp( core );
@@ -156,7 +161,7 @@ static inline float cpu_temp( int core )
static inline uint32_t cpu_clock( int core )
{
#ifdef WIN32
#if defined(WIN32) || defined(__APPLE__)
return 0;
#else
return linux_cpufreq( core );
@@ -169,17 +174,17 @@ static inline int cpu_fanpercent()
}
// CPUID
// x86_64 CPUID
// This list is incomplete, it only contains features of interest to cpuminer.
// refer to http://en.wikipedia.org/wiki/CPUID for details.
// AVX10 compatibility notes
//
// Notation used: AVX10i.[version]_[vectorwidth]
// AVX10.1_512 is a rebranding of AVX512 and is effectively the AVX* superset
// Display format: AVX10.[version]-[vectorwidth]
// AVX10.1-512 is a rebranding of AVX512 and is effectively the AVX* superset
// with full 512 bit vector support.
// AVX10.2_256 is effectively AVX2 + AVX512_VL, all AVX512 instructions and
// AVX10.2-256 is effectively AVX2 + AVX512_VL, all AVX512 instructions and
// features applied only to 256 bit and 128 bit vectors.
// Future AVX10 versions will add new instructions and features.
@@ -275,8 +280,8 @@ static inline int cpu_fanpercent()
#define FMA3_mask (FMA3_Flag|AVX_mask)
#define AVX512_mask (AVX512_VL_Flag|AVX512_BW_Flag|AVX512_DQ_Flag|AVX512_F_Flag)
#if defined(__x86_64__)
static inline void cpuid( unsigned int leaf, unsigned int subleaf,
unsigned int output[4] )
{
@@ -309,16 +314,65 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
#endif
}
#elif defined(__aarch64__) && !defined(__APPLE__)
#elif defined(ARM_AUXV)
// Always test if HWCAP variable is defined in the kernel before attempting
// to compile it. If not defined the feature can't be tested and won't be
// included in the compile.
// This can occur if compiling with an old kernel and a new CPU and could
// result in a suboptimal build.
// leaf and subleaf arguments are ignored.
static inline void cpuid( unsigned int leaf, unsigned int subleaf,
unsigned int output[4] )
{
output[0] = getauxval(AT_HWCAP);
#if defined(AT_HWCAP)
output[0] = getauxval( AT_HWCAP );
#else
output[0] = 0;
#endif
#if defined(AT_HWCAP2)
output[1] = getauxval( AT_HWCAP2 );
#else
output[1] = 0;
#endif
/*
#define has(CAP, hwcap) !!((hwcap) & HWCAP_##CAP)
#define pr(CAP, hwcap) printf("%10s = %d\n", #CAP, has(CAP, hwcap))
unsigned long hwcaps = getauxval(AT_HWCAP);
printf("HWCAP = 0x%lx\n", hwcaps);
pr(FP, hwcaps);
pr(ASIMD, hwcaps);
pr(EVTSTRM, hwcaps);
pr(AES, hwcaps);
pr(PMULL, hwcaps);
pr(SHA1, hwcaps);
pr(SHA2, hwcaps);
pr(CRC32, hwcaps);
pr(ATOMICS, hwcaps);
pr(FPHP, hwcaps);
pr(ASIMDHP, hwcaps);
pr(CPUID, hwcaps);
pr(ASIMDRDM, hwcaps);
pr(JSCVT, hwcaps);
pr(FCMA, hwcaps);
pr(LRCPC, hwcaps);
pr(DCPOP, hwcaps);
pr(SHA3, hwcaps);
pr(SM3, hwcaps);
pr(SM4, hwcaps);
pr(ASIMDDP, hwcaps);
pr(SHA512, hwcaps);
pr(SVE, hwcaps);
*/
}
#else
#define cpuid(leaf, subleaf, out) out[0] = 0;
#define cpuid( leaf, subleaf, output ) \
output[0] = output[1] = output[2] = output[3] = 0;
#endif
static inline void cpu_getname(char *outbuf, size_t maxsz)
@@ -447,31 +501,20 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
#ifdef __ARM_FEATURE_SHA3
#warning "__ARM_FEATURE_SHA3"
#endif
*/
// GCC-14.1: the AVX512 macros are defined even when compiled with only
// -mavx10.1-256, causing compile errors in AVX512 code. Only with
// -mavx10.1-512 does it compile successfully.
// __EVEX512__ is set only when compiled with -mavx10.1-512.
// Adding -fno-evex512 doesn't help.
// Building with -mapxf fails to configure on a CPU without APX because it can
// run the test program.
/*
#ifdef __AVX10_1__
#warning "__AVX10_1__"
#ifdef __ARM_FEATURE_SHA512
#warning "__ARM_FEATURE_SHA512"
#endif
#ifdef __AVX10_1_256__
#warning "__AVX10_1_256__"
#ifdef __ARM_FEATURE_SVE
#warning "__ARM_FEATURE_SVE"
#endif
#ifdef __AVX10_1_512__
#warning "__AVX10_1_512__"
#ifdef __ARM_FEATURE_SVE2
#warning "__ARM_FEATURE_SVE2"
#endif
#ifdef __EVEX512__
#warning "__EVEX512__"
#ifdef __ARM_FEATURE_SME
#warning "__ARM_FEATURE_SME"
#endif
*/
// Typical display format: AVX10.[version]_[vectorlength], if vector length is
// omitted 256 is the default.
// Ex: AVX10.1_512
@@ -482,7 +525,6 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
// 1 1 1 1 = AVX10 512 bit max (version 1 granite rapids)
// Other combinations are not defined.
// No technical need for this, the code won't run if false.
static inline bool cpu_arch_x86_64()
{
#if defined(__x86_64__)
@@ -515,11 +557,11 @@ static inline bool has_sse()
static inline bool has_sse2()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return cpu_info[ EDX_Reg ] & SSE2_Flag;
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return cpu_info[ EDX_Reg ] & SSE2_Flag;
#else
return false;
return false;
#endif
}
@@ -556,39 +598,11 @@ static inline bool has_sse42()
#endif
}
// There's no HWCAP for NEON, assume it's always true.
static inline bool has_neon()
{
#if defined(__aarch64__) && !defined(__APPLE__)
unsigned int cpu_info[4] = { 0 };
return cpu_info[0];
#else
return false;
#endif
}
static inline bool has_aes_ni()
{
#if defined(__x86_64__)
if ( has_sse2() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return cpu_info[ ECX_Reg ] & AES_NI_Flag;
}
return false;
#elif defined(__aarch64__) && !defined(__APPLE__)
if ( has_neon() )
{
#if defined(KERNEL_HWCAP_AES)
return true;
#else
return false;
#endif
/* unsigned int cpu_info[4] = { 0 };
cpuid( 0, 0, cpu_info );
return cpu_info[0] & HWCAP_AES;
*/ }
return false;
#if defined(__aarch64__)
return true;
#else
return false;
#endif
@@ -616,54 +630,48 @@ static inline bool has_avx2()
#endif
}
static inline bool has_sha()
// SVE vector width is determined at run time.
static inline bool has_sve()
{
#if defined(__x86_64__)
if ( has_avx() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & SHA_Flag;
}
return false;
#elif defined(__aarch64__) && !defined(__APPLE__)
if ( has_neon() )
{
#if defined(KERNEL_HWCAP_SHA2)
return true;
#if defined(__aarch64__) && defined(HWCAP_SVE)
unsigned int cpu_info[4] = { 0 };
cpuid( 0, 0, cpu_info );
return cpu_info[0] & HWCAP_SVE;
#else
return false;
#endif
/* unsigned int cpu_info[4] = { 0 };
cpuid( 0, 0, cpu_info );
return cpu_info[0] & HWCAP_SHA2;
*/ }
return false;
#else
return false;
return false;
#endif
}
static inline bool has_sha512()
static inline bool has_sve2()
{
#if defined(__x86_64__)
if ( has_avx2() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 1, cpu_info );
return cpu_info[ EAX_Reg ] & SHA512_Flag;
}
return false;
#elif defined(__aarch64__) && !defined(__APPLE__)
if ( has_neon() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( 0, 0, cpu_info );
return cpu_info[0] & HWCAP_SHA3;
}
return false;
#if defined(__aarch64__) && defined(HWCAP2_SVE2)
unsigned int cpu_info[4] = { 0 };
cpuid( 0, 0, cpu_info );
return cpu_info[1] & HWCAP2_SVE2;
#else
return false;
return false;
#endif
}
static inline bool has_sme()
{
#if defined(__aarch64__) && defined(HWCAP2_SME)
unsigned int cpu_info[4] = { 0 };
cpuid( 0, 0, cpu_info );
return cpu_info[1] & HWCAP2_SME;
#else
return false;
#endif
}
static inline bool has_sme2()
{
#if defined(__aarch64__) && defined(HWCAP2_SME2)
unsigned int cpu_info[4] = { 0 };
cpuid( 0, 0, cpu_info );
return cpu_info[1] & HWCAP2_SME2;
#else
return false;
#endif
}
@@ -723,6 +731,48 @@ static inline bool has_avx512()
#endif
}
static inline bool has_vbmi()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ ECX_Reg ] & AVX512_VBMI_Flag;
#else
return false;
#endif
}
static inline bool has_vbmi2()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ ECX_Reg ] & AVX512_VBMI2_Flag;
#else
return false;
#endif
}
static inline bool has_aes()
{
#if defined(__x86_64__)
if ( has_sse2() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return cpu_info[ ECX_Reg ] & AES_NI_Flag;
}
return false;
#elif defined(__aarch64__) && defined(HWCAP_AES)
// NEON AES
unsigned int cpu_info[4] = { 0 };
cpuid( 0, 0, cpu_info );
return cpu_info[0] & HWCAP_AES;
#else
return false;
#endif
}
static inline bool has_vaes()
{
#if defined(__x86_64__)
@@ -738,25 +788,78 @@ static inline bool has_vaes()
#endif
}
static inline bool has_vbmi()
static inline bool has_sveaes()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ ECX_Reg ] & AVX512_VBMI_Flag;
#if defined(__aarch64__) && defined(HWCAP2_SVEAES)
unsigned int cpu_info[4] = { 0 };
cpuid( 0, 0, cpu_info );
return cpu_info[1] & HWCAP2_SVEAES;
#else
return false;
#endif
}
static inline bool has_vbmi2()
static inline bool has_sha256()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ ECX_Reg ] & AVX512_VBMI2_Flag;
if ( has_avx() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & SHA_Flag;
}
return false;
#elif defined(__aarch64__) && defined(HWCAP_SHA2)
// NEON SHA256
unsigned int cpu_info[4] = { 0 };
cpuid( 0, 0, cpu_info );
return cpu_info[0] & HWCAP_SHA2;
#else
return false;
return false;
#endif
}
static inline bool has_sha512()
{
#if defined(__x86_64__)
if ( has_avx2() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 1, cpu_info );
return cpu_info[ EAX_Reg ] & SHA512_Flag;
}
return false;
#elif defined(__aarch64__) && defined(HWCAP_SHA512)
// NEON SHA512
unsigned int cpu_info[4] = { 0 };
cpuid( 0, 0, cpu_info );
return cpu_info[0] & HWCAP_SHA512;
#else
return false;
#endif
}
// Arm only
static inline bool has_sha3()
{
#if defined(__aarch64__) && defined(HWCAP_SHA3)
// NEON SHA3
unsigned int cpu_info[4] = { 0 };
cpuid( 0, 0, cpu_info );
return cpu_info[0] & HWCAP_SHA3;
#else
return false;
#endif
}
static inline bool has_svesha3()
{
#if defined(__aarch64__) && defined(HWCAP2_SVESHA3)
unsigned int cpu_info[4] = { 0 };
cpuid( 0, 0, cpu_info );
return cpu_info[1] & HWCAP2_SVESHA3;
#else
return false;
#endif
}
@@ -815,10 +918,8 @@ static inline unsigned int avx10_version()
cpuid( AVX10_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX10_VERSION_mask;
}
return 0;
#else
return 0;
#endif
return 0;
}
// also includes 256 & 128
@@ -831,13 +932,11 @@ static inline bool has_avx10_512()
cpuid( AVX10_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX10_512_Flag;
}
return false;
#else
return false;
#endif
return false;
}
// Includes 128 but may not include 512
// Includes 128 but might not include 512
static inline bool has_avx10_256()
{
#if defined(__x86_64__)
@@ -847,13 +946,11 @@ static inline bool has_avx10_256()
cpuid( AVX10_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX10_256_Flag;
}
return false;
#else
return false;
#endif
return false;
}
// Maximum vector length
// AVX10 vector register length
static inline unsigned int avx10_vector_length()
{
#if defined(__x86_64__)
@@ -864,24 +961,28 @@ static inline unsigned int avx10_vector_length()
return cpu_info[ EBX_Reg ] & AVX10_512_Flag ? 512
: ( cpu_info[ EBX_Reg ] & AVX10_256_Flag ? 256 : 0 );
}
return 0;
#else
return 0;
#endif
return 0;
}
// ARM SVE vector register length, converted from bytes to bits.
static inline int sve_vector_length()
{
#if defined(ARM_AUXV)
if ( has_sve() )
return prctl( (PR_SVE_GET_VL & PR_SVE_VL_LEN_MASK) * 8 );
#endif
return 0;
}
static inline uint32_t cpuid_get_highest_function_number()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = {0};
cpuid( VENDOR_ID, 0, cpu_info);
return cpu_info[ EAX_Reg ];
#else
return 0;
#endif
return 0;
}
// out of date
@@ -962,9 +1063,7 @@ static inline void cpu_brand_string( char* s )
#elif defined(__arm__) || defined(__aarch64__)
unsigned int cpu_info[4] = { 0 };
cpuid( 0, 0, cpu_info );
sprintf( s, "ARM 64 bit CPU, HWCAP %08x", cpu_info[0] );
sprintf( s, "ARM 64 bit CPU" );
#else

8
util.c
View File

@@ -1414,6 +1414,12 @@ static bool send_line( struct stratum_ctx *sctx, char *s )
int n;
fd_set wd;
// Something nasty going on With Windows on aarch64. This hack prevents
// corrupting the sctx pointer. This only works if placed inside the while loop.
#if defined(__aarch64__) && defined(WIN32) && defined(ARM_WIN_HACK)
printf("");
#endif
FD_ZERO( &wd );
FD_SET( sctx->sock, &wd );
if ( select( (int) ( sctx->sock + 1 ), NULL, &wd, NULL, &timeout ) < 1 )
@@ -2239,7 +2245,7 @@ static bool stratum_benchdata(json_t *result, json_t *params, int thr_id)
#endif
cpu_bestfeature(arch, 16);
if (has_aes_ni()) strcat(arch, " NI");
if (has_aes()) strcat(arch, " NI");
cpu_getmodelid(vendorid, 32);
cpu_getname(cpuname, 80);

View File

@@ -11,16 +11,11 @@
export LOCAL_LIB="$HOME/usr/lib"
export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --host=x86_64-w64-mingw32"
#export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
# set correct gcc version
export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
# used by GCC
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs"
#export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
# Support for Windows 7 CPU groups, AES sometimes not included in -march
# CPU groups disabled due to incompatibilities between Intel and AMD CPUs.
#export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
export DEFAULT_CFLAGS="-maes -O3 -Wall"
export DEFAULT_CFLAGS_OLD="-O3 -Wall"
@@ -40,7 +35,6 @@ cp $MINGW_LIB/zlib1.dll release/
cp $MINGW_LIB/libwinpthread-1.dll release/
cp $GCC_MINGW_LIB/libstdc++-6.dll release/
cp $GCC_MINGW_LIB/libgcc_s_seh-1.dll release/
#cp ./../libcrypto-1_1-x64.dll release/
cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
# Start building...
@@ -126,12 +120,12 @@ CFLAGS="-msse2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-sse2.exe
make clean || echo clean
#make clean || echo clean
# Native with CPU groups ennabled
make clean || echo clean
rm -f config.status
CFLAGS="-march=native $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
#make clean || echo clean
#rm -f config.status
#CFLAGS="-march=native $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
#make -j 8
#strip -s cpuminer.exe