Compare commits

...

5 Commits
v24.5 ... v25.2

Author SHA1 Message Date
Jay D Dee
1ed18bf22e v25.2 2025-01-12 18:58:21 -05:00
Jay D Dee
1d9341ee92 v25.1 2024-12-30 21:33:04 -05:00
Jay D Dee
a45a333b40 v24.8 2024-12-25 23:12:29 -05:00
Jay D Dee
2b1037a7c7 v24.7 2024-12-16 19:17:19 -05:00
Jay D Dee
06624a0ff2 v24.6 2024-12-08 11:14:08 -05:00
57 changed files with 2406 additions and 4539 deletions

View File

@@ -1,19 +1,39 @@
if WANT_JANSSON
JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
if HAVE_APPLE
# MacOS uses Homebrew to install needed packages but they aren't linked for
# the jansson test in configure. Ignore the failed test & link them now,
# different path for different CPU arch.
if ARCH_ARM64
EXTRA_INCLUDES = -I/opt/homebrew/include
EXTRA_LIBS = -L/opt/homebrew/lib
else
JANSSON_INCLUDES=
EXTRA_INCLUDES = -I/usr/local/include
EXTRA_LIBS = -L/usr/local/lib
endif
EXTRA_DIST = example-cfg.json nomacro.pl
else
SUBDIRS = compat
if WANT_JANSSON
# Can't find jansson libraries, compile the included source code.
EXTRA_INCLUDES = -I$(top_srcdir)/compat/jansson
EXTRA_LIBS = -L$(top_srcdir)/compat/jansson
else
EXTRA_INCLUDES =
EXTRA_LIBS =
endif
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) -I.
endif
bin_PROGRAMS = cpuminer
EXTRA_DIST = example-cfg.json nomacro.pl
dist_man_MANS = cpuminer.1
SUBDIRS = compat
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(EXTRA_INCLUDES) -I.
bin_PROGRAMS = cpuminer
dist_man_MANS = cpuminer.1
cpuminer_SOURCES = \
dummy.cpp \
@@ -166,8 +186,6 @@ cpuminer_SOURCES = \
algo/shavite/sph-shavite-aesni.c \
algo/shavite/shavite-hash-2way.c \
algo/shavite/shavite-hash-4way.c \
algo/simd/nist.c \
algo/simd/vector.c \
algo/simd/sph_simd.c \
algo/simd/simd-hash-2way.c \
algo/skein/sph_skein.c \
@@ -274,29 +292,29 @@ cpuminer_SOURCES = \
algo/yespower/yespower-opt.c \
algo/yespower/yespower-ref.c \
algo/yespower/yespower-blake2b-ref.c
disable_flags =
if USE_ASM
cpuminer_SOURCES += asm/neoscrypt_asm.S
else
disable_flags += -DNOASM
endif
if HAVE_WINDOWS
cpuminer_SOURCES += compat/winansi.c
endif
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
if USE_ASM
disable_flags =
cpuminer_SOURCES += asm/neoscrypt_asm.S
else
disable_flags += -DNOASM
endif
if HAVE_WINDOWS
cpuminer_CFLAGS += -Wl,--stack,10485760
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = $(EXTRA_LIBS) @LIBCURL@ -ljansson @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
if ARCH_ARM64
cpuminer_CFLAGS += -flax-vector-conversions
endif
if HAVE_WINDOWS
# use to profile an object
# gprof_cflags = -pg -g3
# cpuminer_LDFLAGS += -pg
@@ -310,5 +328,4 @@ cpuminer-neoscrypt.o: neoscrypt.c
@echo "CUSTOM ${@}: ${filter %.o,${^}} ${filter %.c,${^}}"
$(CC) $(common_ccflags) -g -O3 $(gprof_cflags) -MT $@ -MD -MP -c -o $@ $<
endif

View File

@@ -36,34 +36,18 @@ for compile instructions.
Requirements
------------
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
Intel Core2 and newer and AMD equivalents. Further optimizations are available
on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
32 bit CPUs are not supported.
Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
are not supported.
1. A 64 bit CPU supporting x86_64 (Intel or AMD) or aarch64 (ARM).
x86_64 requires SSE2, aarch64 requires armv8 & NEON.
Mobile CPUs like laptop computers are not recommended because they aren't
designed for extreme heat of operating at full load for extended periods of
time.
Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
including Mint and Centos, are known to work and have all dependencies
in their repositories. Others may work but may require more effort. Older
versions such as Centos 6 don't work due to missing features.
Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
binaries. WindowsXP 64 bit is YMMV.
FreeBSD is not actively tested but should work, YMMV.
MacOS, OSx and Android are not supported.
2. 64 bit operating system including Linux, Windows, MacOS, or BSD.
Android, IOS and alt OSs like Haiku & ReactOS are not supported.
3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
RPC getwork using http:// or https://.
GBT is YMMV.
RPC getblockte,plate using http:// or https://.
Supported Algorithms
--------------------

View File

@@ -75,6 +75,43 @@ If not what makes it happen or not happen?
Change Log
----------
v25.2
ARM: Fixed regression from v25.1 that could cause build fail.
BSD: FreeBSD is now supported. Other BSDs may also work.
MacOS: build with installed jansson library instead of compiling the included source code.
Windows: remove "_WIN32_WINNT=0x0601" which is a downgrade on Win11.
Changed build.sh shell from bash to sh.
v25.1
MacOS ARM64: m7m algo is now working.
MacOS ARM64: can now be compiled with GCC.
MacOS x86_64: is now working compiled with GCC.
Fixed some minor bugs & removed some obsolete code.
v24.8
ARM: Apple MacOS on M series CPU is now supported compiled from source
code, see Wiki for details.
ARM: Fix incorrect compiler version display when using clang.
build.sh can now be used to compile all targets, arm_build.sh & build_msys2.sh
have been removed.
Windows: MSys2 build now enables CPU groups by default, prebuilt binaries
continue to be compiled with CPU groups disabled.
v24.7
ARM: compile works for Windows using MSys2 & MingW, see wiki for details.
v24.6
ARM: Fixed scryptn2, x16*, broken in v24.2.
ARM: Small improvement to interleaving.
Eliminated some potential compile errors in code that was dependent on
compiler optimisations.
x86_64: improved support for AVX10 compilation, needs GCC-14 or higher.
v24.5
Fix MinGW compile error after MSys2 upgrade to GCC-14.2.

View File

@@ -387,7 +387,7 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
// Hamsi 8 way AVX512
// Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
// timig. However, when tested hashing X13 on i9-9940x using cmplt with zero
// timing. However, testing hashing X13 on i9-9940x using cmplt with zero
// had a 3% faster overall hashrate than than using movepi.
#define INPUT_BIG8 \
@@ -418,13 +418,11 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
tb = mm512_xoror( b, d, a ); \
a = _mm512_xor_si512( a, c ); \
b = mm512_xoror( td, tb, a ); \
td = mm512_xorand( a, td, tb ); \
d = _mm512_ternarylogic_epi64( a, td, tb, 0x87 );/* not( xorand( a, td, tb ) ); */ \
a = c; \
c = mm512_xor3( tb, b, td ); \
d = mm512_not( td ); \
c = _mm512_ternarylogic_epi64( tb, b, d, 0x69 ); /* not( xor3( tb, b, d ) ); */ \
}
/*
#define SBOX8( a, b, c, d ) \
do { \
@@ -1155,11 +1153,99 @@ do { \
b = mm256_xoror( td, tb, a ); \
d = _mm256_ternarylogic_epi64( a, td, tb, 0x87 );/* mm256_not( mm256_xorand( a, td, tb ) ); */ \
a = c; \
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /*mm256_not( mm256_xor3( tb, b, d ) );*/ \
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /* mm256_not( mm256_xor3( tb, b, d ) ); */ \
}
#else
#define INPUT_BIG_sub( db_i ) \
{ \
const __m256i dm = _mm256_cmpgt_epi64( zero, db_i ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
tp += 8; \
}
#define INPUT_BIG \
{ \
const __m256i db = *buf; \
const __m256i zero = m256_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
INPUT_BIG_sub( _mm256_slli_epi64( db,63 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,62 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,61 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,60 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,59 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,58 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,57 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,56 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,55 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,54 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,53 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,52 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,51 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,50 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,49 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,48 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,47 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,46 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,45 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,44 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,43 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,42 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,41 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,40 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,39 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,38 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,37 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,36 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,35 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,34 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,33 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,32 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,31 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,30 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,29 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,28 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,27 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,26 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,25 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,24 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,23 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,22 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,21 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,20 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,19 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,18 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,17 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,16 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,15 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,14 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,13 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,12 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,11 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,10 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 9 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 8 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 7 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 6 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 5 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 4 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 3 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 2 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 1 ) ); \
INPUT_BIG_sub( db ); \
}
#if 0
// dependent on the compiler unrolling the loop
#define INPUT_BIG \
do { \
__m256i db = *buf; \
@@ -1180,6 +1266,7 @@ do { \
tp += 8; \
} \
} while (0)
#endif
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
#define SBOX( a, b, c, d ) \
@@ -1219,7 +1306,7 @@ do { \
do { \
a = mm256_rol_32( a, 13 ); \
c = mm256_rol_32( c, 3 ); \
b = mm256_xor3( a, b, c ); \
b = mm256_xor3( b, a, c ); \
d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
b = mm256_rol_32( b, 1 ); \
d = mm256_rol_32( d, 7 ); \
@@ -1961,6 +2048,94 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
sc->h[6] = c6; \
sc->h[7] = c7;
#define INPUT_2x64_sub( db_i ) \
{ \
const v128u64_t dm = v128_cmpgt64( zero, db_i ); \
m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
tp += 8; \
}
#define INPUT_2x64 \
{ \
const v128u64_t db = *buf; \
const v128u64_t zero = v128_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
INPUT_2x64_sub( v128_sl64( db,63 ) ); \
INPUT_2x64_sub( v128_sl64( db,62 ) ); \
INPUT_2x64_sub( v128_sl64( db,61 ) ); \
INPUT_2x64_sub( v128_sl64( db,60 ) ); \
INPUT_2x64_sub( v128_sl64( db,59 ) ); \
INPUT_2x64_sub( v128_sl64( db,58 ) ); \
INPUT_2x64_sub( v128_sl64( db,57 ) ); \
INPUT_2x64_sub( v128_sl64( db,56 ) ); \
INPUT_2x64_sub( v128_sl64( db,55 ) ); \
INPUT_2x64_sub( v128_sl64( db,54 ) ); \
INPUT_2x64_sub( v128_sl64( db,53 ) ); \
INPUT_2x64_sub( v128_sl64( db,52 ) ); \
INPUT_2x64_sub( v128_sl64( db,51 ) ); \
INPUT_2x64_sub( v128_sl64( db,50 ) ); \
INPUT_2x64_sub( v128_sl64( db,49 ) ); \
INPUT_2x64_sub( v128_sl64( db,48 ) ); \
INPUT_2x64_sub( v128_sl64( db,47 ) ); \
INPUT_2x64_sub( v128_sl64( db,46 ) ); \
INPUT_2x64_sub( v128_sl64( db,45 ) ); \
INPUT_2x64_sub( v128_sl64( db,44 ) ); \
INPUT_2x64_sub( v128_sl64( db,43 ) ); \
INPUT_2x64_sub( v128_sl64( db,42 ) ); \
INPUT_2x64_sub( v128_sl64( db,41 ) ); \
INPUT_2x64_sub( v128_sl64( db,40 ) ); \
INPUT_2x64_sub( v128_sl64( db,39 ) ); \
INPUT_2x64_sub( v128_sl64( db,38 ) ); \
INPUT_2x64_sub( v128_sl64( db,37 ) ); \
INPUT_2x64_sub( v128_sl64( db,36 ) ); \
INPUT_2x64_sub( v128_sl64( db,35 ) ); \
INPUT_2x64_sub( v128_sl64( db,34 ) ); \
INPUT_2x64_sub( v128_sl64( db,33 ) ); \
INPUT_2x64_sub( v128_sl64( db,32 ) ); \
INPUT_2x64_sub( v128_sl64( db,31 ) ); \
INPUT_2x64_sub( v128_sl64( db,30 ) ); \
INPUT_2x64_sub( v128_sl64( db,29 ) ); \
INPUT_2x64_sub( v128_sl64( db,28 ) ); \
INPUT_2x64_sub( v128_sl64( db,27 ) ); \
INPUT_2x64_sub( v128_sl64( db,26 ) ); \
INPUT_2x64_sub( v128_sl64( db,25 ) ); \
INPUT_2x64_sub( v128_sl64( db,24 ) ); \
INPUT_2x64_sub( v128_sl64( db,23 ) ); \
INPUT_2x64_sub( v128_sl64( db,22 ) ); \
INPUT_2x64_sub( v128_sl64( db,21 ) ); \
INPUT_2x64_sub( v128_sl64( db,20 ) ); \
INPUT_2x64_sub( v128_sl64( db,19 ) ); \
INPUT_2x64_sub( v128_sl64( db,18 ) ); \
INPUT_2x64_sub( v128_sl64( db,17 ) ); \
INPUT_2x64_sub( v128_sl64( db,16 ) ); \
INPUT_2x64_sub( v128_sl64( db,15 ) ); \
INPUT_2x64_sub( v128_sl64( db,14 ) ); \
INPUT_2x64_sub( v128_sl64( db,13 ) ); \
INPUT_2x64_sub( v128_sl64( db,12 ) ); \
INPUT_2x64_sub( v128_sl64( db,11 ) ); \
INPUT_2x64_sub( v128_sl64( db,10 ) ); \
INPUT_2x64_sub( v128_sl64( db, 9 ) ); \
INPUT_2x64_sub( v128_sl64( db, 8 ) ); \
INPUT_2x64_sub( v128_sl64( db, 7 ) ); \
INPUT_2x64_sub( v128_sl64( db, 6 ) ); \
INPUT_2x64_sub( v128_sl64( db, 5 ) ); \
INPUT_2x64_sub( v128_sl64( db, 4 ) ); \
INPUT_2x64_sub( v128_sl64( db, 3 ) ); \
INPUT_2x64_sub( v128_sl64( db, 2 ) ); \
INPUT_2x64_sub( v128_sl64( db, 1 ) ); \
INPUT_2x64_sub( db ); \
}
#if 0
// Dependent on the compiler unrolling the loop.
#define INPUT_2x64 \
{ \
v128u64_t db = *buf; \
@@ -1981,6 +2156,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
tp += 8; \
} \
}
#endif
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
#define SBOX_2x64( a, b, c, d ) \
@@ -2001,7 +2177,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
{ \
a = v128_rol32( a, 13 ); \
c = v128_rol32( c, 3 ); \
b = v128_xor3( a, b, c ); \
b = v128_xor3( c, a, b ); \
d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
b = v128_rol32( b, 1 ); \
d = v128_rol32( d, 7 ); \

View File

@@ -161,29 +161,25 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
size_t byte_len, size_t lim )
{
unsigned eb;
union {
__m512i tmp[lim + 1];
uint64_t dummy; /* for alignment */
} u;
__m512i tmp[lim + 1] __attribute__ ((aligned (64)));
size_t j;
size_t m512_len = byte_len >> 3;
const unsigned eb = hard_coded_eb;
eb = hard_coded_eb;
if ( kc->ptr == (lim - 8) )
{
const uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = _mm512_set1_epi64( t );
tmp[0] = _mm512_set1_epi64( t );
j = 8;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = _mm512_set1_epi64( eb );
memset_zero_512( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
tmp[0] = _mm512_set1_epi64( eb );
memset_zero_512( tmp + 1, (j>>3) - 2 );
tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
}
keccak64_8way_core( kc, u.tmp, j, lim );
keccak64_8way_core( kc, tmp, j, lim );
/* Finalize the "lane complement" */
NOT64( kc->w[ 1], kc->w[ 1] );
NOT64( kc->w[ 2], kc->w[ 2] );
@@ -361,29 +357,25 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
size_t lim )
{
unsigned eb;
union {
__m256i tmp[lim + 1];
uint64_t dummy; /* for alignment */
} u;
__m256i tmp[lim + 1] __attribute__ ((aligned (32)));
size_t j;
size_t m256_len = byte_len >> 3;
const unsigned eb = hard_coded_eb;
eb = hard_coded_eb;
if ( kc->ptr == (lim - 8) )
{
const uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = _mm256_set1_epi64x( t );
tmp[0] = _mm256_set1_epi64x( t );
j = 8;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = _mm256_set1_epi64x( eb );
memset_zero_256( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
tmp[0] = _mm256_set1_epi64x( eb );
memset_zero_256( tmp + 1, (j>>3) - 2 );
tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
}
keccak64_core( kc, u.tmp, j, lim );
keccak64_core( kc, tmp, j, lim );
/* Finalize the "lane complement" */
NOT64( kc->w[ 1], kc->w[ 1] );
NOT64( kc->w[ 2], kc->w[ 2] );

View File

@@ -1,8 +1,6 @@
#include "cpuminer-config.h"
#include "algo-gate-api.h"
#if !defined(__APPLE__)
#include <gmp.h>
#include <stdbool.h>
#include <stdlib.h>
@@ -33,6 +31,7 @@ static inline double exp_n( double xt )
return exp( xt );
}
/*
static inline double exp_n2( double x1, double x2 )
{
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
@@ -53,6 +52,7 @@ static inline double exp_n2( double x1, double x2 )
else if ( xt > p6 - 1.e-200 )
return 0.;
}
*/
double swit2_( double wvnmb )
{
@@ -298,14 +298,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
return 0;
}
#endif // not apple
bool register_m7m_algo( algo_gate_t *gate )
{
#if defined(__APPLE__)
applog( LOG_ERR, "M7M algo is not supported on MacOS");
return false;
#else
gate->optimizations = SHA256_OPT;
init_m7m_ctx();
gate->scanhash = (void*)&scanhash_m7m_hash;
@@ -315,6 +309,5 @@ bool register_m7m_algo( algo_gate_t *gate )
gate->set_work_data_endian = (void*)&set_work_data_big_endian;
opt_target_factor = 65536.0;
return true;
#endif
}

View File

@@ -11,7 +11,6 @@
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
@@ -617,9 +616,9 @@ union _hmq1725_4way_context_overlay
cubehashParam cube;
cube_2way_context cube2;
sph_shavite512_context shavite;
hashState_sd sd;
simd512_context simd;
shavite512_2way_context shavite2;
simd_2way_context simd;
simd_2way_context simd_2way;
hashState_echo echo;
hamsi512_4way_context hamsi;
hashState_fugue fugue;
@@ -753,8 +752,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
simd512_2way_full( &ctx.simd_2way, vhashA, vhashA, 64 );
simd512_2way_full( &ctx.simd_2way, vhashB, vhashB, 64 );
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
@@ -869,41 +868,25 @@ extern void hmq1725_4way_hash(void *state, const void *input)
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
(const BitSequence *)hash0, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
}
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
if ( hash1[0] & mask ) //4
echo_full( &ctx.echo, (BitSequence *)hash1, 512,
(const BitSequence *)hash1, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
}
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
if ( hash2[0] & mask ) //4
echo_full( &ctx.echo, (BitSequence *)hash2, 512,
(const BitSequence *)hash2, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
}
simd512_ctx( &ctx.simd, hash2, hash2, 64 );
if ( hash3[0] & mask ) //4
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
(const BitSequence *)hash3, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
}
simd512_ctx( &ctx.simd, hash3, hash3, 64 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

View File

@@ -46,7 +46,7 @@
#endif
#ifdef __GNUC__
#if defined(NOASM) || defined(__arm__) || defined(__aarch64__)
#if defined(NOASM) || defined(__arm__) || defined(__aarch64__) || defined(__APPLE__)
#define ASM 0
#else
#define ASM 1

View File

@@ -1,472 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "nist.h"
#include "simd_iv.h"
/* #define NO_PRECOMPUTED_IV */
#if defined(__SSE2__) // || defined(__ARM_NEON)
/*
* Increase the counter.
*/
void IncreaseCounter(hashState_sd *state, DataLength databitlen) {
#ifdef HAS_64
state->count += databitlen;
#else
uint32_t old_count = state->count_low;
state->count_low += databitlen;
if (state->count_low < old_count)
state->count_high++;
#endif
}
/*
* Initialize the hashState_sd with a given IV.
* If the IV is NULL, initialize with zeros.
*/
int InitIV(hashState_sd *state, int hashbitlen, const u32 *IV) {
int n = 8;
state->hashbitlen = hashbitlen;
state->n_feistels = n;
state->blocksize = 128*8;
#ifdef HAS_64
state->count = 0;
#else
state->count_low = 0;
state->count_high = 0;
#endif
// state->buffer = malloc(16*n + 16);
/*
* Align the buffer to a 128 bit boundary.
*/
// state->buffer += ((unsigned char*)NULL - state->buffer)&15;
// state->A = malloc((4*n+4)*sizeof(u32));
/*
* Align the buffer to a 128 bit boundary.
*/
// state->A += ((u32*)NULL - state->A)&3;
state->B = state->A+n;
state->C = state->B+n;
state->D = state->C+n;
if (IV)
memcpy(state->A, IV, 4*n*sizeof(u32));
else
memset(state->A, 0, 4*n*sizeof(u32));
// free(state->buffer);
// free(state->A);
return 0;
}
/*
* Initialize the hashState_sd.
*/
int init_sd(hashState_sd *state, int hashbitlen) {
int r;
char *init;
#ifndef NO_PRECOMPUTED_IV
// if (hashbitlen == 224)
// r=InitIV(state, hashbitlen, IV_224);
// else if (hashbitlen == 256)
// r=InitIV(state, hashbitlen, IV_256);
// else if (hashbitlen == 384)
// r=InitIV(state, hashbitlen, IV_384);
// else
if (hashbitlen == 512)
r = InitIV(state, hashbitlen, IV_512);
else
#endif
{
/*
* Nonstandart length: IV is not precomputed.
*/
r=InitIV(state, hashbitlen, NULL);
if (r != 0)
return r;
init = malloc(state->blocksize);
memset(init, 0, state->blocksize);
#if defined __STDC__ && __STDC_VERSION__ >= 199901L
snprintf(init, state->blocksize, "SIMD-%i v1.1", hashbitlen);
#else
sprintf(init, "SIMD-%i v1.1", hashbitlen);
#endif
SIMD_Compress(state, (unsigned char*) init, 0);
free(init);
}
return r;
}
int update_sd( hashState_sd *state, const BitSequence *data,
DataLength databitlen )
{
unsigned current;
unsigned int bs = state->blocksize;
static int align = -1;
if (align == -1)
align = RequiredAlignment();
#ifdef HAS_64
current = state->count & (bs - 1);
#else
current = state->count_low & (bs - 1);
#endif
if ( current & 7 )
{
// The number of hashed bits is not a multiple of 8.
// Very painfull to implement and not required by the NIST API.
return 1;
}
while ( databitlen > 0 )
{
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_Compress(state, data, 0);
databitlen -= bs;
data += bs/8;
IncreaseCounter(state, bs);
}
else
{
// Copy a chunk of data to the buffer
unsigned int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
IncreaseCounter( state, databitlen );
return 0;
}
else
{
memcpy( state->buffer+current/8, data, len/8 );
IncreaseCounter( state,len );
databitlen -= len;
data += len/8;
current = 0;
SIMD_Compress( state, state->buffer, 0 );
}
}
}
return 0;
}
int final_sd( hashState_sd *state, BitSequence *hashval )
{
#ifdef HAS_64
uint64_t l;
int current = state->count & (state->blocksize - 1);
#else
uint32_t l;
int current = state->count_low & (state->blocksize - 1);
#endif
unsigned int i;
BitSequence bs[64];
int isshort = 1;
// If there is still some data in the buffer, hash it
if ( current )
{
// We first need to zero out the end of the buffer.
if ( current & 7 )
{
BitSequence mask = 0xff >> ( current & 7 );
state->buffer[current/8] &= ~mask;
}
current = ( current+7 ) / 8;
memset( state->buffer+current, 0, state->blocksize/8 - current );
SIMD_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, state->blocksize / 8 );
#ifdef HAS_64
l = state->count;
for ( i=0; i<8; i++ )
{
state->buffer[i] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
#else
l = state->count_low;
for ( i=0; i<4; i++ )
{
state->buffer[i] = l & 0xff;
l >>= 8;
}
l = state->count_high;
for ( i=0; i<4; i++ )
{
state->buffer[4+i] = l & 0xff;
l >>= 8;
}
if ( state->count_high == 0 && state->count_low < 16384 )
isshort = 2;
#endif
SIMD_Compress( state, state->buffer, isshort );
// Decode the 32-bit words into a BitSequence
for ( i=0; i < 2*state->n_feistels; i++ )
{
u32 x = state->A[i];
bs[4*i ] = x&0xff;
x >>= 8;
bs[4*i+1] = x&0xff;
x >>= 8;
bs[4*i+2] = x&0xff;
x >>= 8;
bs[4*i+3] = x&0xff;
}
memcpy( hashval, bs, state->hashbitlen / 8 );
if ( state->hashbitlen % 8 )
{
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
hashval[state->hashbitlen/8 + 1] = bs[state->hashbitlen/8 + 1] & mask;
}
return 0;
}
int update_final_sd( hashState_sd *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen )
{
int current, i;
unsigned int bs = state->blocksize;
static int align = -1;
BitSequence out[64];
int isshort = 1;
uint64_t l;
if (align == -1)
align = RequiredAlignment();
#ifdef HAS_64
current = state->count & (bs - 1);
#else
current = state->count_low & (bs - 1);
#endif
if ( current & 7 )
{
// The number of hashed bits is not a multiple of 8.
// Very painfull to implement and not required by the NIST API.
return 1;
}
while ( databitlen > 0 )
{
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_Compress(state, data, 0);
databitlen -= bs;
data += bs/8;
IncreaseCounter(state, bs);
}
else
{
// Copy a chunk of data to the buffer
unsigned int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
IncreaseCounter( state, databitlen );
break;
}
else
{
memcpy( state->buffer+current/8, data, len/8 );
IncreaseCounter( state,len );
databitlen -= len;
data += len/8;
current = 0;
SIMD_Compress( state, state->buffer, 0 );
}
}
}
current = state->count & (state->blocksize - 1);
// If there is still some data in the buffer, hash it
if ( current )
{
// We first need to zero out the end of the buffer.
if ( current & 7 )
{
BitSequence mask = 0xff >> ( current & 7 );
state->buffer[current/8] &= ~mask;
}
current = ( current+7 ) / 8;
memset( state->buffer+current, 0, state->blocksize/8 - current );
SIMD_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, state->blocksize / 8 );
l = state->count;
for ( i=0; i<8; i++ )
{
state->buffer[i] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
SIMD_Compress( state, state->buffer, isshort );
// Decode the 32-bit words into a BitSequence
for ( i=0; i < 2*state->n_feistels; i++ )
{
u32 x = state->A[i];
out[4*i ] = x & 0xff;
x >>= 8;
out[4*i+1] = x & 0xff;
x >>= 8;
out[4*i+2] = x & 0xff;
x >>= 8;
out[4*i+3] = x & 0xff;
}
memcpy( hashval, out, state->hashbitlen / 8 );
if ( state->hashbitlen % 8 )
{
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
}
return 0;
}
int simd_full( hashState_sd *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen )
{
InitIV( state, 512, IV_512 );
int current, i;
unsigned int bs = state->blocksize;
static int align = -1;
BitSequence out[64];
int isshort = 1;
uint64_t l;
if (align == -1)
align = RequiredAlignment();
#ifdef HAS_64
current = state->count & (bs - 1);
#else
current = state->count_low & (bs - 1);
#endif
if ( current & 7 )
{
// The number of hashed bits is not a multiple of 8.
// Very painfull to implement and not required by the NIST API.
return 1;
}
while ( databitlen > 0 )
{
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_Compress(state, data, 0);
databitlen -= bs;
data += bs/8;
IncreaseCounter(state, bs);
}
else
{
// Copy a chunk of data to the buffer
unsigned int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
IncreaseCounter( state, databitlen );
break;
}
else
{
memcpy( state->buffer+current/8, data, len/8 );
IncreaseCounter( state,len );
databitlen -= len;
data += len/8;
current = 0;
SIMD_Compress( state, state->buffer, 0 );
}
}
}
current = state->count & (state->blocksize - 1);
// If there is still some data in the buffer, hash it
if ( current )
{
// We first need to zero out the end of the buffer.
if ( current & 7 )
{
BitSequence mask = 0xff >> ( current & 7 );
state->buffer[current/8] &= ~mask;
}
current = ( current+7 ) / 8;
memset( state->buffer+current, 0, state->blocksize/8 - current );
SIMD_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, state->blocksize / 8 );
l = state->count;
for ( i=0; i<8; i++ )
{
state->buffer[i] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
SIMD_Compress( state, state->buffer, isshort );
// Decode the 32-bit words into a BitSequence
for ( i=0; i < 2*state->n_feistels; i++ )
{
u32 x = state->A[i];
out[4*i ] = x & 0xff;
x >>= 8;
out[4*i+1] = x & 0xff;
x >>= 8;
out[4*i+2] = x & 0xff;
x >>= 8;
out[4*i+3] = x & 0xff;
}
memcpy( hashval, out, state->hashbitlen / 8 );
if ( state->hashbitlen % 8 )
{
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
}
return 0;
}
#endif

View File

@@ -1,64 +0,0 @@
#ifndef __NIST_H__
#define __NIST_H__
/*define data alignment for different C compilers*/
#if defined(__GNUC__)
#define DATA_ALIGN(x) x __attribute__((aligned(16)))
#else
#define DATA_ALIGN(x) __declspec(align(16)) x
#endif
#include "simd-compat.h"
#include "compat/sha3-defs.h"
/*
* NIST API Specific types.
*/
typedef struct {
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
#ifdef HAS_64
uint64_t count;
#else
uint32_t count_low;
uint32_t count_high;
#endif
DATA_ALIGN(uint32_t A[32]);
uint32_t *B;
uint32_t *C;
uint32_t *D;
DATA_ALIGN(unsigned char buffer[128]);
} hashState_sd;
/*
* NIST API
*/
int init_sd(hashState_sd *state, int hashbitlen);
int update_sd(hashState_sd *state, const BitSequence *data, DataLength databitlen);
int final_sd(hashState_sd *state, BitSequence *hashval);
int update_final_sd( hashState_sd *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen );
int simd_full( hashState_sd *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen );
/*
* Internal API
*/
//int SupportedLength(int hashbitlen);
int RequiredAlignment(void);
void SIMD_Compress(hashState_sd * state, const unsigned char *M, int final);
void fft128_natural(fft_t *a, unsigned char *x);
void fft256_natural(fft_t *a, unsigned char *x);
#endif

View File

@@ -1,198 +0,0 @@
#ifndef __SIMD_COMPAT_H__
#define __SIMD_COMPAT_H__
#include <limits.h>
/*
* This file desfines some helper function for cross-platform compatibility.
*/
#if defined __GNUC_PREREQ && (! defined __STRICT_ANSI__)
#define GNU_EXT
#endif
/*
* First define some integer types.
*/
#if defined __STDC__ && __STDC_VERSION__ >= 199901L
/*
* On C99 implementations, we can use <stdint.h> to get an exact 32-bit
* type, if any, or otherwise use a wider type.
*/
#include <stdint.h>
#include "compat/brg_types.h"
#define C32(x) ((u32)(x))
#define HAS_64 1
#else
/*
* On non-C99 systems, we use "unsigned int" if it is wide enough,
* "unsigned long" otherwise. This supports all "reasonable" architectures.
* We have to be cautious: pre-C99 preprocessors handle constants
* differently in '#if' expressions. Hence the shifts to test UINT_MAX.
*/
#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
typedef unsigned int u32;
#define C32(x) ((u32)(x ## U))
#else
typedef unsigned long u32;
#define C32(x) ((u32)(x ## UL))
#endif
/*
* We want a 64-bit type. We use "unsigned long" if it is wide enough (as
* is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
* "unsigned long long" otherwise, if available. We use ULLONG_MAX to
* test whether "unsigned long long" is available; we also know that
* gcc features this type, even if the libc header do not know it.
*/
#if ((ULONG_MAX >> 31) >> 31) >= 3
typedef unsigned long u64;
#define HAS_64 1
#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
typedef unsigned long long u64;
#define HAS_64 1
#else
/*
* No 64-bit type...
*/
#endif
#endif
/*
* fft_t should be at least 16 bits wide.
* using short int will require less memory, but int is faster...
*/
typedef int fft_t;
/*
* Implementation note: some processors have specific opcodes to perform
* a rotation. Recent versions of gcc recognize the expression above and
* use the relevant opcodes, when appropriate.
*/
#define T32(x) ((x) & C32(0xFFFFFFFF))
#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
#define ROTR32(x, n) ROTL32(x, (32 - (n)))
/*
* The macro MAYBE_INLINE expands to an inline qualifier, is available.
*/
#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined GNU_EXT
#define MAYBE_INLINE static inline
#elif defined _MSC_VER
#define MAYBE_INLINE __inline
#else
#define MAYBE_INLINE
#endif
/* */
#if defined __GNUC__ && ( defined __i386__ || defined __x86_64__ )
#define rdtsc() \
({ \
u32 lo, hi; \
__asm__ __volatile__ ( /* serialize */ \
"xorl %%eax,%%eax \n cpuid" \
::: "%rax", "%rbx", "%rcx", "%rdx"); \
/* We cannot use "=A", since this would use %rax on x86_64 */ \
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); \
(u64)hi << 32 | lo; \
}) \
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
#define rdtsc __rdtsc
#endif
/*
* The IS_ALIGNED macro tests if a char* pointer is aligned to an
* n-bit boundary.
* It is defined as false on unknown architectures.
*/
#define CHECK_ALIGNED(p,n) ((((unsigned char *) (p) - (unsigned char *) NULL) & ((n)-1)) == 0)
#if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
/*
* Unaligned 32-bit access are not expensive on x86 so we don't care
*/
#define IS_ALIGNED(p,n) (n<=4 || CHECK_ALIGNED(p,n))
#elif defined __sparcv9 || defined __sparc || defined __arm || \
defined __ia64 || defined __ia64__ || \
defined __itanium__ || defined __M_IA64 || \
defined __powerpc__ || defined __powerpc
#define IS_ALIGNED(p,n) CHECK_ALIGNED(p,n)
#else
/*
* Unkonwn architecture: play safe
*/
#define IS_ALIGNED(p,n) 0
#endif
/* checks for endianness */
#if defined (__linux__) || defined (__GLIBC__)
# include <endian.h>
#elif defined (__FreeBSD__)
# include <machine/endian.h>
#elif defined (__OpenBSD__)
# include <sys/endian.h>
#endif
#ifdef __BYTE_ORDER
# if __BYTE_ORDER == __LITTLE_ENDIAN
# define SIMD_LITTLE_ENDIAN
# elif __BYTE_ORDER == __BIG_ENDIAN
# define SIMD_BIG_ENDIAN
# endif
#else
# if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
# define SIMD_LITTLE_ENDIAN
# endif
#endif
#endif

View File

@@ -231,7 +231,7 @@ static void FFT64( void *a )
// Unrolled decimation in frequency (DIF) radix-2 NTT.
// Output data is in revbin_permuted order.
static const int w[] = {0, 2, 4, 6};
// static const int w[] = {0, 2, 4, 6};
#define BUTTERFLY_0( i,j ) \
do { \
@@ -240,25 +240,25 @@ do { \
X(i) = v128_sub16( X(i), v ); \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
#define BUTTERFLY_N( i, j, w_n ) \
do { \
v128u16_t v = X(j); \
X(j) = v128_add16( X(i), X(j) ); \
X(i) = v128_sl16( v128_sub16( X(i), v ), w[n] ); \
X(i) = v128_sl16( v128_sub16( X(i), v ), w_n ); \
} while(0)
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
BUTTERFLY_N( 1, 5, 2 );
BUTTERFLY_N( 2, 6, 4 );
BUTTERFLY_N( 3, 7, 6 );
DO_REDUCE( 2 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
BUTTERFLY_N( 1, 3, 4 );
BUTTERFLY_N( 5, 7, 4 );
DO_REDUCE( 1 );
@@ -329,10 +329,10 @@ do { \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
#define BUTTERFLY_N( i, j, w_n ) \
do { \
v128u16_t u = X(j); \
X(i) = v128_sl16( X(i), w[n] ); \
X(i) = v128_sl16( X(i), w_n ); \
X(j) = v128_sub16( X(j), X(i) ); \
X(i) = v128_add16( u, X(i) ); \
} while(0)
@@ -353,15 +353,15 @@ do { \
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
BUTTERFLY_N( 1, 3, 4 );
BUTTERFLY_N( 5, 7, 4 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
BUTTERFLY_N( 1, 5, 2 );
BUTTERFLY_N( 2, 6, 4 );
BUTTERFLY_N( 3, 7, 6 );
DO_REDUCE_FULL_S( 0 );
DO_REDUCE_FULL_S( 1 );
@@ -853,7 +853,7 @@ static void fft64_2way( void *a )
// Unrolled decimation in frequency (DIF) radix-2 NTT.
// Output data is in revbin_permuted order.
static const int w[] = {0, 2, 4, 6};
// static const int w[] = {0, 2, 4, 6};
// __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
@@ -864,25 +864,25 @@ do { \
X(i) = _mm256_sub_epi16( X(i), v ); \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
#define BUTTERFLY_N( i, j, w_n ) \
do { \
__m256i v = X(j); \
X(j) = _mm256_add_epi16( X(i), X(j) ); \
X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w_n ); \
} while(0)
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
BUTTERFLY_N( 1, 5, 2 );
BUTTERFLY_N( 2, 6, 4 );
BUTTERFLY_N( 3, 7, 6 );
DO_REDUCE( 2 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
BUTTERFLY_N( 1, 3, 4 );
BUTTERFLY_N( 5, 7, 4 );
DO_REDUCE( 1 );
@@ -953,10 +953,10 @@ do { \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
#define BUTTERFLY_N( i, j, w_n ) \
do { \
__m256i u = X(j); \
X(i) = _mm256_slli_epi16( X(i), w[n] ); \
X(i) = _mm256_slli_epi16( X(i), w_n ); \
X(j) = _mm256_sub_epi16( X(j), X(i) ); \
X(i) = _mm256_add_epi16( u, X(i) ); \
} while(0)
@@ -977,15 +977,15 @@ do { \
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
BUTTERFLY_N( 1, 3, 4 );
BUTTERFLY_N( 5, 7, 4 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
BUTTERFLY_N( 1, 5, 2 );
BUTTERFLY_N( 2, 6, 4 );
BUTTERFLY_N( 3, 7, 6 );
DO_REDUCE_FULL_S( 0 );
DO_REDUCE_FULL_S( 1 );
@@ -1709,11 +1709,11 @@ do { \
X(i) = _mm512_sub_epi16( X(i), v ); \
} while(0)
#define BUTTERFLY_N( i, j, w ) \
#define BUTTERFLY_N( i, j, w_n ) \
do { \
__m512i v = X(j); \
X(j) = _mm512_add_epi16( X(i), X(j) ); \
X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w ); \
X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w_n ); \
} while(0)
BUTTERFLY_0( 0, 4 );
@@ -1792,10 +1792,10 @@ do { \
} while(0)
#define BUTTERFLY_N( i, j, w ) \
#define BUTTERFLY_N( i, j, w_n ) \
do { \
__m512i u = X(j); \
X(i) = _mm512_slli_epi16( X(i), w ); \
X(i) = _mm512_slli_epi16( X(i), w_n ); \
X(j) = _mm512_sub_epi16( X(j), X(i) ); \
X(i) = _mm512_add_epi16( u, X(i) ); \
} while(0)

View File

@@ -1,7 +1,6 @@
#ifndef SIMD_HASH_2WAY_H__
#define SIMD_HASH_2WAY_H__ 1
#include "simd-compat.h"
#include "simd-utils.h"
#if defined(__SSE2__) || defined (__ARM_NEON)
@@ -34,7 +33,7 @@ typedef struct
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
} simd512_2way_context __attribute__((aligned(128)));
} simd512_2way_context __attribute__((aligned(64)));
#define simd_2way_context simd512_2way_context
// databitlen is bits

View File

@@ -1,948 +0,0 @@
#include <stdlib.h>
#include <stdio.h>
#include "nist.h"
#include "vector.h"
//#if defined(__SSE2__) || defined(__ARM_NEON)
#if defined(__SSE2__)
#define PRINT_SOME 0
/*
int SupportedLength(int hashbitlen) {
if (hashbitlen <= 0 || hashbitlen > 512)
return 0;
else
return 1;
}
*/
int RequiredAlignment(void) {
return 16;
}
static const union cv V128 = CV(128);
static const union cv V255 = CV(255);
static const union cv V257 = CV(257);
static const union cv8 V0 = CV(0);
/*
* Reduce modulo 257; result is in [-127; 383]
* REDUCE(x) := (x&255) - (x>>8)
*/
#define REDUCE(x) \
v16_sub(v16_and(x, V255.v16), v16_shift_r (x, 8))
/*
* Reduce from [-127; 383] to [-128; 128]
* EXTRA_REDUCE_S(x) := x<=128 ? x : x-257
*/
#define EXTRA_REDUCE_S(x) \
v16_sub(x, v16_and(V257.v16, v16_cmp(x, V128.v16)))
/*
* Reduce modulo 257; result is in [-128; 128]
*/
#define REDUCE_FULL_S(x) \
EXTRA_REDUCE_S(REDUCE(x))
#define DO_REDUCE(i) \
X(i) = REDUCE(X(i))
#define DO_REDUCE_FULL_S(i) \
do { \
X(i) = REDUCE(X(i)); \
X(i) = EXTRA_REDUCE_S(X(i)); \
} while(0)
#define MAYBE_VOLATILE
MAYBE_INLINE void fft64(void *a) {
v16* const A = a;
register v16 X0, X1, X2, X3, X4, X5, X6, X7;
/*
#if V16_SIZE == 8
#define X(i) A[i]
#elif V16_SIZE == 4
#define X(i) A[2*i]
#endif
*/
#define X(i) X##i
X0 = A[0];
X1 = A[1];
X2 = A[2];
X3 = A[3];
X4 = A[4];
X5 = A[5];
X6 = A[6];
X7 = A[7];
#define DO_REDUCE(i) \
X(i) = REDUCE(X(i))
/*
* Begin with 8 parallels DIF FFT_8
*
* FFT_8 using w=4 as 8th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT.
* Output data is in revbin_permuted order.
*/
static const int w[] = {0, 2, 4, 6};
// v16 *Twiddle = (v16*)FFT64_Twiddle;
#define BUTTERFLY(i,j,n) \
do { \
MAYBE_VOLATILE v16 v = X(j); \
X(j) = v16_add(X(i), X(j)); \
if (n) \
X(i) = v16_shift_l(v16_sub(X(i), v), w[n]); \
else \
X(i) = v16_sub(X(i), v); \
} while(0)
BUTTERFLY(0, 4, 0);
BUTTERFLY(1, 5, 1);
BUTTERFLY(2, 6, 2);
BUTTERFLY(3, 7, 3);
DO_REDUCE(2);
DO_REDUCE(3);
BUTTERFLY(0, 2, 0);
BUTTERFLY(4, 6, 0);
BUTTERFLY(1, 3, 2);
BUTTERFLY(5, 7, 2);
DO_REDUCE(1);
BUTTERFLY(0, 1, 0);
BUTTERFLY(2, 3, 0);
BUTTERFLY(4, 5, 0);
BUTTERFLY(6, 7, 0);
/* We don't need to reduce X(7) */
DO_REDUCE_FULL_S(0);
DO_REDUCE_FULL_S(1);
DO_REDUCE_FULL_S(2);
DO_REDUCE_FULL_S(3);
DO_REDUCE_FULL_S(4);
DO_REDUCE_FULL_S(5);
DO_REDUCE_FULL_S(6);
#undef BUTTERFLY
/*
* Multiply by twiddle factors
*/
X(6) = v16_mul(X(6), FFT64_Twiddle[0].v16);
X(5) = v16_mul(X(5), FFT64_Twiddle[1].v16);
X(4) = v16_mul(X(4), FFT64_Twiddle[2].v16);
X(3) = v16_mul(X(3), FFT64_Twiddle[3].v16);
X(2) = v16_mul(X(2), FFT64_Twiddle[4].v16);
X(1) = v16_mul(X(1), FFT64_Twiddle[5].v16);
X(0) = v16_mul(X(0), FFT64_Twiddle[6].v16);
/*
* Transpose the FFT state with a revbin order permutation
* on the rows and the column.
* This will make the full FFT_64 in order.
*/
#define INTERLEAVE(i,j) \
do { \
v16 t1= X(i); \
v16 t2= X(j); \
X(i) = v16_interleavel(t1, t2); \
X(j) = v16_interleaveh(t1, t2); \
} while(0)
INTERLEAVE(1, 0);
INTERLEAVE(3, 2);
INTERLEAVE(5, 4);
INTERLEAVE(7, 6);
INTERLEAVE(2, 0);
INTERLEAVE(3, 1);
INTERLEAVE(6, 4);
INTERLEAVE(7, 5);
INTERLEAVE(4, 0);
INTERLEAVE(5, 1);
INTERLEAVE(6, 2);
INTERLEAVE(7, 3);
#undef INTERLEAVE
/*
* Finish with 8 parallels DIT FFT_8
*
* FFT_8 using w=4 as 8th root of unity
* Unrolled decimation in time (DIT) radix-2 NTT.
* Intput data is in revbin_permuted order.
*/
#define BUTTERFLY(i,j,n) \
do { \
MAYBE_VOLATILE v16 u = X(j); \
if (n) \
X(i) = v16_shift_l(X(i), w[n]); \
X(j) = v16_sub(X(j), X(i)); \
X(i) = v16_add(u, X(i)); \
} while(0)
DO_REDUCE(0);
DO_REDUCE(1);
DO_REDUCE(2);
DO_REDUCE(3);
DO_REDUCE(4);
DO_REDUCE(5);
DO_REDUCE(6);
DO_REDUCE(7);
BUTTERFLY(0, 1, 0);
BUTTERFLY(2, 3, 0);
BUTTERFLY(4, 5, 0);
BUTTERFLY(6, 7, 0);
BUTTERFLY(0, 2, 0);
BUTTERFLY(4, 6, 0);
BUTTERFLY(1, 3, 2);
BUTTERFLY(5, 7, 2);
DO_REDUCE(3);
BUTTERFLY(0, 4, 0);
BUTTERFLY(1, 5, 1);
BUTTERFLY(2, 6, 2);
BUTTERFLY(3, 7, 3);
DO_REDUCE_FULL_S(0);
DO_REDUCE_FULL_S(1);
DO_REDUCE_FULL_S(2);
DO_REDUCE_FULL_S(3);
DO_REDUCE_FULL_S(4);
DO_REDUCE_FULL_S(5);
DO_REDUCE_FULL_S(6);
DO_REDUCE_FULL_S(7);
#undef BUTTERFLY
A[0] = X0;
A[1] = X1;
A[2] = X2;
A[3] = X3;
A[4] = X4;
A[5] = X5;
A[6] = X6;
A[7] = X7;
#undef X
}
MAYBE_INLINE void fft128(void *a) {
int i;
// Temp space to help for interleaving in the end
v16 B[8];
v16 *A = (v16*) a;
// v16 *Twiddle = (v16*)FFT128_Twiddle;
/* Size-2 butterflies */
for (i = 0; i<8; i++) {
B[i] = v16_add(A[i], A[i+8]);
B[i] = REDUCE_FULL_S(B[i]);
A[i+8] = v16_sub(A[i], A[i+8]);
A[i+8] = REDUCE_FULL_S(A[i+8]);
A[i+8] = v16_mul(A[i+8], FFT128_Twiddle[i].v16);
A[i+8] = REDUCE_FULL_S(A[i+8]);
}
fft64(B);
fft64(A+8);
/* Transpose (i.e. interleave) */
for (i=0; i<8; i++) {
A[2*i] = v16_interleavel (B[i], A[i+8]);
A[2*i+1] = v16_interleaveh (B[i], A[i+8]);
}
}
#ifdef v16_broadcast
/* Compute the FFT using a table
* The function works if the value of the message is smaller
* than 2^14.
*/
void fft128_msg_final(short *a, const unsigned char *x) {
static const union cv FFT128_Final_Table[] = {
{{ 1, -211, 60, -67, 2, 92, -137, 123}},
{{ 2, 118, 45, 111, 97, -46, 49, -106}},
{{ 4, -73, -17, -11, 8, 111, -34, -22}},
{{ -68, -4, 76, -25, 96, -96, -68, -9}},
{{ 16, -35, -68, -44, 32, -70, -136, -88}},
{{ 0, -124, 17, 12, -6, 57, 47, -8}},
{{ 64, 117, -15, 81, 128, -23, -30, -95}},
{{ -68, -53, -52, -70, -10, -117, 77, 21}},
{{ -1, -46, -60, 67, -2, -92, -120, -123}},
{{ -2, -118, -45, -111, -97, 46, -49, 106}},
{{ -4, 73, 17, 11, -8, -111, 34, 22}},
{{ 68, 4, -76, 25, -96, 96, 68, 9}},
{{ -16, -222, 68, 44, -32, 70, -121, 88}},
{{ 0, 124, -17, -12, 6, -57, -47, 8}},
{{ -64, -117, 15, -81, -128, -234, 30, 95}},
{{ 68, 53, 52, 70, 10, 117, -77, -21}},
{{-118, -31, 116, -61, 21, -62, -25, -122}},
{{-101, 107, -45, -95, -8, 3, 101, -34}},
{{ 42, -124, -50, 13, 84, 9, -100, -231}},
{{ -79, -53, 82, 65, -81, 47, 61, 107}},
{{ -89, -239, 57, -205, -178, 36, -143, 104}},
{{-126, 113, 33, 111, 103, -109, 65, -114}},
{{ -99, 72, -29, -49, -198, -113, -58, -98}},
{{ 8, -27, -106, -30, 111, 6, 10, -108}},
{{-139, 31, -116, -196, -21, 62, 25, -135}},
{{ 101, -107, 45, 95, 8, -3, -101, 34}},
{{ -42, -133, 50, -13, -84, -9, 100, -26}},
{{ 79, 53, -82, -65, 81, -47, -61, -107}},
{{-168, -18, -57, -52, -79, -36, -114, -104}},
{{ 126, -113, -33, -111, -103, 109, -65, 114}},
{{ 99, -72, -228, 49, -59, 113, 58, -159}},
{{ -8, 27, 106, 30, -111, -6, -10, 108}}
};
// v16 *Table = (v16*)FFT128_Final_Table;
v16 *A = (v16*) a;
v16 msg1 = v16_broadcast(x[0]>128?x[0]-257:x[0]);
v16 msg2 = v16_broadcast(x[1]>128?x[1]-257:x[1]);
// v16 msg2 = v16_broadcast(x[1]);
#if 0
int i;
for (i=0; i<16; i++) {
v16 tmp = v16_mul(FFT128_Final_Table[2*i].v16 , msg2);
v16 sum = v16_add(FFT128_Final_Table[2*i+1].v16, msg1);
sum = v16_add(sum, tmp);
A[i] = REDUCE_FULL_S(sum);
}
#else
#define FFT_FINAL(i) \
v16 tmp##i = v16_mul(FFT128_Final_Table[2*i].v16, msg2); \
v16 sum##i = v16_add(FFT128_Final_Table[2*i+1].v16, msg1); \
sum##i = v16_add(sum##i, tmp##i); \
A[i] = REDUCE_FULL_S(sum##i);
FFT_FINAL(0)
FFT_FINAL(1)
FFT_FINAL(2)
FFT_FINAL(3)
FFT_FINAL(4)
FFT_FINAL(5)
FFT_FINAL(6)
FFT_FINAL(7)
FFT_FINAL(8)
FFT_FINAL(9)
FFT_FINAL(10)
FFT_FINAL(11)
FFT_FINAL(12)
FFT_FINAL(13)
FFT_FINAL(14)
FFT_FINAL(15)
#endif
}
#endif
void fft128_msg(short *a, const unsigned char *x, int final) {
static const union cv Tweak =
{{0,0,0,0,0,0,0,1}};
static const union cv FinalTweak =
{{0,0,0,0,0,1,0,1}};
v8 *X = (v8*) x;
v16 *A = (v16*) a;
// v16 *Twiddle = (v16*)FFT128_Twiddle;
#define UNPACK(i) \
do { \
v8 t = X[i]; \
A[2*i] = v8_mergel(t, V0.v8); \
A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16); \
A[2*i+8] = REDUCE(A[2*i+8]); \
A[2*i+1] = v8_mergeh(t, V0.v8); \
A[2*i+9] = v16_mul(A[2*i+1], FFT128_Twiddle[2*i+1].v16); \
A[2*i+9] = REDUCE(A[2*i+9]); \
} while(0)
/*
* This allows to tweak the last butterflies to introduce X^127
*/
#define UNPACK_TWEAK(i,tw) \
do { \
v8 t = X[i]; \
v16 tmp; \
A[2*i] = v8_mergel(t, V0.v8); \
A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16); \
A[2*i+8] = REDUCE(A[2*i+8]); \
tmp = v8_mergeh(t, V0.v8); \
A[2*i+1] = v16_add(tmp, tw); \
A[2*i+9] = v16_mul(v16_sub(tmp, tw), FFT128_Twiddle[2*i+1].v16); \
A[2*i+9] = REDUCE(A[2*i+9]); \
} while(0)
UNPACK(0);
UNPACK(1);
UNPACK(2);
if (final)
UNPACK_TWEAK(3, FinalTweak.v16);
else
UNPACK_TWEAK(3, Tweak.v16);
#undef UNPACK
#undef UNPACK_TWEAK
fft64(a);
fft64(a+64);
}
#if 0
void fft128_msg(short *a, const unsigned char *x, int final) {
for (int i=0; i<64; i++)
a[i] = x[i];
for (int i=64; i<128; i++)
a[i] = 0;
a[127] = 1;
a[125] = final? 1: 0;
fft128(a);
}
#endif
void fft256_msg(short *a, const unsigned char *x, int final) {
static const union cv Tweak =
{{0,0,0,0,0,0,0,1}};
static const union cv FinalTweak =
{{0,0,0,0,0,1,0,1}};
v8 *X = (v8*) x;
v16 *A = (v16*) a;
// v16 *Twiddle = (v16*)FFT256_Twiddle;
#define UNPACK(i) \
do { \
v8 t = X[i]; \
A[2*i] = v8_mergel(t, V0.v8); \
A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16); \
A[2*i+16] = REDUCE(A[2*i+16]); \
A[2*i+1] = v8_mergeh(t, V0.v8); \
A[2*i+17] = v16_mul(A[2*i+1], FFT256_Twiddle[2*i+1].v16); \
A[2*i+17] = REDUCE(A[2*i+17]); \
} while(0)
/*
* This allows to tweak the last butterflies to introduce X^127
*/
#define UNPACK_TWEAK(i,tw) \
do { \
v8 t = X[i]; \
v16 tmp; \
A[2*i] = v8_mergel(t, V0.v8); \
A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16); \
A[2*i+16] = REDUCE(A[2*i+16]); \
tmp = v8_mergeh(t, V0.v8); \
A[2*i+1] = v16_add(tmp, tw); \
A[2*i+17] = v16_mul(v16_sub(tmp, tw), FFT256_Twiddle[2*i+1].v16); \
A[2*i+17] = REDUCE(A[2*i+17]); \
} while(0)
UNPACK(0);
UNPACK(1);
UNPACK(2);
UNPACK(3);
UNPACK(4);
UNPACK(5);
UNPACK(6);
if (final)
UNPACK_TWEAK(7, FinalTweak.v16);
else
UNPACK_TWEAK(7, Tweak.v16);
#undef UNPACK
#undef UNPACK_TWEAK
fft128(a);
fft128(a+128);
}
void rounds(u32* state, const unsigned char* msg, short* fft) {
v32* S = (v32*) state;
const v32* M = (v32*)msg;
volatile v16* W = (v16*)fft;
register v32 S0, S1, S2, S3;
static const union cv code[] = { CV(185), CV(233) };
S0 = v32_xor(S[0], v32_bswap(M[0]));
S1 = v32_xor(S[1], v32_bswap(M[1]));
S2 = v32_xor(S[2], v32_bswap(M[2]));
S3 = v32_xor(S[3], v32_bswap(M[3]));
#define S(i) S##i
/* #define F_0(B, C, D) ((((C) ^ (D)) & (B)) ^ (D)) */
/* #define F_1(B, C, D) (((D) & (C)) | (((D) | (C)) & (B))) */
#define F_0(B, C, D) v32_xor(v32_and(v32_xor(C,D), B), D)
#define F_1(B, C, D) v32_or(v32_and(D, C), v32_and( v32_or(D,C), B))
#define F(a,b,c,fun) F_##fun (a,b,c)
/*
* We split the round function in two halfes
* so as to insert some independent computations in between
*/
#define SUM3_00 1
#define SUM3_01 2
#define SUM3_02 3
#define SUM3_10 2
#define SUM3_11 3
#define SUM3_12 1
#define SUM3_20 3
#define SUM3_21 1
#define SUM3_22 2
#define STEP_1(a,b,c,d,w,fun,r,s,z) \
do { \
if (PRINT_SOME) { \
int j; \
v32 ww=w, aa=a, bb=b, cc=c, dd=d; \
u32 *WW = (void*)&ww; \
u32 *AA = (void*)&aa; \
u32 *BB = (void*)&bb; \
u32 *CC = (void*)&cc; \
u32 *DD = (void*)&dd; \
for (j=0; j<4; j++) { \
printf ("%08x/%2i/%2i[%i]: %08x %08x %08x %08x\n", \
WW[j], r, s, SUM3_##z, \
AA[j], BB[j], CC[j], DD[j]); \
} \
} \
TT = F(a,b,c,fun); \
a = v32_rotate(a,r); \
w = v32_add(w, d); \
TT = v32_add(TT, w); \
TT = v32_rotate(TT,s); \
d = v32_shufxor(a,SUM3_##z); \
} while(0)
#define STEP_2(a,b,c,d,w,fun,r,s) \
do { \
d = v32_add(d, TT); \
} while(0)
#define STEP(a,b,c,d,w,fun,r,s,z) \
do { \
register v32 TT; \
STEP_1(a,b,c,d,w,fun,r,s,z); \
STEP_2(a,b,c,d,w,fun,r,s); \
} while(0);
#define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3, \
fun,r,s,t,u,z,r0) \
do { \
register v32 W0, W1, W2, W3, TT; \
W0 = v16_merge##u0(W[h0], W[l0]); \
W0 = V1632(v16_mul(V3216(W0), code[z].v16)); \
STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, r0##0); \
W1 = v16_merge##u1(W[h1], W[l1]); \
W1 = V1632(v16_mul(V3216(W1), code[z].v16)); \
STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s); \
STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, r0##1); \
W2 = v16_merge##u2(W[h2], W[l2]); \
W2 = V1632(v16_mul(V3216(W2), code[z].v16)); \
STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t); \
STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, r0##2); \
W3 = v16_merge##u3(W[h3], W[l3]); \
W3 = V1632(v16_mul(V3216(W3), code[z].v16)); \
STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u); \
STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, r0##0); \
STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r); \
} while(0)
/*
* 4 rounds with code 185
*/
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0, 0);
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0, 1);
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0, 2);
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0, 0);
/*
* 4 rounds with code 233
*/
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1, 1);
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1, 2);
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1, 0);
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1, 1);
/*
* 1 round as feed-forward
*/
STEP(S(0), S(1), S(2), S(3), S[0], 0, 4, 13, 20);
STEP(S(3), S(0), S(1), S(2), S[1], 0, 13, 10, 21);
STEP(S(2), S(3), S(0), S(1), S[2], 0, 10, 25, 22);
STEP(S(1), S(2), S(3), S(0), S[3], 0, 25, 4, 20);
S[0] = S(0); S[1] = S(1); S[2] = S(2); S[3] = S(3);
#undef ROUND
#undef STEP
#undef STEP_1
#undef STEP_2
}
void rounds512(u32* state, const unsigned char* msg, short* fft) {
v32* S = (v32*) state;
v32* M = (v32*) msg;
v16* W = (v16*) fft;
register v32 S0l, S1l, S2l, S3l;
register v32 S0h, S1h, S2h, S3h;
static const union cv code[] = { CV(185), CV(233) };
S0l = v32_xor(S[0], v32_bswap(M[0]));
S0h = v32_xor(S[1], v32_bswap(M[1]));
S1l = v32_xor(S[2], v32_bswap(M[2]));
S1h = v32_xor(S[3], v32_bswap(M[3]));
S2l = v32_xor(S[4], v32_bswap(M[4]));
S2h = v32_xor(S[5], v32_bswap(M[5]));
S3l = v32_xor(S[6], v32_bswap(M[6]));
S3h = v32_xor(S[7], v32_bswap(M[7]));
#define S(i) S##i
/* #define F_0(B, C, D) ((((C) ^ (D)) & (B)) ^ (D)) */
/* #define F_1(B, C, D) (((D) & (C)) | (((D) | (C)) & (B))) */
#define F_0(B, C, D) v32_xor(v32_and(v32_xor(C,D), B), D)
#define F_1(B, C, D) v32_or(v32_and(D, C), v32_and( v32_or(D,C), B))
#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
/*
* We split the round function in two halfes
* so as to insert some independent computations in between
*/
#define SUM7_00 0
#define SUM7_01 1
#define SUM7_02 2
#define SUM7_03 3
#define SUM7_04 4
#define SUM7_05 5
#define SUM7_06 6
#define SUM7_10 1
#define SUM7_11 2
#define SUM7_12 3
#define SUM7_13 4
#define SUM7_14 5
#define SUM7_15 6
#define SUM7_16 0
#define SUM7_20 2
#define SUM7_21 3
#define SUM7_22 4
#define SUM7_23 5
#define SUM7_24 6
#define SUM7_25 0
#define SUM7_26 1
#define SUM7_30 3
#define SUM7_31 4
#define SUM7_32 5
#define SUM7_33 6
#define SUM7_34 0
#define SUM7_35 1
#define SUM7_36 2
#define SUM7_40 4
#define SUM7_41 5
#define SUM7_42 6
#define SUM7_43 0
#define SUM7_44 1
#define SUM7_45 2
#define SUM7_46 3
#define SUM7_50 5
#define SUM7_51 6
#define SUM7_52 0
#define SUM7_53 1
#define SUM7_54 2
#define SUM7_55 3
#define SUM7_56 4
#define SUM7_60 6
#define SUM7_61 0
#define SUM7_62 1
#define SUM7_63 2
#define SUM7_64 3
#define SUM7_65 4
#define SUM7_66 5
#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
#define PERM_0(d,a) /* XOR 1 */ \
do { \
d##l = v32_shufxor(a##l,1); \
d##h = v32_shufxor(a##h,1); \
} while(0)
#define PERM_1(d,a) /* XOR 6 */ \
do { \
d##l = v32_shufxor(a##h,2); \
d##h = v32_shufxor(a##l,2); \
} while(0)
#define PERM_2(d,a) /* XOR 2 */ \
do { \
d##l = v32_shufxor(a##l,2); \
d##h = v32_shufxor(a##h,2); \
} while(0)
#define PERM_3(d,a) /* XOR 3 */ \
do { \
d##l = v32_shufxor(a##l,3); \
d##h = v32_shufxor(a##h,3); \
} while(0)
#define PERM_4(d,a) /* XOR 5 */ \
do { \
d##l = v32_shufxor(a##h,1); \
d##h = v32_shufxor(a##l,1); \
} while(0)
#define PERM_5(d,a) /* XOR 7 */ \
do { \
d##l = v32_shufxor(a##h,3); \
d##h = v32_shufxor(a##l,3); \
} while(0)
#define PERM_6(d,a) /* XOR 4 */ \
do { \
d##l = a##h; \
d##h = a##l; \
} while(0)
#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
do { \
if (PRINT_SOME) { \
int j; \
v32 ww=w##l, aa=a##l, bb=b##l, cc=c##l, dd=d##l; \
u32 *WW = (void*)&ww; \
u32 *AA = (void*)&aa; \
u32 *BB = (void*)&bb; \
u32 *CC = (void*)&cc; \
u32 *DD = (void*)&dd; \
for (j=0; j<4; j++) { \
printf ("%08x/%2i/%2i: %08x %08x %08x %08x\n", \
WW[j], r, s, \
AA[j], BB[j], CC[j], DD[j]); \
} \
} \
TTl = Fl(a,b,c,fun); \
TTh = Fh(a,b,c,fun); \
a##l = v32_rotate(a##l,r); \
a##h = v32_rotate(a##h,r); \
w##l = v32_add(w##l, d##l); \
w##h = v32_add(w##h, d##h); \
TTl = v32_add(TTl, w##l); \
TTh = v32_add(TTh, w##h); \
TTl = v32_rotate(TTl,s); \
TTh = v32_rotate(TTh,s); \
PERM(z,d,a); \
} while(0)
#define STEP_1(a,b,c,d,w,fun,r,s,z) \
STEP_1_(a,b,c,d,w,fun,r,s,z)
#define STEP_2_(a,b,c,d,w,fun,r,s) \
do { \
d##l = v32_add(d##l, TTl); \
d##h = v32_add(d##h, TTh); \
} while(0)
#define STEP_2(a,b,c,d,w,fun,r,s) \
STEP_2_(a,b,c,d,w,fun,r,s)
#define STEP(a,b,c,d,w1,w2,fun,r,s,z) \
do { \
register v32 TTl, TTh, Wl=w1, Wh=w2; \
STEP_1(a,b,c,d,W,fun,r,s,z); \
STEP_2(a,b,c,d,W,fun,r,s); \
} while(0);
#define MSG_l(x) (2*(x))
#define MSG_h(x) (2*(x)+1)
#define MSG(w,hh,ll,u,z) \
do { \
int a = MSG_##u(hh); \
int b = MSG_##u(ll); \
w##l = v16_mergel(W[a], W[b]); \
w##l = V1632(v16_mul(V3216(w##l), code[z].v16)); \
w##h = v16_mergeh(W[a], W[b]); \
w##h = V1632(v16_mul(V3216(w##h), code[z].v16)); \
} while(0)
#define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3, \
fun,r,s,t,u,z) \
do { \
register v32 W0l, W1l, W2l, W3l, TTl; \
register v32 W0h, W1h, W2h, W3h, TTh; \
MSG(W0,h0,l0,u0,z); \
STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, 0); \
MSG(W1,h1,l1,u1,z); \
STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s); \
STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, 1); \
MSG(W2,h2,l2,u2,z); \
STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t); \
STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, 2); \
MSG(W3,h3,l3,u3,z); \
STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u); \
STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, 3); \
STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r); \
} while(0)
/*
* 4 rounds with code 185
*/
#define PERM_START 0
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 4
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 1
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0);
#undef PERM_START
#define PERM_START 5
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0);
#undef PERM_START
/*
* 4 rounds with code 233
*/
#define PERM_START 2
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 6
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 3
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1);
#undef PERM_START
#define PERM_START 0
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1);
#undef PERM_START
/*
* 1 round as feed-forward
*/
#define PERM_START 4
STEP(S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0);
STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1);
STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2);
STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3);
#undef PERM_START
S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h;
S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h;
#undef ROUND
#undef STEP
#undef STEP_1
#undef STEP_2
}
void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) {
if (state->hashbitlen <= 256) {
union cv Y[16];
short* y = (short*) Y[0].u16;
#ifdef v16_broadcast
if (final == 2) {
fft128_msg_final(y, m);
rounds(state->A, m, y);
} else {
fft128_msg(y, m, final);
rounds(state->A, m, y);
}
#else
fft128_msg(y, m, final);
rounds(state->A, m, y);
#endif
} else {
union cv Y[32];
short* y = (short*) Y[0].u16;
fft256_msg(y, m, final);
rounds512(state->A, m, y);
}
}
/*
* Give the FFT output in the regular order for consitancy checks
*/
void fft128_natural(fft_t *x, unsigned char *a) {
union cv Y[16];
short* y = (short*) Y[0].u16;
int i;
fft128_msg(y, a, 0);
for(i=0; i<64; i++) {
x[2*i] = y[i];
x[2*i+1] = y[i+64];
}
}
#endif // SSE2

View File

@@ -1,246 +0,0 @@
#ifndef __VECTOR_H__
#define __VECTOR_H__
#include "compat.h"
#include "simd-utils.h"
/*******************************
* Using GCC vector extensions *
*******************************/
//typedef unsigned char v16qi __attribute__ ((vector_size (16)));
typedef char v16qi __attribute__ ((vector_size (16)));
typedef short v8hi __attribute__ ((vector_size (16)));
typedef int v4si __attribute__ ((vector_size (16)));
typedef float v4sf __attribute__ ((vector_size (16)));
typedef long long int v2di __attribute__ ((vector_size (16)));
typedef short v4hi __attribute__ ((vector_size (8)));
typedef unsigned char v8qi __attribute__ ((vector_size (8)));
typedef v16qi v8;
typedef v8hi v16;
typedef v4si v32;
#define V16_SIZE 8
union cv {
unsigned short u16[8];
v16 v16;
};
union cv8 {
unsigned char u8[16];
v8 v8;
};
union u32 {
u32 u[4];
v32 v;
};
#define V3216(x) ((v16) (x))
#define V1632(x) ((v32) (x))
#define V168(x) ( (v8) (x))
#define V816(x) ((v16) (x))
#if 0
/* These instruction are shorter than the PAND/POR/... that GCC uses */
#define vec_and(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_andps ((v4sf) a, (v4sf) b);})
#define vec_or(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_orps ((v4sf) a, (v4sf) b);})
#define vec_xor(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_xorps ((v4sf) a, (v4sf) b);})
#define vec_andn(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_andnps ((v4sf) a, (v4sf) b);})
#define v16_and(x,y) ((v16) vec_and ((x), (y)))
#define v16_or(x,y) ((v16) vec_or ((x), (y)))
#define v16_xor(x,y) ((v16) vec_xor ((x), (y)))
#define v16_andn(x,y) ((v16) vec_andn((x), (y)))
#define v32_and(x,y) ((v32) vec_and ((x), (y)))
#define v32_or(x,y) ((v32) vec_or ((x), (y)))
#define v32_xor(x,y) ((v32) vec_xor ((x), (y)))
#define v32_andn(x,y) ((v32) vec_andn((x), (y)))
#endif
#if defined(__SSE2__)
#define vec_and(x,y) ((x)&(y))
#define vec_or(x,y) ((x)|(y))
#define vec_xor(x,y) ((x)^(y))
#define v16_and vec_and
#define v16_or vec_or
#define v16_xor vec_xor
#define v32_and vec_and
#define v32_or vec_or
#define v32_xor vec_xor
#define vec_andn(x,y) __builtin_ia32_pandn128 ((v2di) x, (v2di) y)
#define v16_andn(x,y) ((v16) vec_andn(x,y))
#define v32_andn(x,y) ((v32) vec_andn(x,y))
#define v32_add(x,y) ((x)+(y))
#define v16_add(x,y) ((x)+(y))
#define v16_sub(x,y) ((x)-(y))
#define v16_mul(x,y) ((x)*(y))
#define v16_neg(x) (-(x))
#define v16_shift_l __builtin_ia32_psllwi128
#define v16_shift_r __builtin_ia32_psrawi128
#define v16_cmp __builtin_ia32_pcmpgtw128
#define v16_interleavel __builtin_ia32_punpcklwd128
#define v16_interleaveh __builtin_ia32_punpckhwd128
#define v16_mergel(a,b) V1632(__builtin_ia32_punpcklwd128(a,b))
#define v16_mergeh(a,b) V1632(__builtin_ia32_punpckhwd128(a,b))
#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
#define v32_shift_l __builtin_ia32_pslldi128
#define v32_shift_r __builtin_ia32_psrldi128
#define v32_rotate(x,n) \
v32_or(v32_shift_l(x,n), v32_shift_r(x,32-(n)))
#define v32_shuf __builtin_ia32_pshufd
#define SHUFXOR_1 0xb1 /* 0b10110001 */
#define SHUFXOR_2 0x4e /* 0b01001110 */
#define SHUFXOR_3 0x1b /* 0b00011011 */
#define CAT(x, y) x##y
#define XCAT(x,y) CAT(x,y)
#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
#define v32_bswap(x) (x)
#define v16_broadcast(x) ({ \
union u32 u; \
u32 xx = x; \
u.u[0] = xx | (xx << 16); \
V3216(v32_shuf(u.v,0)); })
#define CV(x) {{x, x, x, x, x, x, x, x}}
#elif defined(__aarch64__) && defined(__ARM_NEON)
#define vec_and( x, y ) v128_and( x, y )
#define vec_or(x,y) v128_or( x, y )
#define vec_xor(x,y) v128_xor( x, y )
#define v16_and v128_and
#define v16_or v128_or
#define v16_xor v128_xor
#define v32_and v128_and
#define v32_or v128_or
#define v32_xor v128_xor
#define vec_andn( x,y ) v128_andnot( x, y )
#define v16_andn vec_andn
#define v32_andn vec_andn
#define v32_add( x, y ) v128_add32( x, y )
#define v16_add( x, y ) v128_add16( x, y )
#define v16_sub( x, y ) v128_sub16( x, y )
#define v16_mul( x, y ) v128_mul16( x, y )
#define v16_neg(x) v128_negate16( x )
#define v16_shift_l( x, c ) v128_sl16
#define v16_shift_r v128_sr16
#define v16_cmp v128_cmpgt16
#define v16_interleavel v128_unpacklo16
#define v16_interleaveh v128_unpackhi16
#define v16_mergel(a,b) V1632(__builtin_ia32_punpcklwd128(a,b))
#define v16_mergeh(a,b) V1632(__builtin_ia32_punpckhwd128(a,b))
#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
#define v32_shift_l v128_sl32
#define v32_shift_r v128_sr32
#define v32_rotate(x,n) v128_rol32
#define v32_shuf __builtin_ia32_pshufd
#define SHUFXOR_1 0xb1 /* 0b10110001 */
#define SHUFXOR_2 0x4e /* 0b01001110 */
#define SHUFXOR_3 0x1b /* 0b00011011 */
#define CAT(x, y) x##y
#define XCAT(x,y) CAT(x,y)
#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
#define v32_bswap(x) (x)
#define v16_broadcast(x) ({ \
union u32 u; \
u32 xx = x; \
u.u[0] = xx | (xx << 16); \
V3216(v32_shuf(u.v,0)); })
#define CV(x) {{x, x, x, x, x, x, x, x}}
#else
#error "I don't know how to vectorize on this architecture."
#endif
/* Twiddle tables */
static const union cv FFT64_Twiddle[] = {
{{1, 2, 4, 8, 16, 32, 64, 128}},
{{1, 60, 2, 120, 4, -17, 8, -34}},
{{1, 120, 8, -68, 64, -30, -2, 17}},
{{1, 46, 60, -67, 2, 92, 120, 123}},
{{1, 92, -17, -22, 32, 117, -30, 67}},
{{1, -67, 120, -73, 8, -22, -68, -70}},
{{1, 123, -34, -70, 128, 67, 17, 35}},
};
static const union cv FFT128_Twiddle[] = {
{{ 1, -118, 46, -31, 60, 116, -67, -61}},
{{ 2, 21, 92, -62, 120, -25, 123, -122}},
{{ 4, 42, -73, -124, -17, -50, -11, 13}},
{{ 8, 84, 111, 9, -34, -100, -22, 26}},
{{ 16, -89, -35, 18, -68, 57, -44, 52}},
{{ 32, 79, -70, 36, 121, 114, -88, 104}},
{{ 64, -99, 117, 72, -15, -29, 81, -49}},
{{128, 59, -23, -113, -30, -58, -95, -98}},
};
static const union cv FFT256_Twiddle[] = {
{{ 1, 41, -118, 45, 46, 87, -31, 14}},
{{ 60, -110, 116, -127, -67, 80, -61, 69}},
{{ 2, 82, 21, 90, 92, -83, -62, 28}},
{{ 120, 37, -25, 3, 123, -97, -122, -119}},
{{ 4, -93, 42, -77, -73, 91, -124, 56}},
{{ -17, 74, -50, 6, -11, 63, 13, 19}},
{{ 8, 71, 84, 103, 111, -75, 9, 112}},
{{ -34, -109, -100, 12, -22, 126, 26, 38}},
{{ 16, -115, -89, -51, -35, 107, 18, -33}},
{{ -68, 39, 57, 24, -44, -5, 52, 76}},
{{ 32, 27, 79, -102, -70, -43, 36, -66}},
{{ 121, 78, 114, 48, -88, -10, 104, -105}},
{{ 64, 54, -99, 53, 117, -86, 72, 125}},
{{ -15, -101, -29, 96, 81, -20, -49, 47}},
{{ 128, 108, 59, 106, -23, 85, -113, -7}},
{{ -30, 55, -58, -65, -95, -40, -98, 94}}
};
#endif

View File

@@ -13,11 +13,7 @@
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/luffa/luffa_for_sse2.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
@@ -43,11 +39,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
} c11_ctx_holder;
c11_ctx_holder c11_ctx __attribute__ ((aligned (64)));
@@ -69,11 +61,6 @@ void init_c11_ctx()
init_luffa( &c11_ctx.luffa, 512 );
cubehashInit( &c11_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &c11_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &c11_ctx.simd );
#else
init_sd( &c11_ctx.simd, 512 );
#endif
}
void c11_hash( void *output, const void *input )
@@ -105,41 +92,35 @@ void c11_hash( void *output, const void *input )
sph_skein512( &ctx.skein, (const void*) hash, 64 );
sph_skein512_close( &ctx.skein, hash );
update_and_final_luffa( &ctx.luffa, hash, hash, 64 );
update_and_final_luffa( &ctx.luffa, hash, hash, 64 );
cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#else
sph_echo512( &ctx.echo, hash, 64 );
sph_echo512_close( &ctx.echo, hash );
sph_echo512( &ctx.echo, hash, 64 );
sph_echo512_close( &ctx.echo, hash );
#endif
memcpy(output, hash, 32);
memcpy(output, hash, 32);
}
int scanhash_c11( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
const uint32_t Htarg = ptarget[7];
uint32_t nonce = first_nonce;
int thr_id = mythr->id;
volatile uint8_t *restart = &(work_restart[thr_id].restart);

View File

@@ -13,17 +13,13 @@
#include "algo/skein/sph_skein.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#ifdef __AES__
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
@@ -37,11 +33,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
#ifdef __AES__
hashState_groestl groestl;
#else
@@ -62,11 +54,6 @@ void init_tt10_ctx()
init_luffa( &tt10_ctx.luffa, 512 );
cubehashInit( &tt10_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &tt10_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &tt10_ctx.simd );
#else
init_sd( &tt10_ctx.simd, 512 );
#endif
#ifdef __AES__
init_groestl( &tt10_ctx.groestl, 64 );
#else
@@ -222,27 +209,7 @@ void timetravel10_hash(void *output, const void *input)
}
break;
case 9:
if ( i == 0 )
{
memcpy( &ctx.simd, &tt10_mid.simd, sizeof tt10_mid.simd );
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) input + midlen, tail );
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hashB,
(const BitSequence *)input + midlen, tail*8 );
#endif
}
else
{
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_sd( &ctx.simd, (const BitSequence *)hashA, dataLen*8 );
final_sd( &ctx.simd, (BitSequence *)hashB );
#endif
}
simd512_ctx( &ctx.simd, hashB, hashA, dataLen );
break;
default:
break;
@@ -325,15 +292,6 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce,
memcpy( &tt10_mid.shavite, &tt10_ctx.shavite, sizeof(tt10_mid.shavite ) );
sph_shavite512( &tt10_mid.shavite, endiandata, 64 );
break;
case 9:
memcpy( &tt10_mid.simd, &tt10_ctx.simd, sizeof(tt10_mid.simd ) );
#if defined(__aarch64__)
sph_simd512( &tt10_mid.simd, (const void*) endiandata, 64 );
sph_simd512_close( &tt10_mid.simd, hash);
#else
update_sd( &tt10_mid.simd, (const BitSequence *)endiandata, 512 );
#endif
break;
default:
break;
}

View File

@@ -22,12 +22,7 @@
#include "algo/echo/sph_echo.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
typedef struct {
sph_blake512_context blake;
@@ -45,11 +40,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
} x11_ctx_holder;
x11_ctx_holder x11_ctx;
@@ -71,11 +62,6 @@ void init_x11_ctx()
init_luffa( &x11_ctx.luffa, 512 );
cubehashInit( &x11_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &x11_ctx.simd );
#else
init_sd( &x11_ctx.simd, 512 );
#endif
}
void x11_hash( void *state, const void *input )
@@ -118,13 +104,7 @@ void x11_hash( void *state, const void *input )
sph_shavite512( &ctx.shavite, hash, 64 );
sph_shavite512_close( &ctx.shavite, hash );
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -20,11 +20,7 @@
#include "algo/echo/sph_echo.h"
#endif
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/luffa/luffa_for_sse2.h"
typedef struct {
@@ -37,11 +33,7 @@ typedef struct {
#endif
hashState_luffa luffa;
cubehashParam cube;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_blake512_context blake;
sph_bmw512_context bmw;
sph_skein512_context skein;
@@ -63,11 +55,6 @@ void init_x11evo_ctx()
#endif
init_luffa( &x11evo_ctx.luffa, 512 );
cubehashInit( &x11evo_ctx.cube, 512, 16, 32 );
#if defined(__aarch64__)
sph_simd512_init( &x11evo_ctx.simd );
#else
init_sd( &x11evo_ctx.simd, 512 );
#endif
sph_blake512_init( &x11evo_ctx.blake );
sph_bmw512_init( &x11evo_ctx.bmw );
sph_skein512_init( &x11evo_ctx.skein );
@@ -146,12 +133,7 @@ void x11evo_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, (char*)hash );
break;
case 9:
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (char*)hash, (const char*)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
break;
case 10:
#ifdef __AES__

View File

@@ -17,12 +17,7 @@
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -47,11 +42,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_gost512_context gost;
} x11gost_ctx_holder;
@@ -75,11 +66,6 @@ void init_x11gost_ctx()
sph_shavite512_init( &x11gost_ctx.shavite );
init_luffa( &x11gost_ctx.luffa, 512 );
cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
#if defined(__aarch64__)
sph_simd512_init(&x11gost_ctx.simd);
#else
init_sd( &x11gost_ctx.simd, 512 );
#endif
}
void x11gost_hash(void *output, const void *input)
@@ -123,13 +109,7 @@ void x11gost_hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64 );
sph_shavite512_close( &ctx.shavite, hash );
#if defined(__aarch64__)
sph_simd512 (&ctx.simd, hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -17,11 +17,7 @@
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -44,11 +40,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
} x12_ctx_holder;
@@ -68,14 +60,9 @@ void init_x12_ctx()
sph_groestl512_init(&x12_ctx.groestl);
sph_echo512_init(&x12_ctx.echo);
#endif
init_luffa( &x12_ctx.luffa, 512 );
init_luffa( &x12_ctx.luffa, 512 );
cubehashInit( &x12_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &x12_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &x12_ctx.simd );
#else
init_sd( &x12_ctx.simd, 512 );
#endif
sph_hamsi512_init( &x12_ctx.hamsi );
};
@@ -101,13 +88,7 @@ void x12hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hashB);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hashB, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
final_sd( &ctx.simd, (BitSequence *)hash );
#endif
simd512_ctx( &ctx.simd, hash, hashB, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hashB,

View File

@@ -15,11 +15,7 @@
#include "algo/hamsi/sph_hamsi.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -48,11 +44,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
} x13_ctx_holder;
@@ -77,11 +69,6 @@ void init_x13_ctx()
init_luffa( &x13_ctx.luffa, 512 );
cubehashInit( &x13_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &x13_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init(&x13_ctx.simd);
#else
init_sd( &x13_ctx.simd, 512 );
#endif
sph_hamsi512_init( &x13_ctx.hamsi );
};
@@ -121,13 +108,7 @@ void x13hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -15,11 +15,7 @@
#include "algo/shavite/sph_shavite.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -47,11 +43,7 @@ typedef struct {
sph_skein512_context skein;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sm3_ctx_t sm3;
} x13bcd_ctx_holder;
@@ -76,11 +68,6 @@ void init_x13bcd_ctx()
sph_keccak512_init( &x13bcd_ctx.keccak );
cubehashInit( &x13bcd_ctx.cube,512,16,32 );
sph_shavite512_init( &x13bcd_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &x13bcd_ctx.simd );
#else
init_sd( &x13bcd_ctx.simd, 512 );
#endif
sm3_init( &x13bcd_ctx.sm3 );
sph_hamsi512_init( &x13bcd_ctx.hamsi );
};
@@ -127,13 +114,7 @@ void x13bcd_hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -17,11 +17,7 @@
#include "algo/fugue/sph_fugue.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -46,11 +42,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sm3_ctx_t sm3;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
@@ -75,11 +67,6 @@ void init_x13sm3_ctx()
init_luffa( &hsr_ctx.luffa,512 );
cubehashInit( &hsr_ctx.cube,512,16,32 );
sph_shavite512_init( &hsr_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &hsr_ctx.simd );
#else
init_sd( &hsr_ctx.simd,512 );
#endif
sm3_init( &hsr_ctx.sm3 );
sph_hamsi512_init( &hsr_ctx.hamsi );
sph_fugue512_init( &hsr_ctx.fugue );
@@ -123,13 +110,7 @@ void x13sm3_hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
//11---echo---
#ifdef __AES__

View File

@@ -15,11 +15,7 @@
#include "algo/hamsi/sph_hamsi.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -49,11 +45,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
} x14_ctx_holder;
@@ -79,11 +71,6 @@ void init_x14_ctx()
init_luffa( &x14_ctx.luffa,512 );
cubehashInit( &x14_ctx.cube,512,16,32 );
sph_shavite512_init( &x14_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &x14_ctx.simd );
#else
init_sd( &x14_ctx.simd, 512 );
#endif
sph_hamsi512_init( &x14_ctx.hamsi );
sph_shabal512_init( &x14_ctx.shabal );
};
@@ -124,13 +111,7 @@ void x14hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -17,12 +17,7 @@
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -52,11 +47,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -83,11 +74,6 @@ void init_x15_ctx()
init_luffa( &x15_ctx.luffa,512 );
cubehashInit( &x15_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &x15_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &x15_ctx.simd );
#else
init_sd( &x15_ctx.simd, 512 );
#endif
sph_hamsi512_init( &x15_ctx.hamsi );
sph_shabal512_init( &x15_ctx.shabal );
sph_whirlpool_init( &x15_ctx.whirlpool );
@@ -131,13 +117,7 @@ void x15hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -236,7 +236,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
do
{
edata[19] = nonce;
if ( hex_hash( hash32, edata, thr_id ) );
if ( hex_hash( hash32, edata, thr_id ) )
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
{
be32enc( &pdata[19], nonce );

View File

@@ -526,7 +526,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if( x16r_8way_hash( hash, vdata, thr_id ) );
if ( x16r_8way_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 8; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
@@ -952,7 +952,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_4way_hash( hash, vdata, thr_id ) );
if ( x16r_4way_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 4; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
@@ -1353,7 +1353,7 @@ int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce,
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_2x64_hash( hash, vdata, thr_id ) );
if ( x16r_2x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{

View File

@@ -15,7 +15,6 @@
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/sph_simd.h"
#include "algo/simd/nist.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"

View File

@@ -137,7 +137,7 @@ int scanhash_x20r_8x64( struct work *work, uint32_t max_nonce,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if( x20r_8x64_hash( hash, vdata, thr_id ) );
if ( x20r_8x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 8; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
@@ -205,7 +205,7 @@ int scanhash_x20r_4x64( struct work *work, uint32_t max_nonce,
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x20r_4x64_hash( hash, vdata, thr_id ) );
if ( x20r_4x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 4; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
@@ -269,7 +269,7 @@ int scanhash_x20r_2x64( struct work *work, uint32_t max_nonce,
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x20r_2x64_hash( hash, vdata, thr_id ) );
if ( x20r_2x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{

View File

@@ -18,11 +18,7 @@
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/sha/sph_sha2.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
@@ -53,11 +49,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -86,11 +78,6 @@ void init_sonoa_ctx()
init_luffa( &sonoa_ctx.luffa, 512 );
cubehashInit( &sonoa_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &sonoa_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &sonoa_ctx.simd );
#else
init_sd( &sonoa_ctx.simd, 512 );
#endif
sph_hamsi512_init( &sonoa_ctx.hamsi );
sph_shabal512_init( &sonoa_ctx.shabal );
sph_whirlpool_init( &sonoa_ctx.whirlpool );
@@ -134,13 +121,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,
@@ -189,13 +170,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
@@ -249,13 +224,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
@@ -318,13 +287,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
@@ -410,13 +373,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, hash, hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
@@ -483,13 +440,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, hash, hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
@@ -527,7 +478,6 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_whirlpool_close(&ctx.whirlpool, hash);
if ( work_restart[thr_id].restart ) return 0;
//
sph_bmw512_init( &ctx.bmw);
sph_bmw512(&ctx.bmw, hash, 64);
@@ -565,13 +515,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, hash, hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );

View File

@@ -18,11 +18,7 @@
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/sha/sph_sha2.h"
#if defined(__AES__)
#include "algo/fugue/fugue-aesni.h"
@@ -34,7 +30,7 @@
#include "algo/fugue/sph_fugue.h"
#endif
#include "algo/blake/sph_blake.h"
#include "algo/cubehash/sph_cubehash.h"
//#include "algo/cubehash/sph_cubehash.h"
#include "algo/luffa/sph_luffa.h"
@@ -63,17 +59,9 @@ union _x17_context_overlay
#else
hashState_luffa luffa;
#endif
//#if defined(__aarch64__)
// sph_cubehash512_context cube;
//#else
cubehashParam cube;
//#endif
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -127,26 +115,13 @@ int x17_hash(void *output, const void *input, int thr_id )
luffa_full( &ctx.luffa, hash, 512, hash, 64 );
#endif
//#if defined(__aarch64__)
// sph_cubehash512_init(&ctx.cube);
// sph_cubehash512(&ctx.cube, (const void*) hash, 64);
// sph_cubehash512_close(&ctx.cube, hash);
//#else
cubehash_full( &ctx.cube, hash, 512, hash, 64 );
//#endif
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
echo_full( &ctx.echo, (BitSequence *)hash, 512,

View File

@@ -17,11 +17,7 @@
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/sha/sph_sha2.h"
#if defined(__AES__)
@@ -45,11 +41,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -78,11 +70,6 @@ void init_xevan_ctx()
init_luffa( &xevan_ctx.luffa, 512 );
cubehashInit( &xevan_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &xevan_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &xevan_ctx.simd );
#else
init_sd( &xevan_ctx.simd, 512 );
#endif
sph_hamsi512_init( &xevan_ctx.hamsi );
sph_shabal512_init( &xevan_ctx.shabal );
sph_whirlpool_init( &xevan_ctx.whirlpool );
@@ -137,13 +124,7 @@ int xevan_hash(void *output, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, dataLen);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512( &ctx.simd, (const void*) hash, dataLen );
sph_simd512_close( &ctx.simd, hash );
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, dataLen*8 );
#endif
simd512_ctx( &ctx.simd, hash, hash, dataLen );
#if defined(__AES__)
update_final_echo( &ctx.echo, (BitSequence *) hash,
@@ -210,13 +191,14 @@ int xevan_hash(void *output, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, dataLen);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, dataLen*8 );
#endif
simd512_ctx( &ctx.simd, hash, hash, dataLen );
//#if defined(__aarch64__)
// sph_simd512(&ctx.simd, (const void*) hash, 64);
// sph_simd512_close(&ctx.simd, hash);
//#else
// update_final_sd( &ctx.simd, (BitSequence *)hash,
// (const BitSequence *)hash, dataLen*8 );
//#endif
#if defined(__AES__)
update_final_echo( &ctx.echo, (BitSequence *) hash,

View File

@@ -18,7 +18,6 @@
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/nist.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/fugue/fugue-aesni.h"
#include "algo/whirlpool/sph_whirlpool.h"

View File

@@ -1,14 +0,0 @@
#!/bin/bash
# Linux build
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=native -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer

View File

@@ -15,51 +15,43 @@ rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=armv9-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv9-crypto-sha3
make clean || echo clean
CFLAGS="-O3 -march=armv9-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv9-crypto
make clean || echo clean
CFLAGS="-O3 -march=armv9-a -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv9
# SVE2 available in armv8.5
make clean || echo clean
CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+sve2 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv8.5-crypto-sha3-sve2
# SHA3 available in armv8.4
make clean || echo clean
CFLAGS="-O3 -march=armv8.4-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv8.4-crypto-sha3
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=armv8-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv8-crypto
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=armv8-a -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv8
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=native -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer

View File

@@ -4,7 +4,7 @@
# during develpment. However the information contained may provide compilation
# tips to users.
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 > /dev/null
rm cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 > /dev/null
# AVX512 SHA VAES: Intel Core Icelake, Rocketlake
make distclean || echo clean
@@ -18,28 +18,55 @@ strip -s cpuminer
mv cpuminer cpuminer-avx512-sha-vaes
# Intel Core Alderlake: AVX2 SHA VAES, needs gcc-12
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
make -j 8
strip -s cpuminer
mv cpuminer cpuminer-alderlake
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
#make -j 8
#strip -s cpuminer
#mv cpuminer cpuminer-alderlake
# Intel Core Arrowlake: AVX2 SHA512 VAES, needs gcc-14
# Intel Core Arrowlake-s: AVX2 SHA512 VAES, needs gcc-14
# Arrowlake-s includes SHA512, Arrowlake does not?
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl
#make -j 8
#strip -s cpuminer
#mv cpuminer cpuminer-arrowlake
#mv cpuminer cpuminer-arrowlake-s
# Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14
# Apparently Granitrapids will not include AVX10, SHA512 or APX,
# wait for Diamondrapids & gcc-15.
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
#make -j 8
#strip -s cpuminer
#mv cpuminer cpuminer-graniterapids
# Force AVX10-256
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=arrowlake-s -mavx10.1-256 -Wall" ./configure --with-curl
#make -j 8
#strip -s cpuminer
#mv cpuminer cpuminer-avx10-256
# Force SHA512 AVX10-512
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1-512 -Wall" ./configure --with-curl
#make -j 8
#strip -s cpuminer
#mv cpuminer cpuminer-avx10-512
# Zen5: AVX512 SHA VAES, requires gcc-14.
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=znver5" ./configure --with-curl
#CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl
#make -j $(nproc)
#strip -s cpuminer
#mv cpuminer cpuminer-zen4
#mv cpuminer cpuminer-zen5
# Zen4: AVX512 SHA VAES
make clean || echo clean
@@ -70,7 +97,7 @@ make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-avx512
# AVX2 SHA VAES: generic
# AVX2 SHA VAES: generic, zen3, alderlake...arrowlake
make clean || echo done
rm -f config.status
# vaes doesn't include aes

View File

@@ -1,10 +0,0 @@
#!/bin/bash
#
# Compile on Windows using MSYS2 and MinGW.
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=native -Wall -D_WIN32_WINNT=0x0601" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer

View File

@@ -1,20 +1,9 @@
#!/bin/bash
#if [ "$OS" = "Windows_NT" ]; then
# ./mingw64.sh
# exit 0
#fi
# Linux build
#!/bin/sh
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
#strip -s cpuminer

View File

@@ -1,8 +1,8 @@
#!/bin/bash
#!/bin/sh
#
# make clean and rm all the targetted executables.
rm cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9 cpuminer-armv9-crypto cpuminer-armv9-crypto-sha3 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8.5-aes-sha3-sve2 cpuminer-armv8-crypto cpuminer-armv8 > /dev/null
rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9 cpuminer-armv9-crypto cpuminer-armv9-crypto-sha3 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8-crypto cpuminer-armv8 > /dev/null
rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null

View File

@@ -3,7 +3,7 @@
#ifdef WIN32
#if _WIN32_WINNT==0x0601 // Windows 7
#if _WIN32_WINNT>=0x0601 // Windows 7
#define WINDOWS_CPU_GROUPS_ENABLED 1
#endif

2226
configure vendored

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [24.5])
AC_INIT([cpuminer-opt], [25.2])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM
@@ -41,32 +41,30 @@ AC_CHECK_DECLS([be32dec, le32dec, be32enc, le32enc, le16dec, le16enc], [], [],
AC_FUNC_ALLOCA
AC_CHECK_FUNCS([getopt_long])
MINGW_TARGET=`$CC -dumpmachine 2>&1`
case $MINGW_TARGET in
arm*-*-*)
have_arm=true
;;
i*86-*-mingw*)
have_x86=true
have_win32=true
CFLAGS="-Icompat/pthreads $CFLAGS"
PTHREAD_LDFLAGS="-Lcompat/pthreads/x86"
WS2_LIBS="-lws2_32"
;;
x86_64-*-mingw*|amd64-*-mingw*)
have_x86_64=true
have_win32=true
CFLAGS="-Icompat/pthreads $CFLAGS"
PTHREAD_LDFLAGS="-Lcompat/pthreads/x64"
# SHOULD BE AT END! after -lcrypto #
WS2_LIBS="-L/mingw/x86_64-w64-mingw32/lib -lws2_32"
;;
i*86-*-*)
have_x86=true
;;
case $target in
x86_64-*-*|amd64-*-*)
have_x86_64=true
;;
aarch64*-*-*|arm64*-*-*)
have_arm64=true
;;
powerpc*-*-*)
have_ppc=true
;;
esac
PTHREAD_FLAGS="-pthread"
WS2_LIBS=""
case $target in
*-*-mingw*)
have_win32=true
PTHREAD_FLAGS=""
WS2_LIBS="-lws2_32"
;;
*-apple-*)
have_apple=true
;;
esac
AC_ARG_ENABLE([assembly],
@@ -75,54 +73,14 @@ if test x$enable_assembly != xno; then
AC_DEFINE([USE_ASM], [1], [Define to 1 if assembly routines are wanted.])
fi
if test x$enable_assembly != xno -a x$have_x86_64 = xtrue
then
AC_MSG_CHECKING(whether we can compile AVX code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vmovdqa %ymm0, %ymm1");])],
AC_DEFINE(USE_AVX, 1, [Define to 1 if AVX assembly is available.])
AC_MSG_RESULT(yes)
AC_MSG_CHECKING(whether we can compile XOP code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vprotd \$7, %xmm0, %xmm1");])],
AC_DEFINE(USE_XOP, 1, [Define to 1 if XOP assembly is available.])
AC_MSG_RESULT(yes)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the XOP instruction set.])
)
AC_MSG_CHECKING(whether we can compile AVX2 code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
AC_MSG_RESULT(yes)
AC_MSG_CHECKING(whether we can compile AVX512 code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");])],
AC_DEFINE(USE_AVX512, 1, [Define to 1 if AVX512 assembly is available.])
AC_MSG_RESULT(yes)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX512 instruction set.])
)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])
)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX instruction set.])
)
fi
# jansson test fails on Linux/Mingw, handled in Makefile.am.
AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)
# GC2 for GNU static
if test "x$have_win32" = "xtrue" ; then
# MinGW
AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",[])
else
AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",[])
fi
LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
# PTHREAD_LIBS="$PTHREAD_LIBS"
AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
AC_CHECK_LIB([pthreadGC2], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",
AC_CHECK_LIB([pthreadGC1], [pthread_create], PTHREAD_LIBS="-lpthreadGC1",
AC_CHECK_LIB([pthreadGC], [pthread_create], PTHREAD_LIBS="-lpthreadGC"
))))
AC_MSG_CHECKING(whether __uint128_t is supported)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([static __uint128_t i = 100;])],
@@ -136,16 +94,10 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([static __uint128_t i = 100;])],
AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
AM_CONDITIONAL([USE_ASM], [test x$enable_assembly != xno])
AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue])
AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue])
AM_CONDITIONAL([ARCH_ARM], [test x$have_arm = xtrue])
AM_CONDITIONAL([ARCH_ARM64], [test x$have_arm64 = xtrue])
AM_CONDITIONAL([MINGW], [test "x$OS" = "xWindows_NT"])
if test x$request_jansson = xtrue ; then
JANSSON_LIBS="compat/jansson/libjansson.a"
else
JANSSON_LIBS=-ljansson
fi
AM_CONDITIONAL([HAVE_APPLE], [test x$have_apple = xtrue])
# libcurl install path (for mingw : --with-curl=/usr/local)
AC_ARG_WITH([curl],
@@ -158,25 +110,10 @@ if test -n "$with_curl" ; then
LIBCURL="-lcurl -lz"
fi
# SSL install path (for mingw : --with-crypto=/usr/local/ssl)
AC_ARG_WITH([crypto],
[ --with-crypto=PATH prefix where openssl crypto is installed [default=/usr]])
if test -n "$with_crypto" ; then
LIBCURL_CFLAGS="$LIBCURL_CFLAGS -I$with_crypto/include"
LIBCURL_CPPFLAGS="$LIBCURL_CPPFLAGS -I$with_crypto/include"
LIBCURL_LDFLAGS="-L$with_crypto/lib $LIBCURL_LDFLAGS"
LIBCURL="$LIBCURL -lssl -lcrypto"
fi
CFLAGS="$CFLAGS $LIBCURL_CFLAGS"
CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS"
#AC_CHECK_LIB([z],[gzopen],[],[])
#AC_CHECK_LIB([crypto],[OPENSSL_init], crypto=yes, [AC_MSG_ERROR([OpenSSL crypto library required])])
#AC_CHECK_LIB([ssl],[SSL_new], ssl=yes, ssl=no)
# AC_CHECK_LIB([curl], [curl_multi_timeout],
# have_libcurl=yes,
# have_libcurl=no AC_MSG_ERROR([curl library required])

View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.4.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.2.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='24.4'
PACKAGE_STRING='cpuminer-opt 24.4'
PACKAGE_VERSION='25.2'
PACKAGE_STRING='cpuminer-opt 25.2'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -657,14 +657,14 @@ JANSSON_LIBS
LIBCURL_CPPFLAGS
LIBCURL_CFLAGS
LIBCURL
HAVE_APPLE_FALSE
HAVE_APPLE_TRUE
MINGW_FALSE
MINGW_TRUE
ARCH_ARM_FALSE
ARCH_ARM_TRUE
ARCH_ARM64_FALSE
ARCH_ARM64_TRUE
ARCH_x86_64_FALSE
ARCH_x86_64_TRUE
ARCH_x86_FALSE
ARCH_x86_TRUE
USE_ASM_FALSE
USE_ASM_TRUE
HAVE_WINDOWS_FALSE
@@ -796,7 +796,6 @@ enable_maintainer_mode
enable_dependency_tracking
enable_assembly
with_curl
with_crypto
'
ac_precious_vars='build_alias
host_alias
@@ -1360,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 24.4 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 25.2 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1432,7 +1431,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 24.4:";;
short | recursive ) echo "Configuration of cpuminer-opt 25.2:";;
esac
cat <<\_ACEOF
@@ -1455,7 +1454,6 @@ Optional Packages:
--with-PACKAGE[=ARG] use PACKAGE [ARG=yes]
--without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no)
--with-curl=PATH prefix where curl is installed default=/usr
--with-crypto=PATH prefix where openssl crypto is installed default=/usr
Some influential environment variables:
CC C compiler command
@@ -1538,7 +1536,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 24.4
cpuminer-opt configure 25.2
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1983,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 24.4, which was
It was created by cpuminer-opt $as_me 25.2, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3593,7 +3591,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='24.4'
VERSION='25.2'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -5810,11 +5808,11 @@ if test x$ac_prog_cxx_stdcxx = xno
then :
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++11 features" >&5
printf %s "checking for $CXX option to enable C++11 features... " >&6; }
if test ${ac_cv_prog_cxx_11+y}
if test ${ac_cv_prog_cxx_cxx11+y}
then :
printf %s "(cached) " >&6
else $as_nop
ac_cv_prog_cxx_11=no
ac_cv_prog_cxx_cxx11=no
ac_save_CXX=$CXX
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
@@ -5856,11 +5854,11 @@ if test x$ac_prog_cxx_stdcxx = xno
then :
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++98 features" >&5
printf %s "checking for $CXX option to enable C++98 features... " >&6; }
if test ${ac_cv_prog_cxx_98+y}
if test ${ac_cv_prog_cxx_cxx98+y}
then :
printf %s "(cached) " >&6
else $as_nop
ac_cv_prog_cxx_98=no
ac_cv_prog_cxx_cxx98=no
ac_save_CXX=$CXX
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
@@ -6502,32 +6500,30 @@ then :
fi
MINGW_TARGET=`$CC -dumpmachine 2>&1`
case $MINGW_TARGET in
arm*-*-*)
have_arm=true
;;
i*86-*-mingw*)
have_x86=true
have_win32=true
CFLAGS="-Icompat/pthreads $CFLAGS"
PTHREAD_LDFLAGS="-Lcompat/pthreads/x86"
WS2_LIBS="-lws2_32"
;;
x86_64-*-mingw*|amd64-*-mingw*)
have_x86_64=true
have_win32=true
CFLAGS="-Icompat/pthreads $CFLAGS"
PTHREAD_LDFLAGS="-Lcompat/pthreads/x64"
# SHOULD BE AT END! after -lcrypto #
WS2_LIBS="-L/mingw/x86_64-w64-mingw32/lib -lws2_32"
;;
i*86-*-*)
have_x86=true
;;
case $target in
x86_64-*-*|amd64-*-*)
have_x86_64=true
;;
aarch64*-*-*|arm64*-*-*)
have_arm64=true
;;
powerpc*-*-*)
have_ppc=true
;;
esac
PTHREAD_FLAGS="-pthread"
WS2_LIBS=""
case $target in
*-*-mingw*)
have_win32=true
PTHREAD_FLAGS=""
WS2_LIBS="-lws2_32"
;;
*-apple-*)
have_apple=true
;;
esac
# Check whether --enable-assembly was given.
@@ -6542,126 +6538,7 @@ printf "%s\n" "#define USE_ASM 1" >>confdefs.h
fi
if test x$enable_assembly != xno -a x$have_x86_64 = xtrue
then
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX code" >&5
printf %s "checking whether we can compile AVX code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main (void)
{
asm ("vmovdqa %ymm0, %ymm1");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"
then :
printf "%s\n" "#define USE_AVX 1" >>confdefs.h
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile XOP code" >&5
printf %s "checking whether we can compile XOP code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main (void)
{
asm ("vprotd \$7, %xmm0, %xmm1");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"
then :
printf "%s\n" "#define USE_XOP 1" >>confdefs.h
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the XOP instruction set." >&5
printf "%s\n" "$as_me: WARNING: The assembler does not support the XOP instruction set." >&2;}
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX2 code" >&5
printf %s "checking whether we can compile AVX2 code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main (void)
{
asm ("vpaddd %ymm0, %ymm1, %ymm2");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"
then :
printf "%s\n" "#define USE_AVX2 1" >>confdefs.h
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX512 code" >&5
printf %s "checking whether we can compile AVX512 code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main (void)
{
asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"
then :
printf "%s\n" "#define USE_AVX512 1" >>confdefs.h
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX512 instruction set." >&5
printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX512 instruction set." >&2;}
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX2 instruction set." >&5
printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX2 instruction set." >&2;}
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX instruction set." >&5
printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX instruction set." >&2;}
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
fi
# jansson test fails on Linux/Mingw, handled in Makefile.am.
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for json_loads in -ljansson" >&5
printf %s "checking for json_loads in -ljansson... " >&6; }
if test ${ac_cv_lib_jansson_json_loads+y}
@@ -6705,51 +6582,7 @@ else $as_nop
fi
# GC2 for GNU static
if test "x$have_win32" = "xtrue" ; then
# MinGW
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
printf %s "checking for pthread_create in -lpthread... " >&6; }
if test ${ac_cv_lib_pthread_pthread_create+y}
then :
printf %s "(cached) " >&6
else $as_nop
ac_check_lib_save_LIBS=$LIBS
LIBS="-lpthread $LIBS"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
/* Override any GCC internal prototype to avoid an error.
Use char because int might match the return type of a GCC
builtin and then its argument prototype would still apply. */
char pthread_create ();
int
main (void)
{
return pthread_create ();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"
then :
ac_cv_lib_pthread_pthread_create=yes
else $as_nop
ac_cv_lib_pthread_pthread_create=no
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam \
conftest$ac_exeext conftest.$ac_ext
LIBS=$ac_check_lib_save_LIBS
fi
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthread_pthread_create" >&5
printf "%s\n" "$ac_cv_lib_pthread_pthread_create" >&6; }
if test "x$ac_cv_lib_pthread_pthread_create" = xyes
then :
PTHREAD_LIBS="-lpthreadGC2"
fi
else
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
printf %s "checking for pthread_create in -lpthread... " >&6; }
if test ${ac_cv_lib_pthread_pthread_create+y}
then :
@@ -6787,12 +6620,132 @@ printf "%s\n" "$ac_cv_lib_pthread_pthread_create" >&6; }
if test "x$ac_cv_lib_pthread_pthread_create" = xyes
then :
PTHREAD_LIBS="-lpthread"
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC2" >&5
printf %s "checking for pthread_create in -lpthreadGC2... " >&6; }
if test ${ac_cv_lib_pthreadGC2_pthread_create+y}
then :
printf %s "(cached) " >&6
else $as_nop
ac_check_lib_save_LIBS=$LIBS
LIBS="-lpthreadGC2 $LIBS"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
/* Override any GCC internal prototype to avoid an error.
Use char because int might match the return type of a GCC
builtin and then its argument prototype would still apply. */
char pthread_create ();
int
main (void)
{
return pthread_create ();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"
then :
ac_cv_lib_pthreadGC2_pthread_create=yes
else $as_nop
ac_cv_lib_pthreadGC2_pthread_create=no
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam \
conftest$ac_exeext conftest.$ac_ext
LIBS=$ac_check_lib_save_LIBS
fi
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC2_pthread_create" >&5
printf "%s\n" "$ac_cv_lib_pthreadGC2_pthread_create" >&6; }
if test "x$ac_cv_lib_pthreadGC2_pthread_create" = xyes
then :
PTHREAD_LIBS="-lpthreadGC2"
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC1" >&5
printf %s "checking for pthread_create in -lpthreadGC1... " >&6; }
if test ${ac_cv_lib_pthreadGC1_pthread_create+y}
then :
printf %s "(cached) " >&6
else $as_nop
ac_check_lib_save_LIBS=$LIBS
LIBS="-lpthreadGC1 $LIBS"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
/* Override any GCC internal prototype to avoid an error.
Use char because int might match the return type of a GCC
builtin and then its argument prototype would still apply. */
char pthread_create ();
int
main (void)
{
return pthread_create ();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"
then :
ac_cv_lib_pthreadGC1_pthread_create=yes
else $as_nop
ac_cv_lib_pthreadGC1_pthread_create=no
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam \
conftest$ac_exeext conftest.$ac_ext
LIBS=$ac_check_lib_save_LIBS
fi
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC1_pthread_create" >&5
printf "%s\n" "$ac_cv_lib_pthreadGC1_pthread_create" >&6; }
if test "x$ac_cv_lib_pthreadGC1_pthread_create" = xyes
then :
PTHREAD_LIBS="-lpthreadGC1"
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC" >&5
printf %s "checking for pthread_create in -lpthreadGC... " >&6; }
if test ${ac_cv_lib_pthreadGC_pthread_create+y}
then :
printf %s "(cached) " >&6
else $as_nop
ac_check_lib_save_LIBS=$LIBS
LIBS="-lpthreadGC $LIBS"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
/* Override any GCC internal prototype to avoid an error.
Use char because int might match the return type of a GCC
builtin and then its argument prototype would still apply. */
char pthread_create ();
int
main (void)
{
return pthread_create ();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"
then :
ac_cv_lib_pthreadGC_pthread_create=yes
else $as_nop
ac_cv_lib_pthreadGC_pthread_create=no
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam \
conftest$ac_exeext conftest.$ac_ext
LIBS=$ac_check_lib_save_LIBS
fi
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC_pthread_create" >&5
printf "%s\n" "$ac_cv_lib_pthreadGC_pthread_create" >&6; }
if test "x$ac_cv_lib_pthreadGC_pthread_create" = xyes
then :
PTHREAD_LIBS="-lpthreadGC"
fi
fi
fi
fi
LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
# PTHREAD_LIBS="$PTHREAD_LIBS"
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether __uint128_t is supported" >&5
printf %s "checking whether __uint128_t is supported... " >&6; }
@@ -6847,14 +6800,6 @@ else
USE_ASM_FALSE=
fi
if test x$have_x86 = xtrue; then
ARCH_x86_TRUE=
ARCH_x86_FALSE='#'
else
ARCH_x86_TRUE='#'
ARCH_x86_FALSE=
fi
if test x$have_x86_64 = xtrue; then
ARCH_x86_64_TRUE=
ARCH_x86_64_FALSE='#'
@@ -6863,12 +6808,12 @@ else
ARCH_x86_64_FALSE=
fi
if test x$have_arm = xtrue; then
ARCH_ARM_TRUE=
ARCH_ARM_FALSE='#'
if test x$have_arm64 = xtrue; then
ARCH_ARM64_TRUE=
ARCH_ARM64_FALSE='#'
else
ARCH_ARM_TRUE='#'
ARCH_ARM_FALSE=
ARCH_ARM64_TRUE='#'
ARCH_ARM64_FALSE=
fi
if test "x$OS" = "xWindows_NT"; then
@@ -6879,13 +6824,15 @@ else
MINGW_FALSE=
fi
if test x$request_jansson = xtrue ; then
JANSSON_LIBS="compat/jansson/libjansson.a"
if test x$have_apple = xtrue; then
HAVE_APPLE_TRUE=
HAVE_APPLE_FALSE='#'
else
JANSSON_LIBS=-ljansson
HAVE_APPLE_TRUE='#'
HAVE_APPLE_FALSE=
fi
# libcurl install path (for mingw : --with-curl=/usr/local)
# Check whether --with-curl was given.
@@ -6902,30 +6849,10 @@ if test -n "$with_curl" ; then
LIBCURL="-lcurl -lz"
fi
# SSL install path (for mingw : --with-crypto=/usr/local/ssl)
# Check whether --with-crypto was given.
if test ${with_crypto+y}
then :
withval=$with_crypto;
fi
if test -n "$with_crypto" ; then
LIBCURL_CFLAGS="$LIBCURL_CFLAGS -I$with_crypto/include"
LIBCURL_CPPFLAGS="$LIBCURL_CPPFLAGS -I$with_crypto/include"
LIBCURL_LDFLAGS="-L$with_crypto/lib $LIBCURL_LDFLAGS"
LIBCURL="$LIBCURL -lssl -lcrypto"
fi
CFLAGS="$CFLAGS $LIBCURL_CFLAGS"
CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS"
#AC_CHECK_LIB([z],[gzopen],[],[])
#AC_CHECK_LIB([crypto],[OPENSSL_init], crypto=yes, [AC_MSG_ERROR([OpenSSL crypto library required])])
#AC_CHECK_LIB([ssl],[SSL_new], ssl=yes, ssl=no)
# AC_CHECK_LIB([curl], [curl_multi_timeout],
# have_libcurl=yes,
# have_libcurl=no AC_MSG_ERROR([curl library required])
@@ -7102,22 +7029,22 @@ if test -z "${USE_ASM_TRUE}" && test -z "${USE_ASM_FALSE}"; then
as_fn_error $? "conditional \"USE_ASM\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${ARCH_x86_TRUE}" && test -z "${ARCH_x86_FALSE}"; then
as_fn_error $? "conditional \"ARCH_x86\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${ARCH_x86_64_TRUE}" && test -z "${ARCH_x86_64_FALSE}"; then
as_fn_error $? "conditional \"ARCH_x86_64\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${ARCH_ARM_TRUE}" && test -z "${ARCH_ARM_FALSE}"; then
as_fn_error $? "conditional \"ARCH_ARM\" was never defined.
if test -z "${ARCH_ARM64_TRUE}" && test -z "${ARCH_ARM64_FALSE}"; then
as_fn_error $? "conditional \"ARCH_ARM64\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${MINGW_TRUE}" && test -z "${MINGW_FALSE}"; then
as_fn_error $? "conditional \"MINGW\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${HAVE_APPLE_TRUE}" && test -z "${HAVE_APPLE_FALSE}"; then
as_fn_error $? "conditional \"HAVE_APPLE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
: "${CONFIG_STATUS=./config.status}"
ac_write_fail=0
@@ -7508,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 24.4, which was
This file was extended by cpuminer-opt $as_me 25.2, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7576,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 24.4
cpuminer-opt config.status 25.2
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"

View File

@@ -206,7 +206,7 @@ static uint32_t last_block_height = 0;
static double highest_share = 0; // highest accepted share diff
static double lowest_share = 9e99; // lowest accepted share diff
static double last_targetdiff = 0.;
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32) || defined(__APPLE__))
static uint32_t hi_temp = 0;
static uint32_t prev_temp = 0;
#endif
@@ -286,15 +286,15 @@ static inline void drop_policy(void) { }
static void affine_to_cpu( struct thr_info *thr )
{
int thread = thr->id;
unsigned long last_error;
bool ok;
unsigned long last_error = 0;
bool ok = true;
#if defined(WINDOWS_CPU_GROUPS_ENABLED)
unsigned long group_size = GetActiveProcessorCount( 0 );
unsigned long group = thread / group_size;
unsigned long cpu = thread_affinity_map[ thread % group_size ];
GROUP_AFFINITY affinity;
GROUP_AFFINITY affinity = {0};
affinity.Group = group;
affinity.Mask = 1ULL << cpu;
@@ -320,8 +320,7 @@ static void affine_to_cpu( struct thr_info *thr )
{
last_error = GetLastError();
if ( !thread )
applog( LOG_WARNING, "Set affinity returned error 0x%x for thread %d",
last_error, thread );
applog( LOG_WARNING, "Set affinity returned error 0x%x", last_error );
}
}
@@ -992,19 +991,19 @@ void report_summary_log( bool force )
if ( rejected_share_count > 10 )
{
if ( rejected_share_count > ( submitted_share_count * .5 ) )
if ( rejected_share_count > ( submitted_share_count / 2 ) )
{
applog(LOG_ERR,"Excessive rejected share rate, exiting...");
exit(1);
}
else if ( rejected_share_count > ( submitted_share_count * .1 ) )
else if ( rejected_share_count > ( submitted_share_count / 10 ) )
applog(LOG_WARNING,"High rejected share rate, check settings.");
}
gettimeofday( &now, NULL );
timeval_subtract( &et, &now, &five_min_start );
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32) || defined(__APPLE__))
// Display CPU temperature and clock rate.
int curr_temp = cpu_temp(0);
@@ -1013,8 +1012,9 @@ void report_summary_log( bool force )
if ( !opt_quiet || ( curr_temp >= 80 ) )
{
int wait_time = curr_temp >= 90 ? 5 : curr_temp >= 80 ? 30 :
curr_temp >= 70 ? 60 : 120;
int wait_time = curr_temp >= 90 ? 5
: curr_temp >= 80 ? 30
: curr_temp >= 70 ? 60 : 120;
timeval_subtract( &diff, &now, &cpu_temp_time );
if ( ( diff.tv_sec > wait_time )
|| ( ( curr_temp > prev_temp ) && ( curr_temp >= 75 ) ) )
@@ -1912,7 +1912,7 @@ static bool wanna_mine(int thr_id)
{
bool state = true;
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32) || defined(__APPLE__))
if (opt_max_temp > 0.0)
{
@@ -2400,7 +2400,7 @@ static void *miner_thread( void *userdata )
{
scale_hash_for_display( &hashrate, hr_units );
sprintf( hr, "%.2f", hashrate );
#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32) || defined(__APPLE__))
applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
#else
float lo_freq = 0., hi_freq = 0.;
@@ -2840,8 +2840,6 @@ static void show_credits()
static bool cpu_capability( bool display_only )
{
char cpu_brand[0x40];
bool cpu_has_aarch64 = cpu_arch_aarch64();
bool cpu_has_x86_64 = cpu_arch_x86_64();
bool cpu_has_sse2 = has_sse2(); // X86_64 only
bool cpu_has_ssse3 = has_ssse3(); // X86_64 only
bool cpu_has_sse41 = has_sse41(); // X86_64 only
@@ -2914,7 +2912,8 @@ static bool cpu_capability( bool display_only )
sw_arm_arch = __ARM_ARCH;
#endif
#endif
// x86_64_only
// x86_64 only
#if defined(__SSE2__)
sw_has_sse2 = true;
#endif
@@ -2942,9 +2941,10 @@ static bool cpu_capability( bool display_only )
#if defined(__AVX10_1_512__)
sw_has_avx10_512 = true;
#endif
// x86_64 or AArch64
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
sw_has_aes = true;
sw_has_aes = true;
#endif
#ifdef __VAES__
sw_has_vaes = true;
@@ -2955,6 +2955,7 @@ static bool cpu_capability( bool display_only )
#if defined(__SHA512__) || defined(__ARM_FEATURE_SHA512)
sw_has_sha512 = true;
#endif
// AArch64 only
#if defined(__ARM_NEON)
sw_has_neon = true;
@@ -2972,87 +2973,93 @@ static bool cpu_capability( bool display_only )
sw_has_sme2 = true;
#endif
// CPU
cpu_brand_string( cpu_brand );
printf( "CPU: %s\n", cpu_brand );
printf("SW built on " __DATE__
#ifdef _MSC_VER
" with VC++ 2013\n");
// Build
printf( "SW built on " __DATE__
#if defined(__clang__)
" with CLANG-%d.%d.%d", __clang_major__, __clang_minor__,
__clang_patchlevel__ );
#elif defined(__GNUC__)
" with GCC-");
printf("%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
#else
printf("\n");
" with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ );
#endif
// OS
#if defined(__linux)
printf(" Linux\n");
#elif defined(WIN32)
printf(" Windows\n");
printf(" Windows");
#if defined(__MINGW64__)
printf(" MinGW-w64\n");
#else
printf("\n");
#endif
#elif defined(__APPLE__)
printf(" MacOS\n");
#elif defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
printf(" Unix\n");
#elif defined(__bsd__) || defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
printf(" BSD/Unix\n");
#else
printf("\n");
#endif
printf("CPU features: ");
if ( cpu_has_x86_64 )
if ( cpu_arch_x86_64() )
{
if ( cpu_has_avx512 ) printf( " AVX512" );
else if ( cpu_has_avx2 ) printf( " AVX2 " );
else if ( cpu_has_avx ) printf( " AVX " );
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
else if ( cpu_has_sse41 ) printf( " SSE4.1" );
else if ( cpu_has_ssse3 ) printf( " SSSE3 " );
else if ( cpu_has_sse2 ) printf( " SSE2 " );
if ( cpu_has_avx10 ) printf( " AVX10.%d-%d", avx10_version(),
avx10_vector_length() );
if ( cpu_has_avx512 ) printf( " AVX512" );
else if ( cpu_has_avx2 ) printf( " AVX2 " );
else if ( cpu_has_avx ) printf( " AVX " );
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
else if ( cpu_has_sse41 ) printf( " SSE4.1" );
else if ( cpu_has_ssse3 ) printf( " SSSE3 " );
else if ( cpu_has_sse2 ) printf( " SSE2 " );
}
else if ( cpu_has_aarch64 )
else if ( cpu_arch_aarch64() )
{
if ( cpu_has_neon ) printf( " NEON" );
if ( cpu_has_sve2 ) printf( " SVE2-%d", sve_vector_length() );
else if ( cpu_has_sve ) printf( " SVE" );
if ( cpu_has_sme2 ) printf( " SME2" );
else if ( cpu_has_sme ) printf( " SME" );
}
if ( cpu_has_vaes ) printf( " VAES" );
else if ( cpu_has_aes ) printf( " AES" );
if ( cpu_has_sha512 ) printf( " SHA512" );
else if ( cpu_has_sha256 ) printf( " SHA256" );
if ( cpu_has_avx10 ) printf( " AVX10.%d-%d",
avx10_version(), avx10_vector_length() );
else if ( cpu_has_sve ) printf( " SVE" );
if ( cpu_has_sme2 ) printf( " SME2" );
else if ( cpu_has_sme ) printf( " SME" );
}
if ( cpu_has_vaes ) printf( " VAES" );
else if ( cpu_has_aes ) printf( " AES" );
if ( cpu_has_sha512 ) printf( " SHA512" );
else if ( cpu_has_sha256 ) printf( " SHA256" );
printf("\nSW features: ");
if ( sw_has_x86_64 )
{
if ( sw_has_avx512 ) printf( " AVX512" );
else if ( sw_has_avx2 ) printf( " AVX2 " );
else if ( sw_has_avx ) printf( " AVX " );
else if ( sw_has_sse42 ) printf( " SSE4.2" );
else if ( sw_has_sse41 ) printf( " SSE4.1" );
else if ( sw_has_ssse3 ) printf( " SSSE3 " );
else if ( sw_has_sse2 ) printf( " SSE2 " );
if ( sw_has_avx10_512 ) printf( " AVX10-512" );
else if ( sw_has_avx10_256 ) printf( " AVX10-256" );
else if ( sw_has_avx512 ) printf( " AVX512" );
else if ( sw_has_avx2 ) printf( " AVX2 " );
else if ( sw_has_avx ) printf( " AVX " );
else if ( sw_has_sse42 ) printf( " SSE4.2" );
else if ( sw_has_sse41 ) printf( " SSE4.1" );
else if ( sw_has_ssse3 ) printf( " SSSE3 " );
else if ( sw_has_sse2 ) printf( " SSE2 " );
}
else if ( sw_has_aarch64 )
{
if ( sw_arm_arch ) printf( " armv%d", sw_arm_arch );
if ( sw_has_neon ) printf( " NEON" );
if ( sw_has_sve2 ) printf( " SVE2" );
else if ( sw_has_sve ) printf( " SVE" );
if ( sw_has_sme2 ) printf( " SME2" );
else if ( sw_has_sme ) printf( " SME" );
if ( sw_has_neon ) printf( " NEON" );
if ( sw_has_sve2 ) printf( " SVE2" );
else if ( sw_has_sve ) printf( " SVE" );
if ( sw_has_sme2 ) printf( " SME2" );
else if ( sw_has_sme ) printf( " SME" );
}
if ( sw_has_vaes ) printf( " VAES" );
else if ( sw_has_aes ) printf( " AES" );
if ( sw_has_sha512 ) printf( " SHA512" );
else if ( sw_has_sha256 ) printf( " SHA256" );
if ( sw_has_vaes ) printf( " VAES" );
else if ( sw_has_aes ) printf( " AES" );
if ( sw_has_sha512 ) printf( " SHA512" );
else if ( sw_has_sha256 ) printf( " SHA256" );
if ( !display_only )
{
printf("\nAlgo features: ");
printf("\nAlgo features:");
if ( algo_features == EMPTY_SET ) printf( " None" );
else
{
@@ -3060,7 +3067,7 @@ static bool cpu_capability( bool display_only )
else if ( algo_has_avx2 ) printf( " AVX2 " );
else if ( algo_has_sse42 ) printf( " SSE4.2" );
else if ( algo_has_sse2 ) printf( " SSE2 " );
if ( algo_has_neon ) printf( " NEON " );
if ( algo_has_neon ) printf( " NEON" );
if ( algo_has_vaes ) printf( " VAES" );
else if ( algo_has_aes ) printf( " AES" );
if ( algo_has_sha512 ) printf( " SHA512" );
@@ -3105,6 +3112,7 @@ static bool cpu_capability( bool display_only )
return true;
}
/*
void show_version_and_exit(void)
{
printf("\n built on " __DATE__
@@ -3161,12 +3169,11 @@ void show_version_and_exit(void)
printf("\n");
exit(0);
}
*/
void show_usage_and_exit(int status)
{
if (status)
fprintf(stderr, "Try `--help' for more information.\n");
// fprintf(stderr, "Try `" PACKAGE_NAME " --help' for more information.\n");
else
printf(usage);
exit(status);
@@ -3182,7 +3189,6 @@ void parse_arg(int key, char *arg )
{
char *p;
int v, i;
// uint64_t ul;
double d;
switch( key )
@@ -3326,7 +3332,8 @@ void parse_arg(int key, char *arg )
free(rpc_user);
rpc_user = strdup(arg);
break;
case 'o': // url
case 'o': // url
{
char *ap, *hp;
ap = strstr( arg, "://" );
@@ -3391,7 +3398,8 @@ void parse_arg(int key, char *arg )
have_stratum = !opt_benchmark && !strncasecmp( rpc_url, "stratum", 7 );
break;
}
case 'O': // userpass
case 'O': // userpass
p = strchr(arg, ':');
if (!p)
{
@@ -3551,10 +3559,10 @@ void parse_arg(int key, char *arg )
case 1029: // stratum-keepalive
opt_stratum_keepalive = true;
break;
case 'V':
case 'V': // version
display_cpu_capability();
exit(0);
case 'h':
case 'h': // help
show_usage_and_exit(0);
default:
@@ -3693,9 +3701,6 @@ int main(int argc, char *argv[])
{
int cpus = GetActiveProcessorCount( i );
num_cpus += cpus;
// if (opt_debug)
// applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
}
#else
@@ -3866,12 +3871,23 @@ int main(int argc, char *argv[])
}
#endif
#if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
applog( LOG_INFO, "Found %d CPUs in %d groups",
num_cpus, num_cpugroups );
#if defined(WIN32)
#if defined(_WIN32_WINNT)
if (opt_debug)
applog( LOG_INFO, "_WIN232_WINNT = 0x%04x", _WIN32_WINNT );
#else
if (opt_debug)
applog( LOG_INFO, "_WIN232_WINNT undefined." );
#endif
#if defined(WINDOWS_CPU_GROUPS_ENABLED)
if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
applog( LOG_INFO, "Found %d CPUs in %d groups",
num_cpus, num_cpugroups );
#endif
#endif
conditional_state = malloc( opt_n_threads * ((sizeof(bool)) ) );
memset( conditional_state, 0, opt_n_threads * ((sizeof(bool)) ) );
@@ -3892,7 +3908,7 @@ int main(int argc, char *argv[])
if ( cpu < num_cpus ) active_cpus++;
}
if ( opt_n_threads > active_cpus )
applog( LOG_WARNING, "Affinity: more threads (%d) than active CPUs (%d)", opt_n_threads, active_cpus );
applog( LOG_WARNING, "More miner threads (%d) than active CPUs in affinity mask (%d)", opt_n_threads, active_cpus );
if ( !opt_quiet )
{
char affinity_mask[64];

12
miner.h
View File

@@ -3,10 +3,7 @@
#include <cpuminer-config.h>
#if !( defined(__SSE2__) || ( defined(__aarch64__) && defined(__ARM_NEON) ) )
#warning "Unknown or unsupported CPU, requires x86_64 with SSE2 or AArch64 with NEON."
#endif
// CPU architecture
#if defined(__x86_64__)
#define USER_AGENT_ARCH "x64" // Intel, AMD x86_64
#elif defined(__aarch64__)
@@ -17,14 +14,15 @@
#define USER_AGENT_ARCH
#endif
// Operating system
// __APPLE__ includes MacOS & IOS, no MacOS only macros found.
#if defined(__linux)
#define USER_AGENT_OS "L" // GNU Linux
#elif defined(WIN32)
#define USER_AGENT_OS "W" // MS Windows
#elif defined(__APPLE__)
#define USER_AGENT_OS "M" // Apple MacOS
// is there a generic BSD macro?
#elif defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
#elif defined(__bsd__) || defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
#define USER_AGENT_OS "U" // BSD unix
#else
#define USER_AGENT_OS
@@ -191,7 +189,7 @@ static inline uint32_t swab32(uint32_t x)
return __builtin_bswap32(x);
#else
return ( ( (x) << 24 ) & 0xff000000u ) | ( ( (x) << 8 ) & 0x00ff0000u )
| ( ( (x) >> 8 ) & 0x0000ff00u ) | ( ( (x) >> 24 ) & 0x000000ffu )
| ( ( (x) >> 8 ) & 0x0000ff00u ) | ( ( (x) >> 24 ) & 0x000000ffu );
// return bswap_32(v);

View File

@@ -29,7 +29,6 @@
// is no significant 64 bit vectorization therefore SSE2 is the practical
// minimum for using this code.
//
// MMX: 64 bit vectors (Not used in cpuminer-opt)
// SSE2: 128 bit vectors (64 bit CPUs only, such as Intel Core2.
// AVX2: 256 bit vectors (Starting with Intel Haswell and AMD Ryzen)
// AVX512: 512 bit vectors (Starting with SkylakeX)
@@ -141,18 +140,15 @@
#include <stdint.h>
#include <stddef.h>
// GCC-14.1: the AVX512 macros are defined even when compiled with only
// -mavx10.1-256, causing compile errors in AVX512 code. Only with
// -mavx10.1-512 does it compile successfully.
// __EVEX512__ is set only when compiled with -mavx10.1-512.
// Adding -fno-evex512 doesn't help.
// Building with -mapxf fails on a CPU without APX because configure can't
// run its test program.
// AVX512 macros are not a reliable indicator of 512 bit vector capability
// because they get defined with AVX10_1_256 which doesn't support 512 bit.
// EVEX512 is also unreliable as it can also be defined when 512b is not
// available.
// Use AVX10_1_512 for 512b & AVX10_1_256 for 256b whenever AVX10 is present.
// Use AVX512 macros only whithout AVX10.
/*
// Test for macros
#ifdef __AVX10__
#warning "__AVX10__"
#endif
#ifdef __AVX10_1__
#warning "__AVX10_1__"
#endif
@@ -162,39 +158,38 @@
#ifdef __AVX10_1_512__
#warning "__AVX10_1_512__"
#endif
#ifdef __EVEX256__
#warning "__EVEX256__"
#endif
#ifdef __EVEX512__
#warning "__EVEX512__"
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#warning "AVX512"
#endif
*/
// AVX10 complicates vector support by adding AVX512 features to CPUs without 512 bit
// vector support. AVX10.1 is just a renaming of AVX512 and is only available for
// Intel P-core only CPUs. AVX10.2 adds support for E-cores that don't support 512 bit
// vectors. The following macros simplify things.
// SIMD512: Use 512, 256 & 128 bit vectors, AVX512VBMI is not included and must be
// tested seperately.
// SIMD512: Use 512, 256 & 128 bit vectors, AVX512VBMI is not included and
// must be tested seperately.
// VL256: Include AVX512VL instructions for 256 & 128 bit vectors.
// VBMI: Include AVX512VBMI instructions for supported vector lengths.
// AVX10 can exist without support for 512 bit vectors.
#if defined(__AVX10_1_512__)
#define SIMD512 1
#elif !defined(__AVX10_1__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SIMD512 1
#endif
#if defined(__AVX10_1__)
// AVX512VL instructions applied to 256 & 128 bit vectors is supported with
// either AVX512VL or AVX10. Support for CPUs without 512 bit vectors is available
// with AVX10.2.
#if defined(__AVX10_2__) || defined(__AVX10_1_512__)
#define VL256 1
#elif defined(__AVX512VL__)
#define VL256 1
#endif
// VBMI does not exist on early versions of AVX512
#if defined(__AVX10_1__) || defined(__AVX512VBMI__)
#define VBMI 1
#if defined(__AVX10_1_512__)
#define SIMD512 1
#endif
#elif defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define VL256 1
#define SIMD512 1
#if defined(__AVX512VBMI__)
#define VBMI 1
#endif
#endif
/*
@@ -221,9 +216,6 @@
#include "simd-utils/simd-int.h"
// x86_64 MMX 64 bit vectors
#include "simd-utils/simd-64.h"
// x86_64 SSE2 128 bit vectors
#include "simd-utils/simd-128.h"
@@ -233,10 +225,6 @@
// x86_64 AVX512 512 bit vectors
#include "simd-utils/simd-512.h"
// move up after cleaning
// CPU architectire abstraction
//#include "simd-utils/simd-portable.h"
// aarch64 neon 128 bit vectors
#include "simd-utils/simd-neon.h"

View File

@@ -86,7 +86,7 @@ static inline void extr_lane_2x32( void *dst, const void *src,
// 4x32
#if ( defined(__x86_64__) && defined(__SSE2__) ) || ( defined(__aarch64__) && defined(__ARM_NEON) )
#if defined(__x86_64__) && defined(__SSE2__)
#define ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ) \
{ \
@@ -174,6 +174,7 @@ static inline void intrlv_4x32_512( void *dst, const void *src0,
STOR_DEST_4x32( D0, D1, D2, D3, dst, 12, dst, 13, dst, 14, dst, 15 );
}
static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src, const int bit_len )
{
@@ -235,6 +236,190 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
STOR_DEST_4x32( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 );
}
#elif defined(__aarch64__) && defined(__ARM_NEON)
static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
const void *src2, const void *src3, const int bit_len )
{
uint32x4x4_t s;
s.val[0] = casti_v128u32( src0, 0 );
s.val[1] = casti_v128u32( src1, 0 );
s.val[2] = casti_v128u32( src2, 0 );
s.val[3] = casti_v128u32( src3, 0 );
vst4q_u32( dst, s );
s.val[0] = casti_v128u32( src0, 1 );
s.val[1] = casti_v128u32( src1, 1 );
s.val[2] = casti_v128u32( src2, 1 );
s.val[3] = casti_v128u32( src3, 1 );
vst4q_u32( dst + 64, s );
if ( bit_len <= 256 ) return;
s.val[0] = casti_v128u32( src0, 2 );
s.val[1] = casti_v128u32( src1, 2 );
s.val[2] = casti_v128u32( src2, 2 );
s.val[3] = casti_v128u32( src3, 2 );
vst4q_u32( dst + 128, s );
s.val[0] = casti_v128u32( src0, 3 );
s.val[1] = casti_v128u32( src1, 3 );
s.val[2] = casti_v128u32( src2, 3 );
s.val[3] = casti_v128u32( src3, 3 );
vst4q_u32( dst + 192, s );
if ( bit_len <= 512 ) return;
s.val[0] = casti_v128u32( src0, 4 );
s.val[1] = casti_v128u32( src1, 4 );
s.val[2] = casti_v128u32( src2, 4 );
s.val[3] = casti_v128u32( src3, 4 );
vst4q_u32( dst + 256, s );
if ( bit_len <= 640 ) return;
s.val[0] = casti_v128u32( src0, 5 );
s.val[1] = casti_v128u32( src1, 5 );
s.val[2] = casti_v128u32( src2, 5 );
s.val[3] = casti_v128u32( src3, 5 );
vst4q_u32( dst + 320, s );
s.val[0] = casti_v128u32( src0, 6 );
s.val[1] = casti_v128u32( src1, 6 );
s.val[2] = casti_v128u32( src2, 6 );
s.val[3] = casti_v128u32( src3, 6 );
vst4q_u32( dst + 384, s );
s.val[0] = casti_v128u32( src0, 7 );
s.val[1] = casti_v128u32( src1, 7 );
s.val[2] = casti_v128u32( src2, 7 );
s.val[3] = casti_v128u32( src3, 7 );
vst4q_u32( dst + 448, s );
// if ( bit_len <= 1024 return;
}
static inline void intrlv_4x32_512( void *dst, const void *src0,
const void *src1, const void *src2, const void *src3 )
{
uint32x4x4_t s;
s.val[0] = casti_v128u32( src0, 0 );
s.val[1] = casti_v128u32( src1, 0 );
s.val[2] = casti_v128u32( src2, 0 );
s.val[3] = casti_v128u32( src3, 0 );
vst4q_u32( dst, s );
s.val[0] = casti_v128u32( src0, 1 );
s.val[1] = casti_v128u32( src1, 1 );
s.val[2] = casti_v128u32( src2, 1 );
s.val[3] = casti_v128u32( src3, 1 );
vst4q_u32( dst + 64, s );
s.val[0] = casti_v128u32( src0, 2 );
s.val[1] = casti_v128u32( src1, 2 );
s.val[2] = casti_v128u32( src2, 2 );
s.val[3] = casti_v128u32( src3, 2 );
vst4q_u32( dst + 128, s );
s.val[0] = casti_v128u32( src0, 3 );
s.val[1] = casti_v128u32( src1, 3 );
s.val[2] = casti_v128u32( src2, 3 );
s.val[3] = casti_v128u32( src3, 3 );
vst4q_u32( dst + 192, s );
}
static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src, int bit_len )
{
uint32x4x4_t s = vld4q_u32( src );
casti_v128( dst0, 0 ) = s.val[0];
casti_v128( dst1, 0 ) = s.val[1];
casti_v128( dst2, 0 ) = s.val[2];
casti_v128( dst3, 0 ) = s.val[3];
s = vld4q_u32( src + 64 );
casti_v128( dst0, 1 ) = s.val[0];
casti_v128( dst1, 1 ) = s.val[1];
casti_v128( dst2, 1 ) = s.val[2];
casti_v128( dst3, 1 ) = s.val[3];
if ( bit_len <= 256 ) return;
s = vld4q_u32( src + 128 );
casti_v128( dst0, 2 ) = s.val[0];
casti_v128( dst1, 2 ) = s.val[1];
casti_v128( dst2, 2 ) = s.val[2];
casti_v128( dst3, 2 ) = s.val[3];
s = vld4q_u32( src + 192 );
casti_v128( dst0, 3 ) = s.val[0];
casti_v128( dst1, 3 ) = s.val[1];
casti_v128( dst2, 3 ) = s.val[2];
casti_v128( dst3, 3 ) = s.val[3];
if ( bit_len <= 512 ) return;
s = vld4q_u32( src + 256 );
casti_v128( dst0, 4 ) = s.val[0];
casti_v128( dst1, 4 ) = s.val[1];
casti_v128( dst2, 4 ) = s.val[2];
casti_v128( dst3, 4 ) = s.val[3];
if ( bit_len <= 640 ) return;
s = vld4q_u32( src + 320 );
casti_v128( dst0, 5 ) = s.val[0];
casti_v128( dst1, 5 ) = s.val[1];
casti_v128( dst2, 5 ) = s.val[2];
casti_v128( dst3, 5 ) = s.val[3];
s = vld4q_u32( src + 384 );
casti_v128( dst0, 6 ) = s.val[0];
casti_v128( dst1, 6 ) = s.val[1];
casti_v128( dst2, 6 ) = s.val[2];
casti_v128( dst3, 6 ) = s.val[3];
s = vld4q_u32( src + 448 );
casti_v128( dst0, 6 ) = s.val[0];
casti_v128( dst1, 6 ) = s.val[1];
casti_v128( dst2, 6 ) = s.val[2];
casti_v128( dst3, 6 ) = s.val[3];
// if ( bit_len <= 1024 ) return;
}
static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src )
{
uint32x4x4_t s = vld4q_u32( src );
casti_v128( dst0, 0 ) = s.val[0];
casti_v128( dst1, 0 ) = s.val[1];
casti_v128( dst2, 0 ) = s.val[2];
casti_v128( dst3, 0 ) = s.val[3];
s = vld4q_u32( src + 64 );
casti_v128( dst0, 1 ) = s.val[0];
casti_v128( dst1, 1 ) = s.val[1];
casti_v128( dst2, 1 ) = s.val[2];
casti_v128( dst3, 1 ) = s.val[3];
s = vld4q_u32( src + 128 );
casti_v128( dst0, 2 ) = s.val[0];
casti_v128( dst1, 2 ) = s.val[1];
casti_v128( dst2, 2 ) = s.val[2];
casti_v128( dst3, 2 ) = s.val[3];
s = vld4q_u32( src + 192 );
casti_v128( dst0, 3 ) = s.val[0];
casti_v128( dst1, 3 ) = s.val[1];
casti_v128( dst2, 3 ) = s.val[2];
casti_v128( dst3, 3 ) = s.val[3];
}
#else // !SSE2 && !NEON
static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
@@ -456,15 +641,13 @@ static inline void v128_bswap32_80( void *d, void *s )
#endif
#if defined(__SSE2__)
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
{
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
v128u32_t s0 = casti_v128u32( src,0 );
v128u32_t s1 = casti_v128u32( src,1 );
v128u32_t s2 = casti_v128u32( src,2 );
v128u32_t s3 = casti_v128u32( src,3 );
v128u32_t s4 = casti_v128u32( src,4 );
#if defined(__SSSE3__)
@@ -487,79 +670,34 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
#endif
casti_v128( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
casti_v128( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
casti_v128( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
casti_v128( d, 3 ) = _mm_shuffle_epi32( s0, 0xff );
casti_v128u32( d, 0 ) = v128_duplane32( s0, 0 );
casti_v128u32( d, 1 ) = v128_duplane32( s0, 1 );
casti_v128u32( d, 2 ) = v128_duplane32( s0, 2 );
casti_v128u32( d, 3 ) = v128_duplane32( s0, 3 );
casti_v128( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 );
casti_v128( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 );
casti_v128( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa );
casti_v128( d, 7 ) = _mm_shuffle_epi32( s1, 0xff );
casti_v128u32( d, 4 ) = v128_duplane32( s1, 0 );
casti_v128u32( d, 5 ) = v128_duplane32( s1, 1 );
casti_v128u32( d, 6 ) = v128_duplane32( s1, 2 );
casti_v128u32( d, 7 ) = v128_duplane32( s1, 3 );
casti_v128( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 );
casti_v128( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 );
casti_v128( d,10 ) = _mm_shuffle_epi32( s2, 0xaa );
casti_v128( d,11 ) = _mm_shuffle_epi32( s2, 0xff );
casti_v128u32( d, 8 ) = v128_duplane32( s2, 0 );
casti_v128u32( d, 9 ) = v128_duplane32( s2, 1 );
casti_v128u32( d,10 ) = v128_duplane32( s2, 2 );
casti_v128u32( d,11 ) = v128_duplane32( s2, 3 );
casti_v128( d,12 ) = _mm_shuffle_epi32( s3, 0x00 );
casti_v128( d,13 ) = _mm_shuffle_epi32( s3, 0x55 );
casti_v128( d,14 ) = _mm_shuffle_epi32( s3, 0xaa );
casti_v128( d,15 ) = _mm_shuffle_epi32( s3, 0xff );
casti_v128u32( d,12 ) = v128_duplane32( s3, 0 );
casti_v128u32( d,13 ) = v128_duplane32( s3, 1 );
casti_v128u32( d,14 ) = v128_duplane32( s3, 2 );
casti_v128u32( d,15 ) = v128_duplane32( s3, 3 );
casti_v128( d,16 ) = _mm_shuffle_epi32( s4, 0x00 );
casti_v128( d,17 ) = _mm_shuffle_epi32( s4, 0x55 );
casti_v128( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
casti_v128( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
casti_v128u32( d,16 ) = v128_duplane32( s2, 0 );
casti_v128u32( d,17 ) = v128_duplane32( s2, 1 );
casti_v128u32( d,18 ) = v128_duplane32( s2, 2 );
casti_v128u32( d,19 ) = v128_duplane32( s2, 3 );
}
#elif defined(__aarch64__) && defined(__ARM_NEON)
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
{
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
casti_v128( d, 0 ) = vdupq_laneq_u32( s0, 0 );
casti_v128( d, 1 ) = vdupq_laneq_u32( s0, 1 );
casti_v128( d, 2 ) = vdupq_laneq_u32( s0, 2 );
casti_v128( d, 3 ) = vdupq_laneq_u32( s0, 3 );
casti_v128( d, 4 ) = vdupq_laneq_u32( s1, 0 );
casti_v128( d, 5 ) = vdupq_laneq_u32( s1, 1 );
casti_v128( d, 6 ) = vdupq_laneq_u32( s1, 2 );
casti_v128( d, 7 ) = vdupq_laneq_u32( s1, 3 );
casti_v128( d, 8 ) = vdupq_laneq_u32( s2, 0 );
casti_v128( d, 9 ) = vdupq_laneq_u32( s2, 1 );
casti_v128( d,10 ) = vdupq_laneq_u32( s2, 2 );
casti_v128( d,11 ) = vdupq_laneq_u32( s2, 3 );
casti_v128( d,12 ) = vdupq_laneq_u32( s3, 0 );
casti_v128( d,13 ) = vdupq_laneq_u32( s3, 1 );
casti_v128( d,14 ) = vdupq_laneq_u32( s3, 2 );
casti_v128( d,15 ) = vdupq_laneq_u32( s3, 3 );
casti_v128( d,16 ) = vdupq_laneq_u32( s2, 0 );
casti_v128( d,17 ) = vdupq_laneq_u32( s2, 1 );
casti_v128( d,18 ) = vdupq_laneq_u32( s2, 2 );
casti_v128( d,19 ) = vdupq_laneq_u32( s2, 3 );
}
#endif
// 8x32
#if defined(__AVX2__)
#define ILEAVE_8x32( D0, D1, D2, D3, D4, D5, D6, D7, \
@@ -1544,7 +1682,9 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
//
// 64 bit data
// 2x64 SSE2, NEON
// 2x64
#if defined(__x86_64__) && defined(__SSE2__)
static inline void intrlv_2x64( void *dst, const void *src0,
const void *src1, const int bit_len )
@@ -1602,7 +1742,101 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
d1[7] = v128_unpackhi64( s[14], s[15] );
}
/*
#elif defined(__aarch64__) && defined(__ARM_NEON)
static inline void intrlv_2x64( void *dst, const void *src0,
const void *src1, const int bit_len )
{
uint64x2x2_t s;
s.val[0] = casti_v128u64( src0, 0 );
s.val[1] = casti_v128u64( src1, 0 );
vst2q_u64( dst, s );
s.val[0] = casti_v128u64( src0, 1 );
s.val[1] = casti_v128u64( src1, 1 );
vst2q_u64( dst + 32, s );
if ( bit_len <= 256 ) return;
s.val[0] = casti_v128u64( src0, 2 );
s.val[1] = casti_v128u64( src1, 2 );
vst2q_u64( dst + 64, s );
s.val[0] = casti_v128u64( src0, 3 );
s.val[1] = casti_v128u64( src1, 3 );
vst2q_u64( dst + 96, s );
if ( bit_len <= 512 ) return;
s.val[0] = casti_v128u64( src0, 4 );
s.val[1] = casti_v128u64( src1, 4 );
vst2q_u64( dst + 128, s );
if ( bit_len <= 640 ) return;
s.val[0] = casti_v128u64( src0, 5 );
s.val[1] = casti_v128u64( src1, 5 );
vst2q_u64( dst + 160, s );
s.val[0] = casti_v128u64( src0, 6 );
s.val[1] = casti_v128u64( src1, 6 );
vst2q_u64( dst + 192, s );
s.val[0] = casti_v128u64( src0, 7 );
s.val[1] = casti_v128u64( src1, 7 );
vst2q_u64( dst + 224, s );
// if ( bit_len <= 1024 ) return;
}
static inline void dintrlv_2x64( void *dst0, void *dst1,
const void *src, const int bit_len )
{
uint64x2x2_t s = vld2q_u64( src );
casti_v128u64( dst0, 0 ) = s.val[0];
casti_v128u64( dst1, 0 ) = s.val[1];
s = vld2q_u64( src + 32 );
casti_v128u64( dst0, 1 ) = s.val[0];
casti_v128u64( dst1, 1 ) = s.val[1];
if ( bit_len <= 256 ) return;
s = vld2q_u64( src + 64 );
casti_v128u64( dst0, 2 ) = s.val[0];
casti_v128u64( dst1, 2 ) = s.val[1];
s = vld2q_u64( src + 96 );
casti_v128u64( dst0, 3 ) = s.val[0];
casti_v128u64( dst1, 3 ) = s.val[1];
if ( bit_len <= 512 ) return;
s = vld2q_u64( src + 128 );
casti_v128u64( dst0, 4 ) = s.val[0];
casti_v128u64( dst1, 4 ) = s.val[1];
if ( bit_len <= 640 ) return;
s = vld2q_u64( src + 160 );
casti_v128u64( dst0, 5 ) = s.val[0];
casti_v128u64( dst1, 5 ) = s.val[1];
s = vld2q_u64( src + 192 );
casti_v128u64( dst0, 6 ) = s.val[0];
casti_v128u64( dst1, 6 ) = s.val[1];
s = vld2q_u64( src + 224 );
casti_v128u64( dst0, 7 ) = s.val[0];
casti_v128u64( dst1, 7 ) = s.val[1];
// if ( bit_len <= 1024 ) return;
}
#else
static inline void intrlv_2x64( void *dst, const void *src0,
const void *src1, const int bit_len )
{
@@ -1621,8 +1855,7 @@ static inline void intrlv_2x64( void *dst, const void *src0,
d[24] = s0[12]; d[25] = s1[12]; d[26] = s0[13]; d[27] = s1[13];
d[28] = s0[14]; d[29] = s1[14]; d[30] = s0[15]; d[31] = s1[15];
}
*/
/*
static inline void dintrlv_2x64( void *dst0, void *dst1,
const void *src, const int bit_len )
{
@@ -1642,15 +1875,16 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
d0[12] = s[24]; d1[12] = s[25]; d0[13] = s[26]; d1[13] = s[27];
d0[14] = s[28]; d1[14] = s[29]; d0[15] = s[30]; d1[15] = s[31];
}
*/
#endif
static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
{
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
v128u64_t s0 = casti_v128u64( src,0 );
v128u64_t s1 = casti_v128u64( src,1 );
v128u64_t s2 = casti_v128u64( src,2 );
v128u64_t s3 = casti_v128u64( src,3 );
v128u64_t s4 = casti_v128u64( src,4 );
#if defined(__SSSE3__)
@@ -1673,41 +1907,20 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
#endif
#if defined(__SSE2__)
casti_v128u64( d,0 ) = v128_duplane64( s0, 0 );
casti_v128u64( d,1 ) = v128_duplane64( s0, 1 );
casti_v128( d,0 ) = _mm_shuffle_epi32( s0, 0x44 );
casti_v128( d,1 ) = _mm_shuffle_epi32( s0, 0xee );
casti_v128u64( d,2 ) = v128_duplane64( s1, 0 );
casti_v128u64( d,3 ) = v128_duplane64( s1, 1 );
casti_v128( d,2 ) = _mm_shuffle_epi32( s1, 0x44 );
casti_v128( d,3 ) = _mm_shuffle_epi32( s1, 0xee );
casti_v128u64( d,4 ) = v128_duplane64( s2, 0 );
casti_v128u64( d,5 ) = v128_duplane64( s2, 1 );
casti_v128( d,4 ) = _mm_shuffle_epi32( s2, 0x44 );
casti_v128( d,5 ) = _mm_shuffle_epi32( s2, 0xee );
casti_v128u64( d,6 ) = v128_duplane64( s3, 0 );
casti_v128u64( d,7 ) = v128_duplane64( s3, 1 );
casti_v128( d,6 ) = _mm_shuffle_epi32( s3, 0x44 );
casti_v128( d,7 ) = _mm_shuffle_epi32( s3, 0xee );
casti_v128( d,8 ) = _mm_shuffle_epi32( s4, 0x44 );
casti_v128( d,9 ) = _mm_shuffle_epi32( s4, 0xee );
#elif defined(__ARM_NEON)
casti_v128u64( d,0 ) = vdupq_laneq_u64( (uint64x2_t)s0, 0 );
casti_v128u64( d,1 ) = vdupq_laneq_u64( (uint64x2_t)s0, 1 );
casti_v128u64( d,2 ) = vdupq_laneq_u64( (uint64x2_t)s1, 0 );
casti_v128u64( d,3 ) = vdupq_laneq_u64( (uint64x2_t)s1, 1 );
casti_v128u64( d,4 ) = vdupq_laneq_u64( (uint64x2_t)s2, 0 );
casti_v128u64( d,5 ) = vdupq_laneq_u64( (uint64x2_t)s2, 1 );
casti_v128u64( d,6 ) = vdupq_laneq_u64( (uint64x2_t)s3, 0 );
casti_v128u64( d,7 ) = vdupq_laneq_u64( (uint64x2_t)s3, 1 );
casti_v128u64( d,8 ) = vdupq_laneq_u64( (uint64x2_t)s4, 0 );
casti_v128u64( d,9 ) = vdupq_laneq_u64( (uint64x2_t)s4, 1 );
#endif
casti_v128u64( d,8 ) = v128_duplane64( s4, 0 );
casti_v128u64( d,9 ) = v128_duplane64( s4, 1 );
}
static inline void extr_lane_2x64( void *dst, const void *src,

View File

@@ -439,11 +439,11 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#define v128_ornot( v1, v0 ) _mm_or_si128( v128_not( v1 ), v0 )
#define v128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
#define v128_xor3( a, b, c ) _mm_xor_si128( _mm_xor_si128( a, b ), c )
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
#define v128_and3( a, b, c ) _mm_and_si128( _mm_and_si128( a, b ), c )
#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
#define v128_or3( a, b, c ) _mm_or_si128( _mm_or_si128( a, b ), c )
#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )

View File

@@ -174,17 +174,22 @@ static inline __m256i mm256_not( const __m256i v )
#define mm256_ornot( v1, v0 ) _mm256_or_si256( mm256_not( v1 ), v0 )
// usage hints to improve performance when ternary logic is not avalable:
// If overwriting an input arg put that arg first so the intermediate
// result can be stored in the dest.
// Put an arg with the nearest dependency last so independant args can be
// processed first.
#define mm256_xor3( a, b, c ) \
_mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
_mm256_xor_si256( _mm256_xor_si256( a, b ), c )
#define mm256_xor4( a, b, c, d ) \
_mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
#define mm256_and3( a, b, c ) \
_mm256_and_si256( a, _mm256_and_si256( b, c ) )
_mm256_and_si256( _mm256_and_si256( a, b ), c )
#define mm256_or3( a, b, c ) \
_mm256_or_si256( a, _mm256_or_si256( b, c ) )
_mm256_or_si256( _mm256_or_si256( a, b ), c )
#define mm256_xorand( a, b, c ) \
_mm256_xor_si256( a, _mm256_and_si256( b, c ) )

View File

@@ -1,193 +0,0 @@
#if !defined(SIMD_64_H__)
#define SIMD_64_H__ 1
#if defined(__x86_64__) && defined(__MMX__)
////////////////////////////////////////////////////////////////
//
// 64 bit MMX vectors.
//
// This code is not used anywhere annd likely never will. It's intent was
// to support 2 way parallel hashing using MMX, or NEON for 32 bit hash
// functions, but hasn't been implementedwas never implemented.
//
// MMX is being deprecated by compilers, all intrinsics will be converted to use SSE
// registers and instructions. MMX will still be available using ASM.
// For backward compatibility it's likely the compiler won't allow mixing explicit SSE
// with promoted MMX. It is therefore preferable to implement all 64 bit vector code
// using explicit SSE with the upper 64 bits being ignored.
// Using SSE for 64 bit vectors will complicate loading arrays from memory which will
// always load 128 bits. Odd indexes will need to be extracted from the upper 64 bits
// of the even index SSE register.
// In most cases the exiting 4x32 SSE code can be used with 2 lanes being ignored
// making ths file obsolete.
#define v64_t __m64
#define v64u32_t v64_t
#define v64_load _mm_load_si64
#define v64_store _mm_store_si64
#define v64_64(i64) ((__m64)(i64))
#define v64_32 _mm_set1_pi32
#define v64_16 _mm_set1_pi16
#define v64_8 _mm_set1_pi8
#define v64_add32 _mm_add_pi32
#define v64_add16 _mm_add_pi16
#define v64_add8 _mm_add_pi8
#define v64_mul32 _mm_mullo_pi32
#define v64_mul16 _mm_mullo_pi16
// compare
#define v64_cmpeq32 _mm_cmpeq_epi32
#define v64_cmpeq16 _mm_cmpeq_epi16
#define v64_cmpeq8 _mm_cmpeq_epi8
#define v64_cmpgt32 _mm_cmpgt_epi32
#define v64_cmpgt16 _mm_cmpgt_epi16
#define v64_cmpgt8 _mm_cmpgt_epi8
#define v64_cmplt32 _mm_cmplt_epi32
#define v64_cmplt16 _mm_cmplt_epi16
#define v64_cmplt8 _mm_cmplt_epi8
// bit shift
#define v64_sl32 _mm_slli_epi32
#define v64_sl16 _mm_slli_epi16
#define v64_sl8 _mm_slli_epi8
#define v64_sr32 _mm_srli_epi32
#define v64_sr16 _mm_srli_epi16
#define v64_sr8 _mm_srli_epi8
#define v64_sra32 _mm_srai_epi32
#define v64_sra16 _mm_srai_epi16
#define v64_sra8 _mm_srai_epi8
#define v64_alignr8 _mm_alignr_pi8
#define v64_unpacklo32 _mm_unpacklo_pi32
#define v64_unpackhi32 _mm_unpackhi_pi32
#define v64_unpacklo16 _mm_unpacklo_pi16
#define v64_unpackhi16 _mm_unpacklhi_pi16
#define v64_unpacklo8 _mm_unpacklo_pi8
#define v64_unpackhi8 _mm_unpackhi_pi16
// Pseudo constants
#define v64_zero _mm_setzero_si64()
#define v64_one_64 _mm_set_pi32( 0UL, 1UL )
#define v64_one_32 v64_32( 1UL )
#define v64_one_16 v64_16( 1U )
#define v64_one_8 v64_8( 1U );
#define v64_neg1 v64_32( 0xFFFFFFFFUL )
#define casti_v64(p,i) (((v64_t*)(p))[(i)])
// Bitwise not: ~(a)
//#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
#define v64_not( a ) ( (v64_t)( ~( (uint64_t)(a) ) )
/*
// Unary negate elements
#define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, v )
#define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, v )
#define mm64_negate_8( v ) _mm_sub_pi8( m64_zero, v )
*/
static inline void v64_memset_zero( __m64 *dst, const int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = v64_zero; }
static inline void v64_memset( __m64 *dst, const __m64 a, const int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
static inline void v64_memcpy( __m64 *dst, const __m64 *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
#define v64_or _mm_or_si64
#define v64_and _mm_and_si64
#define v64_xor _mm_xor_si64
#define v64_andnot _mm_andnot_si64
#define v64_xor3( v2, v1, v0 ) v64_xor( v2, v64_andnot( v1, v0 ) )
#define v64_xorandnot( v2, v1, v0 ) v64_xor( v2, v64_andnot( v1, v0 ) )
// Rotate bits in packed elements of 64 bit vector
#define v64_rol64( a, n ) \
_mm_or_si64( _mm_slli_si64( a, n ), \
_mm_srli_si64( a, 64-(n) ) )
#define v64_ror64( a, n ) \
_mm_or_si64( _mm_srli_si64( a, n ), \
_mm_slli_si64( a, 64-(n) ) )
#define v64_rol32( a, n ) \
_mm_or_si64( _mm_slli_pi32( a, n ), \
_mm_srli_pi32( a, 32-(n) ) )
#define v64_ror32( a, n ) \
_mm_or_si64( _mm_srli_pi32( a, n ), \
_mm_slli_pi32( a, 32-(n) ) )
#define v64_rol16( a, n ) \
_mm_or_si64( _mm_slli_pi16( a, n ), \
_mm_srli_pi16( a, 16-(n) ) )
#define v64_ror16( a, n ) \
_mm_or_si64( _mm_srli_pi16( a, n ), \
_mm_slli_pi16( a, 16-(n) ) )
// Rotate packed elements accross lanes. Useful for byte swap and byte
// rotation.
#if defined(__SSE__)
// Swap hi & lo 32 bits.
#define v64_swap32( a ) _mm_shuffle_pi16( a, 0x4e )
#define v64_shulfr16( a ) _mm_shuffle_pi16( a, 0x39 )
#define v64_shufll16( a ) _mm_shuffle_pi16( a, 0x93 )
// Swap hi & lo 16 bits of each 32 bit element
#define v64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 )
#endif // SSE
#if defined(__SSSE3__)
// Endian byte swap packed elements
#define v64_bswap32( v ) \
_mm_shuffle_pi8( v, (__m64)0x0405060700010203 )
#define v64_bswap16( v ) \
_mm_shuffle_pi8( v, (__m64)0x0607040502030001 );
// Rotate right by c bytes
static inline v64_t v64_shuflr_x8( __m64 v, const int c )
{ return _mm_alignr_pi8( v, v, c ); }
#else
#define v64_bswap32( v ) \
_mm_set_pi32( __builtin_bswap32( ((uint32_t*)&v)[1] ), \
__builtin_bswap32( ((uint32_t*)&v)[0] ) )
#define v64_bswap16( v ) \
_mm_set_pi16( __builtin_bswap16( ((uint16_t*)&v)[3] ), \
__builtin_bswap16( ((uint16_t*)&v)[2] ), \
__builtin_bswap16( ((uint16_t*)&v)[1] ), \
__builtin_bswap16( ((uint16_t*)&v)[0] ) )
#endif // SSSE3
#define v64_blendv( v1, v0, mask ) \
v64_or( v64_and( mask, v1 ), v64_andnot( mask, v0 ) )
#endif // MMX
#endif // SIMD_64_H__

View File

@@ -2,7 +2,7 @@
#define SIMD_INT_H__ 1
//TODO compile time test for byte order
// be64 etc using HW bowap.
// be64 etc using HW bswap.
//
// Endian byte swap
#if defined(__x86_64__)
@@ -19,6 +19,9 @@ static inline uint64_t bswap_64( uint64_t a )
return b;
}
// This produces warnings from clang, but its suggested workaround
// "rev32 %w0, %w1\n\t" produced errors instead. GCC doesn't complain and
// it works as is on both.
static inline uint32_t bswap_32( uint32_t a )
{
uint32_t b;
@@ -94,7 +97,7 @@ static inline uint16_t be16( const uint16_t u16 )
return ( (uint16_t)(p[3]) ) + ( (uint16_t)(p[2]) << 8 );
}
static inline uint32_t le162( const uint16_t u16 )
static inline uint32_t le16( const uint16_t u16 )
{
const uint8_t *p = (uint8_t const *)&u16;
return ( (uint16_t)(p[0]) ) + ( (uint16_t)(p[1]) << 8 );
@@ -112,7 +115,7 @@ static inline uint32_t le162( const uint16_t u16 )
#elif defined(__aarch64__)
// Documentation is vague, ror exists but is ambiguous. Docs say it can
// do 32 or 64 registers. Assuming that is architecture specific andcan
// do 32 or 64 bit registers. Assuming that is architecture specific and can
// only do 32 bit on 32 bit arch. Rarely used so not a big issue.
static inline uint64_t ror64( uint64_t a, const int c )
{

View File

@@ -93,6 +93,8 @@
#define v128_cmplt16( v1, v0 ) vcltq_s16( (int16x8_t)v1, (int16x8_t)(v0) )
#define v128_cmplt8( v1, v0 ) vcltq_s8( (int8x16_t)v1, (int8x16_t)(v0) )
#define v128_cmpeq_zero vceqzq_u64
// Logical bit shift
#define v128_sl64 vshlq_n_u64
#define v128_sl32 vshlq_n_u32
@@ -135,14 +137,14 @@
#if defined(__ARM_FEATURE_SHA3)
#define v128_xor3 veor3q_u32
#else
#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) )
#define v128_xor3( v2, v1, v0 ) veorq_u32( veorq_u32( v2, v1 ), v0 )
#endif
// v2 & v1 & v0
#define v128_and3( v2, v1, v0 ) v128_and( v2, v128_and( v1, v0 ) )
#define v128_and3( v2, v1, v0 ) v128_and( v128_and( v2, v1 ), v0 )
// v2 | v1 | v0
#define v128_or3( v2, v1, v0 ) v128_or( v2, v128_or( v1, v0 ) )
#define v128_or3( v2, v1, v0 ) v128_or( v128_or( v2, v1 ), v0 )
// v2 ^ ( ~v1 & v0 )
#if defined(__ARM_FEATURE_SHA3)
@@ -178,6 +180,7 @@
#define v128_unpacklo8( v1, v0 ) vzip1q_u8( v1, v0 )
#define v128_unpackhi8( v1, v0 ) vzip2q_u8( v1, v0 )
// vzipq_u32 can do hi & lo and return uint32x4x2, no 64 bit version.
// AES
// consistent with Intel AES intrinsics, break up for optimizing
@@ -237,18 +240,15 @@ typedef union
#define cast_v128u32( p ) (*((uint32x4_t*)(p)))
#define castp_v128u32( p ) ((uint32x4_t*)(p))
#define v128_zero v128_64( 0ull )
#define v128_cmpeq_zero vceqzq_u64
#define v128_neg1 v128_64( 0xffffffffffffffffull )
// set1
#define v128_64 vmovq_n_u64
#define v128_32 vmovq_n_u32
#define v128_16 vmovq_n_u16
#define v128_8 vmovq_n_u8
#define v128_zero v128_64( 0ull )
#define v128_neg1 v128_64( 0xffffffffffffffffull )
#define v64_set32( u32_1, u32_0 ) \
vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )
@@ -357,28 +357,23 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
((uint16x8_t)(v)), c )
#define v128_rol16( v, c ) \
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \
: vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
((uint16x8_t)(v)), c )
#define v128_ror8( v, c ) \
vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
((uint8x16_t)(v)), c )
#define v128_rol8( v, c ) \
vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
((uint8x16_t)(v)), c )
// ( v1 ^ v0 ) >>> n
// ( v1 ^ v0 ) >>> c
#if defined(__ARM_FEATURE_SHA3)
#define v128_ror64xor( v1, v0, n ) vxarq_u64( v1, v0, n )
#define v128_ror64xor( v1, v0, c ) vxarq_u64( v1, v0, c )
#else
#define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n )
#define v128_ror64xor( v1, v0, c ) v128_ror64( v128_xor( v1, v0 ), c )
#endif
#define v128_2ror64( v1, v0, c ) \
@@ -411,7 +406,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
v1 = vorrq_u32( v1, t1 ); \
}
#define v128_2rorx32( v1, v0, c ) \
#define v128_2ror32( v1, v0, c ) \
{ \
uint32x4_t t0 = vshlq_n_u32( v0, c ); \
uint32x4_t t1 = vshlq_n_u32( v1, c ); \
@@ -444,9 +439,9 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
#define v128_lrev16 vrev32q_u16
// aka bswap
#define v128_qrev8 vrev64q_u8
#define v128_lrev8 vrev32q_u8
#define v128_wrev8 vrev16q_u8
// #define v128_qrev8 vrev64q_u8
// #define v128_lrev8 vrev32q_u8
// #define v128_wrev8 vrev16q_u8
// full vector rotation
@@ -471,9 +466,9 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
#define v128_bswap16(v) (uint16x8_t)vrev16q_u8( (uint8x16_t)(v) )
#define v128_bswap32(v) (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) )
#define v128_bswap64(v) (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
#define v128_bswap128(v) (uint32x4_t)v128_swap64( v128_bswap64(v) )
#define v128_bswap128(v) (uint32x4_t)v128_rev64( v128_bswap64(v) )
// Usefull for x86_64 but does nothing for ARM
// Useful for x86_64 but does nothing for ARM
#define v128_block_bswap32( dst, src ) \
{ \
casti_v128u32( dst,0 ) = v128_bswap32( casti_v128u32( src,0 ) ); \
@@ -542,7 +537,7 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
// Bitwise blend using vector mask, use only bytewise for compatibility
// with x86_64.
#define v128_blendv( v1, v0, mask ) vbslq_u32( mask, v1, v0 )
#define v128_blendv( v1, v0, mask ) vbslq_u32( mask, v0, v1 )
#endif // __ARM_NEON
#endif // SIMD_NEON_H__

View File

@@ -1,25 +1,152 @@
// Placeholder for now.
//
// This file will hold AArch64 SVE code, a replecement for NEON that uses vector length
// agnostic instructions. This means the same code can be used on CPUs with different
// SVE vector register lengths. This is not good for vectorized hashing.
// This file will hold AArch64 SVE code, a replecement for NEON that uses
// vector length agnostic instructions. This means the same code can be used
// on CPUs with different SVE vector register lengths. This is not good for
// vectorized hashing.
// Optimum hash is sensitive to the vector register length with different code
// used for different register sizes. On X86_64 the vector length is tied to the CPU
// feature making it simple and efficient to handle different lengths although it
// results in multiple executables. Theoretically SVE could use a single executable for
// any vector length.
// used for different register sizes. On X86_64 the vector length is tied to
// the CPU feature making it simple and efficient to handle different lengths
// although it results in multiple executables. Theoretically SVE could use a
// single executable for any vector length.
//
// With the SVE vector length only known at run time it resultis in run time overhead
// to test the vector length. Theoretically it could be tested at program loading and
// appropriate libraries loaded. However I don't know if this can be done and if so
// how to do it.
// With the SVE vector length only known at run time it results in run time
// overhead to test the vector length. Theoretically it could be tested at
// program loading and appropriate libraries loaded. However I don't know if
// this can be done and if specified how to do it.
//
// SVE is not expected to be used for 128 bit vectors as it does not provide any
// advantages over NEON. However, it may be implemented for testing purposes
// because CPU with registers larger than 128 bits are currently very rare and very
// expensive server class CPUs.
// because CPU with registers larger than 128 bits are currently very rare and
// very expensive server class CPUs.
//
// N-way parallel hashing could be the best use of SVE, usimg the same code for all
// vector lengths with the only variable being the number of lanes. This will still
// require run time checking but should be lighter than substituting functions.
// However, 128 bit vectors also need to be supported with 256 bit registers.
// This could be a challenge for un-predicated functions.
//
// N-way parallel hashing could be the best use of SVE, usimg the same code
// for all vector lengths with the only variable being the number of lanes.
// This will still require run time checking but should be lighter than
// substituting functions.
// Current approach is to hard code the length in these intrinsics and called
// by existing length specific code.
// define with sv_ prefix for generic use predicate provided by caller,
// use sv<size>_ with hard coded predicate.
// v<size>_ only if and when it's compatible with SSE & NEON
// Many instructions have no predicate operand, how is VVL handled?
// How does the CPU know how long the vector is and whether it spans
// multiple registers without the predicate?
// Also how does the predicate define the vector size? How to tell if inactive
// high lanes are part of the vector or beyond its range.
//
// Some intructions may have an implied predicate by other arguments.
// TBL for example will only have shuffle indexes for active lanes.
// However this is dependant on software being aware of register size.
#if 0
// #if defined USE_SV128
// NEON needs to be disabled
#define PRED128 0xffff
#define PRED256 0xffffffff
// Types should be transparent
#define sv128u32_t svuint32_t
#define sv256u32_t svuint32_t
// load1
// arithmetic
// _z zero inactive elements, _x undefined inactive elements, _m inactive
// elements from first arg. arg order only matters when _m used. Use _x.
#define sv_add32( p, v1, v0 ) svadd_u32_x( p, v1, v0 )
#define sv128_add32( v1, v0 ) svadd_u32_x( PRED128, v1, v0 )
#define sv256_add32( v1, v0 ) svadd_u32_x( PRED256, v1, v0 )
// Add integer to each element
#define sv_addi32( p, v, i ) svadd_n_u32_x( p, v, i )
// compare
#define sv_cmpeq32( p, v1, v0 ) svcmpeq_u32( p, v1, v0 )
#define sv128_cmpeq32( v1, v0 ) svcmpeq_u32( PRED128, v1, v0 )
#define sv256_cmpeq32( v1, v0 ) svcmpeq_u32( PRED256, v1, v0 )
// bit shift
#define sv_sl32( v, c ) svlsl_n_u32_x( p, v, c )
#define sv128_sl32( v, c ) svlsl_n_u32_x( PRED128, v, c )
#define sv256_sl32( v, c ) svlsl_n_u32_x( PRED256, v, c )
// logic
#define sv_or( p, v1, v0 ) svorr_u32_x( p, v1, v0 )
#define sv128_or( v1, v0 ) svorr_u32_x( PRED128, v1, v0 )
#define sv256_or( v1, v0 ) svorr_u32_x( PRED256, v1, v0 )
// ext used for alignr, and zip used for unpack have no predicate arg.
// How is vector length determined? How are register sizes handled?
// How are part registers handled?
// alignr (ext)
// unpack
// AES
// AES uses fixed 128 bit vectors, how does this work with larger registers?
// set1
#define sv128_32( n ) svdup_n_u32_x( PRED128, n )
#define sv256_32( n ) svdup_n_u32_x( PRED256, n )
// broadcast
// svdup_lane has no predicate
// constants
// pointer cast
// Bit rotation
// No predication for shift instructions
// Cross lane shuffles
// Very limited shuffling, mostly svtbl which has no predicate and uses
// vector for the index.
// endian byte swap
#define sv128_bswap32(v) svrevb_u32_x( p, v )
// blend
#enfif

View File

@@ -16,14 +16,19 @@
#include "miner.h"
#include "simd-utils.h"
#if defined(__aarch64__)
// Missing on MinGW, MacOS
#if defined(__aarch64__) && !defined(WIN32) && !defined(__APPLE__)
#define ARM_AUXV
#endif
#if defined(ARM_AUXV)
// for arm's "cpuid"
#include <sys/auxv.h>
#include <asm/hwcap.h>
#include <sys/prctl.h>
#endif
#ifndef WIN32
#if !(defined(WIN32) || defined(__APPLE__))
// 1035g1: /sys/devices/platform/coretemp.0/hwmon/hwmon3/temp1_input
// 1035g1: /sys/class/hwmon/hwmon1/temp1_input wrong temp
@@ -147,7 +152,7 @@ static inline void linux_cpu_hilo_freq( float *lo, float *hi )
static inline float cpu_temp( int core )
{
#ifdef WIN32
#if defined(WIN32) || defined(__APPLE__)
return 0.;
#else
return linux_cputemp( core );
@@ -156,7 +161,7 @@ static inline float cpu_temp( int core )
static inline uint32_t cpu_clock( int core )
{
#ifdef WIN32
#if defined(WIN32) || defined(__APPLE__)
return 0;
#else
return linux_cpufreq( core );
@@ -275,8 +280,8 @@ static inline int cpu_fanpercent()
#define FMA3_mask (FMA3_Flag|AVX_mask)
#define AVX512_mask (AVX512_VL_Flag|AVX512_BW_Flag|AVX512_DQ_Flag|AVX512_F_Flag)
#if defined(__x86_64__)
static inline void cpuid( unsigned int leaf, unsigned int subleaf,
unsigned int output[4] )
{
@@ -309,13 +314,14 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
#endif
}
#elif defined(__aarch64__)
#elif defined(ARM_AUXV)
// Always test if HWCAP variable is defined in the kernel before attempting
// to compile it. If not defined the feature can't be tested and won't be
// included in the compile.
// This can occur if compiling with an old kernel and a new CPU and could
// result in a suboptimal build.
// leaf and subleaf arguments are ignored.
static inline void cpuid( unsigned int leaf, unsigned int subleaf,
unsigned int output[4] )
@@ -365,7 +371,8 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
}
#else
#define cpuid(leaf, subleaf, out) out[0] = 0;
#define cpuid( leaf, subleaf, output ) \
output[0] = output[1] = output[2] = output[3] = 0;
#endif
static inline void cpu_getname(char *outbuf, size_t maxsz)
@@ -958,10 +965,10 @@ static inline unsigned int avx10_vector_length()
return 0;
}
// ARM SVE vector register length
// ARM SVE vector register length, converted from bytes to bits.
static inline int sve_vector_length()
{
#if defined(__aarch64__)
#if defined(ARM_AUXV)
if ( has_sve() )
return prctl( (PR_SVE_GET_VL & PR_SVE_VL_LEN_MASK) * 8 );
#endif

6
util.c
View File

@@ -1414,6 +1414,12 @@ static bool send_line( struct stratum_ctx *sctx, char *s )
int n;
fd_set wd;
// Something nasty going on With Windows on aarch64. This hack prevents
// corrupting the sctx pointer. This only works if placed inside the while loop.
#if defined(__aarch64__) && defined(WIN32) && defined(ARM_WIN_HACK)
printf("");
#endif
FD_ZERO( &wd );
FD_SET( sctx->sock, &wd );
if ( select( (int) ( sctx->sock + 1 ), NULL, &wd, NULL, &timeout ) < 1 )

View File

@@ -11,16 +11,11 @@
export LOCAL_LIB="$HOME/usr/lib"
export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --host=x86_64-w64-mingw32"
#export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
# set correct gcc version
export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
# used by GCC
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs"
#export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
# Support for Windows 7 CPU groups, AES sometimes not included in -march
# CPU groups disabled due to incompatibilities between Intel and AMD CPUs.
#export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
export DEFAULT_CFLAGS="-maes -O3 -Wall"
export DEFAULT_CFLAGS_OLD="-O3 -Wall"
@@ -40,7 +35,6 @@ cp $MINGW_LIB/zlib1.dll release/
cp $MINGW_LIB/libwinpthread-1.dll release/
cp $GCC_MINGW_LIB/libstdc++-6.dll release/
cp $GCC_MINGW_LIB/libgcc_s_seh-1.dll release/
#cp ./../libcrypto-1_1-x64.dll release/
cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
# Start building...