mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
5 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
1ed18bf22e | ||
![]() |
1d9341ee92 | ||
![]() |
a45a333b40 | ||
![]() |
2b1037a7c7 | ||
![]() |
06624a0ff2 |
67
Makefile.am
67
Makefile.am
@@ -1,19 +1,39 @@
|
||||
|
||||
if WANT_JANSSON
|
||||
JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
|
||||
if HAVE_APPLE
|
||||
# MacOS uses Homebrew to install needed packages but they aren't linked for
|
||||
# the jansson test in configure. Ignore the failed test & link them now,
|
||||
# different path for different CPU arch.
|
||||
|
||||
if ARCH_ARM64
|
||||
EXTRA_INCLUDES = -I/opt/homebrew/include
|
||||
EXTRA_LIBS = -L/opt/homebrew/lib
|
||||
else
|
||||
JANSSON_INCLUDES=
|
||||
EXTRA_INCLUDES = -I/usr/local/include
|
||||
EXTRA_LIBS = -L/usr/local/lib
|
||||
endif
|
||||
|
||||
EXTRA_DIST = example-cfg.json nomacro.pl
|
||||
else
|
||||
|
||||
SUBDIRS = compat
|
||||
if WANT_JANSSON
|
||||
# Can't find jansson libraries, compile the included source code.
|
||||
EXTRA_INCLUDES = -I$(top_srcdir)/compat/jansson
|
||||
EXTRA_LIBS = -L$(top_srcdir)/compat/jansson
|
||||
else
|
||||
EXTRA_INCLUDES =
|
||||
EXTRA_LIBS =
|
||||
endif
|
||||
|
||||
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) -I.
|
||||
endif
|
||||
|
||||
bin_PROGRAMS = cpuminer
|
||||
EXTRA_DIST = example-cfg.json nomacro.pl
|
||||
|
||||
dist_man_MANS = cpuminer.1
|
||||
SUBDIRS = compat
|
||||
|
||||
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(EXTRA_INCLUDES) -I.
|
||||
|
||||
bin_PROGRAMS = cpuminer
|
||||
|
||||
dist_man_MANS = cpuminer.1
|
||||
|
||||
cpuminer_SOURCES = \
|
||||
dummy.cpp \
|
||||
@@ -166,8 +186,6 @@ cpuminer_SOURCES = \
|
||||
algo/shavite/sph-shavite-aesni.c \
|
||||
algo/shavite/shavite-hash-2way.c \
|
||||
algo/shavite/shavite-hash-4way.c \
|
||||
algo/simd/nist.c \
|
||||
algo/simd/vector.c \
|
||||
algo/simd/sph_simd.c \
|
||||
algo/simd/simd-hash-2way.c \
|
||||
algo/skein/sph_skein.c \
|
||||
@@ -274,29 +292,29 @@ cpuminer_SOURCES = \
|
||||
algo/yespower/yespower-opt.c \
|
||||
algo/yespower/yespower-ref.c \
|
||||
algo/yespower/yespower-blake2b-ref.c
|
||||
|
||||
disable_flags =
|
||||
|
||||
if USE_ASM
|
||||
cpuminer_SOURCES += asm/neoscrypt_asm.S
|
||||
else
|
||||
disable_flags += -DNOASM
|
||||
endif
|
||||
|
||||
if HAVE_WINDOWS
|
||||
cpuminer_SOURCES += compat/winansi.c
|
||||
endif
|
||||
|
||||
cpuminer_LDFLAGS = @LDFLAGS@
|
||||
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
|
||||
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
|
||||
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
|
||||
if USE_ASM
|
||||
disable_flags =
|
||||
cpuminer_SOURCES += asm/neoscrypt_asm.S
|
||||
else
|
||||
disable_flags += -DNOASM
|
||||
endif
|
||||
|
||||
if HAVE_WINDOWS
|
||||
cpuminer_CFLAGS += -Wl,--stack,10485760
|
||||
cpuminer_LDFLAGS = @LDFLAGS@
|
||||
cpuminer_LDADD = $(EXTRA_LIBS) @LIBCURL@ -ljansson @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
|
||||
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
|
||||
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
|
||||
|
||||
if ARCH_ARM64
|
||||
cpuminer_CFLAGS += -flax-vector-conversions
|
||||
endif
|
||||
|
||||
if HAVE_WINDOWS
|
||||
|
||||
# use to profile an object
|
||||
# gprof_cflags = -pg -g3
|
||||
# cpuminer_LDFLAGS += -pg
|
||||
@@ -310,5 +328,4 @@ cpuminer-neoscrypt.o: neoscrypt.c
|
||||
@echo "CUSTOM ${@}: ${filter %.o,${^}} ${filter %.c,${^}}"
|
||||
$(CC) $(common_ccflags) -g -O3 $(gprof_cflags) -MT $@ -MD -MP -c -o $@ $<
|
||||
|
||||
|
||||
endif
|
||||
|
26
README.md
26
README.md
@@ -36,34 +36,18 @@ for compile instructions.
|
||||
Requirements
|
||||
------------
|
||||
|
||||
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
|
||||
Intel Core2 and newer and AMD equivalents. Further optimizations are available
|
||||
on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
|
||||
|
||||
32 bit CPUs are not supported.
|
||||
Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
|
||||
are not supported.
|
||||
1. A 64 bit CPU supporting x86_64 (Intel or AMD) or aarch64 (ARM).
|
||||
x86_64 requires SSE2, aarch64 requires armv8 & NEON.
|
||||
|
||||
Mobile CPUs like laptop computers are not recommended because they aren't
|
||||
designed for extreme heat of operating at full load for extended periods of
|
||||
time.
|
||||
|
||||
Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
|
||||
|
||||
2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
|
||||
including Mint and Centos, are known to work and have all dependencies
|
||||
in their repositories. Others may work but may require more effort. Older
|
||||
versions such as Centos 6 don't work due to missing features.
|
||||
|
||||
Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
|
||||
binaries. WindowsXP 64 bit is YMMV.
|
||||
|
||||
FreeBSD is not actively tested but should work, YMMV.
|
||||
MacOS, OSx and Android are not supported.
|
||||
2. 64 bit operating system including Linux, Windows, MacOS, or BSD.
|
||||
Android, IOS and alt OSs like Haiku & ReactOS are not supported.
|
||||
|
||||
3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
|
||||
RPC getwork using http:// or https://.
|
||||
GBT is YMMV.
|
||||
RPC getblockte,plate using http:// or https://.
|
||||
|
||||
Supported Algorithms
|
||||
--------------------
|
||||
|
@@ -75,6 +75,43 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v25.2
|
||||
|
||||
ARM: Fixed regression from v25.1 that could cause build fail.
|
||||
BSD: FreeBSD is now supported. Other BSDs may also work.
|
||||
MacOS: build with installed jansson library instead of compiling the included source code.
|
||||
Windows: remove "_WIN32_WINNT=0x0601" which is a downgrade on Win11.
|
||||
Changed build.sh shell from bash to sh.
|
||||
|
||||
v25.1
|
||||
|
||||
MacOS ARM64: m7m algo is now working.
|
||||
MacOS ARM64: can now be compiled with GCC.
|
||||
MacOS x86_64: is now working compiled with GCC.
|
||||
Fixed some minor bugs & removed some obsolete code.
|
||||
|
||||
v24.8
|
||||
|
||||
ARM: Apple MacOS on M series CPU is now supported compiled from source
|
||||
code, see Wiki for details.
|
||||
ARM: Fix incorrect compiler version display when using clang.
|
||||
build.sh can now be used to compile all targets, arm_build.sh & build_msys2.sh
|
||||
have been removed.
|
||||
Windows: MSys2 build now enables CPU groups by default, prebuilt binaries
|
||||
continue to be compiled with CPU groups disabled.
|
||||
|
||||
v24.7
|
||||
|
||||
ARM: compile works for Windows using MSys2 & MingW, see wiki for details.
|
||||
|
||||
v24.6
|
||||
|
||||
ARM: Fixed scryptn2, x16*, broken in v24.2.
|
||||
ARM: Small improvement to interleaving.
|
||||
Eliminated some potential compile errors in code that was dependent on
|
||||
compiler optimisations.
|
||||
x86_64: improved support for AVX10 compilation, needs GCC-14 or higher.
|
||||
|
||||
v24.5
|
||||
|
||||
Fix MinGW compile error after MSys2 upgrade to GCC-14.2.
|
||||
|
@@ -387,7 +387,7 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
|
||||
// Hamsi 8 way AVX512
|
||||
|
||||
// Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
|
||||
// timig. However, when tested hashing X13 on i9-9940x using cmplt with zero
|
||||
// timing. However, testing hashing X13 on i9-9940x using cmplt with zero
|
||||
// had a 3% faster overall hashrate than than using movepi.
|
||||
|
||||
#define INPUT_BIG8 \
|
||||
@@ -418,13 +418,11 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
|
||||
tb = mm512_xoror( b, d, a ); \
|
||||
a = _mm512_xor_si512( a, c ); \
|
||||
b = mm512_xoror( td, tb, a ); \
|
||||
td = mm512_xorand( a, td, tb ); \
|
||||
d = _mm512_ternarylogic_epi64( a, td, tb, 0x87 );/* not( xorand( a, td, tb ) ); */ \
|
||||
a = c; \
|
||||
c = mm512_xor3( tb, b, td ); \
|
||||
d = mm512_not( td ); \
|
||||
c = _mm512_ternarylogic_epi64( tb, b, d, 0x69 ); /* not( xor3( tb, b, d ) ); */ \
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
#define SBOX8( a, b, c, d ) \
|
||||
do { \
|
||||
@@ -1155,11 +1153,99 @@ do { \
|
||||
b = mm256_xoror( td, tb, a ); \
|
||||
d = _mm256_ternarylogic_epi64( a, td, tb, 0x87 );/* mm256_not( mm256_xorand( a, td, tb ) ); */ \
|
||||
a = c; \
|
||||
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /*mm256_not( mm256_xor3( tb, b, d ) );*/ \
|
||||
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /* mm256_not( mm256_xor3( tb, b, d ) ); */ \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define INPUT_BIG_sub( db_i ) \
|
||||
{ \
|
||||
const __m256i dm = _mm256_cmpgt_epi64( zero, db_i ); \
|
||||
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
|
||||
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
|
||||
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
|
||||
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
|
||||
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
|
||||
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
|
||||
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
|
||||
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
|
||||
tp += 8; \
|
||||
}
|
||||
|
||||
#define INPUT_BIG \
|
||||
{ \
|
||||
const __m256i db = *buf; \
|
||||
const __m256i zero = m256_zero; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,63 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,62 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,61 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,60 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,59 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,58 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,57 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,56 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,55 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,54 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,53 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,52 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,51 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,50 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,49 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,48 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,47 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,46 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,45 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,44 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,43 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,42 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,41 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,40 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,39 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,38 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,37 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,36 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,35 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,34 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,33 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,32 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,31 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,30 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,29 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,28 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,27 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,26 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,25 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,24 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,23 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,22 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,21 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,20 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,19 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,18 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,17 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,16 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,15 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,14 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,13 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,12 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,11 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,10 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 9 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 8 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 7 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 6 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 5 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 4 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 3 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 2 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 1 ) ); \
|
||||
INPUT_BIG_sub( db ); \
|
||||
}
|
||||
|
||||
#if 0
|
||||
// dependent on the compiler unrolling the loop
|
||||
#define INPUT_BIG \
|
||||
do { \
|
||||
__m256i db = *buf; \
|
||||
@@ -1180,6 +1266,7 @@ do { \
|
||||
tp += 8; \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
|
||||
#define SBOX( a, b, c, d ) \
|
||||
@@ -1219,7 +1306,7 @@ do { \
|
||||
do { \
|
||||
a = mm256_rol_32( a, 13 ); \
|
||||
c = mm256_rol_32( c, 3 ); \
|
||||
b = mm256_xor3( a, b, c ); \
|
||||
b = mm256_xor3( b, a, c ); \
|
||||
d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
|
||||
b = mm256_rol_32( b, 1 ); \
|
||||
d = mm256_rol_32( d, 7 ); \
|
||||
@@ -1961,6 +2048,94 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
sc->h[6] = c6; \
|
||||
sc->h[7] = c7;
|
||||
|
||||
#define INPUT_2x64_sub( db_i ) \
|
||||
{ \
|
||||
const v128u64_t dm = v128_cmpgt64( zero, db_i ); \
|
||||
m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
|
||||
m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
|
||||
m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
|
||||
m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
|
||||
m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
|
||||
m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
|
||||
m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
|
||||
m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
|
||||
tp += 8; \
|
||||
}
|
||||
|
||||
#define INPUT_2x64 \
|
||||
{ \
|
||||
const v128u64_t db = *buf; \
|
||||
const v128u64_t zero = v128_zero; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
INPUT_2x64_sub( v128_sl64( db,63 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,62 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,61 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,60 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,59 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,58 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,57 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,56 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,55 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,54 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,53 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,52 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,51 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,50 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,49 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,48 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,47 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,46 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,45 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,44 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,43 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,42 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,41 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,40 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,39 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,38 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,37 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,36 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,35 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,34 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,33 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,32 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,31 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,30 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,29 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,28 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,27 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,26 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,25 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,24 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,23 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,22 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,21 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,20 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,19 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,18 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,17 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,16 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,15 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,14 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,13 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,12 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,11 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,10 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 9 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 8 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 7 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 6 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 5 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 4 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 3 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 2 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 1 ) ); \
|
||||
INPUT_2x64_sub( db ); \
|
||||
}
|
||||
|
||||
#if 0
|
||||
// Dependent on the compiler unrolling the loop.
|
||||
#define INPUT_2x64 \
|
||||
{ \
|
||||
v128u64_t db = *buf; \
|
||||
@@ -1981,6 +2156,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
tp += 8; \
|
||||
} \
|
||||
}
|
||||
#endif
|
||||
|
||||
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
|
||||
#define SBOX_2x64( a, b, c, d ) \
|
||||
@@ -2001,7 +2177,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
{ \
|
||||
a = v128_rol32( a, 13 ); \
|
||||
c = v128_rol32( c, 3 ); \
|
||||
b = v128_xor3( a, b, c ); \
|
||||
b = v128_xor3( c, a, b ); \
|
||||
d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
|
||||
b = v128_rol32( b, 1 ); \
|
||||
d = v128_rol32( d, 7 ); \
|
||||
|
@@ -161,29 +161,25 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
|
||||
static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
|
||||
size_t byte_len, size_t lim )
|
||||
{
|
||||
unsigned eb;
|
||||
union {
|
||||
__m512i tmp[lim + 1];
|
||||
uint64_t dummy; /* for alignment */
|
||||
} u;
|
||||
__m512i tmp[lim + 1] __attribute__ ((aligned (64)));
|
||||
size_t j;
|
||||
size_t m512_len = byte_len >> 3;
|
||||
const unsigned eb = hard_coded_eb;
|
||||
|
||||
eb = hard_coded_eb;
|
||||
if ( kc->ptr == (lim - 8) )
|
||||
{
|
||||
const uint64_t t = eb | 0x8000000000000000;
|
||||
u.tmp[0] = _mm512_set1_epi64( t );
|
||||
tmp[0] = _mm512_set1_epi64( t );
|
||||
j = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
j = lim - kc->ptr;
|
||||
u.tmp[0] = _mm512_set1_epi64( eb );
|
||||
memset_zero_512( u.tmp + 1, (j>>3) - 2 );
|
||||
u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
|
||||
tmp[0] = _mm512_set1_epi64( eb );
|
||||
memset_zero_512( tmp + 1, (j>>3) - 2 );
|
||||
tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
|
||||
}
|
||||
keccak64_8way_core( kc, u.tmp, j, lim );
|
||||
keccak64_8way_core( kc, tmp, j, lim );
|
||||
/* Finalize the "lane complement" */
|
||||
NOT64( kc->w[ 1], kc->w[ 1] );
|
||||
NOT64( kc->w[ 2], kc->w[ 2] );
|
||||
@@ -361,29 +357,25 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
|
||||
static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
|
||||
size_t lim )
|
||||
{
|
||||
unsigned eb;
|
||||
union {
|
||||
__m256i tmp[lim + 1];
|
||||
uint64_t dummy; /* for alignment */
|
||||
} u;
|
||||
__m256i tmp[lim + 1] __attribute__ ((aligned (32)));
|
||||
size_t j;
|
||||
size_t m256_len = byte_len >> 3;
|
||||
const unsigned eb = hard_coded_eb;
|
||||
|
||||
eb = hard_coded_eb;
|
||||
if ( kc->ptr == (lim - 8) )
|
||||
{
|
||||
const uint64_t t = eb | 0x8000000000000000;
|
||||
u.tmp[0] = _mm256_set1_epi64x( t );
|
||||
tmp[0] = _mm256_set1_epi64x( t );
|
||||
j = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
j = lim - kc->ptr;
|
||||
u.tmp[0] = _mm256_set1_epi64x( eb );
|
||||
memset_zero_256( u.tmp + 1, (j>>3) - 2 );
|
||||
u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
|
||||
tmp[0] = _mm256_set1_epi64x( eb );
|
||||
memset_zero_256( tmp + 1, (j>>3) - 2 );
|
||||
tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
|
||||
}
|
||||
keccak64_core( kc, u.tmp, j, lim );
|
||||
keccak64_core( kc, tmp, j, lim );
|
||||
/* Finalize the "lane complement" */
|
||||
NOT64( kc->w[ 1], kc->w[ 1] );
|
||||
NOT64( kc->w[ 2], kc->w[ 2] );
|
||||
|
@@ -1,8 +1,6 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if !defined(__APPLE__)
|
||||
|
||||
#include <gmp.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
@@ -33,6 +31,7 @@ static inline double exp_n( double xt )
|
||||
return exp( xt );
|
||||
}
|
||||
|
||||
/*
|
||||
static inline double exp_n2( double x1, double x2 )
|
||||
{
|
||||
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
|
||||
@@ -53,6 +52,7 @@ static inline double exp_n2( double x1, double x2 )
|
||||
else if ( xt > p6 - 1.e-200 )
|
||||
return 0.;
|
||||
}
|
||||
*/
|
||||
|
||||
double swit2_( double wvnmb )
|
||||
{
|
||||
@@ -298,14 +298,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // not apple
|
||||
|
||||
bool register_m7m_algo( algo_gate_t *gate )
|
||||
{
|
||||
#if defined(__APPLE__)
|
||||
applog( LOG_ERR, "M7M algo is not supported on MacOS");
|
||||
return false;
|
||||
#else
|
||||
gate->optimizations = SHA256_OPT;
|
||||
init_m7m_ctx();
|
||||
gate->scanhash = (void*)&scanhash_m7m_hash;
|
||||
@@ -315,6 +309,5 @@ bool register_m7m_algo( algo_gate_t *gate )
|
||||
gate->set_work_data_endian = (void*)&set_work_data_big_endian;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@@ -11,7 +11,6 @@
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/shavite/shavite-hash-2way.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
@@ -617,9 +616,9 @@ union _hmq1725_4way_context_overlay
|
||||
cubehashParam cube;
|
||||
cube_2way_context cube2;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd sd;
|
||||
simd512_context simd;
|
||||
shavite512_2way_context shavite2;
|
||||
simd_2way_context simd;
|
||||
simd_2way_context simd_2way;
|
||||
hashState_echo echo;
|
||||
hamsi512_4way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
@@ -753,8 +752,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
|
||||
shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );
|
||||
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
simd512_2way_full( &ctx.simd_2way, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd_2way, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
@@ -869,41 +868,25 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
|
||||
(const BitSequence *)hash0, 64 );
|
||||
else
|
||||
{
|
||||
init_sd( &ctx.sd, 512 );
|
||||
update_final_sd( &ctx.sd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
}
|
||||
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
|
||||
|
||||
if ( hash1[0] & mask ) //4
|
||||
echo_full( &ctx.echo, (BitSequence *)hash1, 512,
|
||||
(const BitSequence *)hash1, 64 );
|
||||
else
|
||||
{
|
||||
init_sd( &ctx.sd, 512 );
|
||||
update_final_sd( &ctx.sd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
}
|
||||
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
|
||||
|
||||
if ( hash2[0] & mask ) //4
|
||||
echo_full( &ctx.echo, (BitSequence *)hash2, 512,
|
||||
(const BitSequence *)hash2, 64 );
|
||||
else
|
||||
{
|
||||
init_sd( &ctx.sd, 512 );
|
||||
update_final_sd( &ctx.sd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
}
|
||||
simd512_ctx( &ctx.simd, hash2, hash2, 64 );
|
||||
|
||||
if ( hash3[0] & mask ) //4
|
||||
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
|
||||
(const BitSequence *)hash3, 64 );
|
||||
else
|
||||
{
|
||||
init_sd( &ctx.sd, 512 );
|
||||
update_final_sd( &ctx.sd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
}
|
||||
simd512_ctx( &ctx.simd, hash3, hash3, 64 );
|
||||
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
|
@@ -46,7 +46,7 @@
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
#if defined(NOASM) || defined(__arm__) || defined(__aarch64__)
|
||||
#if defined(NOASM) || defined(__arm__) || defined(__aarch64__) || defined(__APPLE__)
|
||||
#define ASM 0
|
||||
#else
|
||||
#define ASM 1
|
||||
|
472
algo/simd/nist.c
472
algo/simd/nist.c
@@ -1,472 +0,0 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "nist.h"
|
||||
#include "simd_iv.h"
|
||||
|
||||
|
||||
/* #define NO_PRECOMPUTED_IV */
|
||||
#if defined(__SSE2__) // || defined(__ARM_NEON)
|
||||
|
||||
/*
|
||||
* Increase the counter.
|
||||
*/
|
||||
void IncreaseCounter(hashState_sd *state, DataLength databitlen) {
|
||||
#ifdef HAS_64
|
||||
state->count += databitlen;
|
||||
#else
|
||||
uint32_t old_count = state->count_low;
|
||||
state->count_low += databitlen;
|
||||
if (state->count_low < old_count)
|
||||
state->count_high++;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Initialize the hashState_sd with a given IV.
|
||||
* If the IV is NULL, initialize with zeros.
|
||||
*/
|
||||
int InitIV(hashState_sd *state, int hashbitlen, const u32 *IV) {
|
||||
|
||||
int n = 8;
|
||||
|
||||
state->hashbitlen = hashbitlen;
|
||||
state->n_feistels = n;
|
||||
state->blocksize = 128*8;
|
||||
|
||||
#ifdef HAS_64
|
||||
state->count = 0;
|
||||
#else
|
||||
state->count_low = 0;
|
||||
state->count_high = 0;
|
||||
#endif
|
||||
|
||||
// state->buffer = malloc(16*n + 16);
|
||||
/*
|
||||
* Align the buffer to a 128 bit boundary.
|
||||
*/
|
||||
// state->buffer += ((unsigned char*)NULL - state->buffer)&15;
|
||||
|
||||
// state->A = malloc((4*n+4)*sizeof(u32));
|
||||
/*
|
||||
* Align the buffer to a 128 bit boundary.
|
||||
*/
|
||||
// state->A += ((u32*)NULL - state->A)&3;
|
||||
|
||||
state->B = state->A+n;
|
||||
state->C = state->B+n;
|
||||
state->D = state->C+n;
|
||||
|
||||
if (IV)
|
||||
memcpy(state->A, IV, 4*n*sizeof(u32));
|
||||
else
|
||||
memset(state->A, 0, 4*n*sizeof(u32));
|
||||
|
||||
// free(state->buffer);
|
||||
// free(state->A);
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the hashState_sd.
|
||||
*/
|
||||
int init_sd(hashState_sd *state, int hashbitlen) {
|
||||
int r;
|
||||
char *init;
|
||||
|
||||
#ifndef NO_PRECOMPUTED_IV
|
||||
// if (hashbitlen == 224)
|
||||
// r=InitIV(state, hashbitlen, IV_224);
|
||||
// else if (hashbitlen == 256)
|
||||
// r=InitIV(state, hashbitlen, IV_256);
|
||||
// else if (hashbitlen == 384)
|
||||
// r=InitIV(state, hashbitlen, IV_384);
|
||||
// else
|
||||
if (hashbitlen == 512)
|
||||
r = InitIV(state, hashbitlen, IV_512);
|
||||
else
|
||||
#endif
|
||||
{
|
||||
/*
|
||||
* Nonstandart length: IV is not precomputed.
|
||||
*/
|
||||
r=InitIV(state, hashbitlen, NULL);
|
||||
if (r != 0)
|
||||
return r;
|
||||
|
||||
init = malloc(state->blocksize);
|
||||
memset(init, 0, state->blocksize);
|
||||
#if defined __STDC__ && __STDC_VERSION__ >= 199901L
|
||||
snprintf(init, state->blocksize, "SIMD-%i v1.1", hashbitlen);
|
||||
#else
|
||||
sprintf(init, "SIMD-%i v1.1", hashbitlen);
|
||||
#endif
|
||||
SIMD_Compress(state, (unsigned char*) init, 0);
|
||||
free(init);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
int update_sd( hashState_sd *state, const BitSequence *data,
|
||||
DataLength databitlen )
|
||||
{
|
||||
unsigned current;
|
||||
unsigned int bs = state->blocksize;
|
||||
static int align = -1;
|
||||
|
||||
if (align == -1)
|
||||
align = RequiredAlignment();
|
||||
|
||||
#ifdef HAS_64
|
||||
current = state->count & (bs - 1);
|
||||
#else
|
||||
current = state->count_low & (bs - 1);
|
||||
#endif
|
||||
|
||||
if ( current & 7 )
|
||||
{
|
||||
// The number of hashed bits is not a multiple of 8.
|
||||
// Very painfull to implement and not required by the NIST API.
|
||||
return 1;
|
||||
}
|
||||
|
||||
while ( databitlen > 0 )
|
||||
{
|
||||
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
|
||||
{
|
||||
// We can hash the data directly from the input buffer.
|
||||
SIMD_Compress(state, data, 0);
|
||||
databitlen -= bs;
|
||||
data += bs/8;
|
||||
IncreaseCounter(state, bs);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Copy a chunk of data to the buffer
|
||||
unsigned int len = bs - current;
|
||||
if ( databitlen < len )
|
||||
{
|
||||
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
|
||||
IncreaseCounter( state, databitlen );
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy( state->buffer+current/8, data, len/8 );
|
||||
IncreaseCounter( state,len );
|
||||
databitlen -= len;
|
||||
data += len/8;
|
||||
current = 0;
|
||||
SIMD_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int final_sd( hashState_sd *state, BitSequence *hashval )
|
||||
{
|
||||
#ifdef HAS_64
|
||||
uint64_t l;
|
||||
int current = state->count & (state->blocksize - 1);
|
||||
#else
|
||||
uint32_t l;
|
||||
int current = state->count_low & (state->blocksize - 1);
|
||||
#endif
|
||||
unsigned int i;
|
||||
BitSequence bs[64];
|
||||
int isshort = 1;
|
||||
|
||||
// If there is still some data in the buffer, hash it
|
||||
if ( current )
|
||||
{
|
||||
// We first need to zero out the end of the buffer.
|
||||
if ( current & 7 )
|
||||
{
|
||||
BitSequence mask = 0xff >> ( current & 7 );
|
||||
state->buffer[current/8] &= ~mask;
|
||||
}
|
||||
current = ( current+7 ) / 8;
|
||||
memset( state->buffer+current, 0, state->blocksize/8 - current );
|
||||
SIMD_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
|
||||
//* Input the message length as the last block
|
||||
memset( state->buffer, 0, state->blocksize / 8 );
|
||||
#ifdef HAS_64
|
||||
l = state->count;
|
||||
for ( i=0; i<8; i++ )
|
||||
{
|
||||
state->buffer[i] = l & 0xff;
|
||||
l >>= 8;
|
||||
}
|
||||
if ( state->count < 16384 )
|
||||
isshort = 2;
|
||||
#else
|
||||
l = state->count_low;
|
||||
for ( i=0; i<4; i++ )
|
||||
{
|
||||
state->buffer[i] = l & 0xff;
|
||||
l >>= 8;
|
||||
}
|
||||
l = state->count_high;
|
||||
for ( i=0; i<4; i++ )
|
||||
{
|
||||
state->buffer[4+i] = l & 0xff;
|
||||
l >>= 8;
|
||||
}
|
||||
if ( state->count_high == 0 && state->count_low < 16384 )
|
||||
isshort = 2;
|
||||
#endif
|
||||
|
||||
SIMD_Compress( state, state->buffer, isshort );
|
||||
|
||||
// Decode the 32-bit words into a BitSequence
|
||||
for ( i=0; i < 2*state->n_feistels; i++ )
|
||||
{
|
||||
u32 x = state->A[i];
|
||||
bs[4*i ] = x&0xff;
|
||||
x >>= 8;
|
||||
bs[4*i+1] = x&0xff;
|
||||
x >>= 8;
|
||||
bs[4*i+2] = x&0xff;
|
||||
x >>= 8;
|
||||
bs[4*i+3] = x&0xff;
|
||||
}
|
||||
|
||||
memcpy( hashval, bs, state->hashbitlen / 8 );
|
||||
if ( state->hashbitlen % 8 )
|
||||
{
|
||||
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
|
||||
hashval[state->hashbitlen/8 + 1] = bs[state->hashbitlen/8 + 1] & mask;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int update_final_sd( hashState_sd *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen )
|
||||
{
|
||||
int current, i;
|
||||
unsigned int bs = state->blocksize;
|
||||
static int align = -1;
|
||||
BitSequence out[64];
|
||||
int isshort = 1;
|
||||
uint64_t l;
|
||||
|
||||
if (align == -1)
|
||||
align = RequiredAlignment();
|
||||
|
||||
#ifdef HAS_64
|
||||
current = state->count & (bs - 1);
|
||||
#else
|
||||
current = state->count_low & (bs - 1);
|
||||
#endif
|
||||
|
||||
if ( current & 7 )
|
||||
{
|
||||
// The number of hashed bits is not a multiple of 8.
|
||||
// Very painfull to implement and not required by the NIST API.
|
||||
return 1;
|
||||
}
|
||||
|
||||
while ( databitlen > 0 )
|
||||
{
|
||||
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
|
||||
{
|
||||
// We can hash the data directly from the input buffer.
|
||||
SIMD_Compress(state, data, 0);
|
||||
databitlen -= bs;
|
||||
data += bs/8;
|
||||
IncreaseCounter(state, bs);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Copy a chunk of data to the buffer
|
||||
unsigned int len = bs - current;
|
||||
if ( databitlen < len )
|
||||
{
|
||||
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
|
||||
IncreaseCounter( state, databitlen );
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy( state->buffer+current/8, data, len/8 );
|
||||
IncreaseCounter( state,len );
|
||||
databitlen -= len;
|
||||
data += len/8;
|
||||
current = 0;
|
||||
SIMD_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
current = state->count & (state->blocksize - 1);
|
||||
|
||||
// If there is still some data in the buffer, hash it
|
||||
if ( current )
|
||||
{
|
||||
// We first need to zero out the end of the buffer.
|
||||
if ( current & 7 )
|
||||
{
|
||||
BitSequence mask = 0xff >> ( current & 7 );
|
||||
state->buffer[current/8] &= ~mask;
|
||||
}
|
||||
current = ( current+7 ) / 8;
|
||||
memset( state->buffer+current, 0, state->blocksize/8 - current );
|
||||
SIMD_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
|
||||
//* Input the message length as the last block
|
||||
memset( state->buffer, 0, state->blocksize / 8 );
|
||||
l = state->count;
|
||||
for ( i=0; i<8; i++ )
|
||||
{
|
||||
state->buffer[i] = l & 0xff;
|
||||
l >>= 8;
|
||||
}
|
||||
if ( state->count < 16384 )
|
||||
isshort = 2;
|
||||
|
||||
SIMD_Compress( state, state->buffer, isshort );
|
||||
|
||||
// Decode the 32-bit words into a BitSequence
|
||||
for ( i=0; i < 2*state->n_feistels; i++ )
|
||||
{
|
||||
u32 x = state->A[i];
|
||||
out[4*i ] = x & 0xff;
|
||||
x >>= 8;
|
||||
out[4*i+1] = x & 0xff;
|
||||
x >>= 8;
|
||||
out[4*i+2] = x & 0xff;
|
||||
x >>= 8;
|
||||
out[4*i+3] = x & 0xff;
|
||||
}
|
||||
|
||||
memcpy( hashval, out, state->hashbitlen / 8 );
|
||||
if ( state->hashbitlen % 8 )
|
||||
{
|
||||
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
|
||||
hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int simd_full( hashState_sd *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen )
|
||||
{
|
||||
|
||||
|
||||
InitIV( state, 512, IV_512 );
|
||||
|
||||
int current, i;
|
||||
unsigned int bs = state->blocksize;
|
||||
static int align = -1;
|
||||
BitSequence out[64];
|
||||
int isshort = 1;
|
||||
uint64_t l;
|
||||
|
||||
if (align == -1)
|
||||
align = RequiredAlignment();
|
||||
|
||||
#ifdef HAS_64
|
||||
current = state->count & (bs - 1);
|
||||
#else
|
||||
current = state->count_low & (bs - 1);
|
||||
#endif
|
||||
|
||||
if ( current & 7 )
|
||||
{
|
||||
// The number of hashed bits is not a multiple of 8.
|
||||
// Very painfull to implement and not required by the NIST API.
|
||||
return 1;
|
||||
}
|
||||
|
||||
while ( databitlen > 0 )
|
||||
{
|
||||
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
|
||||
{
|
||||
// We can hash the data directly from the input buffer.
|
||||
SIMD_Compress(state, data, 0);
|
||||
databitlen -= bs;
|
||||
data += bs/8;
|
||||
IncreaseCounter(state, bs);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Copy a chunk of data to the buffer
|
||||
unsigned int len = bs - current;
|
||||
if ( databitlen < len )
|
||||
{
|
||||
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
|
||||
IncreaseCounter( state, databitlen );
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy( state->buffer+current/8, data, len/8 );
|
||||
IncreaseCounter( state,len );
|
||||
databitlen -= len;
|
||||
data += len/8;
|
||||
current = 0;
|
||||
SIMD_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
current = state->count & (state->blocksize - 1);
|
||||
|
||||
// If there is still some data in the buffer, hash it
|
||||
if ( current )
|
||||
{
|
||||
// We first need to zero out the end of the buffer.
|
||||
if ( current & 7 )
|
||||
{
|
||||
BitSequence mask = 0xff >> ( current & 7 );
|
||||
state->buffer[current/8] &= ~mask;
|
||||
}
|
||||
current = ( current+7 ) / 8;
|
||||
memset( state->buffer+current, 0, state->blocksize/8 - current );
|
||||
SIMD_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
|
||||
//* Input the message length as the last block
|
||||
memset( state->buffer, 0, state->blocksize / 8 );
|
||||
l = state->count;
|
||||
for ( i=0; i<8; i++ )
|
||||
{
|
||||
state->buffer[i] = l & 0xff;
|
||||
l >>= 8;
|
||||
}
|
||||
if ( state->count < 16384 )
|
||||
isshort = 2;
|
||||
|
||||
SIMD_Compress( state, state->buffer, isshort );
|
||||
|
||||
// Decode the 32-bit words into a BitSequence
|
||||
for ( i=0; i < 2*state->n_feistels; i++ )
|
||||
{
|
||||
u32 x = state->A[i];
|
||||
out[4*i ] = x & 0xff;
|
||||
x >>= 8;
|
||||
out[4*i+1] = x & 0xff;
|
||||
x >>= 8;
|
||||
out[4*i+2] = x & 0xff;
|
||||
x >>= 8;
|
||||
out[4*i+3] = x & 0xff;
|
||||
}
|
||||
|
||||
memcpy( hashval, out, state->hashbitlen / 8 );
|
||||
if ( state->hashbitlen % 8 )
|
||||
{
|
||||
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
|
||||
hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -1,64 +0,0 @@
|
||||
#ifndef __NIST_H__
|
||||
#define __NIST_H__
|
||||
|
||||
/*define data alignment for different C compilers*/
|
||||
#if defined(__GNUC__)
|
||||
#define DATA_ALIGN(x) x __attribute__((aligned(16)))
|
||||
#else
|
||||
#define DATA_ALIGN(x) __declspec(align(16)) x
|
||||
#endif
|
||||
|
||||
#include "simd-compat.h"
|
||||
#include "compat/sha3-defs.h"
|
||||
/*
|
||||
* NIST API Specific types.
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
unsigned int hashbitlen;
|
||||
unsigned int blocksize;
|
||||
unsigned int n_feistels;
|
||||
|
||||
#ifdef HAS_64
|
||||
uint64_t count;
|
||||
#else
|
||||
uint32_t count_low;
|
||||
uint32_t count_high;
|
||||
#endif
|
||||
|
||||
DATA_ALIGN(uint32_t A[32]);
|
||||
uint32_t *B;
|
||||
uint32_t *C;
|
||||
uint32_t *D;
|
||||
DATA_ALIGN(unsigned char buffer[128]);
|
||||
|
||||
} hashState_sd;
|
||||
|
||||
/*
|
||||
* NIST API
|
||||
*/
|
||||
|
||||
int init_sd(hashState_sd *state, int hashbitlen);
|
||||
|
||||
int update_sd(hashState_sd *state, const BitSequence *data, DataLength databitlen);
|
||||
|
||||
int final_sd(hashState_sd *state, BitSequence *hashval);
|
||||
|
||||
int update_final_sd( hashState_sd *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen );
|
||||
|
||||
int simd_full( hashState_sd *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen );
|
||||
|
||||
/*
|
||||
* Internal API
|
||||
*/
|
||||
|
||||
//int SupportedLength(int hashbitlen);
|
||||
int RequiredAlignment(void);
|
||||
void SIMD_Compress(hashState_sd * state, const unsigned char *M, int final);
|
||||
|
||||
void fft128_natural(fft_t *a, unsigned char *x);
|
||||
void fft256_natural(fft_t *a, unsigned char *x);
|
||||
|
||||
#endif
|
@@ -1,198 +0,0 @@
|
||||
#ifndef __SIMD_COMPAT_H__
|
||||
#define __SIMD_COMPAT_H__
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
|
||||
/*
|
||||
* This file desfines some helper function for cross-platform compatibility.
|
||||
*/
|
||||
|
||||
#if defined __GNUC_PREREQ && (! defined __STRICT_ANSI__)
|
||||
#define GNU_EXT
|
||||
#endif
|
||||
|
||||
/*
|
||||
* First define some integer types.
|
||||
*/
|
||||
|
||||
#if defined __STDC__ && __STDC_VERSION__ >= 199901L
|
||||
|
||||
/*
|
||||
* On C99 implementations, we can use <stdint.h> to get an exact 32-bit
|
||||
* type, if any, or otherwise use a wider type.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include "compat/brg_types.h"
|
||||
|
||||
#define C32(x) ((u32)(x))
|
||||
|
||||
#define HAS_64 1
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
* On non-C99 systems, we use "unsigned int" if it is wide enough,
|
||||
* "unsigned long" otherwise. This supports all "reasonable" architectures.
|
||||
* We have to be cautious: pre-C99 preprocessors handle constants
|
||||
* differently in '#if' expressions. Hence the shifts to test UINT_MAX.
|
||||
*/
|
||||
|
||||
#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
|
||||
|
||||
typedef unsigned int u32;
|
||||
|
||||
#define C32(x) ((u32)(x ## U))
|
||||
|
||||
#else
|
||||
|
||||
typedef unsigned long u32;
|
||||
|
||||
#define C32(x) ((u32)(x ## UL))
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We want a 64-bit type. We use "unsigned long" if it is wide enough (as
|
||||
* is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
|
||||
* "unsigned long long" otherwise, if available. We use ULLONG_MAX to
|
||||
* test whether "unsigned long long" is available; we also know that
|
||||
* gcc features this type, even if the libc header do not know it.
|
||||
*/
|
||||
|
||||
#if ((ULONG_MAX >> 31) >> 31) >= 3
|
||||
|
||||
typedef unsigned long u64;
|
||||
|
||||
#define HAS_64 1
|
||||
|
||||
#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
|
||||
|
||||
typedef unsigned long long u64;
|
||||
|
||||
#define HAS_64 1
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
* No 64-bit type...
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* fft_t should be at least 16 bits wide.
|
||||
* using short int will require less memory, but int is faster...
|
||||
*/
|
||||
|
||||
typedef int fft_t;
|
||||
|
||||
|
||||
/*
|
||||
* Implementation note: some processors have specific opcodes to perform
|
||||
* a rotation. Recent versions of gcc recognize the expression above and
|
||||
* use the relevant opcodes, when appropriate.
|
||||
*/
|
||||
|
||||
#define T32(x) ((x) & C32(0xFFFFFFFF))
|
||||
#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
|
||||
#define ROTR32(x, n) ROTL32(x, (32 - (n)))
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* The macro MAYBE_INLINE expands to an inline qualifier, is available.
|
||||
*/
|
||||
|
||||
#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined GNU_EXT
|
||||
#define MAYBE_INLINE static inline
|
||||
#elif defined _MSC_VER
|
||||
#define MAYBE_INLINE __inline
|
||||
#else
|
||||
#define MAYBE_INLINE
|
||||
#endif
|
||||
|
||||
|
||||
/* */
|
||||
|
||||
#if defined __GNUC__ && ( defined __i386__ || defined __x86_64__ )
|
||||
|
||||
#define rdtsc() \
|
||||
({ \
|
||||
u32 lo, hi; \
|
||||
__asm__ __volatile__ ( /* serialize */ \
|
||||
"xorl %%eax,%%eax \n cpuid" \
|
||||
::: "%rax", "%rbx", "%rcx", "%rdx"); \
|
||||
/* We cannot use "=A", since this would use %rax on x86_64 */ \
|
||||
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); \
|
||||
(u64)hi << 32 | lo; \
|
||||
}) \
|
||||
|
||||
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
|
||||
|
||||
#define rdtsc __rdtsc
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The IS_ALIGNED macro tests if a char* pointer is aligned to an
|
||||
* n-bit boundary.
|
||||
* It is defined as false on unknown architectures.
|
||||
*/
|
||||
|
||||
|
||||
#define CHECK_ALIGNED(p,n) ((((unsigned char *) (p) - (unsigned char *) NULL) & ((n)-1)) == 0)
|
||||
|
||||
#if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
|
||||
/*
|
||||
* Unaligned 32-bit access are not expensive on x86 so we don't care
|
||||
*/
|
||||
#define IS_ALIGNED(p,n) (n<=4 || CHECK_ALIGNED(p,n))
|
||||
|
||||
#elif defined __sparcv9 || defined __sparc || defined __arm || \
|
||||
defined __ia64 || defined __ia64__ || \
|
||||
defined __itanium__ || defined __M_IA64 || \
|
||||
defined __powerpc__ || defined __powerpc
|
||||
#define IS_ALIGNED(p,n) CHECK_ALIGNED(p,n)
|
||||
|
||||
#else
|
||||
/*
|
||||
* Unkonwn architecture: play safe
|
||||
*/
|
||||
#define IS_ALIGNED(p,n) 0
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* checks for endianness */
|
||||
|
||||
#if defined (__linux__) || defined (__GLIBC__)
|
||||
# include <endian.h>
|
||||
#elif defined (__FreeBSD__)
|
||||
# include <machine/endian.h>
|
||||
#elif defined (__OpenBSD__)
|
||||
# include <sys/endian.h>
|
||||
#endif
|
||||
|
||||
#ifdef __BYTE_ORDER
|
||||
|
||||
# if __BYTE_ORDER == __LITTLE_ENDIAN
|
||||
# define SIMD_LITTLE_ENDIAN
|
||||
# elif __BYTE_ORDER == __BIG_ENDIAN
|
||||
# define SIMD_BIG_ENDIAN
|
||||
# endif
|
||||
|
||||
#else
|
||||
|
||||
# if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
|
||||
# define SIMD_LITTLE_ENDIAN
|
||||
# endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
@@ -231,7 +231,7 @@ static void FFT64( void *a )
|
||||
// Unrolled decimation in frequency (DIF) radix-2 NTT.
|
||||
// Output data is in revbin_permuted order.
|
||||
|
||||
static const int w[] = {0, 2, 4, 6};
|
||||
// static const int w[] = {0, 2, 4, 6};
|
||||
|
||||
#define BUTTERFLY_0( i,j ) \
|
||||
do { \
|
||||
@@ -240,25 +240,25 @@ do { \
|
||||
X(i) = v128_sub16( X(i), v ); \
|
||||
} while(0)
|
||||
|
||||
#define BUTTERFLY_N( i,j,n ) \
|
||||
#define BUTTERFLY_N( i, j, w_n ) \
|
||||
do { \
|
||||
v128u16_t v = X(j); \
|
||||
X(j) = v128_add16( X(i), X(j) ); \
|
||||
X(i) = v128_sl16( v128_sub16( X(i), v ), w[n] ); \
|
||||
X(i) = v128_sl16( v128_sub16( X(i), v ), w_n ); \
|
||||
} while(0)
|
||||
|
||||
BUTTERFLY_0( 0, 4 );
|
||||
BUTTERFLY_N( 1, 5, 1 );
|
||||
BUTTERFLY_N( 2, 6, 2 );
|
||||
BUTTERFLY_N( 3, 7, 3 );
|
||||
BUTTERFLY_N( 1, 5, 2 );
|
||||
BUTTERFLY_N( 2, 6, 4 );
|
||||
BUTTERFLY_N( 3, 7, 6 );
|
||||
|
||||
DO_REDUCE( 2 );
|
||||
DO_REDUCE( 3 );
|
||||
|
||||
BUTTERFLY_0( 0, 2 );
|
||||
BUTTERFLY_0( 4, 6 );
|
||||
BUTTERFLY_N( 1, 3, 2 );
|
||||
BUTTERFLY_N( 5, 7, 2 );
|
||||
BUTTERFLY_N( 1, 3, 4 );
|
||||
BUTTERFLY_N( 5, 7, 4 );
|
||||
|
||||
DO_REDUCE( 1 );
|
||||
|
||||
@@ -329,10 +329,10 @@ do { \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define BUTTERFLY_N( i,j,n ) \
|
||||
#define BUTTERFLY_N( i, j, w_n ) \
|
||||
do { \
|
||||
v128u16_t u = X(j); \
|
||||
X(i) = v128_sl16( X(i), w[n] ); \
|
||||
X(i) = v128_sl16( X(i), w_n ); \
|
||||
X(j) = v128_sub16( X(j), X(i) ); \
|
||||
X(i) = v128_add16( u, X(i) ); \
|
||||
} while(0)
|
||||
@@ -353,15 +353,15 @@ do { \
|
||||
|
||||
BUTTERFLY_0( 0, 2 );
|
||||
BUTTERFLY_0( 4, 6 );
|
||||
BUTTERFLY_N( 1, 3, 2 );
|
||||
BUTTERFLY_N( 5, 7, 2 );
|
||||
BUTTERFLY_N( 1, 3, 4 );
|
||||
BUTTERFLY_N( 5, 7, 4 );
|
||||
|
||||
DO_REDUCE( 3 );
|
||||
|
||||
BUTTERFLY_0( 0, 4 );
|
||||
BUTTERFLY_N( 1, 5, 1 );
|
||||
BUTTERFLY_N( 2, 6, 2 );
|
||||
BUTTERFLY_N( 3, 7, 3 );
|
||||
BUTTERFLY_N( 1, 5, 2 );
|
||||
BUTTERFLY_N( 2, 6, 4 );
|
||||
BUTTERFLY_N( 3, 7, 6 );
|
||||
|
||||
DO_REDUCE_FULL_S( 0 );
|
||||
DO_REDUCE_FULL_S( 1 );
|
||||
@@ -853,7 +853,7 @@ static void fft64_2way( void *a )
|
||||
// Unrolled decimation in frequency (DIF) radix-2 NTT.
|
||||
// Output data is in revbin_permuted order.
|
||||
|
||||
static const int w[] = {0, 2, 4, 6};
|
||||
// static const int w[] = {0, 2, 4, 6};
|
||||
// __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
|
||||
|
||||
|
||||
@@ -864,25 +864,25 @@ do { \
|
||||
X(i) = _mm256_sub_epi16( X(i), v ); \
|
||||
} while(0)
|
||||
|
||||
#define BUTTERFLY_N( i,j,n ) \
|
||||
#define BUTTERFLY_N( i, j, w_n ) \
|
||||
do { \
|
||||
__m256i v = X(j); \
|
||||
X(j) = _mm256_add_epi16( X(i), X(j) ); \
|
||||
X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
|
||||
X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w_n ); \
|
||||
} while(0)
|
||||
|
||||
BUTTERFLY_0( 0, 4 );
|
||||
BUTTERFLY_N( 1, 5, 1 );
|
||||
BUTTERFLY_N( 2, 6, 2 );
|
||||
BUTTERFLY_N( 3, 7, 3 );
|
||||
BUTTERFLY_N( 1, 5, 2 );
|
||||
BUTTERFLY_N( 2, 6, 4 );
|
||||
BUTTERFLY_N( 3, 7, 6 );
|
||||
|
||||
DO_REDUCE( 2 );
|
||||
DO_REDUCE( 3 );
|
||||
|
||||
BUTTERFLY_0( 0, 2 );
|
||||
BUTTERFLY_0( 4, 6 );
|
||||
BUTTERFLY_N( 1, 3, 2 );
|
||||
BUTTERFLY_N( 5, 7, 2 );
|
||||
BUTTERFLY_N( 1, 3, 4 );
|
||||
BUTTERFLY_N( 5, 7, 4 );
|
||||
|
||||
DO_REDUCE( 1 );
|
||||
|
||||
@@ -953,10 +953,10 @@ do { \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define BUTTERFLY_N( i,j,n ) \
|
||||
#define BUTTERFLY_N( i, j, w_n ) \
|
||||
do { \
|
||||
__m256i u = X(j); \
|
||||
X(i) = _mm256_slli_epi16( X(i), w[n] ); \
|
||||
X(i) = _mm256_slli_epi16( X(i), w_n ); \
|
||||
X(j) = _mm256_sub_epi16( X(j), X(i) ); \
|
||||
X(i) = _mm256_add_epi16( u, X(i) ); \
|
||||
} while(0)
|
||||
@@ -977,15 +977,15 @@ do { \
|
||||
|
||||
BUTTERFLY_0( 0, 2 );
|
||||
BUTTERFLY_0( 4, 6 );
|
||||
BUTTERFLY_N( 1, 3, 2 );
|
||||
BUTTERFLY_N( 5, 7, 2 );
|
||||
BUTTERFLY_N( 1, 3, 4 );
|
||||
BUTTERFLY_N( 5, 7, 4 );
|
||||
|
||||
DO_REDUCE( 3 );
|
||||
|
||||
BUTTERFLY_0( 0, 4 );
|
||||
BUTTERFLY_N( 1, 5, 1 );
|
||||
BUTTERFLY_N( 2, 6, 2 );
|
||||
BUTTERFLY_N( 3, 7, 3 );
|
||||
BUTTERFLY_N( 1, 5, 2 );
|
||||
BUTTERFLY_N( 2, 6, 4 );
|
||||
BUTTERFLY_N( 3, 7, 6 );
|
||||
|
||||
DO_REDUCE_FULL_S( 0 );
|
||||
DO_REDUCE_FULL_S( 1 );
|
||||
@@ -1709,11 +1709,11 @@ do { \
|
||||
X(i) = _mm512_sub_epi16( X(i), v ); \
|
||||
} while(0)
|
||||
|
||||
#define BUTTERFLY_N( i, j, w ) \
|
||||
#define BUTTERFLY_N( i, j, w_n ) \
|
||||
do { \
|
||||
__m512i v = X(j); \
|
||||
X(j) = _mm512_add_epi16( X(i), X(j) ); \
|
||||
X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w ); \
|
||||
X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w_n ); \
|
||||
} while(0)
|
||||
|
||||
BUTTERFLY_0( 0, 4 );
|
||||
@@ -1792,10 +1792,10 @@ do { \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define BUTTERFLY_N( i, j, w ) \
|
||||
#define BUTTERFLY_N( i, j, w_n ) \
|
||||
do { \
|
||||
__m512i u = X(j); \
|
||||
X(i) = _mm512_slli_epi16( X(i), w ); \
|
||||
X(i) = _mm512_slli_epi16( X(i), w_n ); \
|
||||
X(j) = _mm512_sub_epi16( X(j), X(i) ); \
|
||||
X(i) = _mm512_add_epi16( u, X(i) ); \
|
||||
} while(0)
|
||||
|
@@ -1,7 +1,6 @@
|
||||
#ifndef SIMD_HASH_2WAY_H__
|
||||
#define SIMD_HASH_2WAY_H__ 1
|
||||
|
||||
#include "simd-compat.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__SSE2__) || defined (__ARM_NEON)
|
||||
@@ -34,7 +33,7 @@ typedef struct
|
||||
unsigned int hashbitlen;
|
||||
unsigned int blocksize;
|
||||
unsigned int n_feistels;
|
||||
} simd512_2way_context __attribute__((aligned(128)));
|
||||
} simd512_2way_context __attribute__((aligned(64)));
|
||||
#define simd_2way_context simd512_2way_context
|
||||
|
||||
// databitlen is bits
|
||||
|
@@ -1,948 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "nist.h"
|
||||
#include "vector.h"
|
||||
|
||||
|
||||
//#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE2__)
|
||||
|
||||
#define PRINT_SOME 0
|
||||
|
||||
/*
|
||||
int SupportedLength(int hashbitlen) {
|
||||
if (hashbitlen <= 0 || hashbitlen > 512)
|
||||
return 0;
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
*/
|
||||
|
||||
int RequiredAlignment(void) {
|
||||
return 16;
|
||||
}
|
||||
|
||||
static const union cv V128 = CV(128);
|
||||
static const union cv V255 = CV(255);
|
||||
static const union cv V257 = CV(257);
|
||||
static const union cv8 V0 = CV(0);
|
||||
|
||||
|
||||
/*
|
||||
* Reduce modulo 257; result is in [-127; 383]
|
||||
* REDUCE(x) := (x&255) - (x>>8)
|
||||
*/
|
||||
#define REDUCE(x) \
|
||||
v16_sub(v16_and(x, V255.v16), v16_shift_r (x, 8))
|
||||
|
||||
/*
|
||||
* Reduce from [-127; 383] to [-128; 128]
|
||||
* EXTRA_REDUCE_S(x) := x<=128 ? x : x-257
|
||||
*/
|
||||
#define EXTRA_REDUCE_S(x) \
|
||||
v16_sub(x, v16_and(V257.v16, v16_cmp(x, V128.v16)))
|
||||
|
||||
/*
|
||||
* Reduce modulo 257; result is in [-128; 128]
|
||||
*/
|
||||
#define REDUCE_FULL_S(x) \
|
||||
EXTRA_REDUCE_S(REDUCE(x))
|
||||
|
||||
#define DO_REDUCE(i) \
|
||||
X(i) = REDUCE(X(i))
|
||||
|
||||
#define DO_REDUCE_FULL_S(i) \
|
||||
do { \
|
||||
X(i) = REDUCE(X(i)); \
|
||||
X(i) = EXTRA_REDUCE_S(X(i)); \
|
||||
} while(0)
|
||||
|
||||
#define MAYBE_VOLATILE
|
||||
|
||||
MAYBE_INLINE void fft64(void *a) {
|
||||
|
||||
v16* const A = a;
|
||||
|
||||
register v16 X0, X1, X2, X3, X4, X5, X6, X7;
|
||||
/*
|
||||
#if V16_SIZE == 8
|
||||
#define X(i) A[i]
|
||||
#elif V16_SIZE == 4
|
||||
#define X(i) A[2*i]
|
||||
#endif
|
||||
*/
|
||||
#define X(i) X##i
|
||||
|
||||
X0 = A[0];
|
||||
X1 = A[1];
|
||||
X2 = A[2];
|
||||
X3 = A[3];
|
||||
X4 = A[4];
|
||||
X5 = A[5];
|
||||
X6 = A[6];
|
||||
X7 = A[7];
|
||||
|
||||
#define DO_REDUCE(i) \
|
||||
X(i) = REDUCE(X(i))
|
||||
|
||||
/*
|
||||
* Begin with 8 parallels DIF FFT_8
|
||||
*
|
||||
* FFT_8 using w=4 as 8th root of unity
|
||||
* Unrolled decimation in frequency (DIF) radix-2 NTT.
|
||||
* Output data is in revbin_permuted order.
|
||||
*/
|
||||
|
||||
static const int w[] = {0, 2, 4, 6};
|
||||
// v16 *Twiddle = (v16*)FFT64_Twiddle;
|
||||
|
||||
#define BUTTERFLY(i,j,n) \
|
||||
do { \
|
||||
MAYBE_VOLATILE v16 v = X(j); \
|
||||
X(j) = v16_add(X(i), X(j)); \
|
||||
if (n) \
|
||||
X(i) = v16_shift_l(v16_sub(X(i), v), w[n]); \
|
||||
else \
|
||||
X(i) = v16_sub(X(i), v); \
|
||||
} while(0)
|
||||
|
||||
BUTTERFLY(0, 4, 0);
|
||||
BUTTERFLY(1, 5, 1);
|
||||
BUTTERFLY(2, 6, 2);
|
||||
BUTTERFLY(3, 7, 3);
|
||||
|
||||
DO_REDUCE(2);
|
||||
DO_REDUCE(3);
|
||||
|
||||
BUTTERFLY(0, 2, 0);
|
||||
BUTTERFLY(4, 6, 0);
|
||||
BUTTERFLY(1, 3, 2);
|
||||
BUTTERFLY(5, 7, 2);
|
||||
|
||||
DO_REDUCE(1);
|
||||
|
||||
BUTTERFLY(0, 1, 0);
|
||||
BUTTERFLY(2, 3, 0);
|
||||
BUTTERFLY(4, 5, 0);
|
||||
BUTTERFLY(6, 7, 0);
|
||||
|
||||
/* We don't need to reduce X(7) */
|
||||
DO_REDUCE_FULL_S(0);
|
||||
DO_REDUCE_FULL_S(1);
|
||||
DO_REDUCE_FULL_S(2);
|
||||
DO_REDUCE_FULL_S(3);
|
||||
DO_REDUCE_FULL_S(4);
|
||||
DO_REDUCE_FULL_S(5);
|
||||
DO_REDUCE_FULL_S(6);
|
||||
|
||||
#undef BUTTERFLY
|
||||
|
||||
/*
|
||||
* Multiply by twiddle factors
|
||||
*/
|
||||
|
||||
X(6) = v16_mul(X(6), FFT64_Twiddle[0].v16);
|
||||
X(5) = v16_mul(X(5), FFT64_Twiddle[1].v16);
|
||||
X(4) = v16_mul(X(4), FFT64_Twiddle[2].v16);
|
||||
X(3) = v16_mul(X(3), FFT64_Twiddle[3].v16);
|
||||
X(2) = v16_mul(X(2), FFT64_Twiddle[4].v16);
|
||||
X(1) = v16_mul(X(1), FFT64_Twiddle[5].v16);
|
||||
X(0) = v16_mul(X(0), FFT64_Twiddle[6].v16);
|
||||
|
||||
/*
|
||||
* Transpose the FFT state with a revbin order permutation
|
||||
* on the rows and the column.
|
||||
* This will make the full FFT_64 in order.
|
||||
*/
|
||||
|
||||
#define INTERLEAVE(i,j) \
|
||||
do { \
|
||||
v16 t1= X(i); \
|
||||
v16 t2= X(j); \
|
||||
X(i) = v16_interleavel(t1, t2); \
|
||||
X(j) = v16_interleaveh(t1, t2); \
|
||||
} while(0)
|
||||
|
||||
INTERLEAVE(1, 0);
|
||||
INTERLEAVE(3, 2);
|
||||
INTERLEAVE(5, 4);
|
||||
INTERLEAVE(7, 6);
|
||||
|
||||
INTERLEAVE(2, 0);
|
||||
INTERLEAVE(3, 1);
|
||||
INTERLEAVE(6, 4);
|
||||
INTERLEAVE(7, 5);
|
||||
|
||||
INTERLEAVE(4, 0);
|
||||
INTERLEAVE(5, 1);
|
||||
INTERLEAVE(6, 2);
|
||||
INTERLEAVE(7, 3);
|
||||
|
||||
#undef INTERLEAVE
|
||||
|
||||
/*
|
||||
* Finish with 8 parallels DIT FFT_8
|
||||
*
|
||||
* FFT_8 using w=4 as 8th root of unity
|
||||
* Unrolled decimation in time (DIT) radix-2 NTT.
|
||||
* Intput data is in revbin_permuted order.
|
||||
*/
|
||||
|
||||
#define BUTTERFLY(i,j,n) \
|
||||
do { \
|
||||
MAYBE_VOLATILE v16 u = X(j); \
|
||||
if (n) \
|
||||
X(i) = v16_shift_l(X(i), w[n]); \
|
||||
X(j) = v16_sub(X(j), X(i)); \
|
||||
X(i) = v16_add(u, X(i)); \
|
||||
} while(0)
|
||||
|
||||
DO_REDUCE(0);
|
||||
DO_REDUCE(1);
|
||||
DO_REDUCE(2);
|
||||
DO_REDUCE(3);
|
||||
DO_REDUCE(4);
|
||||
DO_REDUCE(5);
|
||||
DO_REDUCE(6);
|
||||
DO_REDUCE(7);
|
||||
|
||||
BUTTERFLY(0, 1, 0);
|
||||
BUTTERFLY(2, 3, 0);
|
||||
BUTTERFLY(4, 5, 0);
|
||||
BUTTERFLY(6, 7, 0);
|
||||
|
||||
BUTTERFLY(0, 2, 0);
|
||||
BUTTERFLY(4, 6, 0);
|
||||
BUTTERFLY(1, 3, 2);
|
||||
BUTTERFLY(5, 7, 2);
|
||||
|
||||
DO_REDUCE(3);
|
||||
|
||||
BUTTERFLY(0, 4, 0);
|
||||
BUTTERFLY(1, 5, 1);
|
||||
BUTTERFLY(2, 6, 2);
|
||||
BUTTERFLY(3, 7, 3);
|
||||
|
||||
DO_REDUCE_FULL_S(0);
|
||||
DO_REDUCE_FULL_S(1);
|
||||
DO_REDUCE_FULL_S(2);
|
||||
DO_REDUCE_FULL_S(3);
|
||||
DO_REDUCE_FULL_S(4);
|
||||
DO_REDUCE_FULL_S(5);
|
||||
DO_REDUCE_FULL_S(6);
|
||||
DO_REDUCE_FULL_S(7);
|
||||
|
||||
#undef BUTTERFLY
|
||||
|
||||
A[0] = X0;
|
||||
A[1] = X1;
|
||||
A[2] = X2;
|
||||
A[3] = X3;
|
||||
A[4] = X4;
|
||||
A[5] = X5;
|
||||
A[6] = X6;
|
||||
A[7] = X7;
|
||||
|
||||
#undef X
|
||||
|
||||
}
|
||||
|
||||
|
||||
MAYBE_INLINE void fft128(void *a) {
|
||||
|
||||
int i;
|
||||
|
||||
// Temp space to help for interleaving in the end
|
||||
v16 B[8];
|
||||
|
||||
v16 *A = (v16*) a;
|
||||
// v16 *Twiddle = (v16*)FFT128_Twiddle;
|
||||
|
||||
/* Size-2 butterflies */
|
||||
|
||||
for (i = 0; i<8; i++) {
|
||||
B[i] = v16_add(A[i], A[i+8]);
|
||||
B[i] = REDUCE_FULL_S(B[i]);
|
||||
A[i+8] = v16_sub(A[i], A[i+8]);
|
||||
A[i+8] = REDUCE_FULL_S(A[i+8]);
|
||||
A[i+8] = v16_mul(A[i+8], FFT128_Twiddle[i].v16);
|
||||
A[i+8] = REDUCE_FULL_S(A[i+8]);
|
||||
}
|
||||
|
||||
fft64(B);
|
||||
fft64(A+8);
|
||||
|
||||
/* Transpose (i.e. interleave) */
|
||||
|
||||
for (i=0; i<8; i++) {
|
||||
A[2*i] = v16_interleavel (B[i], A[i+8]);
|
||||
A[2*i+1] = v16_interleaveh (B[i], A[i+8]);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef v16_broadcast
|
||||
/* Compute the FFT using a table
|
||||
* The function works if the value of the message is smaller
|
||||
* than 2^14.
|
||||
*/
|
||||
void fft128_msg_final(short *a, const unsigned char *x) {
|
||||
|
||||
static const union cv FFT128_Final_Table[] = {
|
||||
{{ 1, -211, 60, -67, 2, 92, -137, 123}},
|
||||
{{ 2, 118, 45, 111, 97, -46, 49, -106}},
|
||||
{{ 4, -73, -17, -11, 8, 111, -34, -22}},
|
||||
{{ -68, -4, 76, -25, 96, -96, -68, -9}},
|
||||
{{ 16, -35, -68, -44, 32, -70, -136, -88}},
|
||||
{{ 0, -124, 17, 12, -6, 57, 47, -8}},
|
||||
{{ 64, 117, -15, 81, 128, -23, -30, -95}},
|
||||
{{ -68, -53, -52, -70, -10, -117, 77, 21}},
|
||||
{{ -1, -46, -60, 67, -2, -92, -120, -123}},
|
||||
{{ -2, -118, -45, -111, -97, 46, -49, 106}},
|
||||
{{ -4, 73, 17, 11, -8, -111, 34, 22}},
|
||||
{{ 68, 4, -76, 25, -96, 96, 68, 9}},
|
||||
{{ -16, -222, 68, 44, -32, 70, -121, 88}},
|
||||
{{ 0, 124, -17, -12, 6, -57, -47, 8}},
|
||||
{{ -64, -117, 15, -81, -128, -234, 30, 95}},
|
||||
{{ 68, 53, 52, 70, 10, 117, -77, -21}},
|
||||
{{-118, -31, 116, -61, 21, -62, -25, -122}},
|
||||
{{-101, 107, -45, -95, -8, 3, 101, -34}},
|
||||
{{ 42, -124, -50, 13, 84, 9, -100, -231}},
|
||||
{{ -79, -53, 82, 65, -81, 47, 61, 107}},
|
||||
{{ -89, -239, 57, -205, -178, 36, -143, 104}},
|
||||
{{-126, 113, 33, 111, 103, -109, 65, -114}},
|
||||
{{ -99, 72, -29, -49, -198, -113, -58, -98}},
|
||||
{{ 8, -27, -106, -30, 111, 6, 10, -108}},
|
||||
{{-139, 31, -116, -196, -21, 62, 25, -135}},
|
||||
{{ 101, -107, 45, 95, 8, -3, -101, 34}},
|
||||
{{ -42, -133, 50, -13, -84, -9, 100, -26}},
|
||||
{{ 79, 53, -82, -65, 81, -47, -61, -107}},
|
||||
{{-168, -18, -57, -52, -79, -36, -114, -104}},
|
||||
{{ 126, -113, -33, -111, -103, 109, -65, 114}},
|
||||
{{ 99, -72, -228, 49, -59, 113, 58, -159}},
|
||||
{{ -8, 27, 106, 30, -111, -6, -10, 108}}
|
||||
};
|
||||
|
||||
// v16 *Table = (v16*)FFT128_Final_Table;
|
||||
v16 *A = (v16*) a;
|
||||
v16 msg1 = v16_broadcast(x[0]>128?x[0]-257:x[0]);
|
||||
v16 msg2 = v16_broadcast(x[1]>128?x[1]-257:x[1]);
|
||||
// v16 msg2 = v16_broadcast(x[1]);
|
||||
|
||||
#if 0
|
||||
int i;
|
||||
for (i=0; i<16; i++) {
|
||||
v16 tmp = v16_mul(FFT128_Final_Table[2*i].v16 , msg2);
|
||||
v16 sum = v16_add(FFT128_Final_Table[2*i+1].v16, msg1);
|
||||
sum = v16_add(sum, tmp);
|
||||
A[i] = REDUCE_FULL_S(sum);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define FFT_FINAL(i) \
|
||||
v16 tmp##i = v16_mul(FFT128_Final_Table[2*i].v16, msg2); \
|
||||
v16 sum##i = v16_add(FFT128_Final_Table[2*i+1].v16, msg1); \
|
||||
sum##i = v16_add(sum##i, tmp##i); \
|
||||
A[i] = REDUCE_FULL_S(sum##i);
|
||||
|
||||
FFT_FINAL(0)
|
||||
FFT_FINAL(1)
|
||||
FFT_FINAL(2)
|
||||
FFT_FINAL(3)
|
||||
FFT_FINAL(4)
|
||||
FFT_FINAL(5)
|
||||
FFT_FINAL(6)
|
||||
FFT_FINAL(7)
|
||||
FFT_FINAL(8)
|
||||
FFT_FINAL(9)
|
||||
FFT_FINAL(10)
|
||||
FFT_FINAL(11)
|
||||
FFT_FINAL(12)
|
||||
FFT_FINAL(13)
|
||||
FFT_FINAL(14)
|
||||
FFT_FINAL(15)
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
void fft128_msg(short *a, const unsigned char *x, int final) {
|
||||
|
||||
static const union cv Tweak =
|
||||
{{0,0,0,0,0,0,0,1}};
|
||||
static const union cv FinalTweak =
|
||||
{{0,0,0,0,0,1,0,1}};
|
||||
|
||||
|
||||
v8 *X = (v8*) x;
|
||||
v16 *A = (v16*) a;
|
||||
// v16 *Twiddle = (v16*)FFT128_Twiddle;
|
||||
|
||||
#define UNPACK(i) \
|
||||
do { \
|
||||
v8 t = X[i]; \
|
||||
A[2*i] = v8_mergel(t, V0.v8); \
|
||||
A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16); \
|
||||
A[2*i+8] = REDUCE(A[2*i+8]); \
|
||||
A[2*i+1] = v8_mergeh(t, V0.v8); \
|
||||
A[2*i+9] = v16_mul(A[2*i+1], FFT128_Twiddle[2*i+1].v16); \
|
||||
A[2*i+9] = REDUCE(A[2*i+9]); \
|
||||
} while(0)
|
||||
|
||||
|
||||
/*
|
||||
* This allows to tweak the last butterflies to introduce X^127
|
||||
*/
|
||||
#define UNPACK_TWEAK(i,tw) \
|
||||
do { \
|
||||
v8 t = X[i]; \
|
||||
v16 tmp; \
|
||||
A[2*i] = v8_mergel(t, V0.v8); \
|
||||
A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16); \
|
||||
A[2*i+8] = REDUCE(A[2*i+8]); \
|
||||
tmp = v8_mergeh(t, V0.v8); \
|
||||
A[2*i+1] = v16_add(tmp, tw); \
|
||||
A[2*i+9] = v16_mul(v16_sub(tmp, tw), FFT128_Twiddle[2*i+1].v16); \
|
||||
A[2*i+9] = REDUCE(A[2*i+9]); \
|
||||
} while(0)
|
||||
|
||||
UNPACK(0);
|
||||
UNPACK(1);
|
||||
UNPACK(2);
|
||||
if (final)
|
||||
UNPACK_TWEAK(3, FinalTweak.v16);
|
||||
else
|
||||
UNPACK_TWEAK(3, Tweak.v16);
|
||||
|
||||
#undef UNPACK
|
||||
#undef UNPACK_TWEAK
|
||||
|
||||
fft64(a);
|
||||
fft64(a+64);
|
||||
}
|
||||
|
||||
#if 0
|
||||
void fft128_msg(short *a, const unsigned char *x, int final) {
|
||||
|
||||
for (int i=0; i<64; i++)
|
||||
a[i] = x[i];
|
||||
|
||||
for (int i=64; i<128; i++)
|
||||
a[i] = 0;
|
||||
|
||||
a[127] = 1;
|
||||
a[125] = final? 1: 0;
|
||||
|
||||
fft128(a);
|
||||
}
|
||||
#endif
|
||||
|
||||
void fft256_msg(short *a, const unsigned char *x, int final) {
|
||||
|
||||
static const union cv Tweak =
|
||||
{{0,0,0,0,0,0,0,1}};
|
||||
static const union cv FinalTweak =
|
||||
{{0,0,0,0,0,1,0,1}};
|
||||
|
||||
|
||||
v8 *X = (v8*) x;
|
||||
v16 *A = (v16*) a;
|
||||
// v16 *Twiddle = (v16*)FFT256_Twiddle;
|
||||
|
||||
#define UNPACK(i) \
|
||||
do { \
|
||||
v8 t = X[i]; \
|
||||
A[2*i] = v8_mergel(t, V0.v8); \
|
||||
A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16); \
|
||||
A[2*i+16] = REDUCE(A[2*i+16]); \
|
||||
A[2*i+1] = v8_mergeh(t, V0.v8); \
|
||||
A[2*i+17] = v16_mul(A[2*i+1], FFT256_Twiddle[2*i+1].v16); \
|
||||
A[2*i+17] = REDUCE(A[2*i+17]); \
|
||||
} while(0)
|
||||
|
||||
|
||||
/*
|
||||
* This allows to tweak the last butterflies to introduce X^127
|
||||
*/
|
||||
#define UNPACK_TWEAK(i,tw) \
|
||||
do { \
|
||||
v8 t = X[i]; \
|
||||
v16 tmp; \
|
||||
A[2*i] = v8_mergel(t, V0.v8); \
|
||||
A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16); \
|
||||
A[2*i+16] = REDUCE(A[2*i+16]); \
|
||||
tmp = v8_mergeh(t, V0.v8); \
|
||||
A[2*i+1] = v16_add(tmp, tw); \
|
||||
A[2*i+17] = v16_mul(v16_sub(tmp, tw), FFT256_Twiddle[2*i+1].v16); \
|
||||
A[2*i+17] = REDUCE(A[2*i+17]); \
|
||||
} while(0)
|
||||
|
||||
UNPACK(0);
|
||||
UNPACK(1);
|
||||
UNPACK(2);
|
||||
UNPACK(3);
|
||||
UNPACK(4);
|
||||
UNPACK(5);
|
||||
UNPACK(6);
|
||||
if (final)
|
||||
UNPACK_TWEAK(7, FinalTweak.v16);
|
||||
else
|
||||
UNPACK_TWEAK(7, Tweak.v16);
|
||||
|
||||
#undef UNPACK
|
||||
#undef UNPACK_TWEAK
|
||||
|
||||
fft128(a);
|
||||
fft128(a+128);
|
||||
}
|
||||
|
||||
|
||||
void rounds(u32* state, const unsigned char* msg, short* fft) {
|
||||
|
||||
v32* S = (v32*) state;
|
||||
const v32* M = (v32*)msg;
|
||||
volatile v16* W = (v16*)fft;
|
||||
|
||||
register v32 S0, S1, S2, S3;
|
||||
static const union cv code[] = { CV(185), CV(233) };
|
||||
|
||||
S0 = v32_xor(S[0], v32_bswap(M[0]));
|
||||
S1 = v32_xor(S[1], v32_bswap(M[1]));
|
||||
S2 = v32_xor(S[2], v32_bswap(M[2]));
|
||||
S3 = v32_xor(S[3], v32_bswap(M[3]));
|
||||
|
||||
#define S(i) S##i
|
||||
|
||||
|
||||
/* #define F_0(B, C, D) ((((C) ^ (D)) & (B)) ^ (D)) */
|
||||
/* #define F_1(B, C, D) (((D) & (C)) | (((D) | (C)) & (B))) */
|
||||
|
||||
#define F_0(B, C, D) v32_xor(v32_and(v32_xor(C,D), B), D)
|
||||
#define F_1(B, C, D) v32_or(v32_and(D, C), v32_and( v32_or(D,C), B))
|
||||
|
||||
#define F(a,b,c,fun) F_##fun (a,b,c)
|
||||
|
||||
/*
|
||||
* We split the round function in two halfes
|
||||
* so as to insert some independent computations in between
|
||||
*/
|
||||
|
||||
#define SUM3_00 1
|
||||
#define SUM3_01 2
|
||||
#define SUM3_02 3
|
||||
#define SUM3_10 2
|
||||
#define SUM3_11 3
|
||||
#define SUM3_12 1
|
||||
#define SUM3_20 3
|
||||
#define SUM3_21 1
|
||||
#define SUM3_22 2
|
||||
|
||||
#define STEP_1(a,b,c,d,w,fun,r,s,z) \
|
||||
do { \
|
||||
if (PRINT_SOME) { \
|
||||
int j; \
|
||||
v32 ww=w, aa=a, bb=b, cc=c, dd=d; \
|
||||
u32 *WW = (void*)&ww; \
|
||||
u32 *AA = (void*)&aa; \
|
||||
u32 *BB = (void*)&bb; \
|
||||
u32 *CC = (void*)&cc; \
|
||||
u32 *DD = (void*)ⅆ \
|
||||
for (j=0; j<4; j++) { \
|
||||
printf ("%08x/%2i/%2i[%i]: %08x %08x %08x %08x\n", \
|
||||
WW[j], r, s, SUM3_##z, \
|
||||
AA[j], BB[j], CC[j], DD[j]); \
|
||||
} \
|
||||
} \
|
||||
TT = F(a,b,c,fun); \
|
||||
a = v32_rotate(a,r); \
|
||||
w = v32_add(w, d); \
|
||||
TT = v32_add(TT, w); \
|
||||
TT = v32_rotate(TT,s); \
|
||||
d = v32_shufxor(a,SUM3_##z); \
|
||||
} while(0)
|
||||
|
||||
#define STEP_2(a,b,c,d,w,fun,r,s) \
|
||||
do { \
|
||||
d = v32_add(d, TT); \
|
||||
} while(0)
|
||||
|
||||
#define STEP(a,b,c,d,w,fun,r,s,z) \
|
||||
do { \
|
||||
register v32 TT; \
|
||||
STEP_1(a,b,c,d,w,fun,r,s,z); \
|
||||
STEP_2(a,b,c,d,w,fun,r,s); \
|
||||
} while(0);
|
||||
|
||||
|
||||
#define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3, \
|
||||
fun,r,s,t,u,z,r0) \
|
||||
do { \
|
||||
register v32 W0, W1, W2, W3, TT; \
|
||||
W0 = v16_merge##u0(W[h0], W[l0]); \
|
||||
W0 = V1632(v16_mul(V3216(W0), code[z].v16)); \
|
||||
STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, r0##0); \
|
||||
W1 = v16_merge##u1(W[h1], W[l1]); \
|
||||
W1 = V1632(v16_mul(V3216(W1), code[z].v16)); \
|
||||
STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s); \
|
||||
STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, r0##1); \
|
||||
W2 = v16_merge##u2(W[h2], W[l2]); \
|
||||
W2 = V1632(v16_mul(V3216(W2), code[z].v16)); \
|
||||
STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t); \
|
||||
STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, r0##2); \
|
||||
W3 = v16_merge##u3(W[h3], W[l3]); \
|
||||
W3 = V1632(v16_mul(V3216(W3), code[z].v16)); \
|
||||
STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u); \
|
||||
STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, r0##0); \
|
||||
STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r); \
|
||||
} while(0)
|
||||
|
||||
|
||||
/*
|
||||
* 4 rounds with code 185
|
||||
*/
|
||||
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0, 0);
|
||||
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0, 1);
|
||||
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0, 2);
|
||||
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0, 0);
|
||||
|
||||
/*
|
||||
* 4 rounds with code 233
|
||||
*/
|
||||
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1, 1);
|
||||
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1, 2);
|
||||
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1, 0);
|
||||
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1, 1);
|
||||
|
||||
|
||||
/*
|
||||
* 1 round as feed-forward
|
||||
*/
|
||||
STEP(S(0), S(1), S(2), S(3), S[0], 0, 4, 13, 20);
|
||||
STEP(S(3), S(0), S(1), S(2), S[1], 0, 13, 10, 21);
|
||||
STEP(S(2), S(3), S(0), S(1), S[2], 0, 10, 25, 22);
|
||||
STEP(S(1), S(2), S(3), S(0), S[3], 0, 25, 4, 20);
|
||||
|
||||
S[0] = S(0); S[1] = S(1); S[2] = S(2); S[3] = S(3);
|
||||
|
||||
#undef ROUND
|
||||
#undef STEP
|
||||
#undef STEP_1
|
||||
#undef STEP_2
|
||||
}
|
||||
|
||||
|
||||
void rounds512(u32* state, const unsigned char* msg, short* fft) {
|
||||
|
||||
v32* S = (v32*) state;
|
||||
v32* M = (v32*) msg;
|
||||
v16* W = (v16*) fft;
|
||||
|
||||
register v32 S0l, S1l, S2l, S3l;
|
||||
register v32 S0h, S1h, S2h, S3h;
|
||||
static const union cv code[] = { CV(185), CV(233) };
|
||||
|
||||
S0l = v32_xor(S[0], v32_bswap(M[0]));
|
||||
S0h = v32_xor(S[1], v32_bswap(M[1]));
|
||||
S1l = v32_xor(S[2], v32_bswap(M[2]));
|
||||
S1h = v32_xor(S[3], v32_bswap(M[3]));
|
||||
S2l = v32_xor(S[4], v32_bswap(M[4]));
|
||||
S2h = v32_xor(S[5], v32_bswap(M[5]));
|
||||
S3l = v32_xor(S[6], v32_bswap(M[6]));
|
||||
S3h = v32_xor(S[7], v32_bswap(M[7]));
|
||||
|
||||
#define S(i) S##i
|
||||
|
||||
|
||||
/* #define F_0(B, C, D) ((((C) ^ (D)) & (B)) ^ (D)) */
|
||||
/* #define F_1(B, C, D) (((D) & (C)) | (((D) | (C)) & (B))) */
|
||||
|
||||
#define F_0(B, C, D) v32_xor(v32_and(v32_xor(C,D), B), D)
|
||||
#define F_1(B, C, D) v32_or(v32_and(D, C), v32_and( v32_or(D,C), B))
|
||||
|
||||
#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
|
||||
#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
|
||||
|
||||
/*
|
||||
* We split the round function in two halfes
|
||||
* so as to insert some independent computations in between
|
||||
*/
|
||||
|
||||
#define SUM7_00 0
|
||||
#define SUM7_01 1
|
||||
#define SUM7_02 2
|
||||
#define SUM7_03 3
|
||||
#define SUM7_04 4
|
||||
#define SUM7_05 5
|
||||
#define SUM7_06 6
|
||||
|
||||
#define SUM7_10 1
|
||||
#define SUM7_11 2
|
||||
#define SUM7_12 3
|
||||
#define SUM7_13 4
|
||||
#define SUM7_14 5
|
||||
#define SUM7_15 6
|
||||
#define SUM7_16 0
|
||||
|
||||
#define SUM7_20 2
|
||||
#define SUM7_21 3
|
||||
#define SUM7_22 4
|
||||
#define SUM7_23 5
|
||||
#define SUM7_24 6
|
||||
#define SUM7_25 0
|
||||
#define SUM7_26 1
|
||||
|
||||
#define SUM7_30 3
|
||||
#define SUM7_31 4
|
||||
#define SUM7_32 5
|
||||
#define SUM7_33 6
|
||||
#define SUM7_34 0
|
||||
#define SUM7_35 1
|
||||
#define SUM7_36 2
|
||||
|
||||
#define SUM7_40 4
|
||||
#define SUM7_41 5
|
||||
#define SUM7_42 6
|
||||
#define SUM7_43 0
|
||||
#define SUM7_44 1
|
||||
#define SUM7_45 2
|
||||
#define SUM7_46 3
|
||||
|
||||
#define SUM7_50 5
|
||||
#define SUM7_51 6
|
||||
#define SUM7_52 0
|
||||
#define SUM7_53 1
|
||||
#define SUM7_54 2
|
||||
#define SUM7_55 3
|
||||
#define SUM7_56 4
|
||||
|
||||
#define SUM7_60 6
|
||||
#define SUM7_61 0
|
||||
#define SUM7_62 1
|
||||
#define SUM7_63 2
|
||||
#define SUM7_64 3
|
||||
#define SUM7_65 4
|
||||
#define SUM7_66 5
|
||||
|
||||
#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
|
||||
|
||||
#define PERM_0(d,a) /* XOR 1 */ \
|
||||
do { \
|
||||
d##l = v32_shufxor(a##l,1); \
|
||||
d##h = v32_shufxor(a##h,1); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_1(d,a) /* XOR 6 */ \
|
||||
do { \
|
||||
d##l = v32_shufxor(a##h,2); \
|
||||
d##h = v32_shufxor(a##l,2); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_2(d,a) /* XOR 2 */ \
|
||||
do { \
|
||||
d##l = v32_shufxor(a##l,2); \
|
||||
d##h = v32_shufxor(a##h,2); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_3(d,a) /* XOR 3 */ \
|
||||
do { \
|
||||
d##l = v32_shufxor(a##l,3); \
|
||||
d##h = v32_shufxor(a##h,3); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_4(d,a) /* XOR 5 */ \
|
||||
do { \
|
||||
d##l = v32_shufxor(a##h,1); \
|
||||
d##h = v32_shufxor(a##l,1); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_5(d,a) /* XOR 7 */ \
|
||||
do { \
|
||||
d##l = v32_shufxor(a##h,3); \
|
||||
d##h = v32_shufxor(a##l,3); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_6(d,a) /* XOR 4 */ \
|
||||
do { \
|
||||
d##l = a##h; \
|
||||
d##h = a##l; \
|
||||
} while(0)
|
||||
|
||||
#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
|
||||
do { \
|
||||
if (PRINT_SOME) { \
|
||||
int j; \
|
||||
v32 ww=w##l, aa=a##l, bb=b##l, cc=c##l, dd=d##l; \
|
||||
u32 *WW = (void*)&ww; \
|
||||
u32 *AA = (void*)&aa; \
|
||||
u32 *BB = (void*)&bb; \
|
||||
u32 *CC = (void*)&cc; \
|
||||
u32 *DD = (void*)ⅆ \
|
||||
for (j=0; j<4; j++) { \
|
||||
printf ("%08x/%2i/%2i: %08x %08x %08x %08x\n", \
|
||||
WW[j], r, s, \
|
||||
AA[j], BB[j], CC[j], DD[j]); \
|
||||
} \
|
||||
} \
|
||||
TTl = Fl(a,b,c,fun); \
|
||||
TTh = Fh(a,b,c,fun); \
|
||||
a##l = v32_rotate(a##l,r); \
|
||||
a##h = v32_rotate(a##h,r); \
|
||||
w##l = v32_add(w##l, d##l); \
|
||||
w##h = v32_add(w##h, d##h); \
|
||||
TTl = v32_add(TTl, w##l); \
|
||||
TTh = v32_add(TTh, w##h); \
|
||||
TTl = v32_rotate(TTl,s); \
|
||||
TTh = v32_rotate(TTh,s); \
|
||||
PERM(z,d,a); \
|
||||
} while(0)
|
||||
|
||||
#define STEP_1(a,b,c,d,w,fun,r,s,z) \
|
||||
STEP_1_(a,b,c,d,w,fun,r,s,z)
|
||||
|
||||
#define STEP_2_(a,b,c,d,w,fun,r,s) \
|
||||
do { \
|
||||
d##l = v32_add(d##l, TTl); \
|
||||
d##h = v32_add(d##h, TTh); \
|
||||
} while(0)
|
||||
|
||||
#define STEP_2(a,b,c,d,w,fun,r,s) \
|
||||
STEP_2_(a,b,c,d,w,fun,r,s)
|
||||
|
||||
#define STEP(a,b,c,d,w1,w2,fun,r,s,z) \
|
||||
do { \
|
||||
register v32 TTl, TTh, Wl=w1, Wh=w2; \
|
||||
STEP_1(a,b,c,d,W,fun,r,s,z); \
|
||||
STEP_2(a,b,c,d,W,fun,r,s); \
|
||||
} while(0);
|
||||
|
||||
|
||||
#define MSG_l(x) (2*(x))
|
||||
#define MSG_h(x) (2*(x)+1)
|
||||
|
||||
#define MSG(w,hh,ll,u,z) \
|
||||
do { \
|
||||
int a = MSG_##u(hh); \
|
||||
int b = MSG_##u(ll); \
|
||||
w##l = v16_mergel(W[a], W[b]); \
|
||||
w##l = V1632(v16_mul(V3216(w##l), code[z].v16)); \
|
||||
w##h = v16_mergeh(W[a], W[b]); \
|
||||
w##h = V1632(v16_mul(V3216(w##h), code[z].v16)); \
|
||||
} while(0)
|
||||
|
||||
#define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3, \
|
||||
fun,r,s,t,u,z) \
|
||||
do { \
|
||||
register v32 W0l, W1l, W2l, W3l, TTl; \
|
||||
register v32 W0h, W1h, W2h, W3h, TTh; \
|
||||
MSG(W0,h0,l0,u0,z); \
|
||||
STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, 0); \
|
||||
MSG(W1,h1,l1,u1,z); \
|
||||
STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s); \
|
||||
STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, 1); \
|
||||
MSG(W2,h2,l2,u2,z); \
|
||||
STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t); \
|
||||
STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, 2); \
|
||||
MSG(W3,h3,l3,u3,z); \
|
||||
STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u); \
|
||||
STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, 3); \
|
||||
STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r); \
|
||||
} while(0)
|
||||
|
||||
|
||||
/*
|
||||
* 4 rounds with code 185
|
||||
*/
|
||||
#define PERM_START 0
|
||||
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0);
|
||||
#undef PERM_START
|
||||
#define PERM_START 4
|
||||
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0);
|
||||
#undef PERM_START
|
||||
#define PERM_START 1
|
||||
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0);
|
||||
#undef PERM_START
|
||||
#define PERM_START 5
|
||||
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0);
|
||||
#undef PERM_START
|
||||
|
||||
/*
|
||||
* 4 rounds with code 233
|
||||
*/
|
||||
#define PERM_START 2
|
||||
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1);
|
||||
#undef PERM_START
|
||||
#define PERM_START 6
|
||||
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1);
|
||||
#undef PERM_START
|
||||
#define PERM_START 3
|
||||
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1);
|
||||
#undef PERM_START
|
||||
#define PERM_START 0
|
||||
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1);
|
||||
#undef PERM_START
|
||||
|
||||
|
||||
/*
|
||||
* 1 round as feed-forward
|
||||
*/
|
||||
#define PERM_START 4
|
||||
STEP(S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0);
|
||||
STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1);
|
||||
STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2);
|
||||
STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3);
|
||||
#undef PERM_START
|
||||
|
||||
S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h;
|
||||
S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h;
|
||||
|
||||
#undef ROUND
|
||||
#undef STEP
|
||||
#undef STEP_1
|
||||
#undef STEP_2
|
||||
}
|
||||
|
||||
void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) {
|
||||
if (state->hashbitlen <= 256) {
|
||||
union cv Y[16];
|
||||
short* y = (short*) Y[0].u16;
|
||||
|
||||
#ifdef v16_broadcast
|
||||
if (final == 2) {
|
||||
fft128_msg_final(y, m);
|
||||
rounds(state->A, m, y);
|
||||
} else {
|
||||
fft128_msg(y, m, final);
|
||||
rounds(state->A, m, y);
|
||||
}
|
||||
#else
|
||||
fft128_msg(y, m, final);
|
||||
rounds(state->A, m, y);
|
||||
#endif
|
||||
} else {
|
||||
union cv Y[32];
|
||||
short* y = (short*) Y[0].u16;
|
||||
|
||||
fft256_msg(y, m, final);
|
||||
rounds512(state->A, m, y);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Give the FFT output in the regular order for consitancy checks
|
||||
*/
|
||||
void fft128_natural(fft_t *x, unsigned char *a) {
|
||||
union cv Y[16];
|
||||
short* y = (short*) Y[0].u16;
|
||||
int i;
|
||||
|
||||
fft128_msg(y, a, 0);
|
||||
|
||||
for(i=0; i<64; i++) {
|
||||
x[2*i] = y[i];
|
||||
x[2*i+1] = y[i+64];
|
||||
}
|
||||
}
|
||||
|
||||
#endif // SSE2
|
@@ -1,246 +0,0 @@
|
||||
#ifndef __VECTOR_H__
|
||||
#define __VECTOR_H__
|
||||
|
||||
#include "compat.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
/*******************************
|
||||
* Using GCC vector extensions *
|
||||
*******************************/
|
||||
|
||||
//typedef unsigned char v16qi __attribute__ ((vector_size (16)));
|
||||
typedef char v16qi __attribute__ ((vector_size (16)));
|
||||
typedef short v8hi __attribute__ ((vector_size (16)));
|
||||
typedef int v4si __attribute__ ((vector_size (16)));
|
||||
typedef float v4sf __attribute__ ((vector_size (16)));
|
||||
typedef long long int v2di __attribute__ ((vector_size (16)));
|
||||
|
||||
typedef short v4hi __attribute__ ((vector_size (8)));
|
||||
typedef unsigned char v8qi __attribute__ ((vector_size (8)));
|
||||
|
||||
typedef v16qi v8;
|
||||
typedef v8hi v16;
|
||||
typedef v4si v32;
|
||||
#define V16_SIZE 8
|
||||
|
||||
union cv {
|
||||
unsigned short u16[8];
|
||||
v16 v16;
|
||||
};
|
||||
|
||||
union cv8 {
|
||||
unsigned char u8[16];
|
||||
v8 v8;
|
||||
};
|
||||
|
||||
union u32 {
|
||||
u32 u[4];
|
||||
v32 v;
|
||||
};
|
||||
|
||||
#define V3216(x) ((v16) (x))
|
||||
#define V1632(x) ((v32) (x))
|
||||
#define V168(x) ( (v8) (x))
|
||||
#define V816(x) ((v16) (x))
|
||||
|
||||
#if 0
|
||||
/* These instruction are shorter than the PAND/POR/... that GCC uses */
|
||||
|
||||
#define vec_and(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_andps ((v4sf) a, (v4sf) b);})
|
||||
#define vec_or(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_orps ((v4sf) a, (v4sf) b);})
|
||||
#define vec_xor(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_xorps ((v4sf) a, (v4sf) b);})
|
||||
#define vec_andn(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_andnps ((v4sf) a, (v4sf) b);})
|
||||
|
||||
#define v16_and(x,y) ((v16) vec_and ((x), (y)))
|
||||
#define v16_or(x,y) ((v16) vec_or ((x), (y)))
|
||||
#define v16_xor(x,y) ((v16) vec_xor ((x), (y)))
|
||||
#define v16_andn(x,y) ((v16) vec_andn((x), (y)))
|
||||
|
||||
#define v32_and(x,y) ((v32) vec_and ((x), (y)))
|
||||
#define v32_or(x,y) ((v32) vec_or ((x), (y)))
|
||||
#define v32_xor(x,y) ((v32) vec_xor ((x), (y)))
|
||||
#define v32_andn(x,y) ((v32) vec_andn((x), (y)))
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
#define vec_and(x,y) ((x)&(y))
|
||||
#define vec_or(x,y) ((x)|(y))
|
||||
#define vec_xor(x,y) ((x)^(y))
|
||||
|
||||
#define v16_and vec_and
|
||||
#define v16_or vec_or
|
||||
#define v16_xor vec_xor
|
||||
|
||||
#define v32_and vec_and
|
||||
#define v32_or vec_or
|
||||
#define v32_xor vec_xor
|
||||
|
||||
#define vec_andn(x,y) __builtin_ia32_pandn128 ((v2di) x, (v2di) y)
|
||||
#define v16_andn(x,y) ((v16) vec_andn(x,y))
|
||||
#define v32_andn(x,y) ((v32) vec_andn(x,y))
|
||||
|
||||
#define v32_add(x,y) ((x)+(y))
|
||||
|
||||
#define v16_add(x,y) ((x)+(y))
|
||||
#define v16_sub(x,y) ((x)-(y))
|
||||
#define v16_mul(x,y) ((x)*(y))
|
||||
#define v16_neg(x) (-(x))
|
||||
#define v16_shift_l __builtin_ia32_psllwi128
|
||||
#define v16_shift_r __builtin_ia32_psrawi128
|
||||
#define v16_cmp __builtin_ia32_pcmpgtw128
|
||||
|
||||
#define v16_interleavel __builtin_ia32_punpcklwd128
|
||||
#define v16_interleaveh __builtin_ia32_punpckhwd128
|
||||
|
||||
#define v16_mergel(a,b) V1632(__builtin_ia32_punpcklwd128(a,b))
|
||||
#define v16_mergeh(a,b) V1632(__builtin_ia32_punpckhwd128(a,b))
|
||||
|
||||
#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
|
||||
#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
|
||||
|
||||
#define v32_shift_l __builtin_ia32_pslldi128
|
||||
#define v32_shift_r __builtin_ia32_psrldi128
|
||||
|
||||
#define v32_rotate(x,n) \
|
||||
v32_or(v32_shift_l(x,n), v32_shift_r(x,32-(n)))
|
||||
|
||||
#define v32_shuf __builtin_ia32_pshufd
|
||||
|
||||
#define SHUFXOR_1 0xb1 /* 0b10110001 */
|
||||
#define SHUFXOR_2 0x4e /* 0b01001110 */
|
||||
#define SHUFXOR_3 0x1b /* 0b00011011 */
|
||||
|
||||
#define CAT(x, y) x##y
|
||||
#define XCAT(x,y) CAT(x,y)
|
||||
|
||||
#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
|
||||
|
||||
#define v32_bswap(x) (x)
|
||||
|
||||
#define v16_broadcast(x) ({ \
|
||||
union u32 u; \
|
||||
u32 xx = x; \
|
||||
u.u[0] = xx | (xx << 16); \
|
||||
V3216(v32_shuf(u.v,0)); })
|
||||
|
||||
#define CV(x) {{x, x, x, x, x, x, x, x}}
|
||||
|
||||
#elif defined(__aarch64__) && defined(__ARM_NEON)
|
||||
|
||||
#define vec_and( x, y ) v128_and( x, y )
|
||||
#define vec_or(x,y) v128_or( x, y )
|
||||
#define vec_xor(x,y) v128_xor( x, y )
|
||||
|
||||
#define v16_and v128_and
|
||||
#define v16_or v128_or
|
||||
#define v16_xor v128_xor
|
||||
|
||||
#define v32_and v128_and
|
||||
#define v32_or v128_or
|
||||
#define v32_xor v128_xor
|
||||
|
||||
#define vec_andn( x,y ) v128_andnot( x, y )
|
||||
#define v16_andn vec_andn
|
||||
#define v32_andn vec_andn
|
||||
|
||||
#define v32_add( x, y ) v128_add32( x, y )
|
||||
|
||||
#define v16_add( x, y ) v128_add16( x, y )
|
||||
#define v16_sub( x, y ) v128_sub16( x, y )
|
||||
#define v16_mul( x, y ) v128_mul16( x, y )
|
||||
#define v16_neg(x) v128_negate16( x )
|
||||
#define v16_shift_l( x, c ) v128_sl16
|
||||
#define v16_shift_r v128_sr16
|
||||
#define v16_cmp v128_cmpgt16
|
||||
|
||||
#define v16_interleavel v128_unpacklo16
|
||||
#define v16_interleaveh v128_unpackhi16
|
||||
|
||||
#define v16_mergel(a,b) V1632(__builtin_ia32_punpcklwd128(a,b))
|
||||
#define v16_mergeh(a,b) V1632(__builtin_ia32_punpckhwd128(a,b))
|
||||
|
||||
#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
|
||||
#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
|
||||
|
||||
#define v32_shift_l v128_sl32
|
||||
#define v32_shift_r v128_sr32
|
||||
|
||||
#define v32_rotate(x,n) v128_rol32
|
||||
|
||||
#define v32_shuf __builtin_ia32_pshufd
|
||||
|
||||
#define SHUFXOR_1 0xb1 /* 0b10110001 */
|
||||
#define SHUFXOR_2 0x4e /* 0b01001110 */
|
||||
#define SHUFXOR_3 0x1b /* 0b00011011 */
|
||||
|
||||
#define CAT(x, y) x##y
|
||||
#define XCAT(x,y) CAT(x,y)
|
||||
|
||||
#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
|
||||
|
||||
#define v32_bswap(x) (x)
|
||||
|
||||
#define v16_broadcast(x) ({ \
|
||||
union u32 u; \
|
||||
u32 xx = x; \
|
||||
u.u[0] = xx | (xx << 16); \
|
||||
V3216(v32_shuf(u.v,0)); })
|
||||
|
||||
#define CV(x) {{x, x, x, x, x, x, x, x}}
|
||||
|
||||
#else
|
||||
|
||||
#error "I don't know how to vectorize on this architecture."
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* Twiddle tables */
|
||||
|
||||
static const union cv FFT64_Twiddle[] = {
|
||||
{{1, 2, 4, 8, 16, 32, 64, 128}},
|
||||
{{1, 60, 2, 120, 4, -17, 8, -34}},
|
||||
{{1, 120, 8, -68, 64, -30, -2, 17}},
|
||||
{{1, 46, 60, -67, 2, 92, 120, 123}},
|
||||
{{1, 92, -17, -22, 32, 117, -30, 67}},
|
||||
{{1, -67, 120, -73, 8, -22, -68, -70}},
|
||||
{{1, 123, -34, -70, 128, 67, 17, 35}},
|
||||
};
|
||||
|
||||
|
||||
static const union cv FFT128_Twiddle[] = {
|
||||
{{ 1, -118, 46, -31, 60, 116, -67, -61}},
|
||||
{{ 2, 21, 92, -62, 120, -25, 123, -122}},
|
||||
{{ 4, 42, -73, -124, -17, -50, -11, 13}},
|
||||
{{ 8, 84, 111, 9, -34, -100, -22, 26}},
|
||||
{{ 16, -89, -35, 18, -68, 57, -44, 52}},
|
||||
{{ 32, 79, -70, 36, 121, 114, -88, 104}},
|
||||
{{ 64, -99, 117, 72, -15, -29, 81, -49}},
|
||||
{{128, 59, -23, -113, -30, -58, -95, -98}},
|
||||
};
|
||||
|
||||
|
||||
static const union cv FFT256_Twiddle[] = {
|
||||
{{ 1, 41, -118, 45, 46, 87, -31, 14}},
|
||||
{{ 60, -110, 116, -127, -67, 80, -61, 69}},
|
||||
{{ 2, 82, 21, 90, 92, -83, -62, 28}},
|
||||
{{ 120, 37, -25, 3, 123, -97, -122, -119}},
|
||||
{{ 4, -93, 42, -77, -73, 91, -124, 56}},
|
||||
{{ -17, 74, -50, 6, -11, 63, 13, 19}},
|
||||
{{ 8, 71, 84, 103, 111, -75, 9, 112}},
|
||||
{{ -34, -109, -100, 12, -22, 126, 26, 38}},
|
||||
{{ 16, -115, -89, -51, -35, 107, 18, -33}},
|
||||
{{ -68, 39, 57, 24, -44, -5, 52, 76}},
|
||||
{{ 32, 27, 79, -102, -70, -43, 36, -66}},
|
||||
{{ 121, 78, 114, 48, -88, -10, 104, -105}},
|
||||
{{ 64, 54, -99, 53, 117, -86, 72, 125}},
|
||||
{{ -15, -101, -29, 96, 81, -20, -49, 47}},
|
||||
{{ 128, 108, 59, 106, -23, 85, -113, -7}},
|
||||
{{ -30, 55, -58, -65, -95, -40, -98, 94}}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
@@ -13,11 +13,7 @@
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
@@ -43,11 +39,7 @@ typedef struct {
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
} c11_ctx_holder;
|
||||
|
||||
c11_ctx_holder c11_ctx __attribute__ ((aligned (64)));
|
||||
@@ -69,11 +61,6 @@ void init_c11_ctx()
|
||||
init_luffa( &c11_ctx.luffa, 512 );
|
||||
cubehashInit( &c11_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &c11_ctx.shavite );
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &c11_ctx.simd );
|
||||
#else
|
||||
init_sd( &c11_ctx.simd, 512 );
|
||||
#endif
|
||||
}
|
||||
|
||||
void c11_hash( void *output, const void *input )
|
||||
@@ -105,41 +92,35 @@ void c11_hash( void *output, const void *input )
|
||||
sph_skein512( &ctx.skein, (const void*) hash, 64 );
|
||||
sph_skein512_close( &ctx.skein, hash );
|
||||
|
||||
update_and_final_luffa( &ctx.luffa, hash, hash, 64 );
|
||||
update_and_final_luffa( &ctx.luffa, hash, hash, 64 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
|
||||
cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hash);
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#else
|
||||
sph_echo512( &ctx.echo, hash, 64 );
|
||||
sph_echo512_close( &ctx.echo, hash );
|
||||
sph_echo512( &ctx.echo, hash, 64 );
|
||||
sph_echo512_close( &ctx.echo, hash );
|
||||
#endif
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_c11( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t hash[8] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t hash[8] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t nonce = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
|
@@ -13,17 +13,13 @@
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#ifdef __AES__
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#endif
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
|
||||
@@ -37,11 +33,7 @@ typedef struct {
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
#ifdef __AES__
|
||||
hashState_groestl groestl;
|
||||
#else
|
||||
@@ -62,11 +54,6 @@ void init_tt10_ctx()
|
||||
init_luffa( &tt10_ctx.luffa, 512 );
|
||||
cubehashInit( &tt10_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &tt10_ctx.shavite );
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &tt10_ctx.simd );
|
||||
#else
|
||||
init_sd( &tt10_ctx.simd, 512 );
|
||||
#endif
|
||||
#ifdef __AES__
|
||||
init_groestl( &tt10_ctx.groestl, 64 );
|
||||
#else
|
||||
@@ -222,27 +209,7 @@ void timetravel10_hash(void *output, const void *input)
|
||||
}
|
||||
break;
|
||||
case 9:
|
||||
if ( i == 0 )
|
||||
{
|
||||
memcpy( &ctx.simd, &tt10_mid.simd, sizeof tt10_mid.simd );
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) input + midlen, tail );
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hashB,
|
||||
(const BitSequence *)input + midlen, tail*8 );
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_sd( &ctx.simd, (const BitSequence *)hashA, dataLen*8 );
|
||||
final_sd( &ctx.simd, (BitSequence *)hashB );
|
||||
#endif
|
||||
}
|
||||
simd512_ctx( &ctx.simd, hashB, hashA, dataLen );
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -325,15 +292,6 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce,
|
||||
memcpy( &tt10_mid.shavite, &tt10_ctx.shavite, sizeof(tt10_mid.shavite ) );
|
||||
sph_shavite512( &tt10_mid.shavite, endiandata, 64 );
|
||||
break;
|
||||
case 9:
|
||||
memcpy( &tt10_mid.simd, &tt10_ctx.simd, sizeof(tt10_mid.simd ) );
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512( &tt10_mid.simd, (const void*) endiandata, 64 );
|
||||
sph_simd512_close( &tt10_mid.simd, hash);
|
||||
#else
|
||||
update_sd( &tt10_mid.simd, (const BitSequence *)endiandata, 512 );
|
||||
#endif
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@@ -22,12 +22,7 @@
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
|
||||
typedef struct {
|
||||
sph_blake512_context blake;
|
||||
@@ -45,11 +40,7 @@ typedef struct {
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
} x11_ctx_holder;
|
||||
|
||||
x11_ctx_holder x11_ctx;
|
||||
@@ -71,11 +62,6 @@ void init_x11_ctx()
|
||||
init_luffa( &x11_ctx.luffa, 512 );
|
||||
cubehashInit( &x11_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x11_ctx.shavite );
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &x11_ctx.simd );
|
||||
#else
|
||||
init_sd( &x11_ctx.simd, 512 );
|
||||
#endif
|
||||
}
|
||||
|
||||
void x11_hash( void *state, const void *input )
|
||||
@@ -118,13 +104,7 @@ void x11_hash( void *state, const void *input )
|
||||
sph_shavite512( &ctx.shavite, hash, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash );
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
|
@@ -20,11 +20,7 @@
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
|
||||
typedef struct {
|
||||
@@ -37,11 +33,7 @@ typedef struct {
|
||||
#endif
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
sph_blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
sph_skein512_context skein;
|
||||
@@ -63,11 +55,6 @@ void init_x11evo_ctx()
|
||||
#endif
|
||||
init_luffa( &x11evo_ctx.luffa, 512 );
|
||||
cubehashInit( &x11evo_ctx.cube, 512, 16, 32 );
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &x11evo_ctx.simd );
|
||||
#else
|
||||
init_sd( &x11evo_ctx.simd, 512 );
|
||||
#endif
|
||||
sph_blake512_init( &x11evo_ctx.blake );
|
||||
sph_bmw512_init( &x11evo_ctx.bmw );
|
||||
sph_skein512_init( &x11evo_ctx.skein );
|
||||
@@ -146,12 +133,7 @@ void x11evo_hash( void *state, const void *input )
|
||||
sph_shavite512_close( &ctx.shavite, (char*)hash );
|
||||
break;
|
||||
case 9:
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (char*)hash, (const char*)hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
break;
|
||||
case 10:
|
||||
#ifdef __AES__
|
||||
|
@@ -17,12 +17,7 @@
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
@@ -47,11 +42,7 @@ typedef struct {
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
sph_gost512_context gost;
|
||||
} x11gost_ctx_holder;
|
||||
|
||||
@@ -75,11 +66,6 @@ void init_x11gost_ctx()
|
||||
sph_shavite512_init( &x11gost_ctx.shavite );
|
||||
init_luffa( &x11gost_ctx.luffa, 512 );
|
||||
cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init(&x11gost_ctx.simd);
|
||||
#else
|
||||
init_sd( &x11gost_ctx.simd, 512 );
|
||||
#endif
|
||||
}
|
||||
|
||||
void x11gost_hash(void *output, const void *input)
|
||||
@@ -123,13 +109,7 @@ void x11gost_hash(void *output, const void *input)
|
||||
sph_shavite512( &ctx.shavite, hash, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash );
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512 (&ctx.simd, hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
|
@@ -17,11 +17,7 @@
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
@@ -44,11 +40,7 @@ typedef struct {
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cubehash;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
} x12_ctx_holder;
|
||||
|
||||
@@ -68,14 +60,9 @@ void init_x12_ctx()
|
||||
sph_groestl512_init(&x12_ctx.groestl);
|
||||
sph_echo512_init(&x12_ctx.echo);
|
||||
#endif
|
||||
init_luffa( &x12_ctx.luffa, 512 );
|
||||
init_luffa( &x12_ctx.luffa, 512 );
|
||||
cubehashInit( &x12_ctx.cubehash, 512, 16, 32 );
|
||||
sph_shavite512_init( &x12_ctx.shavite );
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &x12_ctx.simd );
|
||||
#else
|
||||
init_sd( &x12_ctx.simd, 512 );
|
||||
#endif
|
||||
sph_hamsi512_init( &x12_ctx.hamsi );
|
||||
};
|
||||
|
||||
@@ -101,13 +88,7 @@ void x12hash(void *output, const void *input)
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hashB);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) hashB, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
|
||||
final_sd( &ctx.simd, (BitSequence *)hash );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hashB, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hashB,
|
||||
|
@@ -15,11 +15,7 @@
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
@@ -48,11 +44,7 @@ typedef struct {
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cubehash;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
} x13_ctx_holder;
|
||||
|
||||
@@ -77,11 +69,6 @@ void init_x13_ctx()
|
||||
init_luffa( &x13_ctx.luffa, 512 );
|
||||
cubehashInit( &x13_ctx.cubehash, 512, 16, 32 );
|
||||
sph_shavite512_init( &x13_ctx.shavite );
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init(&x13_ctx.simd);
|
||||
#else
|
||||
init_sd( &x13_ctx.simd, 512 );
|
||||
#endif
|
||||
sph_hamsi512_init( &x13_ctx.hamsi );
|
||||
};
|
||||
|
||||
@@ -121,13 +108,7 @@ void x13hash(void *output, const void *input)
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
|
@@ -15,11 +15,7 @@
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
@@ -47,11 +43,7 @@ typedef struct {
|
||||
sph_skein512_context skein;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
sm3_ctx_t sm3;
|
||||
} x13bcd_ctx_holder;
|
||||
@@ -76,11 +68,6 @@ void init_x13bcd_ctx()
|
||||
sph_keccak512_init( &x13bcd_ctx.keccak );
|
||||
cubehashInit( &x13bcd_ctx.cube,512,16,32 );
|
||||
sph_shavite512_init( &x13bcd_ctx.shavite );
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &x13bcd_ctx.simd );
|
||||
#else
|
||||
init_sd( &x13bcd_ctx.simd, 512 );
|
||||
#endif
|
||||
sm3_init( &x13bcd_ctx.sm3 );
|
||||
sph_hamsi512_init( &x13bcd_ctx.hamsi );
|
||||
};
|
||||
@@ -127,13 +114,7 @@ void x13bcd_hash(void *output, const void *input)
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
|
@@ -17,11 +17,7 @@
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
@@ -46,11 +42,7 @@ typedef struct {
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
sm3_ctx_t sm3;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
@@ -75,11 +67,6 @@ void init_x13sm3_ctx()
|
||||
init_luffa( &hsr_ctx.luffa,512 );
|
||||
cubehashInit( &hsr_ctx.cube,512,16,32 );
|
||||
sph_shavite512_init( &hsr_ctx.shavite );
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &hsr_ctx.simd );
|
||||
#else
|
||||
init_sd( &hsr_ctx.simd,512 );
|
||||
#endif
|
||||
sm3_init( &hsr_ctx.sm3 );
|
||||
sph_hamsi512_init( &hsr_ctx.hamsi );
|
||||
sph_fugue512_init( &hsr_ctx.fugue );
|
||||
@@ -123,13 +110,7 @@ void x13sm3_hash(void *output, const void *input)
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
//11---echo---
|
||||
#ifdef __AES__
|
||||
|
@@ -15,11 +15,7 @@
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
@@ -49,11 +45,7 @@ typedef struct {
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_shabal512_context shabal;
|
||||
} x14_ctx_holder;
|
||||
@@ -79,11 +71,6 @@ void init_x14_ctx()
|
||||
init_luffa( &x14_ctx.luffa,512 );
|
||||
cubehashInit( &x14_ctx.cube,512,16,32 );
|
||||
sph_shavite512_init( &x14_ctx.shavite );
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &x14_ctx.simd );
|
||||
#else
|
||||
init_sd( &x14_ctx.simd, 512 );
|
||||
#endif
|
||||
sph_hamsi512_init( &x14_ctx.hamsi );
|
||||
sph_shabal512_init( &x14_ctx.shabal );
|
||||
};
|
||||
@@ -124,13 +111,7 @@ void x14hash(void *output, const void *input)
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
|
@@ -17,12 +17,7 @@
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
@@ -52,11 +47,7 @@ typedef struct {
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cubehash;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
@@ -83,11 +74,6 @@ void init_x15_ctx()
|
||||
init_luffa( &x15_ctx.luffa,512 );
|
||||
cubehashInit( &x15_ctx.cubehash, 512, 16, 32 );
|
||||
sph_shavite512_init( &x15_ctx.shavite );
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &x15_ctx.simd );
|
||||
#else
|
||||
init_sd( &x15_ctx.simd, 512 );
|
||||
#endif
|
||||
sph_hamsi512_init( &x15_ctx.hamsi );
|
||||
sph_shabal512_init( &x15_ctx.shabal );
|
||||
sph_whirlpool_init( &x15_ctx.whirlpool );
|
||||
@@ -131,13 +117,7 @@ void x15hash(void *output, const void *input)
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
|
@@ -236,7 +236,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
|
||||
do
|
||||
{
|
||||
edata[19] = nonce;
|
||||
if ( hex_hash( hash32, edata, thr_id ) );
|
||||
if ( hex_hash( hash32, edata, thr_id ) )
|
||||
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
|
||||
{
|
||||
be32enc( &pdata[19], nonce );
|
||||
|
@@ -526,7 +526,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if( x16r_8way_hash( hash, vdata, thr_id ) );
|
||||
if ( x16r_8way_hash( hash, vdata, thr_id ) )
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
@@ -952,7 +952,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if ( x16r_4way_hash( hash, vdata, thr_id ) );
|
||||
if ( x16r_4way_hash( hash, vdata, thr_id ) )
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
@@ -1353,7 +1353,7 @@ int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce,
|
||||
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if ( x16r_2x64_hash( hash, vdata, thr_id ) );
|
||||
if ( x16r_2x64_hash( hash, vdata, thr_id ) )
|
||||
for ( int i = 0; i < 2; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
|
@@ -15,7 +15,6 @@
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
|
@@ -137,7 +137,7 @@ int scanhash_x20r_8x64( struct work *work, uint32_t max_nonce,
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if( x20r_8x64_hash( hash, vdata, thr_id ) );
|
||||
if ( x20r_8x64_hash( hash, vdata, thr_id ) )
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
@@ -205,7 +205,7 @@ int scanhash_x20r_4x64( struct work *work, uint32_t max_nonce,
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if ( x20r_4x64_hash( hash, vdata, thr_id ) );
|
||||
if ( x20r_4x64_hash( hash, vdata, thr_id ) )
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
@@ -269,7 +269,7 @@ int scanhash_x20r_2x64( struct work *work, uint32_t max_nonce,
|
||||
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if ( x20r_2x64_hash( hash, vdata, thr_id ) );
|
||||
if ( x20r_2x64_hash( hash, vdata, thr_id ) )
|
||||
for ( int i = 0; i < 2; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
|
@@ -18,11 +18,7 @@
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
@@ -53,11 +49,7 @@ typedef struct {
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cubehash;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
@@ -86,11 +78,6 @@ void init_sonoa_ctx()
|
||||
init_luffa( &sonoa_ctx.luffa, 512 );
|
||||
cubehashInit( &sonoa_ctx.cubehash, 512, 16, 32 );
|
||||
sph_shavite512_init( &sonoa_ctx.shavite );
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &sonoa_ctx.simd );
|
||||
#else
|
||||
init_sd( &sonoa_ctx.simd, 512 );
|
||||
#endif
|
||||
sph_hamsi512_init( &sonoa_ctx.hamsi );
|
||||
sph_shabal512_init( &sonoa_ctx.shabal );
|
||||
sph_whirlpool_init( &sonoa_ctx.whirlpool );
|
||||
@@ -134,13 +121,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
|
||||
sph_shavite512(&ctx.shavite, hash, 64);
|
||||
sph_shavite512_close(&ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
@@ -189,13 +170,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
|
||||
sph_shavite512(&ctx.shavite, hash, 64);
|
||||
sph_shavite512_close(&ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
init_echo( &ctx.echo, 512 );
|
||||
@@ -249,13 +224,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
|
||||
sph_shavite512(&ctx.shavite, hash, 64);
|
||||
sph_shavite512_close(&ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
init_echo( &ctx.echo, 512 );
|
||||
@@ -318,13 +287,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
|
||||
sph_shavite512(&ctx.shavite, hash, 64);
|
||||
sph_shavite512_close(&ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
init_echo( &ctx.echo, 512 );
|
||||
@@ -410,13 +373,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
|
||||
sph_shavite512(&ctx.shavite, hash, 64);
|
||||
sph_shavite512_close(&ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &ctx.simd );
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
simd_full( &ctx.simd, hash, hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
init_echo( &ctx.echo, 512 );
|
||||
@@ -483,13 +440,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
|
||||
sph_shavite512(&ctx.shavite, hash, 64);
|
||||
sph_shavite512_close(&ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &ctx.simd );
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
simd_full( &ctx.simd, hash, hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
init_echo( &ctx.echo, 512 );
|
||||
@@ -527,7 +478,6 @@ int sonoa_hash( void *state, const void *input, int thr_id )
|
||||
sph_whirlpool_close(&ctx.whirlpool, hash);
|
||||
|
||||
if ( work_restart[thr_id].restart ) return 0;
|
||||
//
|
||||
|
||||
sph_bmw512_init( &ctx.bmw);
|
||||
sph_bmw512(&ctx.bmw, hash, 64);
|
||||
@@ -565,13 +515,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
|
||||
sph_shavite512(&ctx.shavite, hash, 64);
|
||||
sph_shavite512_close(&ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &ctx.simd );
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
simd_full( &ctx.simd, hash, hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
init_echo( &ctx.echo, 512 );
|
||||
|
@@ -18,11 +18,7 @@
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/fugue/fugue-aesni.h"
|
||||
@@ -34,7 +30,7 @@
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#endif
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
//#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/luffa/sph_luffa.h"
|
||||
|
||||
|
||||
@@ -63,17 +59,9 @@ union _x17_context_overlay
|
||||
#else
|
||||
hashState_luffa luffa;
|
||||
#endif
|
||||
//#if defined(__aarch64__)
|
||||
// sph_cubehash512_context cube;
|
||||
//#else
|
||||
cubehashParam cube;
|
||||
//#endif
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
@@ -127,26 +115,13 @@ int x17_hash(void *output, const void *input, int thr_id )
|
||||
luffa_full( &ctx.luffa, hash, 512, hash, 64 );
|
||||
#endif
|
||||
|
||||
//#if defined(__aarch64__)
|
||||
// sph_cubehash512_init(&ctx.cube);
|
||||
// sph_cubehash512(&ctx.cube, (const void*) hash, 64);
|
||||
// sph_cubehash512_close(&ctx.cube, hash);
|
||||
//#else
|
||||
cubehash_full( &ctx.cube, hash, 512, hash, 64 );
|
||||
//#endif
|
||||
|
||||
sph_shavite512_init( &ctx.shavite );
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &ctx.simd );
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
simd_full( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
echo_full( &ctx.echo, (BitSequence *)hash, 512,
|
||||
|
@@ -17,11 +17,7 @@
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#if defined(__AES__)
|
||||
@@ -45,11 +41,7 @@ typedef struct {
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cubehash;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
@@ -78,11 +70,6 @@ void init_xevan_ctx()
|
||||
init_luffa( &xevan_ctx.luffa, 512 );
|
||||
cubehashInit( &xevan_ctx.cubehash, 512, 16, 32 );
|
||||
sph_shavite512_init( &xevan_ctx.shavite );
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &xevan_ctx.simd );
|
||||
#else
|
||||
init_sd( &xevan_ctx.simd, 512 );
|
||||
#endif
|
||||
sph_hamsi512_init( &xevan_ctx.hamsi );
|
||||
sph_shabal512_init( &xevan_ctx.shabal );
|
||||
sph_whirlpool_init( &xevan_ctx.whirlpool );
|
||||
@@ -137,13 +124,7 @@ int xevan_hash(void *output, const void *input, int thr_id )
|
||||
sph_shavite512(&ctx.shavite, hash, dataLen);
|
||||
sph_shavite512_close(&ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512( &ctx.simd, (const void*) hash, dataLen );
|
||||
sph_simd512_close( &ctx.simd, hash );
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, dataLen*8 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, dataLen );
|
||||
|
||||
#if defined(__AES__)
|
||||
update_final_echo( &ctx.echo, (BitSequence *) hash,
|
||||
@@ -210,13 +191,14 @@ int xevan_hash(void *output, const void *input, int thr_id )
|
||||
sph_shavite512(&ctx.shavite, hash, dataLen);
|
||||
sph_shavite512_close(&ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, dataLen*8 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, dataLen );
|
||||
//#if defined(__aarch64__)
|
||||
// sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
// sph_simd512_close(&ctx.simd, hash);
|
||||
//#else
|
||||
// update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
// (const BitSequence *)hash, dataLen*8 );
|
||||
//#endif
|
||||
|
||||
#if defined(__AES__)
|
||||
update_final_echo( &ctx.echo, (BitSequence *) hash,
|
||||
|
@@ -18,7 +18,6 @@
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
#include "algo/shavite/shavite-hash-2way.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/fugue/fugue-aesni.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
|
14
arm-build.sh
14
arm-build.sh
@@ -1,14 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Linux build
|
||||
|
||||
make distclean || echo clean
|
||||
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
|
||||
CFLAGS="-O3 -march=native -Wall -flax-vector-conversions" ./configure --with-curl
|
||||
|
||||
make -j $(nproc)
|
||||
|
||||
strip -s cpuminer
|
@@ -15,51 +15,43 @@ rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
CFLAGS="-O3 -march=armv9-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
|
||||
make -j $(nproc)
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-armv9-crypto-sha3
|
||||
|
||||
make clean || echo clean
|
||||
CFLAGS="-O3 -march=armv9-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl
|
||||
make -j $(nproc)
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-armv9-crypto
|
||||
|
||||
make clean || echo clean
|
||||
CFLAGS="-O3 -march=armv9-a -Wall -flax-vector-conversions" ./configure --with-curl
|
||||
make -j $(nproc)
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-armv9
|
||||
|
||||
# SVE2 available in armv8.5
|
||||
make clean || echo clean
|
||||
CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+sve2 -Wall -flax-vector-conversions" ./configure --with-curl
|
||||
make -j $(nproc)
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-armv8.5-crypto-sha3-sve2
|
||||
|
||||
# SHA3 available in armv8.4
|
||||
make clean || echo clean
|
||||
CFLAGS="-O3 -march=armv8.4-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
|
||||
make -j $(nproc)
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-armv8.4-crypto-sha3
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=armv8-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl
|
||||
make -j $(nproc)
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-armv8-crypto
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=armv8-a -Wall -flax-vector-conversions" ./configure --with-curl
|
||||
make -j $(nproc)
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-armv8
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=native -Wall -flax-vector-conversions" ./configure --with-curl
|
||||
make -j $(nproc)
|
||||
strip -s cpuminer
|
||||
|
@@ -4,7 +4,7 @@
|
||||
# during develpment. However the information contained may provide compilation
|
||||
# tips to users.
|
||||
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 > /dev/null
|
||||
rm cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 > /dev/null
|
||||
|
||||
# AVX512 SHA VAES: Intel Core Icelake, Rocketlake
|
||||
make distclean || echo clean
|
||||
@@ -18,28 +18,55 @@ strip -s cpuminer
|
||||
mv cpuminer cpuminer-avx512-sha-vaes
|
||||
|
||||
# Intel Core Alderlake: AVX2 SHA VAES, needs gcc-12
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
|
||||
make -j 8
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-alderlake
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
|
||||
#make -j 8
|
||||
#strip -s cpuminer
|
||||
#mv cpuminer cpuminer-alderlake
|
||||
|
||||
# Intel Core Arrowlake: AVX2 SHA512 VAES, needs gcc-14
|
||||
# Intel Core Arrowlake-s: AVX2 SHA512 VAES, needs gcc-14
|
||||
# Arrowlake-s includes SHA512, Arrowlake does not?
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl
|
||||
#make -j 8
|
||||
#strip -s cpuminer
|
||||
#mv cpuminer cpuminer-arrowlake
|
||||
#mv cpuminer cpuminer-arrowlake-s
|
||||
|
||||
# Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14
|
||||
# Apparently Granitrapids will not include AVX10, SHA512 or APX,
|
||||
# wait for Diamondrapids & gcc-15.
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
|
||||
#make -j 8
|
||||
#strip -s cpuminer
|
||||
#mv cpuminer cpuminer-graniterapids
|
||||
|
||||
# Force AVX10-256
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=arrowlake-s -mavx10.1-256 -Wall" ./configure --with-curl
|
||||
#make -j 8
|
||||
#strip -s cpuminer
|
||||
#mv cpuminer cpuminer-avx10-256
|
||||
|
||||
# Force SHA512 AVX10-512
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1-512 -Wall" ./configure --with-curl
|
||||
#make -j 8
|
||||
#strip -s cpuminer
|
||||
#mv cpuminer cpuminer-avx10-512
|
||||
|
||||
# Zen5: AVX512 SHA VAES, requires gcc-14.
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=znver5" ./configure --with-curl
|
||||
#CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl
|
||||
#make -j $(nproc)
|
||||
#strip -s cpuminer
|
||||
#mv cpuminer cpuminer-zen4
|
||||
#mv cpuminer cpuminer-zen5
|
||||
|
||||
# Zen4: AVX512 SHA VAES
|
||||
make clean || echo clean
|
||||
@@ -70,7 +97,7 @@ make -j $(nproc)
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-avx512
|
||||
|
||||
# AVX2 SHA VAES: generic
|
||||
# AVX2 SHA VAES: generic, zen3, alderlake...arrowlake
|
||||
make clean || echo done
|
||||
rm -f config.status
|
||||
# vaes doesn't include aes
|
||||
|
@@ -1,10 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Compile on Windows using MSYS2 and MinGW.
|
||||
|
||||
make distclean || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
CFLAGS="-O3 -march=native -Wall -D_WIN32_WINNT=0x0601" ./configure --with-curl
|
||||
make -j $(nproc)
|
||||
strip -s cpuminer
|
15
build.sh
15
build.sh
@@ -1,20 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
#if [ "$OS" = "Windows_NT" ]; then
|
||||
# ./mingw64.sh
|
||||
# exit 0
|
||||
#fi
|
||||
|
||||
# Linux build
|
||||
#!/bin/sh
|
||||
|
||||
make distclean || echo clean
|
||||
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
|
||||
#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
|
||||
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
|
||||
make -j $(nproc)
|
||||
|
||||
strip -s cpuminer
|
||||
#strip -s cpuminer
|
||||
|
@@ -1,8 +1,8 @@
|
||||
#!/bin/bash
|
||||
#!/bin/sh
|
||||
#
|
||||
# make clean and rm all the targetted executables.
|
||||
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9 cpuminer-armv9-crypto cpuminer-armv9-crypto-sha3 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8.5-aes-sha3-sve2 cpuminer-armv8-crypto cpuminer-armv8 > /dev/null
|
||||
rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9 cpuminer-armv9-crypto cpuminer-armv9-crypto-sha3 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8-crypto cpuminer-armv8 > /dev/null
|
||||
|
||||
rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null
|
||||
|
||||
|
2
compat.h
2
compat.h
@@ -3,7 +3,7 @@
|
||||
|
||||
#ifdef WIN32
|
||||
|
||||
#if _WIN32_WINNT==0x0601 // Windows 7
|
||||
#if _WIN32_WINNT>=0x0601 // Windows 7
|
||||
#define WINDOWS_CPU_GROUPS_ENABLED 1
|
||||
#endif
|
||||
|
||||
|
123
configure.ac
123
configure.ac
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [24.5])
|
||||
AC_INIT([cpuminer-opt], [25.2])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
@@ -41,32 +41,30 @@ AC_CHECK_DECLS([be32dec, le32dec, be32enc, le32enc, le16dec, le16enc], [], [],
|
||||
AC_FUNC_ALLOCA
|
||||
AC_CHECK_FUNCS([getopt_long])
|
||||
|
||||
MINGW_TARGET=`$CC -dumpmachine 2>&1`
|
||||
case $MINGW_TARGET in
|
||||
arm*-*-*)
|
||||
have_arm=true
|
||||
;;
|
||||
i*86-*-mingw*)
|
||||
have_x86=true
|
||||
have_win32=true
|
||||
CFLAGS="-Icompat/pthreads $CFLAGS"
|
||||
PTHREAD_LDFLAGS="-Lcompat/pthreads/x86"
|
||||
WS2_LIBS="-lws2_32"
|
||||
;;
|
||||
x86_64-*-mingw*|amd64-*-mingw*)
|
||||
have_x86_64=true
|
||||
have_win32=true
|
||||
CFLAGS="-Icompat/pthreads $CFLAGS"
|
||||
PTHREAD_LDFLAGS="-Lcompat/pthreads/x64"
|
||||
# SHOULD BE AT END! after -lcrypto #
|
||||
WS2_LIBS="-L/mingw/x86_64-w64-mingw32/lib -lws2_32"
|
||||
;;
|
||||
i*86-*-*)
|
||||
have_x86=true
|
||||
;;
|
||||
case $target in
|
||||
x86_64-*-*|amd64-*-*)
|
||||
have_x86_64=true
|
||||
;;
|
||||
aarch64*-*-*|arm64*-*-*)
|
||||
have_arm64=true
|
||||
;;
|
||||
powerpc*-*-*)
|
||||
have_ppc=true
|
||||
;;
|
||||
esac
|
||||
|
||||
PTHREAD_FLAGS="-pthread"
|
||||
WS2_LIBS=""
|
||||
|
||||
case $target in
|
||||
*-*-mingw*)
|
||||
have_win32=true
|
||||
PTHREAD_FLAGS=""
|
||||
WS2_LIBS="-lws2_32"
|
||||
;;
|
||||
*-apple-*)
|
||||
have_apple=true
|
||||
;;
|
||||
esac
|
||||
|
||||
AC_ARG_ENABLE([assembly],
|
||||
@@ -75,54 +73,14 @@ if test x$enable_assembly != xno; then
|
||||
AC_DEFINE([USE_ASM], [1], [Define to 1 if assembly routines are wanted.])
|
||||
fi
|
||||
|
||||
if test x$enable_assembly != xno -a x$have_x86_64 = xtrue
|
||||
then
|
||||
AC_MSG_CHECKING(whether we can compile AVX code)
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vmovdqa %ymm0, %ymm1");])],
|
||||
AC_DEFINE(USE_AVX, 1, [Define to 1 if AVX assembly is available.])
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_MSG_CHECKING(whether we can compile XOP code)
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vprotd \$7, %xmm0, %xmm1");])],
|
||||
AC_DEFINE(USE_XOP, 1, [Define to 1 if XOP assembly is available.])
|
||||
AC_MSG_RESULT(yes)
|
||||
,
|
||||
AC_MSG_RESULT(no)
|
||||
AC_MSG_WARN([The assembler does not support the XOP instruction set.])
|
||||
)
|
||||
AC_MSG_CHECKING(whether we can compile AVX2 code)
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
|
||||
AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_MSG_CHECKING(whether we can compile AVX512 code)
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");])],
|
||||
AC_DEFINE(USE_AVX512, 1, [Define to 1 if AVX512 assembly is available.])
|
||||
AC_MSG_RESULT(yes)
|
||||
,
|
||||
AC_MSG_RESULT(no)
|
||||
AC_MSG_WARN([The assembler does not support the AVX512 instruction set.])
|
||||
)
|
||||
,
|
||||
AC_MSG_RESULT(no)
|
||||
AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])
|
||||
)
|
||||
,
|
||||
AC_MSG_RESULT(no)
|
||||
AC_MSG_WARN([The assembler does not support the AVX instruction set.])
|
||||
)
|
||||
fi
|
||||
|
||||
# jansson test fails on Linux/Mingw, handled in Makefile.am.
|
||||
AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)
|
||||
|
||||
# GC2 for GNU static
|
||||
if test "x$have_win32" = "xtrue" ; then
|
||||
# MinGW
|
||||
AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",[])
|
||||
else
|
||||
AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",[])
|
||||
fi
|
||||
|
||||
LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
|
||||
# PTHREAD_LIBS="$PTHREAD_LIBS"
|
||||
AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
|
||||
AC_CHECK_LIB([pthreadGC2], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",
|
||||
AC_CHECK_LIB([pthreadGC1], [pthread_create], PTHREAD_LIBS="-lpthreadGC1",
|
||||
AC_CHECK_LIB([pthreadGC], [pthread_create], PTHREAD_LIBS="-lpthreadGC"
|
||||
))))
|
||||
|
||||
AC_MSG_CHECKING(whether __uint128_t is supported)
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([static __uint128_t i = 100;])],
|
||||
@@ -136,16 +94,10 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([static __uint128_t i = 100;])],
|
||||
AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
|
||||
AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
|
||||
AM_CONDITIONAL([USE_ASM], [test x$enable_assembly != xno])
|
||||
AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue])
|
||||
AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue])
|
||||
AM_CONDITIONAL([ARCH_ARM], [test x$have_arm = xtrue])
|
||||
AM_CONDITIONAL([ARCH_ARM64], [test x$have_arm64 = xtrue])
|
||||
AM_CONDITIONAL([MINGW], [test "x$OS" = "xWindows_NT"])
|
||||
|
||||
if test x$request_jansson = xtrue ; then
|
||||
JANSSON_LIBS="compat/jansson/libjansson.a"
|
||||
else
|
||||
JANSSON_LIBS=-ljansson
|
||||
fi
|
||||
AM_CONDITIONAL([HAVE_APPLE], [test x$have_apple = xtrue])
|
||||
|
||||
# libcurl install path (for mingw : --with-curl=/usr/local)
|
||||
AC_ARG_WITH([curl],
|
||||
@@ -158,25 +110,10 @@ if test -n "$with_curl" ; then
|
||||
LIBCURL="-lcurl -lz"
|
||||
fi
|
||||
|
||||
# SSL install path (for mingw : --with-crypto=/usr/local/ssl)
|
||||
AC_ARG_WITH([crypto],
|
||||
[ --with-crypto=PATH prefix where openssl crypto is installed [default=/usr]])
|
||||
|
||||
if test -n "$with_crypto" ; then
|
||||
LIBCURL_CFLAGS="$LIBCURL_CFLAGS -I$with_crypto/include"
|
||||
LIBCURL_CPPFLAGS="$LIBCURL_CPPFLAGS -I$with_crypto/include"
|
||||
LIBCURL_LDFLAGS="-L$with_crypto/lib $LIBCURL_LDFLAGS"
|
||||
LIBCURL="$LIBCURL -lssl -lcrypto"
|
||||
fi
|
||||
|
||||
CFLAGS="$CFLAGS $LIBCURL_CFLAGS"
|
||||
CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
|
||||
LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS"
|
||||
|
||||
#AC_CHECK_LIB([z],[gzopen],[],[])
|
||||
#AC_CHECK_LIB([crypto],[OPENSSL_init], crypto=yes, [AC_MSG_ERROR([OpenSSL crypto library required])])
|
||||
#AC_CHECK_LIB([ssl],[SSL_new], ssl=yes, ssl=no)
|
||||
|
||||
# AC_CHECK_LIB([curl], [curl_multi_timeout],
|
||||
# have_libcurl=yes,
|
||||
# have_libcurl=no AC_MSG_ERROR([curl library required])
|
||||
|
433
configure~
433
configure~
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.4.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.2.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
|
||||
@@ -608,8 +608,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='24.4'
|
||||
PACKAGE_STRING='cpuminer-opt 24.4'
|
||||
PACKAGE_VERSION='25.2'
|
||||
PACKAGE_STRING='cpuminer-opt 25.2'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -657,14 +657,14 @@ JANSSON_LIBS
|
||||
LIBCURL_CPPFLAGS
|
||||
LIBCURL_CFLAGS
|
||||
LIBCURL
|
||||
HAVE_APPLE_FALSE
|
||||
HAVE_APPLE_TRUE
|
||||
MINGW_FALSE
|
||||
MINGW_TRUE
|
||||
ARCH_ARM_FALSE
|
||||
ARCH_ARM_TRUE
|
||||
ARCH_ARM64_FALSE
|
||||
ARCH_ARM64_TRUE
|
||||
ARCH_x86_64_FALSE
|
||||
ARCH_x86_64_TRUE
|
||||
ARCH_x86_FALSE
|
||||
ARCH_x86_TRUE
|
||||
USE_ASM_FALSE
|
||||
USE_ASM_TRUE
|
||||
HAVE_WINDOWS_FALSE
|
||||
@@ -796,7 +796,6 @@ enable_maintainer_mode
|
||||
enable_dependency_tracking
|
||||
enable_assembly
|
||||
with_curl
|
||||
with_crypto
|
||||
'
|
||||
ac_precious_vars='build_alias
|
||||
host_alias
|
||||
@@ -1360,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 24.4 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 25.2 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1432,7 +1431,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 24.4:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 25.2:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1455,7 +1454,6 @@ Optional Packages:
|
||||
--with-PACKAGE[=ARG] use PACKAGE [ARG=yes]
|
||||
--without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no)
|
||||
--with-curl=PATH prefix where curl is installed default=/usr
|
||||
--with-crypto=PATH prefix where openssl crypto is installed default=/usr
|
||||
|
||||
Some influential environment variables:
|
||||
CC C compiler command
|
||||
@@ -1538,7 +1536,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 24.4
|
||||
cpuminer-opt configure 25.2
|
||||
generated by GNU Autoconf 2.71
|
||||
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
@@ -1985,7 +1983,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 24.4, which was
|
||||
It was created by cpuminer-opt $as_me 25.2, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
$ $0$ac_configure_args_raw
|
||||
@@ -3593,7 +3591,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='24.4'
|
||||
VERSION='25.2'
|
||||
|
||||
|
||||
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
|
||||
@@ -5810,11 +5808,11 @@ if test x$ac_prog_cxx_stdcxx = xno
|
||||
then :
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++11 features" >&5
|
||||
printf %s "checking for $CXX option to enable C++11 features... " >&6; }
|
||||
if test ${ac_cv_prog_cxx_11+y}
|
||||
if test ${ac_cv_prog_cxx_cxx11+y}
|
||||
then :
|
||||
printf %s "(cached) " >&6
|
||||
else $as_nop
|
||||
ac_cv_prog_cxx_11=no
|
||||
ac_cv_prog_cxx_cxx11=no
|
||||
ac_save_CXX=$CXX
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
@@ -5856,11 +5854,11 @@ if test x$ac_prog_cxx_stdcxx = xno
|
||||
then :
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++98 features" >&5
|
||||
printf %s "checking for $CXX option to enable C++98 features... " >&6; }
|
||||
if test ${ac_cv_prog_cxx_98+y}
|
||||
if test ${ac_cv_prog_cxx_cxx98+y}
|
||||
then :
|
||||
printf %s "(cached) " >&6
|
||||
else $as_nop
|
||||
ac_cv_prog_cxx_98=no
|
||||
ac_cv_prog_cxx_cxx98=no
|
||||
ac_save_CXX=$CXX
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
@@ -6502,32 +6500,30 @@ then :
|
||||
fi
|
||||
|
||||
|
||||
MINGW_TARGET=`$CC -dumpmachine 2>&1`
|
||||
case $MINGW_TARGET in
|
||||
arm*-*-*)
|
||||
have_arm=true
|
||||
;;
|
||||
i*86-*-mingw*)
|
||||
have_x86=true
|
||||
have_win32=true
|
||||
CFLAGS="-Icompat/pthreads $CFLAGS"
|
||||
PTHREAD_LDFLAGS="-Lcompat/pthreads/x86"
|
||||
WS2_LIBS="-lws2_32"
|
||||
;;
|
||||
x86_64-*-mingw*|amd64-*-mingw*)
|
||||
have_x86_64=true
|
||||
have_win32=true
|
||||
CFLAGS="-Icompat/pthreads $CFLAGS"
|
||||
PTHREAD_LDFLAGS="-Lcompat/pthreads/x64"
|
||||
# SHOULD BE AT END! after -lcrypto #
|
||||
WS2_LIBS="-L/mingw/x86_64-w64-mingw32/lib -lws2_32"
|
||||
;;
|
||||
i*86-*-*)
|
||||
have_x86=true
|
||||
;;
|
||||
case $target in
|
||||
x86_64-*-*|amd64-*-*)
|
||||
have_x86_64=true
|
||||
;;
|
||||
aarch64*-*-*|arm64*-*-*)
|
||||
have_arm64=true
|
||||
;;
|
||||
powerpc*-*-*)
|
||||
have_ppc=true
|
||||
;;
|
||||
esac
|
||||
|
||||
PTHREAD_FLAGS="-pthread"
|
||||
WS2_LIBS=""
|
||||
|
||||
case $target in
|
||||
*-*-mingw*)
|
||||
have_win32=true
|
||||
PTHREAD_FLAGS=""
|
||||
WS2_LIBS="-lws2_32"
|
||||
;;
|
||||
*-apple-*)
|
||||
have_apple=true
|
||||
;;
|
||||
esac
|
||||
|
||||
# Check whether --enable-assembly was given.
|
||||
@@ -6542,126 +6538,7 @@ printf "%s\n" "#define USE_ASM 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
|
||||
if test x$enable_assembly != xno -a x$have_x86_64 = xtrue
|
||||
then
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX code" >&5
|
||||
printf %s "checking whether we can compile AVX code... " >&6; }
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
asm ("vmovdqa %ymm0, %ymm1");
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_compile "$LINENO"
|
||||
then :
|
||||
|
||||
printf "%s\n" "#define USE_AVX 1" >>confdefs.h
|
||||
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
|
||||
printf "%s\n" "yes" >&6; }
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile XOP code" >&5
|
||||
printf %s "checking whether we can compile XOP code... " >&6; }
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
asm ("vprotd \$7, %xmm0, %xmm1");
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_compile "$LINENO"
|
||||
then :
|
||||
|
||||
printf "%s\n" "#define USE_XOP 1" >>confdefs.h
|
||||
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
|
||||
printf "%s\n" "yes" >&6; }
|
||||
|
||||
else $as_nop
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
|
||||
printf "%s\n" "no" >&6; }
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the XOP instruction set." >&5
|
||||
printf "%s\n" "$as_me: WARNING: The assembler does not support the XOP instruction set." >&2;}
|
||||
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX2 code" >&5
|
||||
printf %s "checking whether we can compile AVX2 code... " >&6; }
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
asm ("vpaddd %ymm0, %ymm1, %ymm2");
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_compile "$LINENO"
|
||||
then :
|
||||
|
||||
printf "%s\n" "#define USE_AVX2 1" >>confdefs.h
|
||||
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
|
||||
printf "%s\n" "yes" >&6; }
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX512 code" >&5
|
||||
printf %s "checking whether we can compile AVX512 code... " >&6; }
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_compile "$LINENO"
|
||||
then :
|
||||
|
||||
printf "%s\n" "#define USE_AVX512 1" >>confdefs.h
|
||||
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
|
||||
printf "%s\n" "yes" >&6; }
|
||||
|
||||
else $as_nop
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
|
||||
printf "%s\n" "no" >&6; }
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX512 instruction set." >&5
|
||||
printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX512 instruction set." >&2;}
|
||||
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
|
||||
|
||||
else $as_nop
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
|
||||
printf "%s\n" "no" >&6; }
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX2 instruction set." >&5
|
||||
printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX2 instruction set." >&2;}
|
||||
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
|
||||
|
||||
else $as_nop
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
|
||||
printf "%s\n" "no" >&6; }
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX instruction set." >&5
|
||||
printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX instruction set." >&2;}
|
||||
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
|
||||
fi
|
||||
|
||||
# jansson test fails on Linux/Mingw, handled in Makefile.am.
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for json_loads in -ljansson" >&5
|
||||
printf %s "checking for json_loads in -ljansson... " >&6; }
|
||||
if test ${ac_cv_lib_jansson_json_loads+y}
|
||||
@@ -6705,51 +6582,7 @@ else $as_nop
|
||||
fi
|
||||
|
||||
|
||||
# GC2 for GNU static
|
||||
if test "x$have_win32" = "xtrue" ; then
|
||||
# MinGW
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
|
||||
printf %s "checking for pthread_create in -lpthread... " >&6; }
|
||||
if test ${ac_cv_lib_pthread_pthread_create+y}
|
||||
then :
|
||||
printf %s "(cached) " >&6
|
||||
else $as_nop
|
||||
ac_check_lib_save_LIBS=$LIBS
|
||||
LIBS="-lpthread $LIBS"
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
/* Override any GCC internal prototype to avoid an error.
|
||||
Use char because int might match the return type of a GCC
|
||||
builtin and then its argument prototype would still apply. */
|
||||
char pthread_create ();
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
return pthread_create ();
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_link "$LINENO"
|
||||
then :
|
||||
ac_cv_lib_pthread_pthread_create=yes
|
||||
else $as_nop
|
||||
ac_cv_lib_pthread_pthread_create=no
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.beam \
|
||||
conftest$ac_exeext conftest.$ac_ext
|
||||
LIBS=$ac_check_lib_save_LIBS
|
||||
fi
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthread_pthread_create" >&5
|
||||
printf "%s\n" "$ac_cv_lib_pthread_pthread_create" >&6; }
|
||||
if test "x$ac_cv_lib_pthread_pthread_create" = xyes
|
||||
then :
|
||||
PTHREAD_LIBS="-lpthreadGC2"
|
||||
fi
|
||||
|
||||
else
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
|
||||
printf %s "checking for pthread_create in -lpthread... " >&6; }
|
||||
if test ${ac_cv_lib_pthread_pthread_create+y}
|
||||
then :
|
||||
@@ -6787,12 +6620,132 @@ printf "%s\n" "$ac_cv_lib_pthread_pthread_create" >&6; }
|
||||
if test "x$ac_cv_lib_pthread_pthread_create" = xyes
|
||||
then :
|
||||
PTHREAD_LIBS="-lpthread"
|
||||
else $as_nop
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC2" >&5
|
||||
printf %s "checking for pthread_create in -lpthreadGC2... " >&6; }
|
||||
if test ${ac_cv_lib_pthreadGC2_pthread_create+y}
|
||||
then :
|
||||
printf %s "(cached) " >&6
|
||||
else $as_nop
|
||||
ac_check_lib_save_LIBS=$LIBS
|
||||
LIBS="-lpthreadGC2 $LIBS"
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
/* Override any GCC internal prototype to avoid an error.
|
||||
Use char because int might match the return type of a GCC
|
||||
builtin and then its argument prototype would still apply. */
|
||||
char pthread_create ();
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
return pthread_create ();
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_link "$LINENO"
|
||||
then :
|
||||
ac_cv_lib_pthreadGC2_pthread_create=yes
|
||||
else $as_nop
|
||||
ac_cv_lib_pthreadGC2_pthread_create=no
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.beam \
|
||||
conftest$ac_exeext conftest.$ac_ext
|
||||
LIBS=$ac_check_lib_save_LIBS
|
||||
fi
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC2_pthread_create" >&5
|
||||
printf "%s\n" "$ac_cv_lib_pthreadGC2_pthread_create" >&6; }
|
||||
if test "x$ac_cv_lib_pthreadGC2_pthread_create" = xyes
|
||||
then :
|
||||
PTHREAD_LIBS="-lpthreadGC2"
|
||||
else $as_nop
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC1" >&5
|
||||
printf %s "checking for pthread_create in -lpthreadGC1... " >&6; }
|
||||
if test ${ac_cv_lib_pthreadGC1_pthread_create+y}
|
||||
then :
|
||||
printf %s "(cached) " >&6
|
||||
else $as_nop
|
||||
ac_check_lib_save_LIBS=$LIBS
|
||||
LIBS="-lpthreadGC1 $LIBS"
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
/* Override any GCC internal prototype to avoid an error.
|
||||
Use char because int might match the return type of a GCC
|
||||
builtin and then its argument prototype would still apply. */
|
||||
char pthread_create ();
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
return pthread_create ();
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_link "$LINENO"
|
||||
then :
|
||||
ac_cv_lib_pthreadGC1_pthread_create=yes
|
||||
else $as_nop
|
||||
ac_cv_lib_pthreadGC1_pthread_create=no
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.beam \
|
||||
conftest$ac_exeext conftest.$ac_ext
|
||||
LIBS=$ac_check_lib_save_LIBS
|
||||
fi
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC1_pthread_create" >&5
|
||||
printf "%s\n" "$ac_cv_lib_pthreadGC1_pthread_create" >&6; }
|
||||
if test "x$ac_cv_lib_pthreadGC1_pthread_create" = xyes
|
||||
then :
|
||||
PTHREAD_LIBS="-lpthreadGC1"
|
||||
else $as_nop
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC" >&5
|
||||
printf %s "checking for pthread_create in -lpthreadGC... " >&6; }
|
||||
if test ${ac_cv_lib_pthreadGC_pthread_create+y}
|
||||
then :
|
||||
printf %s "(cached) " >&6
|
||||
else $as_nop
|
||||
ac_check_lib_save_LIBS=$LIBS
|
||||
LIBS="-lpthreadGC $LIBS"
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
/* Override any GCC internal prototype to avoid an error.
|
||||
Use char because int might match the return type of a GCC
|
||||
builtin and then its argument prototype would still apply. */
|
||||
char pthread_create ();
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
return pthread_create ();
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_link "$LINENO"
|
||||
then :
|
||||
ac_cv_lib_pthreadGC_pthread_create=yes
|
||||
else $as_nop
|
||||
ac_cv_lib_pthreadGC_pthread_create=no
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.beam \
|
||||
conftest$ac_exeext conftest.$ac_ext
|
||||
LIBS=$ac_check_lib_save_LIBS
|
||||
fi
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC_pthread_create" >&5
|
||||
printf "%s\n" "$ac_cv_lib_pthreadGC_pthread_create" >&6; }
|
||||
if test "x$ac_cv_lib_pthreadGC_pthread_create" = xyes
|
||||
then :
|
||||
PTHREAD_LIBS="-lpthreadGC"
|
||||
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
|
||||
# PTHREAD_LIBS="$PTHREAD_LIBS"
|
||||
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether __uint128_t is supported" >&5
|
||||
printf %s "checking whether __uint128_t is supported... " >&6; }
|
||||
@@ -6847,14 +6800,6 @@ else
|
||||
USE_ASM_FALSE=
|
||||
fi
|
||||
|
||||
if test x$have_x86 = xtrue; then
|
||||
ARCH_x86_TRUE=
|
||||
ARCH_x86_FALSE='#'
|
||||
else
|
||||
ARCH_x86_TRUE='#'
|
||||
ARCH_x86_FALSE=
|
||||
fi
|
||||
|
||||
if test x$have_x86_64 = xtrue; then
|
||||
ARCH_x86_64_TRUE=
|
||||
ARCH_x86_64_FALSE='#'
|
||||
@@ -6863,12 +6808,12 @@ else
|
||||
ARCH_x86_64_FALSE=
|
||||
fi
|
||||
|
||||
if test x$have_arm = xtrue; then
|
||||
ARCH_ARM_TRUE=
|
||||
ARCH_ARM_FALSE='#'
|
||||
if test x$have_arm64 = xtrue; then
|
||||
ARCH_ARM64_TRUE=
|
||||
ARCH_ARM64_FALSE='#'
|
||||
else
|
||||
ARCH_ARM_TRUE='#'
|
||||
ARCH_ARM_FALSE=
|
||||
ARCH_ARM64_TRUE='#'
|
||||
ARCH_ARM64_FALSE=
|
||||
fi
|
||||
|
||||
if test "x$OS" = "xWindows_NT"; then
|
||||
@@ -6879,13 +6824,15 @@ else
|
||||
MINGW_FALSE=
|
||||
fi
|
||||
|
||||
|
||||
if test x$request_jansson = xtrue ; then
|
||||
JANSSON_LIBS="compat/jansson/libjansson.a"
|
||||
if test x$have_apple = xtrue; then
|
||||
HAVE_APPLE_TRUE=
|
||||
HAVE_APPLE_FALSE='#'
|
||||
else
|
||||
JANSSON_LIBS=-ljansson
|
||||
HAVE_APPLE_TRUE='#'
|
||||
HAVE_APPLE_FALSE=
|
||||
fi
|
||||
|
||||
|
||||
# libcurl install path (for mingw : --with-curl=/usr/local)
|
||||
|
||||
# Check whether --with-curl was given.
|
||||
@@ -6902,30 +6849,10 @@ if test -n "$with_curl" ; then
|
||||
LIBCURL="-lcurl -lz"
|
||||
fi
|
||||
|
||||
# SSL install path (for mingw : --with-crypto=/usr/local/ssl)
|
||||
|
||||
# Check whether --with-crypto was given.
|
||||
if test ${with_crypto+y}
|
||||
then :
|
||||
withval=$with_crypto;
|
||||
fi
|
||||
|
||||
|
||||
if test -n "$with_crypto" ; then
|
||||
LIBCURL_CFLAGS="$LIBCURL_CFLAGS -I$with_crypto/include"
|
||||
LIBCURL_CPPFLAGS="$LIBCURL_CPPFLAGS -I$with_crypto/include"
|
||||
LIBCURL_LDFLAGS="-L$with_crypto/lib $LIBCURL_LDFLAGS"
|
||||
LIBCURL="$LIBCURL -lssl -lcrypto"
|
||||
fi
|
||||
|
||||
CFLAGS="$CFLAGS $LIBCURL_CFLAGS"
|
||||
CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
|
||||
LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS"
|
||||
|
||||
#AC_CHECK_LIB([z],[gzopen],[],[])
|
||||
#AC_CHECK_LIB([crypto],[OPENSSL_init], crypto=yes, [AC_MSG_ERROR([OpenSSL crypto library required])])
|
||||
#AC_CHECK_LIB([ssl],[SSL_new], ssl=yes, ssl=no)
|
||||
|
||||
# AC_CHECK_LIB([curl], [curl_multi_timeout],
|
||||
# have_libcurl=yes,
|
||||
# have_libcurl=no AC_MSG_ERROR([curl library required])
|
||||
@@ -7102,22 +7029,22 @@ if test -z "${USE_ASM_TRUE}" && test -z "${USE_ASM_FALSE}"; then
|
||||
as_fn_error $? "conditional \"USE_ASM\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
fi
|
||||
if test -z "${ARCH_x86_TRUE}" && test -z "${ARCH_x86_FALSE}"; then
|
||||
as_fn_error $? "conditional \"ARCH_x86\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
fi
|
||||
if test -z "${ARCH_x86_64_TRUE}" && test -z "${ARCH_x86_64_FALSE}"; then
|
||||
as_fn_error $? "conditional \"ARCH_x86_64\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
fi
|
||||
if test -z "${ARCH_ARM_TRUE}" && test -z "${ARCH_ARM_FALSE}"; then
|
||||
as_fn_error $? "conditional \"ARCH_ARM\" was never defined.
|
||||
if test -z "${ARCH_ARM64_TRUE}" && test -z "${ARCH_ARM64_FALSE}"; then
|
||||
as_fn_error $? "conditional \"ARCH_ARM64\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
fi
|
||||
if test -z "${MINGW_TRUE}" && test -z "${MINGW_FALSE}"; then
|
||||
as_fn_error $? "conditional \"MINGW\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
fi
|
||||
if test -z "${HAVE_APPLE_TRUE}" && test -z "${HAVE_APPLE_FALSE}"; then
|
||||
as_fn_error $? "conditional \"HAVE_APPLE\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
fi
|
||||
|
||||
: "${CONFIG_STATUS=./config.status}"
|
||||
ac_write_fail=0
|
||||
@@ -7508,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 24.4, which was
|
||||
This file was extended by cpuminer-opt $as_me 25.2, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -7576,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config='$ac_cs_config_escaped'
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 24.4
|
||||
cpuminer-opt config.status 25.2
|
||||
configured by $0, generated by GNU Autoconf 2.71,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
176
cpu-miner.c
176
cpu-miner.c
@@ -206,7 +206,7 @@ static uint32_t last_block_height = 0;
|
||||
static double highest_share = 0; // highest accepted share diff
|
||||
static double lowest_share = 9e99; // lowest accepted share diff
|
||||
static double last_targetdiff = 0.;
|
||||
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
|
||||
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32) || defined(__APPLE__))
|
||||
static uint32_t hi_temp = 0;
|
||||
static uint32_t prev_temp = 0;
|
||||
#endif
|
||||
@@ -286,15 +286,15 @@ static inline void drop_policy(void) { }
|
||||
static void affine_to_cpu( struct thr_info *thr )
|
||||
{
|
||||
int thread = thr->id;
|
||||
unsigned long last_error;
|
||||
bool ok;
|
||||
unsigned long last_error = 0;
|
||||
bool ok = true;
|
||||
|
||||
#if defined(WINDOWS_CPU_GROUPS_ENABLED)
|
||||
unsigned long group_size = GetActiveProcessorCount( 0 );
|
||||
unsigned long group = thread / group_size;
|
||||
unsigned long cpu = thread_affinity_map[ thread % group_size ];
|
||||
|
||||
GROUP_AFFINITY affinity;
|
||||
GROUP_AFFINITY affinity = {0};
|
||||
affinity.Group = group;
|
||||
affinity.Mask = 1ULL << cpu;
|
||||
|
||||
@@ -320,8 +320,7 @@ static void affine_to_cpu( struct thr_info *thr )
|
||||
{
|
||||
last_error = GetLastError();
|
||||
if ( !thread )
|
||||
applog( LOG_WARNING, "Set affinity returned error 0x%x for thread %d",
|
||||
last_error, thread );
|
||||
applog( LOG_WARNING, "Set affinity returned error 0x%x", last_error );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -992,19 +991,19 @@ void report_summary_log( bool force )
|
||||
|
||||
if ( rejected_share_count > 10 )
|
||||
{
|
||||
if ( rejected_share_count > ( submitted_share_count * .5 ) )
|
||||
if ( rejected_share_count > ( submitted_share_count / 2 ) )
|
||||
{
|
||||
applog(LOG_ERR,"Excessive rejected share rate, exiting...");
|
||||
exit(1);
|
||||
}
|
||||
else if ( rejected_share_count > ( submitted_share_count * .1 ) )
|
||||
else if ( rejected_share_count > ( submitted_share_count / 10 ) )
|
||||
applog(LOG_WARNING,"High rejected share rate, check settings.");
|
||||
}
|
||||
|
||||
gettimeofday( &now, NULL );
|
||||
timeval_subtract( &et, &now, &five_min_start );
|
||||
|
||||
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
|
||||
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32) || defined(__APPLE__))
|
||||
|
||||
// Display CPU temperature and clock rate.
|
||||
int curr_temp = cpu_temp(0);
|
||||
@@ -1013,8 +1012,9 @@ void report_summary_log( bool force )
|
||||
|
||||
if ( !opt_quiet || ( curr_temp >= 80 ) )
|
||||
{
|
||||
int wait_time = curr_temp >= 90 ? 5 : curr_temp >= 80 ? 30 :
|
||||
curr_temp >= 70 ? 60 : 120;
|
||||
int wait_time = curr_temp >= 90 ? 5
|
||||
: curr_temp >= 80 ? 30
|
||||
: curr_temp >= 70 ? 60 : 120;
|
||||
timeval_subtract( &diff, &now, &cpu_temp_time );
|
||||
if ( ( diff.tv_sec > wait_time )
|
||||
|| ( ( curr_temp > prev_temp ) && ( curr_temp >= 75 ) ) )
|
||||
@@ -1912,7 +1912,7 @@ static bool wanna_mine(int thr_id)
|
||||
{
|
||||
bool state = true;
|
||||
|
||||
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
|
||||
#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32) || defined(__APPLE__))
|
||||
|
||||
if (opt_max_temp > 0.0)
|
||||
{
|
||||
@@ -2400,7 +2400,7 @@ static void *miner_thread( void *userdata )
|
||||
{
|
||||
scale_hash_for_display( &hashrate, hr_units );
|
||||
sprintf( hr, "%.2f", hashrate );
|
||||
#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
|
||||
#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32) || defined(__APPLE__))
|
||||
applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
|
||||
#else
|
||||
float lo_freq = 0., hi_freq = 0.;
|
||||
@@ -2840,8 +2840,6 @@ static void show_credits()
|
||||
static bool cpu_capability( bool display_only )
|
||||
{
|
||||
char cpu_brand[0x40];
|
||||
bool cpu_has_aarch64 = cpu_arch_aarch64();
|
||||
bool cpu_has_x86_64 = cpu_arch_x86_64();
|
||||
bool cpu_has_sse2 = has_sse2(); // X86_64 only
|
||||
bool cpu_has_ssse3 = has_ssse3(); // X86_64 only
|
||||
bool cpu_has_sse41 = has_sse41(); // X86_64 only
|
||||
@@ -2914,7 +2912,8 @@ static bool cpu_capability( bool display_only )
|
||||
sw_arm_arch = __ARM_ARCH;
|
||||
#endif
|
||||
#endif
|
||||
// x86_64_only
|
||||
|
||||
// x86_64 only
|
||||
#if defined(__SSE2__)
|
||||
sw_has_sse2 = true;
|
||||
#endif
|
||||
@@ -2942,9 +2941,10 @@ static bool cpu_capability( bool display_only )
|
||||
#if defined(__AVX10_1_512__)
|
||||
sw_has_avx10_512 = true;
|
||||
#endif
|
||||
|
||||
// x86_64 or AArch64
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
sw_has_aes = true;
|
||||
sw_has_aes = true;
|
||||
#endif
|
||||
#ifdef __VAES__
|
||||
sw_has_vaes = true;
|
||||
@@ -2955,6 +2955,7 @@ static bool cpu_capability( bool display_only )
|
||||
#if defined(__SHA512__) || defined(__ARM_FEATURE_SHA512)
|
||||
sw_has_sha512 = true;
|
||||
#endif
|
||||
|
||||
// AArch64 only
|
||||
#if defined(__ARM_NEON)
|
||||
sw_has_neon = true;
|
||||
@@ -2972,87 +2973,93 @@ static bool cpu_capability( bool display_only )
|
||||
sw_has_sme2 = true;
|
||||
#endif
|
||||
|
||||
// CPU
|
||||
cpu_brand_string( cpu_brand );
|
||||
printf( "CPU: %s\n", cpu_brand );
|
||||
|
||||
printf("SW built on " __DATE__
|
||||
#ifdef _MSC_VER
|
||||
" with VC++ 2013\n");
|
||||
// Build
|
||||
printf( "SW built on " __DATE__
|
||||
#if defined(__clang__)
|
||||
" with CLANG-%d.%d.%d", __clang_major__, __clang_minor__,
|
||||
__clang_patchlevel__ );
|
||||
#elif defined(__GNUC__)
|
||||
" with GCC-");
|
||||
printf("%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
|
||||
#else
|
||||
printf("\n");
|
||||
" with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ );
|
||||
#endif
|
||||
|
||||
// OS
|
||||
#if defined(__linux)
|
||||
printf(" Linux\n");
|
||||
#elif defined(WIN32)
|
||||
printf(" Windows\n");
|
||||
printf(" Windows");
|
||||
#if defined(__MINGW64__)
|
||||
printf(" MinGW-w64\n");
|
||||
#else
|
||||
printf("\n");
|
||||
#endif
|
||||
#elif defined(__APPLE__)
|
||||
printf(" MacOS\n");
|
||||
#elif defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
|
||||
printf(" Unix\n");
|
||||
#elif defined(__bsd__) || defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
|
||||
printf(" BSD/Unix\n");
|
||||
#else
|
||||
printf("\n");
|
||||
#endif
|
||||
|
||||
printf("CPU features: ");
|
||||
if ( cpu_has_x86_64 )
|
||||
if ( cpu_arch_x86_64() )
|
||||
{
|
||||
if ( cpu_has_avx512 ) printf( " AVX512" );
|
||||
else if ( cpu_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( cpu_has_avx ) printf( " AVX " );
|
||||
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( cpu_has_sse41 ) printf( " SSE4.1" );
|
||||
else if ( cpu_has_ssse3 ) printf( " SSSE3 " );
|
||||
else if ( cpu_has_sse2 ) printf( " SSE2 " );
|
||||
if ( cpu_has_avx10 ) printf( " AVX10.%d-%d", avx10_version(),
|
||||
avx10_vector_length() );
|
||||
if ( cpu_has_avx512 ) printf( " AVX512" );
|
||||
else if ( cpu_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( cpu_has_avx ) printf( " AVX " );
|
||||
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( cpu_has_sse41 ) printf( " SSE4.1" );
|
||||
else if ( cpu_has_ssse3 ) printf( " SSSE3 " );
|
||||
else if ( cpu_has_sse2 ) printf( " SSE2 " );
|
||||
}
|
||||
else if ( cpu_has_aarch64 )
|
||||
else if ( cpu_arch_aarch64() )
|
||||
{
|
||||
if ( cpu_has_neon ) printf( " NEON" );
|
||||
if ( cpu_has_sve2 ) printf( " SVE2-%d", sve_vector_length() );
|
||||
else if ( cpu_has_sve ) printf( " SVE" );
|
||||
if ( cpu_has_sme2 ) printf( " SME2" );
|
||||
else if ( cpu_has_sme ) printf( " SME" );
|
||||
}
|
||||
if ( cpu_has_vaes ) printf( " VAES" );
|
||||
else if ( cpu_has_aes ) printf( " AES" );
|
||||
if ( cpu_has_sha512 ) printf( " SHA512" );
|
||||
else if ( cpu_has_sha256 ) printf( " SHA256" );
|
||||
if ( cpu_has_avx10 ) printf( " AVX10.%d-%d",
|
||||
avx10_version(), avx10_vector_length() );
|
||||
else if ( cpu_has_sve ) printf( " SVE" );
|
||||
if ( cpu_has_sme2 ) printf( " SME2" );
|
||||
else if ( cpu_has_sme ) printf( " SME" );
|
||||
}
|
||||
if ( cpu_has_vaes ) printf( " VAES" );
|
||||
else if ( cpu_has_aes ) printf( " AES" );
|
||||
if ( cpu_has_sha512 ) printf( " SHA512" );
|
||||
else if ( cpu_has_sha256 ) printf( " SHA256" );
|
||||
|
||||
printf("\nSW features: ");
|
||||
if ( sw_has_x86_64 )
|
||||
{
|
||||
if ( sw_has_avx512 ) printf( " AVX512" );
|
||||
else if ( sw_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( sw_has_avx ) printf( " AVX " );
|
||||
else if ( sw_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( sw_has_sse41 ) printf( " SSE4.1" );
|
||||
else if ( sw_has_ssse3 ) printf( " SSSE3 " );
|
||||
else if ( sw_has_sse2 ) printf( " SSE2 " );
|
||||
if ( sw_has_avx10_512 ) printf( " AVX10-512" );
|
||||
else if ( sw_has_avx10_256 ) printf( " AVX10-256" );
|
||||
else if ( sw_has_avx512 ) printf( " AVX512" );
|
||||
else if ( sw_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( sw_has_avx ) printf( " AVX " );
|
||||
else if ( sw_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( sw_has_sse41 ) printf( " SSE4.1" );
|
||||
else if ( sw_has_ssse3 ) printf( " SSSE3 " );
|
||||
else if ( sw_has_sse2 ) printf( " SSE2 " );
|
||||
}
|
||||
else if ( sw_has_aarch64 )
|
||||
{
|
||||
if ( sw_arm_arch ) printf( " armv%d", sw_arm_arch );
|
||||
if ( sw_has_neon ) printf( " NEON" );
|
||||
if ( sw_has_sve2 ) printf( " SVE2" );
|
||||
else if ( sw_has_sve ) printf( " SVE" );
|
||||
if ( sw_has_sme2 ) printf( " SME2" );
|
||||
else if ( sw_has_sme ) printf( " SME" );
|
||||
if ( sw_has_neon ) printf( " NEON" );
|
||||
if ( sw_has_sve2 ) printf( " SVE2" );
|
||||
else if ( sw_has_sve ) printf( " SVE" );
|
||||
if ( sw_has_sme2 ) printf( " SME2" );
|
||||
else if ( sw_has_sme ) printf( " SME" );
|
||||
}
|
||||
if ( sw_has_vaes ) printf( " VAES" );
|
||||
else if ( sw_has_aes ) printf( " AES" );
|
||||
if ( sw_has_sha512 ) printf( " SHA512" );
|
||||
else if ( sw_has_sha256 ) printf( " SHA256" );
|
||||
if ( sw_has_vaes ) printf( " VAES" );
|
||||
else if ( sw_has_aes ) printf( " AES" );
|
||||
if ( sw_has_sha512 ) printf( " SHA512" );
|
||||
else if ( sw_has_sha256 ) printf( " SHA256" );
|
||||
|
||||
if ( !display_only )
|
||||
{
|
||||
printf("\nAlgo features: ");
|
||||
printf("\nAlgo features:");
|
||||
if ( algo_features == EMPTY_SET ) printf( " None" );
|
||||
else
|
||||
{
|
||||
@@ -3060,7 +3067,7 @@ static bool cpu_capability( bool display_only )
|
||||
else if ( algo_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( algo_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( algo_has_sse2 ) printf( " SSE2 " );
|
||||
if ( algo_has_neon ) printf( " NEON " );
|
||||
if ( algo_has_neon ) printf( " NEON" );
|
||||
if ( algo_has_vaes ) printf( " VAES" );
|
||||
else if ( algo_has_aes ) printf( " AES" );
|
||||
if ( algo_has_sha512 ) printf( " SHA512" );
|
||||
@@ -3105,6 +3112,7 @@ static bool cpu_capability( bool display_only )
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
void show_version_and_exit(void)
|
||||
{
|
||||
printf("\n built on " __DATE__
|
||||
@@ -3161,12 +3169,11 @@ void show_version_and_exit(void)
|
||||
printf("\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
*/
|
||||
void show_usage_and_exit(int status)
|
||||
{
|
||||
if (status)
|
||||
fprintf(stderr, "Try `--help' for more information.\n");
|
||||
// fprintf(stderr, "Try `" PACKAGE_NAME " --help' for more information.\n");
|
||||
else
|
||||
printf(usage);
|
||||
exit(status);
|
||||
@@ -3182,7 +3189,6 @@ void parse_arg(int key, char *arg )
|
||||
{
|
||||
char *p;
|
||||
int v, i;
|
||||
// uint64_t ul;
|
||||
double d;
|
||||
|
||||
switch( key )
|
||||
@@ -3326,7 +3332,8 @@ void parse_arg(int key, char *arg )
|
||||
free(rpc_user);
|
||||
rpc_user = strdup(arg);
|
||||
break;
|
||||
case 'o': // url
|
||||
|
||||
case 'o': // url
|
||||
{
|
||||
char *ap, *hp;
|
||||
ap = strstr( arg, "://" );
|
||||
@@ -3391,7 +3398,8 @@ void parse_arg(int key, char *arg )
|
||||
have_stratum = !opt_benchmark && !strncasecmp( rpc_url, "stratum", 7 );
|
||||
break;
|
||||
}
|
||||
case 'O': // userpass
|
||||
|
||||
case 'O': // userpass
|
||||
p = strchr(arg, ':');
|
||||
if (!p)
|
||||
{
|
||||
@@ -3551,10 +3559,10 @@ void parse_arg(int key, char *arg )
|
||||
case 1029: // stratum-keepalive
|
||||
opt_stratum_keepalive = true;
|
||||
break;
|
||||
case 'V':
|
||||
case 'V': // version
|
||||
display_cpu_capability();
|
||||
exit(0);
|
||||
case 'h':
|
||||
case 'h': // help
|
||||
show_usage_and_exit(0);
|
||||
|
||||
default:
|
||||
@@ -3693,9 +3701,6 @@ int main(int argc, char *argv[])
|
||||
{
|
||||
int cpus = GetActiveProcessorCount( i );
|
||||
num_cpus += cpus;
|
||||
|
||||
// if (opt_debug)
|
||||
// applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
|
||||
}
|
||||
|
||||
#else
|
||||
@@ -3866,12 +3871,23 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
|
||||
if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
|
||||
applog( LOG_INFO, "Found %d CPUs in %d groups",
|
||||
num_cpus, num_cpugroups );
|
||||
#if defined(WIN32)
|
||||
|
||||
#if defined(_WIN32_WINNT)
|
||||
if (opt_debug)
|
||||
applog( LOG_INFO, "_WIN232_WINNT = 0x%04x", _WIN32_WINNT );
|
||||
#else
|
||||
if (opt_debug)
|
||||
applog( LOG_INFO, "_WIN232_WINNT undefined." );
|
||||
#endif
|
||||
|
||||
#if defined(WINDOWS_CPU_GROUPS_ENABLED)
|
||||
if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
|
||||
applog( LOG_INFO, "Found %d CPUs in %d groups",
|
||||
num_cpus, num_cpugroups );
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
conditional_state = malloc( opt_n_threads * ((sizeof(bool)) ) );
|
||||
memset( conditional_state, 0, opt_n_threads * ((sizeof(bool)) ) );
|
||||
|
||||
@@ -3892,7 +3908,7 @@ int main(int argc, char *argv[])
|
||||
if ( cpu < num_cpus ) active_cpus++;
|
||||
}
|
||||
if ( opt_n_threads > active_cpus )
|
||||
applog( LOG_WARNING, "Affinity: more threads (%d) than active CPUs (%d)", opt_n_threads, active_cpus );
|
||||
applog( LOG_WARNING, "More miner threads (%d) than active CPUs in affinity mask (%d)", opt_n_threads, active_cpus );
|
||||
if ( !opt_quiet )
|
||||
{
|
||||
char affinity_mask[64];
|
||||
|
12
miner.h
12
miner.h
@@ -3,10 +3,7 @@
|
||||
|
||||
#include <cpuminer-config.h>
|
||||
|
||||
#if !( defined(__SSE2__) || ( defined(__aarch64__) && defined(__ARM_NEON) ) )
|
||||
#warning "Unknown or unsupported CPU, requires x86_64 with SSE2 or AArch64 with NEON."
|
||||
#endif
|
||||
|
||||
// CPU architecture
|
||||
#if defined(__x86_64__)
|
||||
#define USER_AGENT_ARCH "x64" // Intel, AMD x86_64
|
||||
#elif defined(__aarch64__)
|
||||
@@ -17,14 +14,15 @@
|
||||
#define USER_AGENT_ARCH
|
||||
#endif
|
||||
|
||||
// Operating system
|
||||
// __APPLE__ includes MacOS & IOS, no MacOS only macros found.
|
||||
#if defined(__linux)
|
||||
#define USER_AGENT_OS "L" // GNU Linux
|
||||
#elif defined(WIN32)
|
||||
#define USER_AGENT_OS "W" // MS Windows
|
||||
#elif defined(__APPLE__)
|
||||
#define USER_AGENT_OS "M" // Apple MacOS
|
||||
// is there a generic BSD macro?
|
||||
#elif defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
|
||||
#elif defined(__bsd__) || defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
|
||||
#define USER_AGENT_OS "U" // BSD unix
|
||||
#else
|
||||
#define USER_AGENT_OS
|
||||
@@ -191,7 +189,7 @@ static inline uint32_t swab32(uint32_t x)
|
||||
return __builtin_bswap32(x);
|
||||
#else
|
||||
return ( ( (x) << 24 ) & 0xff000000u ) | ( ( (x) << 8 ) & 0x00ff0000u )
|
||||
| ( ( (x) >> 8 ) & 0x0000ff00u ) | ( ( (x) >> 24 ) & 0x000000ffu )
|
||||
| ( ( (x) >> 8 ) & 0x0000ff00u ) | ( ( (x) >> 24 ) & 0x000000ffu );
|
||||
|
||||
|
||||
// return bswap_32(v);
|
||||
|
68
simd-utils.h
68
simd-utils.h
@@ -29,7 +29,6 @@
|
||||
// is no significant 64 bit vectorization therefore SSE2 is the practical
|
||||
// minimum for using this code.
|
||||
//
|
||||
// MMX: 64 bit vectors (Not used in cpuminer-opt)
|
||||
// SSE2: 128 bit vectors (64 bit CPUs only, such as Intel Core2.
|
||||
// AVX2: 256 bit vectors (Starting with Intel Haswell and AMD Ryzen)
|
||||
// AVX512: 512 bit vectors (Starting with SkylakeX)
|
||||
@@ -141,18 +140,15 @@
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
// GCC-14.1: the AVX512 macros are defined even when compiled with only
|
||||
// -mavx10.1-256, causing compile errors in AVX512 code. Only with
|
||||
// -mavx10.1-512 does it compile successfully.
|
||||
// __EVEX512__ is set only when compiled with -mavx10.1-512.
|
||||
// Adding -fno-evex512 doesn't help.
|
||||
// Building with -mapxf fails on a CPU without APX because configure can't
|
||||
// run its test program.
|
||||
// AVX512 macros are not a reliable indicator of 512 bit vector capability
|
||||
// because they get defined with AVX10_1_256 which doesn't support 512 bit.
|
||||
// EVEX512 is also unreliable as it can also be defined when 512b is not
|
||||
// available.
|
||||
// Use AVX10_1_512 for 512b & AVX10_1_256 for 256b whenever AVX10 is present.
|
||||
// Use AVX512 macros only whithout AVX10.
|
||||
|
||||
/*
|
||||
// Test for macros
|
||||
#ifdef __AVX10__
|
||||
#warning "__AVX10__"
|
||||
#endif
|
||||
#ifdef __AVX10_1__
|
||||
#warning "__AVX10_1__"
|
||||
#endif
|
||||
@@ -162,39 +158,38 @@
|
||||
#ifdef __AVX10_1_512__
|
||||
#warning "__AVX10_1_512__"
|
||||
#endif
|
||||
#ifdef __EVEX256__
|
||||
#warning "__EVEX256__"
|
||||
#endif
|
||||
#ifdef __EVEX512__
|
||||
#warning "__EVEX512__"
|
||||
#endif
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#warning "AVX512"
|
||||
#endif
|
||||
*/
|
||||
|
||||
// AVX10 complicates vector support by adding AVX512 features to CPUs without 512 bit
|
||||
// vector support. AVX10.1 is just a renaming of AVX512 and is only available for
|
||||
// Intel P-core only CPUs. AVX10.2 adds support for E-cores that don't support 512 bit
|
||||
// vectors. The following macros simplify things.
|
||||
// SIMD512: Use 512, 256 & 128 bit vectors, AVX512VBMI is not included and must be
|
||||
// tested seperately.
|
||||
// SIMD512: Use 512, 256 & 128 bit vectors, AVX512VBMI is not included and
|
||||
// must be tested seperately.
|
||||
// VL256: Include AVX512VL instructions for 256 & 128 bit vectors.
|
||||
// VBMI: Include AVX512VBMI instructions for supported vector lengths.
|
||||
|
||||
// AVX10 can exist without support for 512 bit vectors.
|
||||
#if defined(__AVX10_1_512__)
|
||||
#define SIMD512 1
|
||||
#elif !defined(__AVX10_1__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SIMD512 1
|
||||
#endif
|
||||
#if defined(__AVX10_1__)
|
||||
|
||||
// AVX512VL instructions applied to 256 & 128 bit vectors is supported with
|
||||
// either AVX512VL or AVX10. Support for CPUs without 512 bit vectors is available
|
||||
// with AVX10.2.
|
||||
#if defined(__AVX10_2__) || defined(__AVX10_1_512__)
|
||||
#define VL256 1
|
||||
#elif defined(__AVX512VL__)
|
||||
#define VL256 1
|
||||
#endif
|
||||
|
||||
// VBMI does not exist on early versions of AVX512
|
||||
#if defined(__AVX10_1__) || defined(__AVX512VBMI__)
|
||||
#define VBMI 1
|
||||
#if defined(__AVX10_1_512__)
|
||||
#define SIMD512 1
|
||||
#endif
|
||||
|
||||
#elif defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define VL256 1
|
||||
#define SIMD512 1
|
||||
#if defined(__AVX512VBMI__)
|
||||
#define VBMI 1
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
@@ -221,9 +216,6 @@
|
||||
|
||||
#include "simd-utils/simd-int.h"
|
||||
|
||||
// x86_64 MMX 64 bit vectors
|
||||
#include "simd-utils/simd-64.h"
|
||||
|
||||
// x86_64 SSE2 128 bit vectors
|
||||
#include "simd-utils/simd-128.h"
|
||||
|
||||
@@ -233,10 +225,6 @@
|
||||
// x86_64 AVX512 512 bit vectors
|
||||
#include "simd-utils/simd-512.h"
|
||||
|
||||
// move up after cleaning
|
||||
// CPU architectire abstraction
|
||||
//#include "simd-utils/simd-portable.h"
|
||||
|
||||
// aarch64 neon 128 bit vectors
|
||||
#include "simd-utils/simd-neon.h"
|
||||
|
||||
|
@@ -86,7 +86,7 @@ static inline void extr_lane_2x32( void *dst, const void *src,
|
||||
|
||||
// 4x32
|
||||
|
||||
#if ( defined(__x86_64__) && defined(__SSE2__) ) || ( defined(__aarch64__) && defined(__ARM_NEON) )
|
||||
#if defined(__x86_64__) && defined(__SSE2__)
|
||||
|
||||
#define ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ) \
|
||||
{ \
|
||||
@@ -174,6 +174,7 @@ static inline void intrlv_4x32_512( void *dst, const void *src0,
|
||||
STOR_DEST_4x32( D0, D1, D2, D3, dst, 12, dst, 13, dst, 14, dst, 15 );
|
||||
}
|
||||
|
||||
|
||||
static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, const int bit_len )
|
||||
{
|
||||
@@ -235,6 +236,190 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
|
||||
STOR_DEST_4x32( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 );
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__) && defined(__ARM_NEON)
|
||||
|
||||
static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
|
||||
const void *src2, const void *src3, const int bit_len )
|
||||
{
|
||||
uint32x4x4_t s;
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 0 );
|
||||
s.val[1] = casti_v128u32( src1, 0 );
|
||||
s.val[2] = casti_v128u32( src2, 0 );
|
||||
s.val[3] = casti_v128u32( src3, 0 );
|
||||
vst4q_u32( dst, s );
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 1 );
|
||||
s.val[1] = casti_v128u32( src1, 1 );
|
||||
s.val[2] = casti_v128u32( src2, 1 );
|
||||
s.val[3] = casti_v128u32( src3, 1 );
|
||||
vst4q_u32( dst + 64, s );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 2 );
|
||||
s.val[1] = casti_v128u32( src1, 2 );
|
||||
s.val[2] = casti_v128u32( src2, 2 );
|
||||
s.val[3] = casti_v128u32( src3, 2 );
|
||||
vst4q_u32( dst + 128, s );
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 3 );
|
||||
s.val[1] = casti_v128u32( src1, 3 );
|
||||
s.val[2] = casti_v128u32( src2, 3 );
|
||||
s.val[3] = casti_v128u32( src3, 3 );
|
||||
vst4q_u32( dst + 192, s );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 4 );
|
||||
s.val[1] = casti_v128u32( src1, 4 );
|
||||
s.val[2] = casti_v128u32( src2, 4 );
|
||||
s.val[3] = casti_v128u32( src3, 4 );
|
||||
vst4q_u32( dst + 256, s );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 5 );
|
||||
s.val[1] = casti_v128u32( src1, 5 );
|
||||
s.val[2] = casti_v128u32( src2, 5 );
|
||||
s.val[3] = casti_v128u32( src3, 5 );
|
||||
vst4q_u32( dst + 320, s );
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 6 );
|
||||
s.val[1] = casti_v128u32( src1, 6 );
|
||||
s.val[2] = casti_v128u32( src2, 6 );
|
||||
s.val[3] = casti_v128u32( src3, 6 );
|
||||
vst4q_u32( dst + 384, s );
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 7 );
|
||||
s.val[1] = casti_v128u32( src1, 7 );
|
||||
s.val[2] = casti_v128u32( src2, 7 );
|
||||
s.val[3] = casti_v128u32( src3, 7 );
|
||||
vst4q_u32( dst + 448, s );
|
||||
|
||||
// if ( bit_len <= 1024 return;
|
||||
}
|
||||
|
||||
static inline void intrlv_4x32_512( void *dst, const void *src0,
|
||||
const void *src1, const void *src2, const void *src3 )
|
||||
{
|
||||
uint32x4x4_t s;
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 0 );
|
||||
s.val[1] = casti_v128u32( src1, 0 );
|
||||
s.val[2] = casti_v128u32( src2, 0 );
|
||||
s.val[3] = casti_v128u32( src3, 0 );
|
||||
vst4q_u32( dst, s );
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 1 );
|
||||
s.val[1] = casti_v128u32( src1, 1 );
|
||||
s.val[2] = casti_v128u32( src2, 1 );
|
||||
s.val[3] = casti_v128u32( src3, 1 );
|
||||
vst4q_u32( dst + 64, s );
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 2 );
|
||||
s.val[1] = casti_v128u32( src1, 2 );
|
||||
s.val[2] = casti_v128u32( src2, 2 );
|
||||
s.val[3] = casti_v128u32( src3, 2 );
|
||||
vst4q_u32( dst + 128, s );
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 3 );
|
||||
s.val[1] = casti_v128u32( src1, 3 );
|
||||
s.val[2] = casti_v128u32( src2, 3 );
|
||||
s.val[3] = casti_v128u32( src3, 3 );
|
||||
vst4q_u32( dst + 192, s );
|
||||
}
|
||||
|
||||
static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, int bit_len )
|
||||
{
|
||||
uint32x4x4_t s = vld4q_u32( src );
|
||||
|
||||
casti_v128( dst0, 0 ) = s.val[0];
|
||||
casti_v128( dst1, 0 ) = s.val[1];
|
||||
casti_v128( dst2, 0 ) = s.val[2];
|
||||
casti_v128( dst3, 0 ) = s.val[3];
|
||||
|
||||
s = vld4q_u32( src + 64 );
|
||||
casti_v128( dst0, 1 ) = s.val[0];
|
||||
casti_v128( dst1, 1 ) = s.val[1];
|
||||
casti_v128( dst2, 1 ) = s.val[2];
|
||||
casti_v128( dst3, 1 ) = s.val[3];
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
s = vld4q_u32( src + 128 );
|
||||
casti_v128( dst0, 2 ) = s.val[0];
|
||||
casti_v128( dst1, 2 ) = s.val[1];
|
||||
casti_v128( dst2, 2 ) = s.val[2];
|
||||
casti_v128( dst3, 2 ) = s.val[3];
|
||||
|
||||
s = vld4q_u32( src + 192 );
|
||||
casti_v128( dst0, 3 ) = s.val[0];
|
||||
casti_v128( dst1, 3 ) = s.val[1];
|
||||
casti_v128( dst2, 3 ) = s.val[2];
|
||||
casti_v128( dst3, 3 ) = s.val[3];
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
s = vld4q_u32( src + 256 );
|
||||
casti_v128( dst0, 4 ) = s.val[0];
|
||||
casti_v128( dst1, 4 ) = s.val[1];
|
||||
casti_v128( dst2, 4 ) = s.val[2];
|
||||
casti_v128( dst3, 4 ) = s.val[3];
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
s = vld4q_u32( src + 320 );
|
||||
casti_v128( dst0, 5 ) = s.val[0];
|
||||
casti_v128( dst1, 5 ) = s.val[1];
|
||||
casti_v128( dst2, 5 ) = s.val[2];
|
||||
casti_v128( dst3, 5 ) = s.val[3];
|
||||
|
||||
s = vld4q_u32( src + 384 );
|
||||
casti_v128( dst0, 6 ) = s.val[0];
|
||||
casti_v128( dst1, 6 ) = s.val[1];
|
||||
casti_v128( dst2, 6 ) = s.val[2];
|
||||
casti_v128( dst3, 6 ) = s.val[3];
|
||||
|
||||
s = vld4q_u32( src + 448 );
|
||||
casti_v128( dst0, 6 ) = s.val[0];
|
||||
casti_v128( dst1, 6 ) = s.val[1];
|
||||
casti_v128( dst2, 6 ) = s.val[2];
|
||||
casti_v128( dst3, 6 ) = s.val[3];
|
||||
|
||||
// if ( bit_len <= 1024 ) return;
|
||||
}
|
||||
|
||||
static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src )
|
||||
{
|
||||
uint32x4x4_t s = vld4q_u32( src );
|
||||
|
||||
casti_v128( dst0, 0 ) = s.val[0];
|
||||
casti_v128( dst1, 0 ) = s.val[1];
|
||||
casti_v128( dst2, 0 ) = s.val[2];
|
||||
casti_v128( dst3, 0 ) = s.val[3];
|
||||
|
||||
s = vld4q_u32( src + 64 );
|
||||
casti_v128( dst0, 1 ) = s.val[0];
|
||||
casti_v128( dst1, 1 ) = s.val[1];
|
||||
casti_v128( dst2, 1 ) = s.val[2];
|
||||
casti_v128( dst3, 1 ) = s.val[3];
|
||||
|
||||
s = vld4q_u32( src + 128 );
|
||||
casti_v128( dst0, 2 ) = s.val[0];
|
||||
casti_v128( dst1, 2 ) = s.val[1];
|
||||
casti_v128( dst2, 2 ) = s.val[2];
|
||||
casti_v128( dst3, 2 ) = s.val[3];
|
||||
|
||||
s = vld4q_u32( src + 192 );
|
||||
casti_v128( dst0, 3 ) = s.val[0];
|
||||
casti_v128( dst1, 3 ) = s.val[1];
|
||||
casti_v128( dst2, 3 ) = s.val[2];
|
||||
casti_v128( dst3, 3 ) = s.val[3];
|
||||
}
|
||||
|
||||
#else // !SSE2 && !NEON
|
||||
|
||||
static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
|
||||
@@ -456,15 +641,13 @@ static inline void v128_bswap32_80( void *d, void *s )
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
{
|
||||
v128_t s0 = casti_v128( src,0 );
|
||||
v128_t s1 = casti_v128( src,1 );
|
||||
v128_t s2 = casti_v128( src,2 );
|
||||
v128_t s3 = casti_v128( src,3 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
v128u32_t s0 = casti_v128u32( src,0 );
|
||||
v128u32_t s1 = casti_v128u32( src,1 );
|
||||
v128u32_t s2 = casti_v128u32( src,2 );
|
||||
v128u32_t s3 = casti_v128u32( src,3 );
|
||||
v128u32_t s4 = casti_v128u32( src,4 );
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
@@ -487,79 +670,34 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
|
||||
#endif
|
||||
|
||||
casti_v128( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
|
||||
casti_v128( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
|
||||
casti_v128( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
|
||||
casti_v128( d, 3 ) = _mm_shuffle_epi32( s0, 0xff );
|
||||
casti_v128u32( d, 0 ) = v128_duplane32( s0, 0 );
|
||||
casti_v128u32( d, 1 ) = v128_duplane32( s0, 1 );
|
||||
casti_v128u32( d, 2 ) = v128_duplane32( s0, 2 );
|
||||
casti_v128u32( d, 3 ) = v128_duplane32( s0, 3 );
|
||||
|
||||
casti_v128( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 );
|
||||
casti_v128( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 );
|
||||
casti_v128( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa );
|
||||
casti_v128( d, 7 ) = _mm_shuffle_epi32( s1, 0xff );
|
||||
casti_v128u32( d, 4 ) = v128_duplane32( s1, 0 );
|
||||
casti_v128u32( d, 5 ) = v128_duplane32( s1, 1 );
|
||||
casti_v128u32( d, 6 ) = v128_duplane32( s1, 2 );
|
||||
casti_v128u32( d, 7 ) = v128_duplane32( s1, 3 );
|
||||
|
||||
casti_v128( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 );
|
||||
casti_v128( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 );
|
||||
casti_v128( d,10 ) = _mm_shuffle_epi32( s2, 0xaa );
|
||||
casti_v128( d,11 ) = _mm_shuffle_epi32( s2, 0xff );
|
||||
casti_v128u32( d, 8 ) = v128_duplane32( s2, 0 );
|
||||
casti_v128u32( d, 9 ) = v128_duplane32( s2, 1 );
|
||||
casti_v128u32( d,10 ) = v128_duplane32( s2, 2 );
|
||||
casti_v128u32( d,11 ) = v128_duplane32( s2, 3 );
|
||||
|
||||
casti_v128( d,12 ) = _mm_shuffle_epi32( s3, 0x00 );
|
||||
casti_v128( d,13 ) = _mm_shuffle_epi32( s3, 0x55 );
|
||||
casti_v128( d,14 ) = _mm_shuffle_epi32( s3, 0xaa );
|
||||
casti_v128( d,15 ) = _mm_shuffle_epi32( s3, 0xff );
|
||||
casti_v128u32( d,12 ) = v128_duplane32( s3, 0 );
|
||||
casti_v128u32( d,13 ) = v128_duplane32( s3, 1 );
|
||||
casti_v128u32( d,14 ) = v128_duplane32( s3, 2 );
|
||||
casti_v128u32( d,15 ) = v128_duplane32( s3, 3 );
|
||||
|
||||
casti_v128( d,16 ) = _mm_shuffle_epi32( s4, 0x00 );
|
||||
casti_v128( d,17 ) = _mm_shuffle_epi32( s4, 0x55 );
|
||||
casti_v128( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
|
||||
casti_v128( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
|
||||
casti_v128u32( d,16 ) = v128_duplane32( s2, 0 );
|
||||
casti_v128u32( d,17 ) = v128_duplane32( s2, 1 );
|
||||
casti_v128u32( d,18 ) = v128_duplane32( s2, 2 );
|
||||
casti_v128u32( d,19 ) = v128_duplane32( s2, 3 );
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__) && defined(__ARM_NEON)
|
||||
|
||||
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
{
|
||||
v128_t s0 = casti_v128( src,0 );
|
||||
v128_t s1 = casti_v128( src,1 );
|
||||
v128_t s2 = casti_v128( src,2 );
|
||||
v128_t s3 = casti_v128( src,3 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
s0 = v128_bswap32( s0 );
|
||||
s1 = v128_bswap32( s1 );
|
||||
s2 = v128_bswap32( s2 );
|
||||
s3 = v128_bswap32( s3 );
|
||||
s4 = v128_bswap32( s4 );
|
||||
|
||||
casti_v128( d, 0 ) = vdupq_laneq_u32( s0, 0 );
|
||||
casti_v128( d, 1 ) = vdupq_laneq_u32( s0, 1 );
|
||||
casti_v128( d, 2 ) = vdupq_laneq_u32( s0, 2 );
|
||||
casti_v128( d, 3 ) = vdupq_laneq_u32( s0, 3 );
|
||||
|
||||
casti_v128( d, 4 ) = vdupq_laneq_u32( s1, 0 );
|
||||
casti_v128( d, 5 ) = vdupq_laneq_u32( s1, 1 );
|
||||
casti_v128( d, 6 ) = vdupq_laneq_u32( s1, 2 );
|
||||
casti_v128( d, 7 ) = vdupq_laneq_u32( s1, 3 );
|
||||
|
||||
casti_v128( d, 8 ) = vdupq_laneq_u32( s2, 0 );
|
||||
casti_v128( d, 9 ) = vdupq_laneq_u32( s2, 1 );
|
||||
casti_v128( d,10 ) = vdupq_laneq_u32( s2, 2 );
|
||||
casti_v128( d,11 ) = vdupq_laneq_u32( s2, 3 );
|
||||
|
||||
casti_v128( d,12 ) = vdupq_laneq_u32( s3, 0 );
|
||||
casti_v128( d,13 ) = vdupq_laneq_u32( s3, 1 );
|
||||
casti_v128( d,14 ) = vdupq_laneq_u32( s3, 2 );
|
||||
casti_v128( d,15 ) = vdupq_laneq_u32( s3, 3 );
|
||||
|
||||
casti_v128( d,16 ) = vdupq_laneq_u32( s2, 0 );
|
||||
casti_v128( d,17 ) = vdupq_laneq_u32( s2, 1 );
|
||||
casti_v128( d,18 ) = vdupq_laneq_u32( s2, 2 );
|
||||
casti_v128( d,19 ) = vdupq_laneq_u32( s2, 3 );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// 8x32
|
||||
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define ILEAVE_8x32( D0, D1, D2, D3, D4, D5, D6, D7, \
|
||||
@@ -1544,7 +1682,9 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
//
|
||||
// 64 bit data
|
||||
|
||||
// 2x64 SSE2, NEON
|
||||
// 2x64
|
||||
|
||||
#if defined(__x86_64__) && defined(__SSE2__)
|
||||
|
||||
static inline void intrlv_2x64( void *dst, const void *src0,
|
||||
const void *src1, const int bit_len )
|
||||
@@ -1602,7 +1742,101 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
|
||||
d1[7] = v128_unpackhi64( s[14], s[15] );
|
||||
}
|
||||
|
||||
/*
|
||||
#elif defined(__aarch64__) && defined(__ARM_NEON)
|
||||
|
||||
static inline void intrlv_2x64( void *dst, const void *src0,
|
||||
const void *src1, const int bit_len )
|
||||
{
|
||||
uint64x2x2_t s;
|
||||
|
||||
s.val[0] = casti_v128u64( src0, 0 );
|
||||
s.val[1] = casti_v128u64( src1, 0 );
|
||||
vst2q_u64( dst, s );
|
||||
|
||||
s.val[0] = casti_v128u64( src0, 1 );
|
||||
s.val[1] = casti_v128u64( src1, 1 );
|
||||
vst2q_u64( dst + 32, s );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
s.val[0] = casti_v128u64( src0, 2 );
|
||||
s.val[1] = casti_v128u64( src1, 2 );
|
||||
vst2q_u64( dst + 64, s );
|
||||
|
||||
s.val[0] = casti_v128u64( src0, 3 );
|
||||
s.val[1] = casti_v128u64( src1, 3 );
|
||||
vst2q_u64( dst + 96, s );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
s.val[0] = casti_v128u64( src0, 4 );
|
||||
s.val[1] = casti_v128u64( src1, 4 );
|
||||
vst2q_u64( dst + 128, s );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
s.val[0] = casti_v128u64( src0, 5 );
|
||||
s.val[1] = casti_v128u64( src1, 5 );
|
||||
vst2q_u64( dst + 160, s );
|
||||
|
||||
s.val[0] = casti_v128u64( src0, 6 );
|
||||
s.val[1] = casti_v128u64( src1, 6 );
|
||||
vst2q_u64( dst + 192, s );
|
||||
|
||||
s.val[0] = casti_v128u64( src0, 7 );
|
||||
s.val[1] = casti_v128u64( src1, 7 );
|
||||
vst2q_u64( dst + 224, s );
|
||||
|
||||
// if ( bit_len <= 1024 ) return;
|
||||
}
|
||||
|
||||
static inline void dintrlv_2x64( void *dst0, void *dst1,
|
||||
const void *src, const int bit_len )
|
||||
{
|
||||
uint64x2x2_t s = vld2q_u64( src );
|
||||
|
||||
casti_v128u64( dst0, 0 ) = s.val[0];
|
||||
casti_v128u64( dst1, 0 ) = s.val[1];
|
||||
|
||||
s = vld2q_u64( src + 32 );
|
||||
casti_v128u64( dst0, 1 ) = s.val[0];
|
||||
casti_v128u64( dst1, 1 ) = s.val[1];
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
s = vld2q_u64( src + 64 );
|
||||
casti_v128u64( dst0, 2 ) = s.val[0];
|
||||
casti_v128u64( dst1, 2 ) = s.val[1];
|
||||
|
||||
s = vld2q_u64( src + 96 );
|
||||
casti_v128u64( dst0, 3 ) = s.val[0];
|
||||
casti_v128u64( dst1, 3 ) = s.val[1];
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
s = vld2q_u64( src + 128 );
|
||||
casti_v128u64( dst0, 4 ) = s.val[0];
|
||||
casti_v128u64( dst1, 4 ) = s.val[1];
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
s = vld2q_u64( src + 160 );
|
||||
casti_v128u64( dst0, 5 ) = s.val[0];
|
||||
casti_v128u64( dst1, 5 ) = s.val[1];
|
||||
|
||||
s = vld2q_u64( src + 192 );
|
||||
casti_v128u64( dst0, 6 ) = s.val[0];
|
||||
casti_v128u64( dst1, 6 ) = s.val[1];
|
||||
|
||||
s = vld2q_u64( src + 224 );
|
||||
casti_v128u64( dst0, 7 ) = s.val[0];
|
||||
casti_v128u64( dst1, 7 ) = s.val[1];
|
||||
|
||||
// if ( bit_len <= 1024 ) return;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void intrlv_2x64( void *dst, const void *src0,
|
||||
const void *src1, const int bit_len )
|
||||
{
|
||||
@@ -1621,8 +1855,7 @@ static inline void intrlv_2x64( void *dst, const void *src0,
|
||||
d[24] = s0[12]; d[25] = s1[12]; d[26] = s0[13]; d[27] = s1[13];
|
||||
d[28] = s0[14]; d[29] = s1[14]; d[30] = s0[15]; d[31] = s1[15];
|
||||
}
|
||||
*/
|
||||
/*
|
||||
|
||||
static inline void dintrlv_2x64( void *dst0, void *dst1,
|
||||
const void *src, const int bit_len )
|
||||
{
|
||||
@@ -1642,15 +1875,16 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
|
||||
d0[12] = s[24]; d1[12] = s[25]; d0[13] = s[26]; d1[13] = s[27];
|
||||
d0[14] = s[28]; d1[14] = s[29]; d0[15] = s[30]; d1[15] = s[31];
|
||||
}
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
|
||||
{
|
||||
v128_t s0 = casti_v128( src,0 );
|
||||
v128_t s1 = casti_v128( src,1 );
|
||||
v128_t s2 = casti_v128( src,2 );
|
||||
v128_t s3 = casti_v128( src,3 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
v128u64_t s0 = casti_v128u64( src,0 );
|
||||
v128u64_t s1 = casti_v128u64( src,1 );
|
||||
v128u64_t s2 = casti_v128u64( src,2 );
|
||||
v128u64_t s3 = casti_v128u64( src,3 );
|
||||
v128u64_t s4 = casti_v128u64( src,4 );
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
@@ -1673,41 +1907,20 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__)
|
||||
casti_v128u64( d,0 ) = v128_duplane64( s0, 0 );
|
||||
casti_v128u64( d,1 ) = v128_duplane64( s0, 1 );
|
||||
|
||||
casti_v128( d,0 ) = _mm_shuffle_epi32( s0, 0x44 );
|
||||
casti_v128( d,1 ) = _mm_shuffle_epi32( s0, 0xee );
|
||||
casti_v128u64( d,2 ) = v128_duplane64( s1, 0 );
|
||||
casti_v128u64( d,3 ) = v128_duplane64( s1, 1 );
|
||||
|
||||
casti_v128( d,2 ) = _mm_shuffle_epi32( s1, 0x44 );
|
||||
casti_v128( d,3 ) = _mm_shuffle_epi32( s1, 0xee );
|
||||
casti_v128u64( d,4 ) = v128_duplane64( s2, 0 );
|
||||
casti_v128u64( d,5 ) = v128_duplane64( s2, 1 );
|
||||
|
||||
casti_v128( d,4 ) = _mm_shuffle_epi32( s2, 0x44 );
|
||||
casti_v128( d,5 ) = _mm_shuffle_epi32( s2, 0xee );
|
||||
casti_v128u64( d,6 ) = v128_duplane64( s3, 0 );
|
||||
casti_v128u64( d,7 ) = v128_duplane64( s3, 1 );
|
||||
|
||||
casti_v128( d,6 ) = _mm_shuffle_epi32( s3, 0x44 );
|
||||
casti_v128( d,7 ) = _mm_shuffle_epi32( s3, 0xee );
|
||||
|
||||
casti_v128( d,8 ) = _mm_shuffle_epi32( s4, 0x44 );
|
||||
casti_v128( d,9 ) = _mm_shuffle_epi32( s4, 0xee );
|
||||
|
||||
#elif defined(__ARM_NEON)
|
||||
|
||||
casti_v128u64( d,0 ) = vdupq_laneq_u64( (uint64x2_t)s0, 0 );
|
||||
casti_v128u64( d,1 ) = vdupq_laneq_u64( (uint64x2_t)s0, 1 );
|
||||
|
||||
casti_v128u64( d,2 ) = vdupq_laneq_u64( (uint64x2_t)s1, 0 );
|
||||
casti_v128u64( d,3 ) = vdupq_laneq_u64( (uint64x2_t)s1, 1 );
|
||||
|
||||
casti_v128u64( d,4 ) = vdupq_laneq_u64( (uint64x2_t)s2, 0 );
|
||||
casti_v128u64( d,5 ) = vdupq_laneq_u64( (uint64x2_t)s2, 1 );
|
||||
|
||||
casti_v128u64( d,6 ) = vdupq_laneq_u64( (uint64x2_t)s3, 0 );
|
||||
casti_v128u64( d,7 ) = vdupq_laneq_u64( (uint64x2_t)s3, 1 );
|
||||
|
||||
casti_v128u64( d,8 ) = vdupq_laneq_u64( (uint64x2_t)s4, 0 );
|
||||
casti_v128u64( d,9 ) = vdupq_laneq_u64( (uint64x2_t)s4, 1 );
|
||||
|
||||
#endif
|
||||
casti_v128u64( d,8 ) = v128_duplane64( s4, 0 );
|
||||
casti_v128u64( d,9 ) = v128_duplane64( s4, 1 );
|
||||
}
|
||||
|
||||
static inline void extr_lane_2x64( void *dst, const void *src,
|
||||
|
@@ -439,11 +439,11 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
|
||||
#define v128_ornot( v1, v0 ) _mm_or_si128( v128_not( v1 ), v0 )
|
||||
|
||||
#define v128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
|
||||
#define v128_xor3( a, b, c ) _mm_xor_si128( _mm_xor_si128( a, b ), c )
|
||||
|
||||
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_and3( a, b, c ) _mm_and_si128( _mm_and_si128( a, b ), c )
|
||||
|
||||
#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
|
||||
#define v128_or3( a, b, c ) _mm_or_si128( _mm_or_si128( a, b ), c )
|
||||
|
||||
#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
|
@@ -174,17 +174,22 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
|
||||
#define mm256_ornot( v1, v0 ) _mm256_or_si256( mm256_not( v1 ), v0 )
|
||||
|
||||
// usage hints to improve performance when ternary logic is not avalable:
|
||||
// If overwriting an input arg put that arg first so the intermediate
|
||||
// result can be stored in the dest.
|
||||
// Put an arg with the nearest dependency last so independant args can be
|
||||
// processed first.
|
||||
#define mm256_xor3( a, b, c ) \
|
||||
_mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
|
||||
_mm256_xor_si256( _mm256_xor_si256( a, b ), c )
|
||||
|
||||
#define mm256_xor4( a, b, c, d ) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
|
||||
|
||||
#define mm256_and3( a, b, c ) \
|
||||
_mm256_and_si256( a, _mm256_and_si256( b, c ) )
|
||||
_mm256_and_si256( _mm256_and_si256( a, b ), c )
|
||||
|
||||
#define mm256_or3( a, b, c ) \
|
||||
_mm256_or_si256( a, _mm256_or_si256( b, c ) )
|
||||
_mm256_or_si256( _mm256_or_si256( a, b ), c )
|
||||
|
||||
#define mm256_xorand( a, b, c ) \
|
||||
_mm256_xor_si256( a, _mm256_and_si256( b, c ) )
|
||||
|
@@ -1,193 +0,0 @@
|
||||
#if !defined(SIMD_64_H__)
|
||||
#define SIMD_64_H__ 1
|
||||
|
||||
#if defined(__x86_64__) && defined(__MMX__)
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// 64 bit MMX vectors.
|
||||
//
|
||||
// This code is not used anywhere annd likely never will. It's intent was
|
||||
// to support 2 way parallel hashing using MMX, or NEON for 32 bit hash
|
||||
// functions, but hasn't been implementedwas never implemented.
|
||||
//
|
||||
// MMX is being deprecated by compilers, all intrinsics will be converted to use SSE
|
||||
// registers and instructions. MMX will still be available using ASM.
|
||||
// For backward compatibility it's likely the compiler won't allow mixing explicit SSE
|
||||
// with promoted MMX. It is therefore preferable to implement all 64 bit vector code
|
||||
// using explicit SSE with the upper 64 bits being ignored.
|
||||
// Using SSE for 64 bit vectors will complicate loading arrays from memory which will
|
||||
// always load 128 bits. Odd indexes will need to be extracted from the upper 64 bits
|
||||
// of the even index SSE register.
|
||||
// In most cases the exiting 4x32 SSE code can be used with 2 lanes being ignored
|
||||
// making ths file obsolete.
|
||||
|
||||
|
||||
#define v64_t __m64
|
||||
#define v64u32_t v64_t
|
||||
|
||||
#define v64_load _mm_load_si64
|
||||
#define v64_store _mm_store_si64
|
||||
|
||||
#define v64_64(i64) ((__m64)(i64))
|
||||
#define v64_32 _mm_set1_pi32
|
||||
#define v64_16 _mm_set1_pi16
|
||||
#define v64_8 _mm_set1_pi8
|
||||
|
||||
#define v64_add32 _mm_add_pi32
|
||||
#define v64_add16 _mm_add_pi16
|
||||
#define v64_add8 _mm_add_pi8
|
||||
|
||||
#define v64_mul32 _mm_mullo_pi32
|
||||
#define v64_mul16 _mm_mullo_pi16
|
||||
|
||||
// compare
|
||||
#define v64_cmpeq32 _mm_cmpeq_epi32
|
||||
#define v64_cmpeq16 _mm_cmpeq_epi16
|
||||
#define v64_cmpeq8 _mm_cmpeq_epi8
|
||||
|
||||
#define v64_cmpgt32 _mm_cmpgt_epi32
|
||||
#define v64_cmpgt16 _mm_cmpgt_epi16
|
||||
#define v64_cmpgt8 _mm_cmpgt_epi8
|
||||
|
||||
#define v64_cmplt32 _mm_cmplt_epi32
|
||||
#define v64_cmplt16 _mm_cmplt_epi16
|
||||
#define v64_cmplt8 _mm_cmplt_epi8
|
||||
|
||||
// bit shift
|
||||
#define v64_sl32 _mm_slli_epi32
|
||||
#define v64_sl16 _mm_slli_epi16
|
||||
#define v64_sl8 _mm_slli_epi8
|
||||
|
||||
#define v64_sr32 _mm_srli_epi32
|
||||
#define v64_sr16 _mm_srli_epi16
|
||||
#define v64_sr8 _mm_srli_epi8
|
||||
|
||||
#define v64_sra32 _mm_srai_epi32
|
||||
#define v64_sra16 _mm_srai_epi16
|
||||
#define v64_sra8 _mm_srai_epi8
|
||||
|
||||
#define v64_alignr8 _mm_alignr_pi8
|
||||
#define v64_unpacklo32 _mm_unpacklo_pi32
|
||||
#define v64_unpackhi32 _mm_unpackhi_pi32
|
||||
#define v64_unpacklo16 _mm_unpacklo_pi16
|
||||
#define v64_unpackhi16 _mm_unpacklhi_pi16
|
||||
#define v64_unpacklo8 _mm_unpacklo_pi8
|
||||
#define v64_unpackhi8 _mm_unpackhi_pi16
|
||||
|
||||
// Pseudo constants
|
||||
|
||||
#define v64_zero _mm_setzero_si64()
|
||||
#define v64_one_64 _mm_set_pi32( 0UL, 1UL )
|
||||
#define v64_one_32 v64_32( 1UL )
|
||||
#define v64_one_16 v64_16( 1U )
|
||||
#define v64_one_8 v64_8( 1U );
|
||||
#define v64_neg1 v64_32( 0xFFFFFFFFUL )
|
||||
|
||||
#define casti_v64(p,i) (((v64_t*)(p))[(i)])
|
||||
|
||||
// Bitwise not: ~(a)
|
||||
//#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
|
||||
#define v64_not( a ) ( (v64_t)( ~( (uint64_t)(a) ) )
|
||||
|
||||
/*
|
||||
// Unary negate elements
|
||||
#define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, v )
|
||||
#define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, v )
|
||||
#define mm64_negate_8( v ) _mm_sub_pi8( m64_zero, v )
|
||||
*/
|
||||
|
||||
static inline void v64_memset_zero( __m64 *dst, const int n )
|
||||
{ for ( int i = 0; i < n; i++ ) dst[i] = v64_zero; }
|
||||
|
||||
static inline void v64_memset( __m64 *dst, const __m64 a, const int n )
|
||||
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
|
||||
|
||||
static inline void v64_memcpy( __m64 *dst, const __m64 *src, const int n )
|
||||
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
||||
|
||||
#define v64_or _mm_or_si64
|
||||
#define v64_and _mm_and_si64
|
||||
#define v64_xor _mm_xor_si64
|
||||
#define v64_andnot _mm_andnot_si64
|
||||
#define v64_xor3( v2, v1, v0 ) v64_xor( v2, v64_andnot( v1, v0 ) )
|
||||
#define v64_xorandnot( v2, v1, v0 ) v64_xor( v2, v64_andnot( v1, v0 ) )
|
||||
|
||||
|
||||
// Rotate bits in packed elements of 64 bit vector
|
||||
#define v64_rol64( a, n ) \
|
||||
_mm_or_si64( _mm_slli_si64( a, n ), \
|
||||
_mm_srli_si64( a, 64-(n) ) )
|
||||
|
||||
#define v64_ror64( a, n ) \
|
||||
_mm_or_si64( _mm_srli_si64( a, n ), \
|
||||
_mm_slli_si64( a, 64-(n) ) )
|
||||
|
||||
#define v64_rol32( a, n ) \
|
||||
_mm_or_si64( _mm_slli_pi32( a, n ), \
|
||||
_mm_srli_pi32( a, 32-(n) ) )
|
||||
|
||||
#define v64_ror32( a, n ) \
|
||||
_mm_or_si64( _mm_srli_pi32( a, n ), \
|
||||
_mm_slli_pi32( a, 32-(n) ) )
|
||||
|
||||
#define v64_rol16( a, n ) \
|
||||
_mm_or_si64( _mm_slli_pi16( a, n ), \
|
||||
_mm_srli_pi16( a, 16-(n) ) )
|
||||
|
||||
#define v64_ror16( a, n ) \
|
||||
_mm_or_si64( _mm_srli_pi16( a, n ), \
|
||||
_mm_slli_pi16( a, 16-(n) ) )
|
||||
|
||||
// Rotate packed elements accross lanes. Useful for byte swap and byte
|
||||
// rotation.
|
||||
|
||||
#if defined(__SSE__)
|
||||
|
||||
// Swap hi & lo 32 bits.
|
||||
#define v64_swap32( a ) _mm_shuffle_pi16( a, 0x4e )
|
||||
|
||||
#define v64_shulfr16( a ) _mm_shuffle_pi16( a, 0x39 )
|
||||
#define v64_shufll16( a ) _mm_shuffle_pi16( a, 0x93 )
|
||||
|
||||
// Swap hi & lo 16 bits of each 32 bit element
|
||||
#define v64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 )
|
||||
|
||||
#endif // SSE
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
// Endian byte swap packed elements
|
||||
|
||||
#define v64_bswap32( v ) \
|
||||
_mm_shuffle_pi8( v, (__m64)0x0405060700010203 )
|
||||
|
||||
#define v64_bswap16( v ) \
|
||||
_mm_shuffle_pi8( v, (__m64)0x0607040502030001 );
|
||||
|
||||
// Rotate right by c bytes
|
||||
static inline v64_t v64_shuflr_x8( __m64 v, const int c )
|
||||
{ return _mm_alignr_pi8( v, v, c ); }
|
||||
|
||||
#else
|
||||
|
||||
#define v64_bswap32( v ) \
|
||||
_mm_set_pi32( __builtin_bswap32( ((uint32_t*)&v)[1] ), \
|
||||
__builtin_bswap32( ((uint32_t*)&v)[0] ) )
|
||||
|
||||
#define v64_bswap16( v ) \
|
||||
_mm_set_pi16( __builtin_bswap16( ((uint16_t*)&v)[3] ), \
|
||||
__builtin_bswap16( ((uint16_t*)&v)[2] ), \
|
||||
__builtin_bswap16( ((uint16_t*)&v)[1] ), \
|
||||
__builtin_bswap16( ((uint16_t*)&v)[0] ) )
|
||||
|
||||
#endif // SSSE3
|
||||
|
||||
#define v64_blendv( v1, v0, mask ) \
|
||||
v64_or( v64_and( mask, v1 ), v64_andnot( mask, v0 ) )
|
||||
|
||||
|
||||
#endif // MMX
|
||||
|
||||
#endif // SIMD_64_H__
|
||||
|
@@ -2,7 +2,7 @@
|
||||
#define SIMD_INT_H__ 1
|
||||
|
||||
//TODO compile time test for byte order
|
||||
// be64 etc using HW bowap.
|
||||
// be64 etc using HW bswap.
|
||||
//
|
||||
// Endian byte swap
|
||||
#if defined(__x86_64__)
|
||||
@@ -19,6 +19,9 @@ static inline uint64_t bswap_64( uint64_t a )
|
||||
return b;
|
||||
}
|
||||
|
||||
// This produces warnings from clang, but its suggested workaround
|
||||
// "rev32 %w0, %w1\n\t" produced errors instead. GCC doesn't complain and
|
||||
// it works as is on both.
|
||||
static inline uint32_t bswap_32( uint32_t a )
|
||||
{
|
||||
uint32_t b;
|
||||
@@ -94,7 +97,7 @@ static inline uint16_t be16( const uint16_t u16 )
|
||||
return ( (uint16_t)(p[3]) ) + ( (uint16_t)(p[2]) << 8 );
|
||||
}
|
||||
|
||||
static inline uint32_t le162( const uint16_t u16 )
|
||||
static inline uint32_t le16( const uint16_t u16 )
|
||||
{
|
||||
const uint8_t *p = (uint8_t const *)&u16;
|
||||
return ( (uint16_t)(p[0]) ) + ( (uint16_t)(p[1]) << 8 );
|
||||
@@ -112,7 +115,7 @@ static inline uint32_t le162( const uint16_t u16 )
|
||||
#elif defined(__aarch64__)
|
||||
|
||||
// Documentation is vague, ror exists but is ambiguous. Docs say it can
|
||||
// do 32 or 64 registers. Assuming that is architecture specific andcan
|
||||
// do 32 or 64 bit registers. Assuming that is architecture specific and can
|
||||
// only do 32 bit on 32 bit arch. Rarely used so not a big issue.
|
||||
static inline uint64_t ror64( uint64_t a, const int c )
|
||||
{
|
||||
|
@@ -93,6 +93,8 @@
|
||||
#define v128_cmplt16( v1, v0 ) vcltq_s16( (int16x8_t)v1, (int16x8_t)(v0) )
|
||||
#define v128_cmplt8( v1, v0 ) vcltq_s8( (int8x16_t)v1, (int8x16_t)(v0) )
|
||||
|
||||
#define v128_cmpeq_zero vceqzq_u64
|
||||
|
||||
// Logical bit shift
|
||||
#define v128_sl64 vshlq_n_u64
|
||||
#define v128_sl32 vshlq_n_u32
|
||||
@@ -135,14 +137,14 @@
|
||||
#if defined(__ARM_FEATURE_SHA3)
|
||||
#define v128_xor3 veor3q_u32
|
||||
#else
|
||||
#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) )
|
||||
#define v128_xor3( v2, v1, v0 ) veorq_u32( veorq_u32( v2, v1 ), v0 )
|
||||
#endif
|
||||
|
||||
// v2 & v1 & v0
|
||||
#define v128_and3( v2, v1, v0 ) v128_and( v2, v128_and( v1, v0 ) )
|
||||
#define v128_and3( v2, v1, v0 ) v128_and( v128_and( v2, v1 ), v0 )
|
||||
|
||||
// v2 | v1 | v0
|
||||
#define v128_or3( v2, v1, v0 ) v128_or( v2, v128_or( v1, v0 ) )
|
||||
#define v128_or3( v2, v1, v0 ) v128_or( v128_or( v2, v1 ), v0 )
|
||||
|
||||
// v2 ^ ( ~v1 & v0 )
|
||||
#if defined(__ARM_FEATURE_SHA3)
|
||||
@@ -178,6 +180,7 @@
|
||||
#define v128_unpacklo8( v1, v0 ) vzip1q_u8( v1, v0 )
|
||||
#define v128_unpackhi8( v1, v0 ) vzip2q_u8( v1, v0 )
|
||||
|
||||
// vzipq_u32 can do hi & lo and return uint32x4x2, no 64 bit version.
|
||||
|
||||
// AES
|
||||
// consistent with Intel AES intrinsics, break up for optimizing
|
||||
@@ -237,18 +240,15 @@ typedef union
|
||||
#define cast_v128u32( p ) (*((uint32x4_t*)(p)))
|
||||
#define castp_v128u32( p ) ((uint32x4_t*)(p))
|
||||
|
||||
#define v128_zero v128_64( 0ull )
|
||||
|
||||
#define v128_cmpeq_zero vceqzq_u64
|
||||
|
||||
#define v128_neg1 v128_64( 0xffffffffffffffffull )
|
||||
|
||||
// set1
|
||||
#define v128_64 vmovq_n_u64
|
||||
#define v128_32 vmovq_n_u32
|
||||
#define v128_16 vmovq_n_u16
|
||||
#define v128_8 vmovq_n_u8
|
||||
|
||||
#define v128_zero v128_64( 0ull )
|
||||
#define v128_neg1 v128_64( 0xffffffffffffffffull )
|
||||
|
||||
#define v64_set32( u32_1, u32_0 ) \
|
||||
vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )
|
||||
|
||||
@@ -357,28 +357,23 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
((uint16x8_t)(v)), c )
|
||||
|
||||
#define v128_rol16( v, c ) \
|
||||
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
|
||||
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \
|
||||
: vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
|
||||
((uint16x8_t)(v)), c )
|
||||
|
||||
#define v128_ror8( v, c ) \
|
||||
vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
|
||||
vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
|
||||
((uint8x16_t)(v)), c )
|
||||
|
||||
#define v128_rol8( v, c ) \
|
||||
vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
|
||||
vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
|
||||
((uint8x16_t)(v)), c )
|
||||
|
||||
|
||||
// ( v1 ^ v0 ) >>> n
|
||||
// ( v1 ^ v0 ) >>> c
|
||||
#if defined(__ARM_FEATURE_SHA3)
|
||||
|
||||
#define v128_ror64xor( v1, v0, n ) vxarq_u64( v1, v0, n )
|
||||
|
||||
#define v128_ror64xor( v1, v0, c ) vxarq_u64( v1, v0, c )
|
||||
#else
|
||||
|
||||
#define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n )
|
||||
|
||||
#define v128_ror64xor( v1, v0, c ) v128_ror64( v128_xor( v1, v0 ), c )
|
||||
#endif
|
||||
|
||||
#define v128_2ror64( v1, v0, c ) \
|
||||
@@ -411,7 +406,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
v1 = vorrq_u32( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define v128_2rorx32( v1, v0, c ) \
|
||||
#define v128_2ror32( v1, v0, c ) \
|
||||
{ \
|
||||
uint32x4_t t0 = vshlq_n_u32( v0, c ); \
|
||||
uint32x4_t t1 = vshlq_n_u32( v1, c ); \
|
||||
@@ -444,9 +439,9 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
#define v128_lrev16 vrev32q_u16
|
||||
|
||||
// aka bswap
|
||||
#define v128_qrev8 vrev64q_u8
|
||||
#define v128_lrev8 vrev32q_u8
|
||||
#define v128_wrev8 vrev16q_u8
|
||||
// #define v128_qrev8 vrev64q_u8
|
||||
// #define v128_lrev8 vrev32q_u8
|
||||
// #define v128_wrev8 vrev16q_u8
|
||||
|
||||
// full vector rotation
|
||||
|
||||
@@ -471,9 +466,9 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
|
||||
#define v128_bswap16(v) (uint16x8_t)vrev16q_u8( (uint8x16_t)(v) )
|
||||
#define v128_bswap32(v) (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) )
|
||||
#define v128_bswap64(v) (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
|
||||
#define v128_bswap128(v) (uint32x4_t)v128_swap64( v128_bswap64(v) )
|
||||
#define v128_bswap128(v) (uint32x4_t)v128_rev64( v128_bswap64(v) )
|
||||
|
||||
// Usefull for x86_64 but does nothing for ARM
|
||||
// Useful for x86_64 but does nothing for ARM
|
||||
#define v128_block_bswap32( dst, src ) \
|
||||
{ \
|
||||
casti_v128u32( dst,0 ) = v128_bswap32( casti_v128u32( src,0 ) ); \
|
||||
@@ -542,7 +537,7 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
|
||||
|
||||
// Bitwise blend using vector mask, use only bytewise for compatibility
|
||||
// with x86_64.
|
||||
#define v128_blendv( v1, v0, mask ) vbslq_u32( mask, v1, v0 )
|
||||
#define v128_blendv( v1, v0, mask ) vbslq_u32( mask, v0, v1 )
|
||||
|
||||
#endif // __ARM_NEON
|
||||
#endif // SIMD_NEON_H__
|
||||
|
@@ -1,25 +1,152 @@
|
||||
// Placeholder for now.
|
||||
//
|
||||
// This file will hold AArch64 SVE code, a replecement for NEON that uses vector length
|
||||
// agnostic instructions. This means the same code can be used on CPUs with different
|
||||
// SVE vector register lengths. This is not good for vectorized hashing.
|
||||
// This file will hold AArch64 SVE code, a replecement for NEON that uses
|
||||
// vector length agnostic instructions. This means the same code can be used
|
||||
// on CPUs with different SVE vector register lengths. This is not good for
|
||||
// vectorized hashing.
|
||||
// Optimum hash is sensitive to the vector register length with different code
|
||||
// used for different register sizes. On X86_64 the vector length is tied to the CPU
|
||||
// feature making it simple and efficient to handle different lengths although it
|
||||
// results in multiple executables. Theoretically SVE could use a single executable for
|
||||
// any vector length.
|
||||
// used for different register sizes. On X86_64 the vector length is tied to
|
||||
// the CPU feature making it simple and efficient to handle different lengths
|
||||
// although it results in multiple executables. Theoretically SVE could use a
|
||||
// single executable for any vector length.
|
||||
//
|
||||
// With the SVE vector length only known at run time it resultis in run time overhead
|
||||
// to test the vector length. Theoretically it could be tested at program loading and
|
||||
// appropriate libraries loaded. However I don't know if this can be done and if so
|
||||
// how to do it.
|
||||
// With the SVE vector length only known at run time it results in run time
|
||||
// overhead to test the vector length. Theoretically it could be tested at
|
||||
// program loading and appropriate libraries loaded. However I don't know if
|
||||
// this can be done and if specified how to do it.
|
||||
//
|
||||
// SVE is not expected to be used for 128 bit vectors as it does not provide any
|
||||
// advantages over NEON. However, it may be implemented for testing purposes
|
||||
// because CPU with registers larger than 128 bits are currently very rare and very
|
||||
// expensive server class CPUs.
|
||||
// because CPU with registers larger than 128 bits are currently very rare and
|
||||
// very expensive server class CPUs.
|
||||
//
|
||||
// N-way parallel hashing could be the best use of SVE, usimg the same code for all
|
||||
// vector lengths with the only variable being the number of lanes. This will still
|
||||
// require run time checking but should be lighter than substituting functions.
|
||||
// However, 128 bit vectors also need to be supported with 256 bit registers.
|
||||
// This could be a challenge for un-predicated functions.
|
||||
//
|
||||
// N-way parallel hashing could be the best use of SVE, usimg the same code
|
||||
// for all vector lengths with the only variable being the number of lanes.
|
||||
// This will still require run time checking but should be lighter than
|
||||
// substituting functions.
|
||||
|
||||
// Current approach is to hard code the length in these intrinsics and called
|
||||
// by existing length specific code.
|
||||
// define with sv_ prefix for generic use predicate provided by caller,
|
||||
// use sv<size>_ with hard coded predicate.
|
||||
// v<size>_ only if and when it's compatible with SSE & NEON
|
||||
|
||||
// Many instructions have no predicate operand, how is VVL handled?
|
||||
// How does the CPU know how long the vector is and whether it spans
|
||||
// multiple registers without the predicate?
|
||||
|
||||
// Also how does the predicate define the vector size? How to tell if inactive
|
||||
// high lanes are part of the vector or beyond its range.
|
||||
//
|
||||
// Some intructions may have an implied predicate by other arguments.
|
||||
// TBL for example will only have shuffle indexes for active lanes.
|
||||
// However this is dependant on software being aware of register size.
|
||||
|
||||
|
||||
|
||||
#if 0
|
||||
// #if defined USE_SV128
|
||||
// NEON needs to be disabled
|
||||
|
||||
#define PRED128 0xffff
|
||||
#define PRED256 0xffffffff
|
||||
|
||||
// Types should be transparent
|
||||
|
||||
|
||||
#define sv128u32_t svuint32_t
|
||||
#define sv256u32_t svuint32_t
|
||||
|
||||
|
||||
// load1
|
||||
|
||||
|
||||
// arithmetic
|
||||
|
||||
// _z zero inactive elements, _x undefined inactive elements, _m inactive
|
||||
// elements from first arg. arg order only matters when _m used. Use _x.
|
||||
|
||||
#define sv_add32( p, v1, v0 ) svadd_u32_x( p, v1, v0 )
|
||||
|
||||
#define sv128_add32( v1, v0 ) svadd_u32_x( PRED128, v1, v0 )
|
||||
#define sv256_add32( v1, v0 ) svadd_u32_x( PRED256, v1, v0 )
|
||||
|
||||
// Add integer to each element
|
||||
#define sv_addi32( p, v, i ) svadd_n_u32_x( p, v, i )
|
||||
|
||||
|
||||
|
||||
// compare
|
||||
|
||||
#define sv_cmpeq32( p, v1, v0 ) svcmpeq_u32( p, v1, v0 )
|
||||
|
||||
#define sv128_cmpeq32( v1, v0 ) svcmpeq_u32( PRED128, v1, v0 )
|
||||
#define sv256_cmpeq32( v1, v0 ) svcmpeq_u32( PRED256, v1, v0 )
|
||||
|
||||
|
||||
// bit shift
|
||||
|
||||
#define sv_sl32( v, c ) svlsl_n_u32_x( p, v, c )
|
||||
|
||||
#define sv128_sl32( v, c ) svlsl_n_u32_x( PRED128, v, c )
|
||||
#define sv256_sl32( v, c ) svlsl_n_u32_x( PRED256, v, c )
|
||||
|
||||
|
||||
// logic
|
||||
|
||||
#define sv_or( p, v1, v0 ) svorr_u32_x( p, v1, v0 )
|
||||
|
||||
#define sv128_or( v1, v0 ) svorr_u32_x( PRED128, v1, v0 )
|
||||
#define sv256_or( v1, v0 ) svorr_u32_x( PRED256, v1, v0 )
|
||||
|
||||
// ext used for alignr, and zip used for unpack have no predicate arg.
|
||||
// How is vector length determined? How are register sizes handled?
|
||||
// How are part registers handled?
|
||||
|
||||
// alignr (ext)
|
||||
|
||||
// unpack
|
||||
|
||||
|
||||
// AES
|
||||
|
||||
// AES uses fixed 128 bit vectors, how does this work with larger registers?
|
||||
|
||||
// set1
|
||||
|
||||
#define sv128_32( n ) svdup_n_u32_x( PRED128, n )
|
||||
#define sv256_32( n ) svdup_n_u32_x( PRED256, n )
|
||||
|
||||
// broadcast
|
||||
|
||||
// svdup_lane has no predicate
|
||||
|
||||
// constants
|
||||
|
||||
|
||||
// pointer cast
|
||||
|
||||
|
||||
// Bit rotation
|
||||
|
||||
// No predication for shift instructions
|
||||
|
||||
// Cross lane shuffles
|
||||
|
||||
// Very limited shuffling, mostly svtbl which has no predicate and uses
|
||||
// vector for the index.
|
||||
|
||||
|
||||
// endian byte swap
|
||||
|
||||
|
||||
#define sv128_bswap32(v) svrevb_u32_x( p, v )
|
||||
|
||||
|
||||
// blend
|
||||
|
||||
#enfif
|
||||
|
||||
|
25
sysinfos.c
25
sysinfos.c
@@ -16,14 +16,19 @@
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__aarch64__)
|
||||
// Missing on MinGW, MacOS
|
||||
#if defined(__aarch64__) && !defined(WIN32) && !defined(__APPLE__)
|
||||
#define ARM_AUXV
|
||||
#endif
|
||||
|
||||
#if defined(ARM_AUXV)
|
||||
// for arm's "cpuid"
|
||||
#include <sys/auxv.h>
|
||||
#include <asm/hwcap.h>
|
||||
#include <sys/prctl.h>
|
||||
#endif
|
||||
|
||||
#ifndef WIN32
|
||||
#if !(defined(WIN32) || defined(__APPLE__))
|
||||
|
||||
// 1035g1: /sys/devices/platform/coretemp.0/hwmon/hwmon3/temp1_input
|
||||
// 1035g1: /sys/class/hwmon/hwmon1/temp1_input wrong temp
|
||||
@@ -147,7 +152,7 @@ static inline void linux_cpu_hilo_freq( float *lo, float *hi )
|
||||
|
||||
static inline float cpu_temp( int core )
|
||||
{
|
||||
#ifdef WIN32
|
||||
#if defined(WIN32) || defined(__APPLE__)
|
||||
return 0.;
|
||||
#else
|
||||
return linux_cputemp( core );
|
||||
@@ -156,7 +161,7 @@ static inline float cpu_temp( int core )
|
||||
|
||||
static inline uint32_t cpu_clock( int core )
|
||||
{
|
||||
#ifdef WIN32
|
||||
#if defined(WIN32) || defined(__APPLE__)
|
||||
return 0;
|
||||
#else
|
||||
return linux_cpufreq( core );
|
||||
@@ -275,8 +280,8 @@ static inline int cpu_fanpercent()
|
||||
#define FMA3_mask (FMA3_Flag|AVX_mask)
|
||||
#define AVX512_mask (AVX512_VL_Flag|AVX512_BW_Flag|AVX512_DQ_Flag|AVX512_F_Flag)
|
||||
|
||||
|
||||
#if defined(__x86_64__)
|
||||
|
||||
static inline void cpuid( unsigned int leaf, unsigned int subleaf,
|
||||
unsigned int output[4] )
|
||||
{
|
||||
@@ -309,13 +314,14 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
|
||||
#endif
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__)
|
||||
#elif defined(ARM_AUXV)
|
||||
|
||||
// Always test if HWCAP variable is defined in the kernel before attempting
|
||||
// to compile it. If not defined the feature can't be tested and won't be
|
||||
// included in the compile.
|
||||
// This can occur if compiling with an old kernel and a new CPU and could
|
||||
// result in a suboptimal build.
|
||||
// leaf and subleaf arguments are ignored.
|
||||
|
||||
static inline void cpuid( unsigned int leaf, unsigned int subleaf,
|
||||
unsigned int output[4] )
|
||||
@@ -365,7 +371,8 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
|
||||
}
|
||||
|
||||
#else
|
||||
#define cpuid(leaf, subleaf, out) out[0] = 0;
|
||||
#define cpuid( leaf, subleaf, output ) \
|
||||
output[0] = output[1] = output[2] = output[3] = 0;
|
||||
#endif
|
||||
|
||||
static inline void cpu_getname(char *outbuf, size_t maxsz)
|
||||
@@ -958,10 +965,10 @@ static inline unsigned int avx10_vector_length()
|
||||
return 0;
|
||||
}
|
||||
|
||||
// ARM SVE vector register length
|
||||
// ARM SVE vector register length, converted from bytes to bits.
|
||||
static inline int sve_vector_length()
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
#if defined(ARM_AUXV)
|
||||
if ( has_sve() )
|
||||
return prctl( (PR_SVE_GET_VL & PR_SVE_VL_LEN_MASK) * 8 );
|
||||
#endif
|
||||
|
6
util.c
6
util.c
@@ -1414,6 +1414,12 @@ static bool send_line( struct stratum_ctx *sctx, char *s )
|
||||
int n;
|
||||
fd_set wd;
|
||||
|
||||
// Something nasty going on With Windows on aarch64. This hack prevents
|
||||
// corrupting the sctx pointer. This only works if placed inside the while loop.
|
||||
#if defined(__aarch64__) && defined(WIN32) && defined(ARM_WIN_HACK)
|
||||
printf("");
|
||||
#endif
|
||||
|
||||
FD_ZERO( &wd );
|
||||
FD_SET( sctx->sock, &wd );
|
||||
if ( select( (int) ( sctx->sock + 1 ), NULL, &wd, NULL, &timeout ) < 1 )
|
||||
|
@@ -11,16 +11,11 @@
|
||||
|
||||
export LOCAL_LIB="$HOME/usr/lib"
|
||||
export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --host=x86_64-w64-mingw32"
|
||||
#export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
|
||||
export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
|
||||
# set correct gcc version
|
||||
export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
|
||||
# used by GCC
|
||||
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs"
|
||||
#export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
|
||||
# Support for Windows 7 CPU groups, AES sometimes not included in -march
|
||||
# CPU groups disabled due to incompatibilities between Intel and AMD CPUs.
|
||||
#export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
|
||||
export DEFAULT_CFLAGS="-maes -O3 -Wall"
|
||||
export DEFAULT_CFLAGS_OLD="-O3 -Wall"
|
||||
|
||||
@@ -40,7 +35,6 @@ cp $MINGW_LIB/zlib1.dll release/
|
||||
cp $MINGW_LIB/libwinpthread-1.dll release/
|
||||
cp $GCC_MINGW_LIB/libstdc++-6.dll release/
|
||||
cp $GCC_MINGW_LIB/libgcc_s_seh-1.dll release/
|
||||
#cp ./../libcrypto-1_1-x64.dll release/
|
||||
cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
|
||||
|
||||
# Start building...
|
||||
|
Reference in New Issue
Block a user