mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
9 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
3dd6787531 | ||
![]() |
cae1ce2ab7 | ||
![]() |
7a91c41d74 | ||
![]() |
c6bc9d67fb | ||
![]() |
b339450898 | ||
![]() |
fb93160641 | ||
![]() |
520d4d5384 | ||
![]() |
da7030faa8 | ||
![]() |
bd84f199fe |
@@ -1,4 +1,6 @@
|
||||
|
||||
These instructions may be out of date, see the Wiki for the latest...
|
||||
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
|
||||
|
||||
1. Requirements:
|
||||
---------------
|
||||
@@ -35,7 +37,7 @@ SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
|
||||
openssl 1.1.0e or higher.
|
||||
|
||||
znver1 and znver2 should be recognized on most recent version of GCC and
|
||||
znver3 is expected with GCC 11. GCC 11 also includes rocketlake support.
|
||||
znver3 is available with GCC 11. GCC 11 also includes rocketlake support.
|
||||
In the meantime here are some suggestions to compile with new CPUs:
|
||||
|
||||
"-march=native" is usually the best choice, used by build.sh.
|
||||
|
@@ -1,6 +1,6 @@
|
||||
Instructions for compiling cpuminer-opt for Windows.
|
||||
|
||||
Thwaw intructions nay be out of date. Please consult the wiki for
|
||||
These intructions are out of date. Please consult the wiki for
|
||||
the latest:
|
||||
|
||||
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
|
||||
|
@@ -55,9 +55,6 @@ cpuminer_SOURCES = \
|
||||
algo/blake/mod_blakecoin.c \
|
||||
algo/blake/blakecoin.c \
|
||||
algo/blake/blakecoin-4way.c \
|
||||
algo/blake/decred-gate.c \
|
||||
algo/blake/decred.c \
|
||||
algo/blake/decred-4way.c \
|
||||
algo/blake/pentablake-gate.c \
|
||||
algo/blake/pentablake-4way.c \
|
||||
algo/blake/pentablake.c \
|
||||
@@ -205,7 +202,6 @@ cpuminer_SOURCES = \
|
||||
algo/verthash/tiny_sha3/sha3.c \
|
||||
algo/verthash/tiny_sha3/sha3-4way.c \
|
||||
algo/whirlpool/sph_whirlpool.c \
|
||||
algo/whirlpool/whirlpool-hash-4way.c \
|
||||
algo/whirlpool/whirlpool-gate.c \
|
||||
algo/whirlpool/whirlpool.c \
|
||||
algo/whirlpool/whirlpoolx.c \
|
||||
|
61
README.md
61
README.md
@@ -40,17 +40,25 @@ Requirements
|
||||
Intel Core2 and newer and AMD equivalents. Further optimizations are available
|
||||
on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
|
||||
|
||||
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
|
||||
performance.
|
||||
32 bit CPUs are not supported.
|
||||
Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
|
||||
are not supported.
|
||||
|
||||
ARM and Aarch64 CPUs are not supported.
|
||||
Mobile CPUs like laptop computers are not recommended because they aren't
|
||||
designed for extreme heat of operating at full load for extended periods of
|
||||
time.
|
||||
|
||||
Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
|
||||
|
||||
2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
|
||||
including Mint and Centos, are known to work and have all dependencies
|
||||
in their repositories. Others may work but may require more effort. Older
|
||||
versions such as Centos 6 don't work due to missing features.
|
||||
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
|
||||
|
||||
Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
|
||||
binaries. WindowsXP 64 bit is YMMV.
|
||||
|
||||
FreeBSD is not actively tested but should work, YMMV.
|
||||
MacOS, OSx and Android are not supported.
|
||||
|
||||
3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
|
||||
@@ -66,53 +74,50 @@ Supported Algorithms
|
||||
argon2d250 argon2d-crds, Credits (CRDS)
|
||||
argon2d500 argon2d-dyn, Dynamic (DYN)
|
||||
argon2d4096 argon2d-uis, Unitus, (UIS)
|
||||
axiom Shabal-256 MemoHash
|
||||
blake Blake-256 (SFR)
|
||||
blake2b Blake2b 256
|
||||
blake2s Blake-2 S
|
||||
blake Blake-256
|
||||
blake2b Blake2-512
|
||||
blake2s Blake2-256
|
||||
blakecoin blake256r8
|
||||
bmw BMW 256
|
||||
bmw512 BMW 512
|
||||
c11 Chaincoin
|
||||
c11
|
||||
decred
|
||||
deep Deepcoin (DCN)
|
||||
dmd-gr Diamond-Groestl
|
||||
groestl Groestl coin
|
||||
hex x16r-hex
|
||||
hmq1725 Espers
|
||||
hmq1725
|
||||
hodl Hodlcoin
|
||||
jha Jackpotcoin
|
||||
keccak Maxcoin
|
||||
keccakc Creative coin
|
||||
lbry LBC, LBRY Credits
|
||||
luffa Luffa
|
||||
lyra2h Hppcoin
|
||||
lyra2h
|
||||
lyra2re lyra2
|
||||
lyra2rev2 lyra2v2
|
||||
lyra2rev3 lyrav2v3
|
||||
lyra2z
|
||||
lyra2z330 Lyra2 330 rows, Zoin (ZOI)
|
||||
m7m Magi (XMG)
|
||||
minotaur Ringcoin (RNG)
|
||||
lyra2z330
|
||||
m7m
|
||||
minotaur
|
||||
minotaurx
|
||||
myr-gr Myriad-Groestl
|
||||
neoscrypt NeoScrypt(128, 2, 1)
|
||||
nist5 Nist5
|
||||
pentablake Pentablake
|
||||
phi1612 phi
|
||||
phi2 Luxcoin (LUX)
|
||||
phi2-lux identical to phi2
|
||||
pluck Pluck:128 (Supcoin)
|
||||
phi2
|
||||
polytimos Ninja
|
||||
power2b MicroBitcoin (MBC)
|
||||
quark Quark
|
||||
qubit Qubit
|
||||
scrypt scrypt(1024, 1, 1) (default)
|
||||
scrypt:N scrypt(N, 1, 1)
|
||||
scryptn2 scrypt(1048576, 1, 1)
|
||||
sha256d Double SHA-256
|
||||
sha256q Quad SHA-256, Pyrite (PYE)
|
||||
sha256t Triple SHA-256, Onecoin (OC)
|
||||
sha256q Quad SHA-256
|
||||
sha256t Triple SHA-256
|
||||
sha3d Double keccak256 (BSHA3)
|
||||
shavite3 Shavite3
|
||||
skein Skein+Sha (Skeincoin)
|
||||
skein2 Double Skein (Woodcoin)
|
||||
skunk Signatum (SIGT)
|
||||
@@ -128,17 +133,17 @@ Supported Algorithms
|
||||
x11 Dash
|
||||
x11evo Revolvercoin
|
||||
x11gost sib (SibCoin)
|
||||
x12 Galaxie Cash (GCH)
|
||||
x13 X13
|
||||
x12
|
||||
x13
|
||||
x13bcd bcd
|
||||
x13sm3 hsr (Hshare)
|
||||
x14 X14
|
||||
x15 X15
|
||||
x14
|
||||
x15
|
||||
x16r
|
||||
x16rv2
|
||||
x16rt Gincoin (GIN)
|
||||
x16rt-veil Veil (VEIL)
|
||||
x16s Pigeoncoin (PGN)
|
||||
x16rt
|
||||
x16rt-veil veil
|
||||
x16s
|
||||
x17
|
||||
x21s
|
||||
x22i
|
||||
|
30
README.txt
30
README.txt
@@ -1,12 +1,22 @@
|
||||
This file is included in the Windows binary package. Compile instructions
|
||||
for Linux and Windows can be found in RELEASE_NOTES.
|
||||
|
||||
This package is officially avalable only from:
|
||||
cpuminer-opt is open source and free of any fees. Many forks exist that are
|
||||
closed source and contain usage fees. support open source free software.
|
||||
|
||||
This package is officially avalaible only from:
|
||||
|
||||
https://github.com/JayDDee/cpuminer-opt
|
||||
|
||||
No other sources should be trusted.
|
||||
|
||||
cpuminer is a console program that is executed from a DOS or Powershell
|
||||
prompt. There is no GUI and no mouse support.
|
||||
command prompt. There is no GUI and no mouse support.
|
||||
|
||||
New users are encouraged to consult the cpuminer-opt Wiki for detailed
|
||||
information on usage:
|
||||
|
||||
https://github.com/JayDDee/cpuminer-opt/wiki
|
||||
|
||||
Miner programs are often flagged as malware by antivirus programs. This is
|
||||
a false positive, they are flagged simply because they are cryptocurrency
|
||||
@@ -43,12 +53,11 @@ cpuminer-avx2.exe Haswell, Skylake, Kabylake, Coffeelake, Cometlake
|
||||
cpuminer-avx2-sha.exe AMD Zen1, Zen2
|
||||
cpuminer-avx2-sha-vaes.exe Intel Alderlake*, AMD Zen3
|
||||
cpuminer-avx512.exe Intel HEDT Skylake-X, Cascadelake
|
||||
cpuminer-avx512-sha-vaes.exe Icelake, Tigerlake, Rocketlake
|
||||
cpuminer-avx512-sha-vaes.exe AMD Zen4, Intel Rocketlake, Icelake
|
||||
|
||||
* Alderlake is a hybrid architecture. With the E-cores disabled it may be
|
||||
possible to enable AVX512 on the the P-cores and use the avx512-sha-vaes
|
||||
build. This is not officially supported by Intel at time of writing.
|
||||
Check for current information.
|
||||
* Alderlake is a hybrid architecture with a mix of E-cores & P-cores. Although
|
||||
the P-cores can support AVX512 the E-cores can't so Intel decided to disable
|
||||
AVX512 on the the P-cores.
|
||||
|
||||
Notes about included DLL files:
|
||||
|
||||
@@ -59,9 +68,10 @@ source code obtained from the author's official repository. The exact
|
||||
procedure is documented in the build instructions for Windows:
|
||||
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
|
||||
|
||||
Some DLL filess may already be installed on the system by Windows or third
|
||||
party packages. They often will work and may be used instead of the included
|
||||
file.
|
||||
Some included DLL files may already be installed on the system by Windows or
|
||||
third party packages. They often will work and may be used instead of the
|
||||
included version of the files.
|
||||
|
||||
|
||||
If you like this software feel free to donate:
|
||||
|
||||
|
@@ -65,6 +65,76 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.22.0
|
||||
|
||||
Stratum: faster netdiff calculation.
|
||||
Merged a few updates from Pooler/cpuminer:
|
||||
Use CURLOPT_POSTFIELDS in json_rpc_call,
|
||||
Use CURLINFO_ACTIVESOCKET when supported,
|
||||
JSONRPC speedup,
|
||||
Speed up hex2bin function.
|
||||
Small log improvements, notably more frequent hash rate reports.
|
||||
Removed decred algo.
|
||||
|
||||
v3.21.5
|
||||
|
||||
All issues with v3.21.3 & v3.21.4 should be resolved.
|
||||
Changes since v3.21.2:
|
||||
#392 #379 #389 Fixed misaligned address segfault solo mining.
|
||||
#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
|
||||
#392 Fixed conditional mining.
|
||||
#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
|
||||
Windows binaries no longer support CPU groups,
|
||||
Windows binaries support CPUs with up to 64 threads.
|
||||
Small optimizations to serialized vectoring.
|
||||
|
||||
v3.21.4 CANCELLED
|
||||
|
||||
Reapply selected changes from v3.21.3.
|
||||
#392 #379 #389 Fixed misaligned address segfault solo mining.
|
||||
#392 Fixed conditional mining.
|
||||
#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
|
||||
Windows binaries no longer support CPU groups,
|
||||
Windows binaries support CPUs with up to 64 threads.
|
||||
|
||||
v3.21.3.1 UNRELEASED
|
||||
|
||||
Revert to 3.21.2
|
||||
|
||||
v3.21.3 CANCELLED
|
||||
|
||||
#392 #379 #389 Fixed misaligned address segfault solo mining.
|
||||
#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
|
||||
#392 Fixed conditional mining.
|
||||
#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
|
||||
Windows binaries no longer support CPU groups,
|
||||
Windows binaries support CPUs with up to 64 threads.
|
||||
Midstate prehash is now centralized, done only once instead of by every thread
|
||||
for selected algos.
|
||||
Small optimizations to serialized vectoring.
|
||||
|
||||
v3.21.2
|
||||
|
||||
Faster SALSA SIMD shuffle for yespower, yescrypt & scryptn2.
|
||||
Fixed a couple of compiler warnings with gcc-12.
|
||||
|
||||
v3.21.1
|
||||
|
||||
Fixed a segfault in some obsolete algos.
|
||||
Small optimizations to Hamsi & Shabal AVX2 & AVX512.
|
||||
|
||||
v3.21.0
|
||||
|
||||
Added minotaurx algo for stratum only.
|
||||
Blake256 & sha256 prehash optimized to ignore zero-padded data for AVX2 & AVX512.
|
||||
Other small improvements.
|
||||
|
||||
v3.20.3
|
||||
|
||||
Faster c11 algo: AVX512 6%, AVX2 4%, AVX2+VAES 15%.
|
||||
Faster AVX2+VAES for anime 14%, hmq1725 6%.
|
||||
Small optimizations to Luffa AVX2 & AVX512.
|
||||
|
||||
v3.20.2
|
||||
|
||||
Bit rotation optimizations to Blake256, Blake512, Blake2b, Blake2s & Lyra2-blake2b for SSE2 & AVX2.
|
||||
@@ -75,7 +145,7 @@ v3.20.1
|
||||
sph_blake2b optimized 1-way SSSE3 & AVX2.
|
||||
Removed duplicate Blake2b used by Power2b algo, will now use optimized sph_blake2b.
|
||||
Removed imprecise hash & target display from rejected share log.
|
||||
Share and target difficulty is now displayed only for low diificulty shares.
|
||||
Share and target difficulty is now displayed only for low difficulty shares.
|
||||
Updated configure.ac to check for AVX512 asm support.
|
||||
Small optimization to Lyra2 SSE2.
|
||||
|
||||
@@ -92,12 +162,9 @@ v3.19.8
|
||||
|
||||
#370 "stratum+ssl", in addition to "stratum+tcps", is now recognized as a valid
|
||||
url protocol specifier for requesting a secure stratum connection.
|
||||
|
||||
The full url, including the protocol, is now displayed in the stratum connect
|
||||
log and the periodic summary log.
|
||||
|
||||
Small optimizations to Cubehash, AVX2 & AVX512.
|
||||
|
||||
Byte order and prehash optimizations for Blake256 & Blake512, AVX2 & AVX512.
|
||||
|
||||
v3.19.7
|
||||
|
83
aclocal.m4
vendored
83
aclocal.m4
vendored
@@ -1,6 +1,6 @@
|
||||
# generated automatically by aclocal 1.16.1 -*- Autoconf -*-
|
||||
# generated automatically by aclocal 1.16.5 -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
|
||||
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -14,13 +14,13 @@
|
||||
m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])])
|
||||
m4_ifndef([AC_AUTOCONF_VERSION],
|
||||
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
|
||||
m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],,
|
||||
[m4_warning([this file was generated for autoconf 2.69.
|
||||
m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.71],,
|
||||
[m4_warning([this file was generated for autoconf 2.71.
|
||||
You have another version of autoconf. It may work, but is not guaranteed to.
|
||||
If you have problems, you may need to regenerate the build system entirely.
|
||||
To do so, use the procedure documented by the package, typically 'autoreconf'.])])
|
||||
|
||||
# Copyright (C) 2002-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2002-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -35,7 +35,7 @@ AC_DEFUN([AM_AUTOMAKE_VERSION],
|
||||
[am__api_version='1.16'
|
||||
dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
|
||||
dnl require some minimum version. Point them to the right macro.
|
||||
m4_if([$1], [1.16.1], [],
|
||||
m4_if([$1], [1.16.5], [],
|
||||
[AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
|
||||
])
|
||||
|
||||
@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
|
||||
# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
|
||||
# This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
|
||||
AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
|
||||
[AM_AUTOMAKE_VERSION([1.16.1])dnl
|
||||
[AM_AUTOMAKE_VERSION([1.16.5])dnl
|
||||
m4_ifndef([AC_AUTOCONF_VERSION],
|
||||
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
|
||||
_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
|
||||
|
||||
# Figure out how to run the assembler. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -78,7 +78,7 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl
|
||||
|
||||
# AM_AUX_DIR_EXPAND -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -130,7 +130,7 @@ am_aux_dir=`cd "$ac_aux_dir" && pwd`
|
||||
|
||||
# AM_CONDITIONAL -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1997-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1997-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -161,7 +161,7 @@ AC_CONFIG_COMMANDS_PRE(
|
||||
Usually this means the macro was only invoked conditionally.]])
|
||||
fi])])
|
||||
|
||||
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1999-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -352,7 +352,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
|
||||
|
||||
# Generate code to set up dependency tracking. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1999-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -391,7 +391,9 @@ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
|
||||
done
|
||||
if test $am_rc -ne 0; then
|
||||
AC_MSG_FAILURE([Something went wrong bootstrapping makefile fragments
|
||||
for automatic dependency tracking. Try re-running configure with the
|
||||
for automatic dependency tracking. If GNU make was not used, consider
|
||||
re-running the configure script with MAKE="gmake" (or whatever is
|
||||
necessary). You can also try re-running configure with the
|
||||
'--disable-dependency-tracking' option to at least be able to build
|
||||
the package (albeit without support for automatic dependency tracking).])
|
||||
fi
|
||||
@@ -418,7 +420,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
|
||||
|
||||
# Do all the work for Automake. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -446,6 +448,10 @@ m4_defn([AC_PROG_CC])
|
||||
# release and drop the old call support.
|
||||
AC_DEFUN([AM_INIT_AUTOMAKE],
|
||||
[AC_PREREQ([2.65])dnl
|
||||
m4_ifdef([_$0_ALREADY_INIT],
|
||||
[m4_fatal([$0 expanded multiple times
|
||||
]m4_defn([_$0_ALREADY_INIT]))],
|
||||
[m4_define([_$0_ALREADY_INIT], m4_expansion_stack)])dnl
|
||||
dnl Autoconf wants to disallow AM_ names. We explicitly allow
|
||||
dnl the ones we care about.
|
||||
m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
|
||||
@@ -482,7 +488,7 @@ m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
|
||||
[_AM_SET_OPTIONS([$1])dnl
|
||||
dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
|
||||
m4_if(
|
||||
m4_ifdef([AC_PACKAGE_NAME], [ok]):m4_ifdef([AC_PACKAGE_VERSION], [ok]),
|
||||
m4_ifset([AC_PACKAGE_NAME], [ok]):m4_ifset([AC_PACKAGE_VERSION], [ok]),
|
||||
[ok:ok],,
|
||||
[m4_fatal([AC_INIT should be called with package and version arguments])])dnl
|
||||
AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
|
||||
@@ -534,6 +540,20 @@ AC_PROVIDE_IFELSE([AC_PROG_OBJCXX],
|
||||
[m4_define([AC_PROG_OBJCXX],
|
||||
m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl
|
||||
])
|
||||
# Variables for tags utilities; see am/tags.am
|
||||
if test -z "$CTAGS"; then
|
||||
CTAGS=ctags
|
||||
fi
|
||||
AC_SUBST([CTAGS])
|
||||
if test -z "$ETAGS"; then
|
||||
ETAGS=etags
|
||||
fi
|
||||
AC_SUBST([ETAGS])
|
||||
if test -z "$CSCOPE"; then
|
||||
CSCOPE=cscope
|
||||
fi
|
||||
AC_SUBST([CSCOPE])
|
||||
|
||||
AC_REQUIRE([AM_SILENT_RULES])dnl
|
||||
dnl The testsuite driver may need to know about EXEEXT, so add the
|
||||
dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen. This
|
||||
@@ -615,7 +635,7 @@ for _am_header in $config_headers :; do
|
||||
done
|
||||
echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
|
||||
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -636,7 +656,7 @@ if test x"${install_sh+set}" != xset; then
|
||||
fi
|
||||
AC_SUBST([install_sh])])
|
||||
|
||||
# Copyright (C) 2003-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2003-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -658,7 +678,7 @@ AC_SUBST([am__leading_dot])])
|
||||
# Add --enable-maintainer-mode option to configure. -*- Autoconf -*-
|
||||
# From Jim Meyering
|
||||
|
||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -693,7 +713,7 @@ AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
|
||||
|
||||
# Check to see how 'make' treats includes. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -736,7 +756,7 @@ AC_SUBST([am__quote])])
|
||||
|
||||
# Fake the existence of programs that GNU maintainers use. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1997-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1997-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -757,12 +777,7 @@ AC_DEFUN([AM_MISSING_HAS_RUN],
|
||||
[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
|
||||
AC_REQUIRE_AUX_FILE([missing])dnl
|
||||
if test x"${MISSING+set}" != xset; then
|
||||
case $am_aux_dir in
|
||||
*\ * | *\ *)
|
||||
MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
|
||||
*)
|
||||
MISSING="\${SHELL} $am_aux_dir/missing" ;;
|
||||
esac
|
||||
MISSING="\${SHELL} '$am_aux_dir/missing'"
|
||||
fi
|
||||
# Use eval to expand $SHELL
|
||||
if eval "$MISSING --is-lightweight"; then
|
||||
@@ -775,7 +790,7 @@ fi
|
||||
|
||||
# Helper functions for option handling. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -804,7 +819,7 @@ AC_DEFUN([_AM_SET_OPTIONS],
|
||||
AC_DEFUN([_AM_IF_OPTION],
|
||||
[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
|
||||
|
||||
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1999-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -851,7 +866,7 @@ AC_LANG_POP([C])])
|
||||
# For backward compatibility.
|
||||
AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
|
||||
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -870,7 +885,7 @@ AC_DEFUN([AM_RUN_LOG],
|
||||
|
||||
# Check to make sure that the build environment is sane. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -951,7 +966,7 @@ AC_CONFIG_COMMANDS_PRE(
|
||||
rm -f conftest.file
|
||||
])
|
||||
|
||||
# Copyright (C) 2009-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2009-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -1011,7 +1026,7 @@ AC_SUBST([AM_BACKSLASH])dnl
|
||||
_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
|
||||
])
|
||||
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -1039,7 +1054,7 @@ fi
|
||||
INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
|
||||
AC_SUBST([INSTALL_STRIP_PROGRAM])])
|
||||
|
||||
# Copyright (C) 2006-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2006-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -1058,7 +1073,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
|
||||
|
||||
# Check how to create a tarball. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2004-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2004-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
|
@@ -67,7 +67,6 @@ void do_nothing () {}
|
||||
bool return_true () { return true; }
|
||||
bool return_false () { return false; }
|
||||
void *return_null () { return NULL; }
|
||||
void call_error () { printf("ERR: Uninitialized function pointer\n"); }
|
||||
|
||||
void algo_not_tested()
|
||||
{
|
||||
@@ -95,7 +94,8 @@ int null_scanhash()
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Default generic scanhash can be used in many cases.
|
||||
// Default generic scanhash can be used in many cases. Not to be used when
|
||||
// prehashing can be done or when byte swapping the data can be avoided.
|
||||
int scanhash_generic( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
@@ -152,6 +152,9 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
// overwrite byte swapped nonce with original byte order for proper
|
||||
// incrementing. The nonce only needs to byte swapped if it is to be
|
||||
// sumbitted.
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
@@ -260,8 +263,6 @@ void init_algo_gate( algo_gate_t* gate )
|
||||
gate->build_block_header = (void*)&std_build_block_header;
|
||||
gate->build_extraheader = (void*)&std_build_extraheader;
|
||||
gate->set_work_data_endian = (void*)&do_nothing;
|
||||
gate->calc_network_diff = (void*)&std_calc_network_diff;
|
||||
gate->ready_to_mine = (void*)&std_ready_to_mine;
|
||||
gate->resync_threads = (void*)&do_nothing;
|
||||
gate->do_this_thread = (void*)&return_true;
|
||||
gate->longpoll_rpc_call = (void*)&std_longpoll_rpc_call;
|
||||
@@ -305,7 +306,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_BLAKECOIN: rc = register_blakecoin_algo ( gate ); break;
|
||||
case ALGO_BMW512: rc = register_bmw512_algo ( gate ); break;
|
||||
case ALGO_C11: rc = register_c11_algo ( gate ); break;
|
||||
case ALGO_DECRED: rc = register_decred_algo ( gate ); break;
|
||||
case ALGO_DEEP: rc = register_deep_algo ( gate ); break;
|
||||
case ALGO_DMD_GR: rc = register_dmd_gr_algo ( gate ); break;
|
||||
case ALGO_GROESTL: rc = register_groestl_algo ( gate ); break;
|
||||
@@ -324,6 +324,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_LYRA2Z330: rc = register_lyra2z330_algo ( gate ); break;
|
||||
case ALGO_M7M: rc = register_m7m_algo ( gate ); break;
|
||||
case ALGO_MINOTAUR: rc = register_minotaur_algo ( gate ); break;
|
||||
case ALGO_MINOTAURX: rc = register_minotaur_algo ( gate ); break;
|
||||
case ALGO_MYR_GR: rc = register_myriad_algo ( gate ); break;
|
||||
case ALGO_NEOSCRYPT: rc = register_neoscrypt_algo ( gate ); break;
|
||||
case ALGO_NIST5: rc = register_nist5_algo ( gate ); break;
|
||||
@@ -423,7 +424,6 @@ const char* const algo_alias_map[][2] =
|
||||
{ "blake256r8", "blakecoin" },
|
||||
{ "blake256r8vnl", "vanilla" },
|
||||
{ "blake256r14", "blake" },
|
||||
{ "blake256r14dcr", "decred" },
|
||||
{ "diamond", "dmd-gr" },
|
||||
{ "espers", "hmq1725" },
|
||||
{ "flax", "c11" },
|
||||
|
@@ -144,7 +144,7 @@ void ( *gen_merkle_root ) ( char*, struct stratum_ctx* );
|
||||
void ( *build_extraheader ) ( struct work*, struct stratum_ctx* );
|
||||
|
||||
void ( *build_block_header ) ( struct work*, uint32_t, uint32_t*,
|
||||
uint32_t*, uint32_t, uint32_t,
|
||||
uint32_t*, uint32_t, uint32_t,
|
||||
unsigned char* );
|
||||
|
||||
// Build mining.submit message
|
||||
@@ -155,19 +155,13 @@ char* ( *malloc_txs_request ) ( struct work* );
|
||||
// Big endian or little endian
|
||||
void ( *set_work_data_endian ) ( struct work* );
|
||||
|
||||
double ( *calc_network_diff ) ( struct work* );
|
||||
|
||||
// Wait for first work
|
||||
bool ( *ready_to_mine ) ( struct work*, struct stratum_ctx*, int );
|
||||
|
||||
// Diverge mining threads
|
||||
bool ( *do_this_thread ) ( int );
|
||||
|
||||
// After do_this_thread
|
||||
void ( *resync_threads ) ( int, struct work* );
|
||||
|
||||
// No longer needed
|
||||
json_t* (*longpoll_rpc_call) ( CURL*, int*, char* );
|
||||
json_t* ( *longpoll_rpc_call ) ( CURL*, int*, char* );
|
||||
|
||||
set_t optimizations;
|
||||
int ( *get_work_data_size ) ();
|
||||
@@ -286,8 +280,6 @@ char* std_malloc_txs_request( struct work *work );
|
||||
// Default is do_nothing, little endian is assumed
|
||||
void set_work_data_big_endian( struct work *work );
|
||||
|
||||
double std_calc_network_diff( struct work *work );
|
||||
|
||||
void std_build_block_header( struct work* g_work, uint32_t version,
|
||||
uint32_t *prevhash, uint32_t *merkle_root,
|
||||
uint32_t ntime, uint32_t nbits,
|
||||
@@ -297,9 +289,6 @@ void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );
|
||||
|
||||
json_t* std_longpoll_rpc_call( CURL *curl, int *err, char *lp_url );
|
||||
|
||||
bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
|
||||
int thr_id );
|
||||
|
||||
int std_get_work_data_size();
|
||||
|
||||
// Gate admin functions
|
||||
|
@@ -115,7 +115,7 @@ void blake256_8way_close(void *cc, void *dst);
|
||||
void blake256_8way_update_le(void *cc, const void *data, size_t len);
|
||||
void blake256_8way_close_le(void *cc, void *dst);
|
||||
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
const void *data );
|
||||
void *data );
|
||||
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data );
|
||||
|
||||
@@ -178,7 +178,7 @@ void blake256_16way_close(void *cc, void *dst);
|
||||
void blake256_16way_update_le(void *cc, const void *data, size_t len);
|
||||
void blake256_16way_close_le(void *cc, void *dst);
|
||||
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
const void *data );
|
||||
void *data );
|
||||
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data );
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -1,74 +0,0 @@
|
||||
#include "decred-gate.h"
|
||||
#include "blake-hash-4way.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#if defined (DECRED_4WAY)
|
||||
|
||||
static __thread blake256_4way_context blake_mid;
|
||||
|
||||
void decred_hash_4way( void *state, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
// uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
// uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
// uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||
// uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||
const void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
|
||||
int tail_len = 180 - DECRED_MIDSTATE_LEN;
|
||||
blake256_4way_context ctx __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
|
||||
blake256_4way_update( &ctx, tail, tail_len );
|
||||
blake256_4way_close( &ctx, vhash );
|
||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[48*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t _ALIGN(64) edata[48];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
// copy to buffer guaranteed to be aligned.
|
||||
memcpy( edata, pdata, 180 );
|
||||
|
||||
// use the old way until new way updated for size.
|
||||
mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 );
|
||||
|
||||
blake256_4way_init( &blake_mid );
|
||||
blake256_4way_update( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
|
||||
|
||||
uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
|
||||
do {
|
||||
* noncep = n;
|
||||
*(noncep+1) = n+1;
|
||||
*(noncep+2) = n+2;
|
||||
*(noncep+3) = n+3;
|
||||
|
||||
decred_hash_4way( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= HTarget )
|
||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[DECRED_NONCE_INDEX] = n+i;
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,171 +0,0 @@
|
||||
#include "decred-gate.h"
|
||||
#include <unistd.h>
|
||||
#include <memory.h>
|
||||
#include <string.h>
|
||||
|
||||
uint32_t *decred_get_nonceptr( uint32_t *work_data )
|
||||
{
|
||||
return &work_data[ DECRED_NONCE_INDEX ];
|
||||
}
|
||||
|
||||
long double decred_calc_network_diff( struct work* work )
|
||||
{
|
||||
// sample for diff 43.281 : 1c05ea29
|
||||
// todo: endian reversed on longpoll could be zr5 specific...
|
||||
uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
|
||||
uint32_t bits = ( nbits & 0xffffff );
|
||||
int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
|
||||
int m;
|
||||
long double d = (long double)0x0000ffff / (long double)bits;
|
||||
|
||||
for ( m = shift; m < 29; m++ )
|
||||
d *= 256.0;
|
||||
for ( m = 29; m < shift; m++ )
|
||||
d /= 256.0;
|
||||
if ( shift == 28 )
|
||||
d *= 256.0; // testnet
|
||||
if ( opt_debug_diff )
|
||||
applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", (double)d,
|
||||
shift, bits );
|
||||
return net_diff;
|
||||
}
|
||||
|
||||
void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
|
||||
{
|
||||
// some random extradata to make the work unique
|
||||
work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
|
||||
work->height = work->data[32];
|
||||
if (!have_longpoll && work->height > *net_blocks + 1)
|
||||
{
|
||||
char netinfo[64] = { 0 };
|
||||
if ( net_diff > 0. )
|
||||
{
|
||||
if (net_diff != work->targetdiff)
|
||||
sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
|
||||
work->targetdiff);
|
||||
else
|
||||
sprintf(netinfo, ", diff %.3f", net_diff);
|
||||
}
|
||||
applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
|
||||
netinfo);
|
||||
*net_blocks = work->height - 1;
|
||||
}
|
||||
}
|
||||
|
||||
void decred_be_build_stratum_request( char *req, struct work *work,
|
||||
struct stratum_ctx *sctx )
|
||||
{
|
||||
unsigned char *xnonce2str;
|
||||
uint32_t ntime, nonce;
|
||||
char ntimestr[9], noncestr[9];
|
||||
|
||||
be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
|
||||
be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
|
||||
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
|
||||
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
|
||||
xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
|
||||
sctx->xnonce1_size );
|
||||
snprintf( req, JSON_BUF_LEN,
|
||||
"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
|
||||
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
|
||||
free(xnonce2str);
|
||||
}
|
||||
|
||||
#if !defined(min)
|
||||
#define min(a,b) (a>b ? (b) :(a))
|
||||
#endif
|
||||
|
||||
void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
{
|
||||
uchar merkle_root[64] = { 0 };
|
||||
uint32_t extraheader[32] = { 0 };
|
||||
int headersize = 0;
|
||||
uint32_t* extradata = (uint32_t*) sctx->xnonce1;
|
||||
int i;
|
||||
|
||||
// getwork over stratum, getwork merkle + header passed in coinb1
|
||||
memcpy(merkle_root, sctx->job.coinbase, 32);
|
||||
headersize = min((int)sctx->job.coinbase_size - 32,
|
||||
sizeof(extraheader) );
|
||||
memcpy( extraheader, &sctx->job.coinbase[32], headersize );
|
||||
|
||||
// Assemble block header
|
||||
memset( g_work->data, 0, sizeof(g_work->data) );
|
||||
g_work->data[0] = le32dec( sctx->job.version );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[1 + i] = swab32(
|
||||
le32dec( (uint32_t *) sctx->job.prevhash + i ) );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
|
||||
|
||||
// for ( i = 0; i < 8; i++ ) // prevhash
|
||||
// g_work->data[1 + i] = swab32( g_work->data[1 + i] );
|
||||
// for ( i = 0; i < 8; i++ ) // merkle
|
||||
// g_work->data[9 + i] = swab32( g_work->data[9 + i] );
|
||||
|
||||
for ( i = 0; i < headersize/4; i++ ) // header
|
||||
g_work->data[17 + i] = extraheader[i];
|
||||
// extradata
|
||||
|
||||
for ( i = 0; i < sctx->xnonce1_size/4; i++ )
|
||||
g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
|
||||
for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
|
||||
g_work->data[i] = 0;
|
||||
g_work->data[37] = (rand()*4) << 8;
|
||||
// block header suffix from coinb2 (stake version)
|
||||
memcpy( &g_work->data[44],
|
||||
&sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
|
||||
sctx->block_height = g_work->data[32];
|
||||
//applog_hex(work->data, 180);
|
||||
//applog_hex(&work->data[36], 36);
|
||||
}
|
||||
|
||||
#undef min
|
||||
|
||||
bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
|
||||
int thr_id )
|
||||
{
|
||||
if ( have_stratum && strcmp(stratum->job.job_id, work->job_id) )
|
||||
// need to regen g_work..
|
||||
return false;
|
||||
if ( have_stratum && !work->data[0] && !opt_benchmark )
|
||||
{
|
||||
sleep(1);
|
||||
return false;
|
||||
}
|
||||
// extradata: prevent duplicates
|
||||
work->data[ DECRED_XNONCE_INDEX ] += 1;
|
||||
work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
|
||||
return true;
|
||||
}
|
||||
|
||||
int decred_get_work_data_size() { return DECRED_DATA_SIZE; }
|
||||
|
||||
bool register_decred_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(DECRED_4WAY)
|
||||
four_way_not_tested();
|
||||
gate->scanhash = (void*)&scanhash_decred_4way;
|
||||
gate->hash = (void*)&decred_hash_4way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_decred;
|
||||
gate->hash = (void*)&decred_hash;
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT;
|
||||
// gate->get_nonceptr = (void*)&decred_get_nonceptr;
|
||||
gate->decode_extra_data = (void*)&decred_decode_extradata;
|
||||
gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
|
||||
gate->work_decode = (void*)&std_be_work_decode;
|
||||
gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
|
||||
gate->build_extraheader = (void*)&decred_build_extraheader;
|
||||
gate->ready_to_mine = (void*)&decred_ready_to_mine;
|
||||
gate->nbits_index = DECRED_NBITS_INDEX;
|
||||
gate->ntime_index = DECRED_NTIME_INDEX;
|
||||
gate->nonce_index = DECRED_NONCE_INDEX;
|
||||
gate->get_work_data_size = (void*)&decred_get_work_data_size;
|
||||
gate->work_cmp_size = DECRED_WORK_COMPARE_SIZE;
|
||||
allow_mininginfo = false;
|
||||
have_gbt = false;
|
||||
return true;
|
||||
}
|
||||
|
@@ -1,36 +0,0 @@
|
||||
#ifndef __DECRED_GATE_H__
|
||||
#define __DECRED_GATE_H__
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#define DECRED_NBITS_INDEX 29
|
||||
#define DECRED_NTIME_INDEX 34
|
||||
#define DECRED_NONCE_INDEX 35
|
||||
#define DECRED_XNONCE_INDEX 36
|
||||
#define DECRED_DATA_SIZE 192
|
||||
#define DECRED_WORK_COMPARE_SIZE 140
|
||||
#define DECRED_MIDSTATE_LEN 128
|
||||
|
||||
#if defined (__AVX2__)
|
||||
//void blakehash_84way(void *state, const void *input);
|
||||
//int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done );
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
#define DECRED_4WAY
|
||||
#endif
|
||||
|
||||
#if defined (DECRED_4WAY)
|
||||
void decred_hash_4way(void *state, const void *input);
|
||||
int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
void decred_hash( void *state, const void *input );
|
||||
int scanhash_decred( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
@@ -1,282 +0,0 @@
|
||||
#include "decred-gate.h"
|
||||
|
||||
#if !defined(DECRED_8WAY) && !defined(DECRED_4WAY)
|
||||
|
||||
#include "sph_blake.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
#include <unistd.h>
|
||||
|
||||
/*
|
||||
#ifndef min
|
||||
#define min(a,b) (a>b ? b : a)
|
||||
#endif
|
||||
#ifndef max
|
||||
#define max(a,b) (a<b ? b : a)
|
||||
#endif
|
||||
*/
|
||||
/*
|
||||
#define DECRED_NBITS_INDEX 29
|
||||
#define DECRED_NTIME_INDEX 34
|
||||
#define DECRED_NONCE_INDEX 35
|
||||
#define DECRED_XNONCE_INDEX 36
|
||||
#define DECRED_DATA_SIZE 192
|
||||
#define DECRED_WORK_COMPARE_SIZE 140
|
||||
*/
|
||||
static __thread sph_blake256_context blake_mid;
|
||||
static __thread bool ctx_midstate_done = false;
|
||||
|
||||
void decred_hash(void *state, const void *input)
|
||||
{
|
||||
// #define MIDSTATE_LEN 128
|
||||
sph_blake256_context ctx __attribute__ ((aligned (64)));
|
||||
|
||||
uint8_t *ending = (uint8_t*) input;
|
||||
ending += DECRED_MIDSTATE_LEN;
|
||||
|
||||
if (!ctx_midstate_done) {
|
||||
sph_blake256_init(&blake_mid);
|
||||
sph_blake256(&blake_mid, input, DECRED_MIDSTATE_LEN);
|
||||
ctx_midstate_done = true;
|
||||
}
|
||||
memcpy(&ctx, &blake_mid, sizeof(blake_mid));
|
||||
|
||||
sph_blake256(&ctx, ending, (180 - DECRED_MIDSTATE_LEN));
|
||||
sph_blake256_close(&ctx, state);
|
||||
}
|
||||
|
||||
void decred_hash_simple(void *state, const void *input)
|
||||
{
|
||||
sph_blake256_context ctx;
|
||||
sph_blake256_init(&ctx);
|
||||
sph_blake256(&ctx, input, 180);
|
||||
sph_blake256_close(&ctx, state);
|
||||
}
|
||||
|
||||
int scanhash_decred( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) endiandata[48];
|
||||
uint32_t _ALIGN(64) hash32[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
// #define DCR_NONCE_OFT32 35
|
||||
|
||||
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
|
||||
const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
ctx_midstate_done = false;
|
||||
|
||||
#if 1
|
||||
memcpy(endiandata, pdata, 180);
|
||||
#else
|
||||
for (int k=0; k < (180/4); k++)
|
||||
be32enc(&endiandata[k], pdata[k]);
|
||||
#endif
|
||||
|
||||
do {
|
||||
//be32enc(&endiandata[DCR_NONCE_OFT32], n);
|
||||
endiandata[DECRED_NONCE_INDEX] = n;
|
||||
decred_hash(hash32, endiandata);
|
||||
|
||||
if (hash32[7] <= HTarget && fulltest(hash32, ptarget))
|
||||
{
|
||||
pdata[DECRED_NONCE_INDEX] = n;
|
||||
submit_solution( work, hash32, mythr );
|
||||
}
|
||||
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[DECRED_NONCE_INDEX] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
uint32_t *decred_get_nonceptr( uint32_t *work_data )
|
||||
{
|
||||
return &work_data[ DECRED_NONCE_INDEX ];
|
||||
}
|
||||
|
||||
double decred_calc_network_diff( struct work* work )
|
||||
{
|
||||
// sample for diff 43.281 : 1c05ea29
|
||||
// todo: endian reversed on longpoll could be zr5 specific...
|
||||
uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
|
||||
uint32_t bits = ( nbits & 0xffffff );
|
||||
int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
|
||||
int m;
|
||||
double d = (double)0x0000ffff / (double)bits;
|
||||
|
||||
for ( m = shift; m < 29; m++ )
|
||||
d *= 256.0;
|
||||
for ( m = 29; m < shift; m++ )
|
||||
d /= 256.0;
|
||||
if ( shift == 28 )
|
||||
d *= 256.0; // testnet
|
||||
if ( opt_debug_diff )
|
||||
applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
|
||||
shift, bits );
|
||||
return net_diff;
|
||||
}
|
||||
|
||||
void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
|
||||
{
|
||||
// some random extradata to make the work unique
|
||||
work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
|
||||
work->height = work->data[32];
|
||||
if (!have_longpoll && work->height > *net_blocks + 1)
|
||||
{
|
||||
char netinfo[64] = { 0 };
|
||||
if (net_diff > 0.)
|
||||
{
|
||||
if (net_diff != work->targetdiff)
|
||||
sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
|
||||
work->targetdiff);
|
||||
else
|
||||
sprintf(netinfo, ", diff %.3f", net_diff);
|
||||
}
|
||||
applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
|
||||
netinfo);
|
||||
*net_blocks = work->height - 1;
|
||||
}
|
||||
}
|
||||
|
||||
void decred_be_build_stratum_request( char *req, struct work *work,
|
||||
struct stratum_ctx *sctx )
|
||||
{
|
||||
unsigned char *xnonce2str;
|
||||
uint32_t ntime, nonce;
|
||||
char ntimestr[9], noncestr[9];
|
||||
|
||||
be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
|
||||
be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
|
||||
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
|
||||
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
|
||||
xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
|
||||
sctx->xnonce1_size );
|
||||
snprintf( req, JSON_BUF_LEN,
|
||||
"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
|
||||
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
|
||||
free(xnonce2str);
|
||||
}
|
||||
*/
|
||||
/*
|
||||
// data shared between gen_merkle_root and build_extraheader.
|
||||
__thread uint32_t decred_extraheader[32] = { 0 };
|
||||
__thread int decred_headersize = 0;
|
||||
|
||||
void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
|
||||
{
|
||||
// getwork over stratum, getwork merkle + header passed in coinb1
|
||||
memcpy(merkle_root, sctx->job.coinbase, 32);
|
||||
decred_headersize = min((int)sctx->job.coinbase_size - 32,
|
||||
sizeof(decred_extraheader) );
|
||||
memcpy( decred_extraheader, &sctx->job.coinbase[32], decred_headersize);
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
#define min(a,b) (a>b ? (b) :(a))
|
||||
|
||||
void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
{
|
||||
uchar merkle_root[64] = { 0 };
|
||||
uint32_t extraheader[32] = { 0 };
|
||||
int headersize = 0;
|
||||
uint32_t* extradata = (uint32_t*) sctx->xnonce1;
|
||||
size_t t;
|
||||
int i;
|
||||
|
||||
// getwork over stratum, getwork merkle + header passed in coinb1
|
||||
memcpy(merkle_root, sctx->job.coinbase, 32);
|
||||
headersize = min((int)sctx->job.coinbase_size - 32,
|
||||
sizeof(extraheader) );
|
||||
memcpy( extraheader, &sctx->job.coinbase[32], headersize );
|
||||
|
||||
// Increment extranonce2
|
||||
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
|
||||
|
||||
// Assemble block header
|
||||
memset( g_work->data, 0, sizeof(g_work->data) );
|
||||
g_work->data[0] = le32dec( sctx->job.version );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[1 + i] = swab32(
|
||||
le32dec( (uint32_t *) sctx->job.prevhash + i ) );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
|
||||
|
||||
// for ( i = 0; i < 8; i++ ) // prevhash
|
||||
// g_work->data[1 + i] = swab32( g_work->data[1 + i] );
|
||||
// for ( i = 0; i < 8; i++ ) // merkle
|
||||
// g_work->data[9 + i] = swab32( g_work->data[9 + i] );
|
||||
|
||||
for ( i = 0; i < headersize/4; i++ ) // header
|
||||
g_work->data[17 + i] = extraheader[i];
|
||||
// extradata
|
||||
|
||||
for ( i = 0; i < sctx->xnonce1_size/4; i++ )
|
||||
g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
|
||||
for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
|
||||
g_work->data[i] = 0;
|
||||
g_work->data[37] = (rand()*4) << 8;
|
||||
// block header suffix from coinb2 (stake version)
|
||||
memcpy( &g_work->data[44],
|
||||
&sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
|
||||
sctx->bloc_height = g_work->data[32];
|
||||
//applog_hex(work->data, 180);
|
||||
//applog_hex(&work->data[36], 36);
|
||||
}
|
||||
|
||||
#undef min
|
||||
|
||||
bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
|
||||
int thr_id )
|
||||
{
|
||||
if ( have_stratum && strcmp(stratum->job.job_id, work->job_id) )
|
||||
// need to regen g_work..
|
||||
return false;
|
||||
if ( have_stratum && !work->data[0] && !opt_benchmark )
|
||||
{
|
||||
sleep(1);
|
||||
return false;
|
||||
}
|
||||
// extradata: prevent duplicates
|
||||
work->data[ DECRED_XNONCE_INDEX ] += 1;
|
||||
work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool register_decred_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT;
|
||||
gate->scanhash = (void*)&scanhash_decred;
|
||||
gate->hash = (void*)&decred_hash;
|
||||
gate->get_nonceptr = (void*)&decred_get_nonceptr;
|
||||
gate->decode_extra_data = (void*)&decred_decode_extradata;
|
||||
gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
|
||||
gate->work_decode = (void*)&std_be_work_decode;
|
||||
gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
|
||||
gate->build_extraheader = (void*)&decred_build_extraheader;
|
||||
gate->ready_to_mine = (void*)&decred_ready_to_mine;
|
||||
gate->nbits_index = DECRED_NBITS_INDEX;
|
||||
gate->ntime_index = DECRED_NTIME_INDEX;
|
||||
gate->nonce_index = DECRED_NONCE_INDEX;
|
||||
gate->work_data_size = DECRED_DATA_SIZE;
|
||||
gate->work_cmp_size = DECRED_WORK_COMPARE_SIZE;
|
||||
allow_mininginfo = false;
|
||||
have_gbt = false;
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
#endif
|
@@ -1,6 +1,6 @@
|
||||
#include "pentablake-gate.h"
|
||||
|
||||
#if defined (__AVX2__)
|
||||
#if defined(PENTABLAKE_4WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
@@ -4,9 +4,10 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define PENTABLAKE_4WAY
|
||||
#endif
|
||||
// 4way is broken
|
||||
//#if defined(__AVX2__)
|
||||
// #define PENTABLAKE_4WAY
|
||||
//#endif
|
||||
|
||||
#if defined(PENTABLAKE_4WAY)
|
||||
void pentablakehash_4way( void *state, const void *input );
|
||||
|
@@ -78,7 +78,8 @@
|
||||
V[1] = mm256_shufll_64( V[1] ); \
|
||||
}
|
||||
|
||||
#elif defined(__SSSE3__)
|
||||
#elif defined(__SSE2__)
|
||||
// always true
|
||||
|
||||
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
|
||||
{ \
|
||||
@@ -102,19 +103,20 @@
|
||||
const uint8_t *sigmaR = sigma[R]; \
|
||||
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
|
||||
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
|
||||
V2 = mm128_shufl2r_64( V[2], V[3] ); \
|
||||
V3 = mm128_shufl2r_64( V[3], V[2] ); \
|
||||
V6 = mm128_shufl2l_64( V[6], V[7] ); \
|
||||
V7 = mm128_shufl2l_64( V[7], V[6] ); \
|
||||
V2 = mm128_alignr_64( V[3], V[2], 1 ); \
|
||||
V3 = mm128_alignr_64( V[2], V[3], 1 ); \
|
||||
V6 = mm128_alignr_64( V[6], V[7], 1 ); \
|
||||
V7 = mm128_alignr_64( V[7], V[6], 1 ); \
|
||||
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
|
||||
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
|
||||
V[2] = mm128_shufl2l_64( V2, V3 ); \
|
||||
V[3] = mm128_shufl2l_64( V3, V2 ); \
|
||||
V[6] = mm128_shufl2r_64( V6, V7 ); \
|
||||
V[7] = mm128_shufl2r_64( V7, V6 ); \
|
||||
V[2] = mm128_alignr_64( V2, V3, 1 ); \
|
||||
V[3] = mm128_alignr_64( V3, V2, 1 ); \
|
||||
V[6] = mm128_alignr_64( V7, V6, 1 ); \
|
||||
V[7] = mm128_alignr_64( V6, V7, 1 ); \
|
||||
}
|
||||
|
||||
#else
|
||||
// never used, SSE2 is always available
|
||||
|
||||
#ifndef ROTR64
|
||||
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
|
||||
|
@@ -747,38 +747,40 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
mj[14] = mm256_rol_64( M[14], 15 );
|
||||
mj[15] = mm256_rol_64( M[15], 16 );
|
||||
|
||||
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7],
|
||||
(const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) );
|
||||
qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8],
|
||||
(const __m256i)_mm256_set1_epi64x( 17 * 0x0555555555555555ULL ) );
|
||||
qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9],
|
||||
(const __m256i)_mm256_set1_epi64x( 18 * 0x0555555555555555ULL ) );
|
||||
qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10],
|
||||
(const __m256i)_mm256_set1_epi64x( 19 * 0x0555555555555555ULL ) );
|
||||
qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11],
|
||||
(const __m256i)_mm256_set1_epi64x( 20 * 0x0555555555555555ULL ) );
|
||||
qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12],
|
||||
(const __m256i)_mm256_set1_epi64x( 21 * 0x0555555555555555ULL ) );
|
||||
qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13],
|
||||
(const __m256i)_mm256_set1_epi64x( 22 * 0x0555555555555555ULL ) );
|
||||
qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14],
|
||||
(const __m256i)_mm256_set1_epi64x( 23 * 0x0555555555555555ULL ) );
|
||||
qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15],
|
||||
(const __m256i)_mm256_set1_epi64x( 24 * 0x0555555555555555ULL ) );
|
||||
qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0],
|
||||
(const __m256i)_mm256_set1_epi64x( 25 * 0x0555555555555555ULL ) );
|
||||
qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1],
|
||||
(const __m256i)_mm256_set1_epi64x( 26 * 0x0555555555555555ULL ) );
|
||||
qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2],
|
||||
(const __m256i)_mm256_set1_epi64x( 27 * 0x0555555555555555ULL ) );
|
||||
qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3],
|
||||
(const __m256i)_mm256_set1_epi64x( 28 * 0x0555555555555555ULL ) );
|
||||
qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4],
|
||||
(const __m256i)_mm256_set1_epi64x( 29 * 0x0555555555555555ULL ) );
|
||||
qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5],
|
||||
(const __m256i)_mm256_set1_epi64x( 30 * 0x0555555555555555ULL ) );
|
||||
qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6],
|
||||
(const __m256i)_mm256_set1_epi64x( 31 * 0x0555555555555555ULL ) );
|
||||
__m256i K = _mm256_set1_epi64x( 16 * 0x0555555555555555ULL );
|
||||
const __m256i Kincr = _mm256_set1_epi64x( 0x0555555555555555ULL );
|
||||
|
||||
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6], K );
|
||||
|
||||
qt[16] = _mm256_add_epi64( qt[16], expand1_b( qt, 16 ) );
|
||||
qt[17] = _mm256_add_epi64( qt[17], expand1_b( qt, 17 ) );
|
||||
@@ -1180,7 +1182,6 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
|
||||
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
|
||||
|
||||
__m512i mj[16];
|
||||
uint64_t K = 16 * 0x0555555555555555ULL;
|
||||
|
||||
mj[ 0] = mm512_rol_64( M[ 0], 1 );
|
||||
mj[ 1] = mm512_rol_64( M[ 1], 2 );
|
||||
@@ -1199,54 +1200,40 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
|
||||
mj[14] = mm512_rol_64( M[14], 15 );
|
||||
mj[15] = mm512_rol_64( M[15], 16 );
|
||||
|
||||
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
__m512i K = _mm512_set1_epi64( 16 * 0x0555555555555555ULL );
|
||||
const __m512i Kincr = _mm512_set1_epi64( 0x0555555555555555ULL );
|
||||
|
||||
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6], K );
|
||||
|
||||
qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
|
||||
qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
|
||||
|
@@ -24,9 +24,6 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
|
||||
for ( i = 0; i < SIZE512; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
@@ -46,9 +43,6 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
|
||||
{
|
||||
int i;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
|
||||
for ( i = 0; i < SIZE512; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
|
@@ -22,9 +22,6 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
@@ -43,9 +40,6 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
@@ -54,8 +48,6 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
|
||||
|
||||
ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );
|
||||
|
||||
// ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
// INIT256(ctx->chaining);
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
|
@@ -26,9 +26,6 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = m512_zero;
|
||||
@@ -54,8 +51,8 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
|
||||
__m512i* in = (__m512i*)input;
|
||||
int i;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
// if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
// return 1;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
@@ -179,8 +176,8 @@ int groestl256_2way_init( groestl256_2way_context* ctx, uint64_t hashlen )
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
// if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
// return 1;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
@@ -207,9 +204,6 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
|
||||
__m256i* in = (__m256i*)input;
|
||||
int i;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = m256_zero;
|
||||
|
@@ -21,9 +21,6 @@
|
||||
|
||||
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
|
||||
{
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
|
||||
memset_zero_512( ctx->chaining, SIZE512 );
|
||||
memset_zero_512( ctx->buffer, SIZE512 );
|
||||
|
||||
@@ -142,9 +139,6 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
|
||||
|
||||
int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen )
|
||||
{
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
|
||||
memset_zero_256( ctx->chaining, SIZE512 );
|
||||
memset_zero_256( ctx->buffer, SIZE512 );
|
||||
|
||||
|
@@ -73,11 +73,11 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[19], nonce);
|
||||
myriad_hash(hash, endiandata);
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget))
|
||||
if (hash[7] <= Htarg )
|
||||
if ( fulltest(hash, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
|
||||
|
@@ -585,9 +585,8 @@ do { \
|
||||
t = _mm512_xor_si512( t, c ); \
|
||||
d = mm512_xoror( a, b, t ); \
|
||||
t = mm512_xorand( t, a, b ); \
|
||||
b = mm512_xor3( b, d, t ); \
|
||||
a = c; \
|
||||
c = b; \
|
||||
c = mm512_xor3( b, d, t ); \
|
||||
b = d; \
|
||||
d = mm512_not( t ); \
|
||||
} while (0)
|
||||
@@ -635,7 +634,7 @@ do { \
|
||||
|
||||
#define ROUND_BIG8( alpha ) \
|
||||
do { \
|
||||
__m512i t0, t1, t2, t3; \
|
||||
__m512i t0, t1, t2, t3, t4, t5; \
|
||||
s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \
|
||||
s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \
|
||||
s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \
|
||||
@@ -662,43 +661,35 @@ do { \
|
||||
s5 = mm512_swap64_32( s5 ); \
|
||||
sD = mm512_swap64_32( sD ); \
|
||||
sE = mm512_swap64_32( sE ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
|
||||
L8( s0, t1, s9, t3 ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0x5555, s4, t1 ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, t1 ); \
|
||||
sD = _mm512_mask_blend_epi32( 0x5555, sD, t3 ); \
|
||||
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t3 ); \
|
||||
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
|
||||
L8( s0, t0, s9, t1 ); \
|
||||
\
|
||||
s6 = mm512_swap64_32( s6 ); \
|
||||
sF = mm512_swap64_32( sF ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
|
||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
|
||||
L8( s1, t1, sA, t3 ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
|
||||
s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, t1 ); \
|
||||
sE = _mm512_mask_blend_epi32( 0x5555, sE, t3 ); \
|
||||
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
|
||||
L8( s1, t2, sA, t3 ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
|
||||
sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
|
||||
\
|
||||
s7 = mm512_swap64_32( s7 ); \
|
||||
sC = mm512_swap64_32( sC ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
|
||||
L8( s2, t1, sB, t3 ); \
|
||||
s6 = _mm512_mask_blend_epi32( 0x5555, s6, t1 ); \
|
||||
s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, t1 ); \
|
||||
sF = _mm512_mask_blend_epi32( 0x5555, sF, t3 ); \
|
||||
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t3 ); \
|
||||
t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
|
||||
t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
|
||||
L8( s2, t4, sB, t5 ); \
|
||||
s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
|
||||
sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
|
||||
s6 = mm512_swap64_32( s6 ); \
|
||||
sF = mm512_swap64_32( sF ); \
|
||||
\
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
|
||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
|
||||
L8( s3, t1, s8, t3 ); \
|
||||
s7 = _mm512_mask_blend_epi32( 0x5555, s7, t1 ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, t1 ); \
|
||||
sC = _mm512_mask_blend_epi32( 0x5555, sC, t3 ); \
|
||||
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t3 ); \
|
||||
L8( s3, t2, s8, t3 ); \
|
||||
s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
|
||||
sC = _mm512_mask_blend_epi32( 0x5555, t5, t3 ); \
|
||||
sD = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 ); \
|
||||
s7 = mm512_swap64_32( s7 ); \
|
||||
sC = mm512_swap64_32( sC ); \
|
||||
\
|
||||
@@ -924,10 +915,9 @@ do { \
|
||||
d = _mm256_xor_si256( d, a ); \
|
||||
a = _mm256_and_si256( a, b ); \
|
||||
t = _mm256_xor_si256( t, a ); \
|
||||
b = _mm256_xor_si256( b, d ); \
|
||||
b = _mm256_xor_si256( b, t ); \
|
||||
a = c; \
|
||||
c = b; \
|
||||
c = _mm256_xor_si256( b, d ); \
|
||||
c = _mm256_xor_si256( c, t ); \
|
||||
b = d; \
|
||||
d = mm256_not( t ); \
|
||||
} while (0)
|
||||
@@ -977,7 +967,7 @@ do { \
|
||||
|
||||
#define ROUND_BIG( alpha ) \
|
||||
do { \
|
||||
__m256i t0, t1, t2, t3; \
|
||||
__m256i t0, t1, t2, t3, t4, t5; \
|
||||
s0 = _mm256_xor_si256( s0, alpha[ 0] ); \
|
||||
s1 = _mm256_xor_si256( s1, alpha[ 1] ); \
|
||||
s2 = _mm256_xor_si256( s2, alpha[ 2] ); \
|
||||
@@ -1004,43 +994,35 @@ do { \
|
||||
s5 = mm256_swap64_32( s5 ); \
|
||||
sD = mm256_swap64_32( sD ); \
|
||||
sE = mm256_swap64_32( sE ); \
|
||||
t1 = _mm256_blend_epi32( s4, s5, 0xaa ); \
|
||||
t3 = _mm256_blend_epi32( sD, sE, 0xaa ); \
|
||||
L( s0, t1, s9, t3 ); \
|
||||
s4 = _mm256_blend_epi32( s4, t1, 0x55 ); \
|
||||
s5 = _mm256_blend_epi32( s5, t1, 0xaa ); \
|
||||
sD = _mm256_blend_epi32( sD, t3, 0x55 ); \
|
||||
sE = _mm256_blend_epi32( sE, t3, 0xaa ); \
|
||||
t0 = _mm256_blend_epi32( s4, s5, 0xaa ); \
|
||||
t1 = _mm256_blend_epi32( sD, sE, 0xaa ); \
|
||||
L( s0, t0, s9, t1 ); \
|
||||
\
|
||||
s6 = mm256_swap64_32( s6 ); \
|
||||
sF = mm256_swap64_32( sF ); \
|
||||
t1 = _mm256_blend_epi32( s5, s6, 0xaa ); \
|
||||
t2 = _mm256_blend_epi32( s5, s6, 0xaa ); \
|
||||
t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \
|
||||
L( s1, t1, sA, t3 ); \
|
||||
s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
|
||||
s6 = _mm256_blend_epi32( s6, t1, 0xaa ); \
|
||||
sE = _mm256_blend_epi32( sE, t3, 0x55 ); \
|
||||
sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
|
||||
L( s1, t2, sA, t3 ); \
|
||||
s5 = _mm256_blend_epi32( t0, t2, 0x55 ); \
|
||||
sE = _mm256_blend_epi32( t1, t3, 0x55 ); \
|
||||
\
|
||||
s7 = mm256_swap64_32( s7 ); \
|
||||
sC = mm256_swap64_32( sC ); \
|
||||
t1 = _mm256_blend_epi32( s6, s7, 0xaa ); \
|
||||
t3 = _mm256_blend_epi32( sF, sC, 0xaa ); \
|
||||
L( s2, t1, sB, t3 ); \
|
||||
s6 = _mm256_blend_epi32( s6, t1, 0x55 ); \
|
||||
s7 = _mm256_blend_epi32( s7, t1, 0xaa ); \
|
||||
sF = _mm256_blend_epi32( sF, t3, 0x55 ); \
|
||||
sC = _mm256_blend_epi32( sC, t3, 0xaa ); \
|
||||
t4 = _mm256_blend_epi32( s6, s7, 0xaa ); \
|
||||
t5 = _mm256_blend_epi32( sF, sC, 0xaa ); \
|
||||
L( s2, t4, sB, t5 ); \
|
||||
s6 = _mm256_blend_epi32( t2, t4, 0x55 ); \
|
||||
sF = _mm256_blend_epi32( t3, t5, 0x55 ); \
|
||||
s6 = mm256_swap64_32( s6 ); \
|
||||
sF = mm256_swap64_32( sF ); \
|
||||
\
|
||||
t1 = _mm256_blend_epi32( s7, s4, 0xaa ); \
|
||||
t2 = _mm256_blend_epi32( s7, s4, 0xaa ); \
|
||||
t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \
|
||||
L( s3, t1, s8, t3 ); \
|
||||
s7 = _mm256_blend_epi32( s7, t1, 0x55 ); \
|
||||
s4 = _mm256_blend_epi32( s4, t1, 0xaa ); \
|
||||
sC = _mm256_blend_epi32( sC, t3, 0x55 ); \
|
||||
sD = _mm256_blend_epi32( sD, t3, 0xaa ); \
|
||||
L( s3, t2, s8, t3 ); \
|
||||
s7 = _mm256_blend_epi32( t4, t2, 0x55 ); \
|
||||
s4 = _mm256_blend_epi32( t0, t2, 0xaa ); \
|
||||
sC = _mm256_blend_epi32( t5, t3, 0x55 ); \
|
||||
sD = _mm256_blend_epi32( t1, t3, 0xaa ); \
|
||||
s7 = mm256_swap64_32( s7 ); \
|
||||
sC = mm256_swap64_32( sC ); \
|
||||
\
|
||||
|
@@ -141,6 +141,13 @@ do { \
|
||||
_mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
|
||||
do { \
|
||||
__m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
|
||||
mm128_ror_32( x7, 11 ) ), w ); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* PASSy(n, in) computes pass number "y", for a total of "n", using the
|
||||
* one-argument macro "in" to access input words. Current state is assumed
|
||||
@@ -152,22 +159,22 @@ do { \
|
||||
#define PASS1(n, in) do { \
|
||||
unsigned pass_count; \
|
||||
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
|
||||
STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(pass_count + 0), SPH_C32(0x00000000)); \
|
||||
STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(pass_count + 1), SPH_C32(0x00000000)); \
|
||||
STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(pass_count + 2), SPH_C32(0x00000000)); \
|
||||
STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(pass_count + 3), SPH_C32(0x00000000)); \
|
||||
STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(pass_count + 4), SPH_C32(0x00000000)); \
|
||||
STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(pass_count + 5), SPH_C32(0x00000000)); \
|
||||
STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(pass_count + 6), SPH_C32(0x00000000)); \
|
||||
STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(pass_count + 7), SPH_C32(0x00000000)); \
|
||||
STEP1(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(pass_count + 0) ); \
|
||||
STEP1(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(pass_count + 1) ); \
|
||||
STEP1(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(pass_count + 2) ); \
|
||||
STEP1(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(pass_count + 3) ); \
|
||||
STEP1(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(pass_count + 4) ); \
|
||||
STEP1(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(pass_count + 5) ); \
|
||||
STEP1(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(pass_count + 6) ); \
|
||||
STEP1(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(pass_count + 7) ); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -605,25 +612,32 @@ do { \
|
||||
_mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define STEP1_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
|
||||
do { \
|
||||
__m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
|
||||
mm256_ror_32( x7, 11 ) ), w ); \
|
||||
} while (0)
|
||||
|
||||
#define PASS1_8W(n, in) do { \
|
||||
unsigned pass_count; \
|
||||
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
|
||||
STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(pass_count + 0), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(pass_count + 1), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(pass_count + 2), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(pass_count + 3), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(pass_count + 4), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(pass_count + 5), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(pass_count + 6), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(pass_count + 7), SPH_C32(0x00000000)); \
|
||||
STEP1_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(pass_count + 0) ); \
|
||||
STEP1_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(pass_count + 1) ); \
|
||||
STEP1_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(pass_count + 2) ); \
|
||||
STEP1_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(pass_count + 3) ); \
|
||||
STEP1_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(pass_count + 4) ); \
|
||||
STEP1_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(pass_count + 5) ); \
|
||||
STEP1_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(pass_count + 6) ); \
|
||||
STEP1_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(pass_count + 7) ); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
@@ -49,12 +49,11 @@ extern "C"{
|
||||
|
||||
#define Sb_8W(x0, x1, x2, x3, c) \
|
||||
do { \
|
||||
__m512i cc = _mm512_set1_epi64( c ); \
|
||||
x3 = mm512_not( x3 ); \
|
||||
const __m512i cc = _mm512_set1_epi64( c ); \
|
||||
x0 = mm512_xorandnot( x0, x2, cc ); \
|
||||
tmp = mm512_xorand( cc, x0, x1 ); \
|
||||
x0 = mm512_xorand( x0, x2, x3 ); \
|
||||
x3 = mm512_xorandnot( x3, x1, x2 ); \
|
||||
x0 = mm512_xorandnot( x0, x3, x2 ); \
|
||||
x3 = _mm512_ternarylogic_epi64( x3, x1, x2, 0x2d ); /* ~x3 ^ (~x1 & x2) */\
|
||||
x1 = mm512_xorand( x1, x0, x2 ); \
|
||||
x2 = mm512_xorandnot( x2, x3, x0 ); \
|
||||
x0 = mm512_xoror( x0, x1, x3 ); \
|
||||
@@ -79,7 +78,7 @@ do { \
|
||||
|
||||
#define Sb(x0, x1, x2, x3, c) \
|
||||
do { \
|
||||
__m256i cc = _mm256_set1_epi64x( c ); \
|
||||
const __m256i cc = _mm256_set1_epi64x( c ); \
|
||||
x3 = mm256_not( x3 ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
|
||||
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
|
||||
|
@@ -72,11 +72,11 @@ static const uint64_t RC[] = {
|
||||
// Targetted macros, keccak-macros.h is included for each target.
|
||||
|
||||
#define DECL64(x) __m512i x
|
||||
#define XOR(d, a, b) (d = _mm512_xor_si512(a,b))
|
||||
#define XOR64 XOR
|
||||
#define XOR(d, a, b) (d = _mm512_xor_si512(a,b))
|
||||
#define XOR64 XOR
|
||||
#define AND64(d, a, b) (d = _mm512_and_si512(a,b))
|
||||
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
|
||||
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
|
||||
#define NOT64(d, s) (d = mm512_not( s ) )
|
||||
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
|
||||
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
|
||||
#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
|
||||
@@ -257,14 +257,14 @@ keccak512_8way_close(void *cc, void *dst)
|
||||
kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
|
||||
} while (0)
|
||||
|
||||
#define DECL64(x) __m256i x
|
||||
#define XOR(d, a, b) (d = _mm256_xor_si256(a,b))
|
||||
#define XOR64 XOR
|
||||
#define AND64(d, a, b) (d = _mm256_and_si256(a,b))
|
||||
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
|
||||
#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
|
||||
#define ROL64(d, v, n) (d = mm256_rol_64(v, n))
|
||||
#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
|
||||
#define DECL64(x) __m256i x
|
||||
#define XOR(d, a, b) (d = _mm256_xor_si256(a,b))
|
||||
#define XOR64 XOR
|
||||
#define AND64(d, a, b) (d = _mm256_and_si256(a,b))
|
||||
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
|
||||
#define NOT64(d, s) (d = mm256_not( s ) )
|
||||
#define ROL64(d, v, n) (d = mm256_rol_64(v, n))
|
||||
#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
|
||||
#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
|
||||
#define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))
|
||||
|
||||
|
@@ -62,186 +62,66 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
|
||||
|
||||
#define cns4w(i) m512_const1_128( ( (__m128i*)CNS_INIT)[i] )
|
||||
|
||||
#define ADD_CONSTANT4W(a,b,c0,c1)\
|
||||
a = _mm512_xor_si512(a,c0);\
|
||||
b = _mm512_xor_si512(b,c1);
|
||||
#define ADD_CONSTANT4W( a, b, c0, c1 ) \
|
||||
a = _mm512_xor_si512( a, c0 ); \
|
||||
b = _mm512_xor_si512( b, c1 );
|
||||
|
||||
#define MULT24W( a0, a1 ) \
|
||||
do { \
|
||||
{ \
|
||||
__m512i b = _mm512_xor_si512( a0, \
|
||||
_mm512_maskz_shuffle_epi32( 0xbbbb, a1, 16 ) ); \
|
||||
a0 = _mm512_or_si512( _mm512_bsrli_epi128( b, 4 ), \
|
||||
_mm512_bslli_epi128( a1,12 ) ); \
|
||||
a1 = _mm512_or_si512( _mm512_bsrli_epi128( a1, 4 ), \
|
||||
_mm512_bslli_epi128( b,12 ) ); \
|
||||
} while(0)
|
||||
a0 = _mm512_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm512_alignr_epi8( b, a1, 4 ); \
|
||||
}
|
||||
|
||||
/*
|
||||
#define MULT24W( a0, a1, mask ) \
|
||||
do { \
|
||||
__m512i b = _mm512_xor_si512( a0, \
|
||||
_mm512_shuffle_epi32( _mm512_and_si512(a1,mask), 16 ) ); \
|
||||
a0 = _mm512_or_si512( _mm512_bsrli_epi128(b,4), _mm512_bslli_epi128(a1,12) );\
|
||||
a1 = _mm512_or_si512( _mm512_bsrli_epi128(a1,4), _mm512_bslli_epi128(b,12) );\
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
// confirm pointer arithmetic
|
||||
// ok but use array indexes
|
||||
#define STEP_PART4W(x,c0,c1,t)\
|
||||
SUBCRUMB4W(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||
SUBCRUMB4W(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
||||
MIXWORD4W(*x,*(x+4),*t,*(t+1));\
|
||||
MIXWORD4W(*(x+1),*(x+5),*t,*(t+1));\
|
||||
MIXWORD4W(*(x+2),*(x+6),*t,*(t+1));\
|
||||
MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\
|
||||
ADD_CONSTANT4W(*x, *(x+4), c0, c1);
|
||||
|
||||
#define SUBCRUMB4W(a0,a1,a2,a3,t)\
|
||||
t = a0;\
|
||||
#define SUBCRUMB4W( a0, a1, a2, a3 ) \
|
||||
{ \
|
||||
__m512i t = a0; \
|
||||
a0 = mm512_xoror( a3, a0, a1 ); \
|
||||
a2 = _mm512_xor_si512(a2,a3);\
|
||||
a2 = _mm512_xor_si512( a2, a3 ); \
|
||||
a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
|
||||
a3 = mm512_xorand( a2, a3, t ); \
|
||||
a2 = mm512_xorand( a1, a2, a0);\
|
||||
a1 = _mm512_or_si512(a1,a3);\
|
||||
a3 = _mm512_xor_si512(a3,a2);\
|
||||
t = _mm512_xor_si512(t,a1);\
|
||||
a2 = _mm512_and_si512(a2,a1);\
|
||||
a1 = mm512_xnor(a1,a0);\
|
||||
a0 = t;
|
||||
a2 = mm512_xorand( a1, a2, a0); \
|
||||
a1 = _mm512_or_si512( a1, a3 ); \
|
||||
a3 = _mm512_xor_si512( a3, a2 ); \
|
||||
t = _mm512_xor_si512( t, a1 ); \
|
||||
a2 = _mm512_and_si512( a2, a1 ); \
|
||||
a1 = mm512_xnor( a1, a0 ); \
|
||||
a0 = t; \
|
||||
}
|
||||
|
||||
/*
|
||||
#define SUBCRUMB4W(a0,a1,a2,a3,t)\
|
||||
t = _mm512_load_si512(&a0);\
|
||||
a0 = _mm512_or_si512(a0,a1);\
|
||||
a2 = _mm512_xor_si512(a2,a3);\
|
||||
a1 = _mm512_andnot_si512(a1, m512_neg1 );\
|
||||
a0 = _mm512_xor_si512(a0,a3);\
|
||||
a3 = _mm512_and_si512(a3,t);\
|
||||
a1 = _mm512_xor_si512(a1,a3);\
|
||||
a3 = _mm512_xor_si512(a3,a2);\
|
||||
a2 = _mm512_and_si512(a2,a0);\
|
||||
a0 = _mm512_andnot_si512(a0, m512_neg1 );\
|
||||
a2 = _mm512_xor_si512(a2,a1);\
|
||||
a1 = _mm512_or_si512(a1,a3);\
|
||||
t = _mm512_xor_si512(t,a1);\
|
||||
a3 = _mm512_xor_si512(a3,a2);\
|
||||
a2 = _mm512_and_si512(a2,a1);\
|
||||
a1 = _mm512_xor_si512(a1,a0);\
|
||||
a0 = _mm512_load_si512(&t);
|
||||
*/
|
||||
#define MIXWORD4W( a, b ) \
|
||||
b = _mm512_xor_si512( a, b ); \
|
||||
a = _mm512_xor_si512( b, _mm512_rol_epi32( a, 2 ) ); \
|
||||
b = _mm512_xor_si512( a, _mm512_rol_epi32( b, 14 ) ); \
|
||||
a = _mm512_xor_si512( b, _mm512_rol_epi32( a, 10 ) ); \
|
||||
b = _mm512_rol_epi32( b, 1 );
|
||||
|
||||
#define MIXWORD4W(a,b,t1,t2)\
|
||||
b = _mm512_xor_si512(a,b);\
|
||||
t1 = _mm512_slli_epi32(a,2);\
|
||||
t2 = _mm512_srli_epi32(a,30);\
|
||||
a = mm512_xoror( b, t1, t2 ); \
|
||||
t1 = _mm512_slli_epi32(b,14);\
|
||||
t2 = _mm512_srli_epi32(b,18);\
|
||||
b = _mm512_or_si512(t1,t2);\
|
||||
b = mm512_xoror( a, t1, t2 ); \
|
||||
t1 = _mm512_slli_epi32(a,10);\
|
||||
t2 = _mm512_srli_epi32(a,22);\
|
||||
a = mm512_xoror( b, t1, t2 ); \
|
||||
t1 = _mm512_slli_epi32(b,1);\
|
||||
t2 = _mm512_srli_epi32(b,31);\
|
||||
b = _mm512_or_si512(t1,t2);
|
||||
#define STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
|
||||
SUBCRUMB4W( x0, x1, x2, x3 ); \
|
||||
SUBCRUMB4W( x5, x6, x7, x4 ); \
|
||||
MIXWORD4W( x0, x4 ); \
|
||||
MIXWORD4W( x1, x5 ); \
|
||||
MIXWORD4W( x2, x6 ); \
|
||||
MIXWORD4W( x3, x7 ); \
|
||||
ADD_CONSTANT4W( x0, x4, c0, c1 );
|
||||
|
||||
/*
|
||||
#define MIXWORD4W(a,b,t1,t2)\
|
||||
b = _mm512_xor_si512(a,b);\
|
||||
t1 = _mm512_slli_epi32(a,2);\
|
||||
t2 = _mm512_srli_epi32(a,30);\
|
||||
a = _mm512_or_si512(t1,t2);\
|
||||
a = _mm512_xor_si512(a,b);\
|
||||
t1 = _mm512_slli_epi32(b,14);\
|
||||
t2 = _mm512_srli_epi32(b,18);\
|
||||
b = _mm512_or_si512(t1,t2);\
|
||||
b = _mm512_xor_si512(a,b);\
|
||||
t1 = _mm512_slli_epi32(a,10);\
|
||||
t2 = _mm512_srli_epi32(a,22);\
|
||||
a = _mm512_or_si512(t1,t2);\
|
||||
a = _mm512_xor_si512(a,b);\
|
||||
t1 = _mm512_slli_epi32(b,1);\
|
||||
t2 = _mm512_srli_epi32(b,31);\
|
||||
b = _mm512_or_si512(t1,t2);
|
||||
*/
|
||||
|
||||
#define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
|
||||
a1 = _mm512_shuffle_epi32(a1,147);\
|
||||
t0 = _mm512_load_si512(&a1);\
|
||||
a1 = _mm512_unpacklo_epi32(a1,a0);\
|
||||
t0 = _mm512_unpackhi_epi32(t0,a0);\
|
||||
t1 = _mm512_shuffle_epi32(t0,78);\
|
||||
a0 = _mm512_shuffle_epi32(a1,78);\
|
||||
SUBCRUMB4W(t1,t0,a0,a1,tmp0);\
|
||||
t0 = _mm512_unpacklo_epi32(t0,t1);\
|
||||
a1 = _mm512_unpacklo_epi32(a1,a0);\
|
||||
a0 = _mm512_load_si512(&a1);\
|
||||
a0 = _mm512_unpackhi_epi64(a0,t0);\
|
||||
a1 = _mm512_unpacklo_epi64(a1,t0);\
|
||||
a1 = _mm512_shuffle_epi32(a1,57);\
|
||||
MIXWORD4W(a0,a1,tmp0,tmp1);\
|
||||
ADD_CONSTANT4W(a0,a1,c0,c1);
|
||||
|
||||
#define NMLTOM7684W(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
|
||||
s2 = _mm512_load_si512(&r1);\
|
||||
q2 = _mm512_load_si512(&p1);\
|
||||
r2 = _mm512_shuffle_epi32(r2,216);\
|
||||
p2 = _mm512_shuffle_epi32(p2,216);\
|
||||
r1 = _mm512_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm512_unpacklo_epi32(p1,p0);\
|
||||
s2 = _mm512_unpackhi_epi32(s2,r0);\
|
||||
q2 = _mm512_unpackhi_epi32(q2,p0);\
|
||||
s0 = _mm512_load_si512(&r2);\
|
||||
q0 = _mm512_load_si512(&p2);\
|
||||
r2 = _mm512_unpacklo_epi64(r2,r1);\
|
||||
p2 = _mm512_unpacklo_epi64(p2,p1);\
|
||||
s1 = _mm512_load_si512(&s0);\
|
||||
q1 = _mm512_load_si512(&q0);\
|
||||
s0 = _mm512_unpackhi_epi64(s0,r1);\
|
||||
q0 = _mm512_unpackhi_epi64(q0,p1);\
|
||||
r2 = _mm512_shuffle_epi32(r2,225);\
|
||||
p2 = _mm512_shuffle_epi32(p2,225);\
|
||||
r0 = _mm512_load_si512(&s1);\
|
||||
p0 = _mm512_load_si512(&q1);\
|
||||
s0 = _mm512_shuffle_epi32(s0,225);\
|
||||
q0 = _mm512_shuffle_epi32(q0,225);\
|
||||
s1 = _mm512_unpacklo_epi64(s1,s2);\
|
||||
q1 = _mm512_unpacklo_epi64(q1,q2);\
|
||||
r0 = _mm512_unpackhi_epi64(r0,s2);\
|
||||
p0 = _mm512_unpackhi_epi64(p0,q2);\
|
||||
s2 = _mm512_load_si512(&r0);\
|
||||
q2 = _mm512_load_si512(&p0);\
|
||||
s3 = _mm512_load_si512(&r2);\
|
||||
q3 = _mm512_load_si512(&p2);
|
||||
|
||||
#define MIXTON7684W(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
|
||||
s0 = _mm512_load_si512(&r0);\
|
||||
q0 = _mm512_load_si512(&p0);\
|
||||
s1 = _mm512_load_si512(&r2);\
|
||||
q1 = _mm512_load_si512(&p2);\
|
||||
r0 = _mm512_unpackhi_epi32(r0,r1);\
|
||||
p0 = _mm512_unpackhi_epi32(p0,p1);\
|
||||
r2 = _mm512_unpackhi_epi32(r2,r3);\
|
||||
p2 = _mm512_unpackhi_epi32(p2,p3);\
|
||||
s0 = _mm512_unpacklo_epi32(s0,r1);\
|
||||
q0 = _mm512_unpacklo_epi32(q0,p1);\
|
||||
s1 = _mm512_unpacklo_epi32(s1,r3);\
|
||||
q1 = _mm512_unpacklo_epi32(q1,p3);\
|
||||
r1 = _mm512_load_si512(&r0);\
|
||||
p1 = _mm512_load_si512(&p0);\
|
||||
r0 = _mm512_unpackhi_epi64(r0,r2);\
|
||||
p0 = _mm512_unpackhi_epi64(p0,p2);\
|
||||
s0 = _mm512_unpackhi_epi64(s0,s1);\
|
||||
q0 = _mm512_unpackhi_epi64(q0,q1);\
|
||||
r1 = _mm512_unpacklo_epi64(r1,r2);\
|
||||
p1 = _mm512_unpacklo_epi64(p1,p2);\
|
||||
s2 = _mm512_load_si512(&r0);\
|
||||
q2 = _mm512_load_si512(&p0);\
|
||||
s1 = _mm512_load_si512(&r1);\
|
||||
q1 = _mm512_load_si512(&p1);
|
||||
#define STEP_PART24W( a0, a1, t0, t1, c0, c1 ) \
|
||||
a1 = _mm512_shuffle_epi32( a1, 147 ); \
|
||||
t0 = _mm512_load_si512( &a1 ); \
|
||||
a1 = _mm512_unpacklo_epi32( a1, a0 ); \
|
||||
t0 = _mm512_unpackhi_epi32( t0, a0 ); \
|
||||
t1 = _mm512_shuffle_epi32( t0, 78 ); \
|
||||
a0 = _mm512_shuffle_epi32( a1, 78 ); \
|
||||
SUBCRUMB4W( t1, t0, a0, a1 ); \
|
||||
t0 = _mm512_unpacklo_epi32( t0, t1 ); \
|
||||
a1 = _mm512_unpacklo_epi32( a1, a0 ); \
|
||||
a0 = _mm512_load_si512( &a1 ); \
|
||||
a0 = _mm512_unpackhi_epi64( a0, t0 ); \
|
||||
a1 = _mm512_unpacklo_epi64( a1, t0 ); \
|
||||
a1 = _mm512_shuffle_epi32( a1, 57 ); \
|
||||
MIXWORD4W( a0, a1 ); \
|
||||
ADD_CONSTANT4W( a0, a1, c0, c1 );
|
||||
|
||||
#define NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
s1 = _mm512_load_si512(&r3);\
|
||||
@@ -279,8 +159,7 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
||||
__m512i t0, t1;
|
||||
__m512i *chainv = state->chainv;
|
||||
__m512i msg0, msg1;
|
||||
__m512i tmp[2];
|
||||
__m512i x[8];
|
||||
__m512i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
|
||||
t0 = mm512_xor3( chainv[0], chainv[2], chainv[4] );
|
||||
t1 = mm512_xor3( chainv[1], chainv[3], chainv[5] );
|
||||
@@ -372,42 +251,30 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
||||
chainv[7] = _mm512_rol_epi32( chainv[7], 3 );
|
||||
chainv[9] = _mm512_rol_epi32( chainv[9], 4 );
|
||||
|
||||
NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x[0], x[1], x[2], x[3],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7],
|
||||
x[4], x[5], x[6], x[7] );
|
||||
NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
|
||||
chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );
|
||||
|
||||
STEP_PART4W( &x[0], cns4w( 0), cns4w( 1), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w( 2), cns4w( 3), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w( 4), cns4w( 5), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w( 6), cns4w( 7), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w( 8), cns4w( 9), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w(10), cns4w(11), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w(12), cns4w(13), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w(14), cns4w(15), &tmp[0] );
|
||||
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 0), cns4w( 1) );
|
||||
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 2), cns4w( 3) );
|
||||
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 4), cns4w( 5) );
|
||||
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 6), cns4w( 7) );
|
||||
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 8), cns4w( 9) );
|
||||
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w(10), cns4w(11) );
|
||||
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w(12), cns4w(13) );
|
||||
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w(14), cns4w(15) );
|
||||
|
||||
MIXTON10244W( x[0], x[1], x[2], x[3],
|
||||
chainv[0], chainv[2], chainv[4],chainv[6],
|
||||
x[4], x[5], x[6], x[7],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7]);
|
||||
MIXTON10244W( x0, x1, x2, x3, chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x4, x5, x6, x7, chainv[1], chainv[3], chainv[5], chainv[7] );
|
||||
|
||||
/* Process last 256-bit block */
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(16), cns4w(17),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(18), cns4w(19),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(20), cns4w(21),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(22), cns4w(23),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(24), cns4w(25),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(26), cns4w(27),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(28), cns4w(29),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(30), cns4w(31),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(16), cns4w(17) );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(18), cns4w(19) );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(20), cns4w(21) );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(22), cns4w(23) );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(24), cns4w(25) );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(26), cns4w(27) );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(28), cns4w(29) );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(30), cns4w(31) );
|
||||
}
|
||||
|
||||
void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
||||
@@ -683,10 +550,11 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
||||
|
||||
#define cns(i) m256_const1_128( ( (__m128i*)CNS_INIT)[i] )
|
||||
|
||||
#define ADD_CONSTANT(a,b,c0,c1)\
|
||||
a = _mm256_xor_si256(a,c0);\
|
||||
b = _mm256_xor_si256(b,c1);
|
||||
#define ADD_CONSTANT( a, b, c0, c1 ) \
|
||||
a = _mm256_xor_si256( a, c0 ); \
|
||||
b = _mm256_xor_si256( b, c1 );
|
||||
|
||||
/*
|
||||
#define MULT2( a0, a1, mask ) \
|
||||
do { \
|
||||
__m256i b = _mm256_xor_si256( a0, \
|
||||
@@ -694,127 +562,83 @@ do { \
|
||||
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
|
||||
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
#define STEP_PART(x,c0,c1,t)\
|
||||
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
||||
MIXWORD(*x,*(x+4),*t,*(t+1));\
|
||||
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
|
||||
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
|
||||
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
|
||||
ADD_CONSTANT(*x, *(x+4), c0, c1);
|
||||
#define MULT2( a0, a1, mask ) \
|
||||
{ \
|
||||
__m256i b = _mm256_xor_si256( a0, \
|
||||
_mm256_shuffle_epi32( _mm256_and_si256( a1, mask ), 16 ) ); \
|
||||
a0 = _mm256_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm256_alignr_epi8( b, a1, 4 ); \
|
||||
}
|
||||
|
||||
#define SUBCRUMB(a0,a1,a2,a3,t)\
|
||||
t = a0;\
|
||||
a0 = _mm256_or_si256(a0,a1);\
|
||||
a2 = _mm256_xor_si256(a2,a3);\
|
||||
a1 = mm256_not( a1 );\
|
||||
a0 = _mm256_xor_si256(a0,a3);\
|
||||
a3 = _mm256_and_si256(a3,t);\
|
||||
a1 = _mm256_xor_si256(a1,a3);\
|
||||
a3 = _mm256_xor_si256(a3,a2);\
|
||||
a2 = _mm256_and_si256(a2,a0);\
|
||||
a0 = mm256_not( a0 );\
|
||||
a2 = _mm256_xor_si256(a2,a1);\
|
||||
a1 = _mm256_or_si256(a1,a3);\
|
||||
t = _mm256_xor_si256(t,a1);\
|
||||
a3 = _mm256_xor_si256(a3,a2);\
|
||||
a2 = _mm256_and_si256(a2,a1);\
|
||||
a1 = _mm256_xor_si256(a1,a0);\
|
||||
a0 = t;\
|
||||
#define SUBCRUMB( a0, a1, a2, a3 ) \
|
||||
{ \
|
||||
__m256i t = a0; \
|
||||
a0 = _mm256_or_si256( a0, a1 ); \
|
||||
a2 = _mm256_xor_si256( a2, a3 ); \
|
||||
a1 = mm256_not( a1 ); \
|
||||
a0 = _mm256_xor_si256( a0, a3 ); \
|
||||
a3 = _mm256_and_si256( a3, t ); \
|
||||
a1 = _mm256_xor_si256( a1, a3 ); \
|
||||
a3 = _mm256_xor_si256( a3, a2 ); \
|
||||
a2 = _mm256_and_si256( a2, a0 ); \
|
||||
a0 = mm256_not( a0 ); \
|
||||
a2 = _mm256_xor_si256( a2, a1 ); \
|
||||
a1 = _mm256_or_si256( a1, a3 ); \
|
||||
t = _mm256_xor_si256( t, a1 ); \
|
||||
a3 = _mm256_xor_si256( a3, a2 ); \
|
||||
a2 = _mm256_and_si256( a2, a1 ); \
|
||||
a1 = _mm256_xor_si256( a1, a0 ); \
|
||||
a0 = t; \
|
||||
}
|
||||
|
||||
#define MIXWORD(a,b,t1,t2)\
|
||||
b = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(a,2);\
|
||||
t2 = _mm256_srli_epi32(a,30);\
|
||||
a = _mm256_or_si256(t1,t2);\
|
||||
a = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(b,14);\
|
||||
t2 = _mm256_srli_epi32(b,18);\
|
||||
b = _mm256_or_si256(t1,t2);\
|
||||
b = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(a,10);\
|
||||
t2 = _mm256_srli_epi32(a,22);\
|
||||
a = _mm256_or_si256(t1,t2);\
|
||||
a = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(b,1);\
|
||||
t2 = _mm256_srli_epi32(b,31);\
|
||||
b = _mm256_or_si256(t1,t2);
|
||||
#define MIXWORD( a, b ) \
|
||||
{ \
|
||||
__m256i t1, t2; \
|
||||
b = _mm256_xor_si256( a,b ); \
|
||||
t1 = _mm256_slli_epi32( a, 2 ); \
|
||||
t2 = _mm256_srli_epi32( a, 30 ); \
|
||||
a = _mm256_or_si256( t1, t2 ); \
|
||||
a = _mm256_xor_si256( a, b ); \
|
||||
t1 = _mm256_slli_epi32( b, 14 ); \
|
||||
t2 = _mm256_srli_epi32( b, 18 ); \
|
||||
b = _mm256_or_si256( t1, t2 ); \
|
||||
b = _mm256_xor_si256( a, b ); \
|
||||
t1 = _mm256_slli_epi32( a, 10 ); \
|
||||
t2 = _mm256_srli_epi32( a, 22 ); \
|
||||
a = _mm256_or_si256( t1,t2 ); \
|
||||
a = _mm256_xor_si256( a,b ); \
|
||||
t1 = _mm256_slli_epi32( b,1 ); \
|
||||
t2 = _mm256_srli_epi32( b,31 ); \
|
||||
b = _mm256_or_si256( t1, t2 ); \
|
||||
}
|
||||
|
||||
#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
|
||||
a1 = _mm256_shuffle_epi32(a1,147);\
|
||||
t0 = _mm256_load_si256(&a1);\
|
||||
a1 = _mm256_unpacklo_epi32(a1,a0);\
|
||||
t0 = _mm256_unpackhi_epi32(t0,a0);\
|
||||
t1 = _mm256_shuffle_epi32(t0,78);\
|
||||
a0 = _mm256_shuffle_epi32(a1,78);\
|
||||
SUBCRUMB(t1,t0,a0,a1,tmp0);\
|
||||
t0 = _mm256_unpacklo_epi32(t0,t1);\
|
||||
a1 = _mm256_unpacklo_epi32(a1,a0);\
|
||||
a0 = _mm256_load_si256(&a1);\
|
||||
a0 = _mm256_unpackhi_epi64(a0,t0);\
|
||||
a1 = _mm256_unpacklo_epi64(a1,t0);\
|
||||
a1 = _mm256_shuffle_epi32(a1,57);\
|
||||
MIXWORD(a0,a1,tmp0,tmp1);\
|
||||
ADD_CONSTANT(a0,a1,c0,c1);
|
||||
#define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
|
||||
SUBCRUMB( x0, x1, x2, x3 ); \
|
||||
SUBCRUMB( x5, x6, x7, x4 ); \
|
||||
MIXWORD( x0, x4 ); \
|
||||
MIXWORD( x1, x5 ); \
|
||||
MIXWORD( x2, x6 ); \
|
||||
MIXWORD( x3, x7 ); \
|
||||
ADD_CONSTANT( x0, x4, c0, c1 );
|
||||
|
||||
#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
|
||||
s2 = _mm256_load_si256(&r1);\
|
||||
q2 = _mm256_load_si256(&p1);\
|
||||
r2 = _mm256_shuffle_epi32(r2,216);\
|
||||
p2 = _mm256_shuffle_epi32(p2,216);\
|
||||
r1 = _mm256_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm256_unpacklo_epi32(p1,p0);\
|
||||
s2 = _mm256_unpackhi_epi32(s2,r0);\
|
||||
q2 = _mm256_unpackhi_epi32(q2,p0);\
|
||||
s0 = _mm256_load_si256(&r2);\
|
||||
q0 = _mm256_load_si256(&p2);\
|
||||
r2 = _mm256_unpacklo_epi64(r2,r1);\
|
||||
p2 = _mm256_unpacklo_epi64(p2,p1);\
|
||||
s1 = _mm256_load_si256(&s0);\
|
||||
q1 = _mm256_load_si256(&q0);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,r1);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,p1);\
|
||||
r2 = _mm256_shuffle_epi32(r2,225);\
|
||||
p2 = _mm256_shuffle_epi32(p2,225);\
|
||||
r0 = _mm256_load_si256(&s1);\
|
||||
p0 = _mm256_load_si256(&q1);\
|
||||
s0 = _mm256_shuffle_epi32(s0,225);\
|
||||
q0 = _mm256_shuffle_epi32(q0,225);\
|
||||
s1 = _mm256_unpacklo_epi64(s1,s2);\
|
||||
q1 = _mm256_unpacklo_epi64(q1,q2);\
|
||||
r0 = _mm256_unpackhi_epi64(r0,s2);\
|
||||
p0 = _mm256_unpackhi_epi64(p0,q2);\
|
||||
s2 = _mm256_load_si256(&r0);\
|
||||
q2 = _mm256_load_si256(&p0);\
|
||||
s3 = _mm256_load_si256(&r2);\
|
||||
q3 = _mm256_load_si256(&p2);
|
||||
|
||||
#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
|
||||
s0 = _mm256_load_si256(&r0);\
|
||||
q0 = _mm256_load_si256(&p0);\
|
||||
s1 = _mm256_load_si256(&r2);\
|
||||
q1 = _mm256_load_si256(&p2);\
|
||||
r0 = _mm256_unpackhi_epi32(r0,r1);\
|
||||
p0 = _mm256_unpackhi_epi32(p0,p1);\
|
||||
r2 = _mm256_unpackhi_epi32(r2,r3);\
|
||||
p2 = _mm256_unpackhi_epi32(p2,p3);\
|
||||
s0 = _mm256_unpacklo_epi32(s0,r1);\
|
||||
q0 = _mm256_unpacklo_epi32(q0,p1);\
|
||||
s1 = _mm256_unpacklo_epi32(s1,r3);\
|
||||
q1 = _mm256_unpacklo_epi32(q1,p3);\
|
||||
r1 = _mm256_load_si256(&r0);\
|
||||
p1 = _mm256_load_si256(&p0);\
|
||||
r0 = _mm256_unpackhi_epi64(r0,r2);\
|
||||
p0 = _mm256_unpackhi_epi64(p0,p2);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,s1);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,q1);\
|
||||
r1 = _mm256_unpacklo_epi64(r1,r2);\
|
||||
p1 = _mm256_unpacklo_epi64(p1,p2);\
|
||||
s2 = _mm256_load_si256(&r0);\
|
||||
q2 = _mm256_load_si256(&p0);\
|
||||
s1 = _mm256_load_si256(&r1);\
|
||||
q1 = _mm256_load_si256(&p1);\
|
||||
#define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \
|
||||
a1 = _mm256_shuffle_epi32( a1, 147); \
|
||||
t0 = _mm256_load_si256( &a1 ); \
|
||||
a1 = _mm256_unpacklo_epi32( a1, a0 ); \
|
||||
t0 = _mm256_unpackhi_epi32( t0, a0 ); \
|
||||
t1 = _mm256_shuffle_epi32( t0, 78 ); \
|
||||
a0 = _mm256_shuffle_epi32( a1, 78 ); \
|
||||
SUBCRUMB( t1, t0, a0, a1 );\
|
||||
t0 = _mm256_unpacklo_epi32( t0, t1 ); \
|
||||
a1 = _mm256_unpacklo_epi32( a1, a0 ); \
|
||||
a0 = _mm256_load_si256( &a1 ); \
|
||||
a0 = _mm256_unpackhi_epi64( a0, t0 ); \
|
||||
a1 = _mm256_unpacklo_epi64( a1, t0 ); \
|
||||
a1 = _mm256_shuffle_epi32( a1, 57 ); \
|
||||
MIXWORD( a0, a1 ); \
|
||||
ADD_CONSTANT( a0, a1, c0, c1 );
|
||||
|
||||
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
s1 = _mm256_load_si256(&r3);\
|
||||
@@ -857,9 +681,8 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
__m256i t0, t1;
|
||||
__m256i *chainv = state->chainv;
|
||||
__m256i msg0, msg1;
|
||||
__m256i tmp[2];
|
||||
__m256i x[8];
|
||||
const __m256i MASK = m256_const1_i128( 0x00000000ffffffff );
|
||||
__m256i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
const __m256i MASK = m256_const1_i128( 0xffffffff );
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
@@ -958,42 +781,30 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
chainv[7] = mm256_rol_32( chainv[7], 3 );
|
||||
chainv[9] = mm256_rol_32( chainv[9], 4 );
|
||||
|
||||
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x[0], x[1], x[2], x[3],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7],
|
||||
x[4], x[5], x[6], x[7] );
|
||||
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
|
||||
chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );
|
||||
|
||||
STEP_PART( &x[0], cns( 0), cns( 1), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 2), cns( 3), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 4), cns( 5), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 6), cns( 7), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 8), cns( 9), &tmp[0] );
|
||||
STEP_PART( &x[0], cns(10), cns(11), &tmp[0] );
|
||||
STEP_PART( &x[0], cns(12), cns(13), &tmp[0] );
|
||||
STEP_PART( &x[0], cns(14), cns(15), &tmp[0] );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 0), cns( 1) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 2), cns( 3) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 4), cns( 5) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 6), cns( 7) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 8), cns( 9) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(10), cns(11) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(12), cns(13) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(14), cns(15) );
|
||||
|
||||
MIXTON1024( x[0], x[1], x[2], x[3],
|
||||
chainv[0], chainv[2], chainv[4],chainv[6],
|
||||
x[4], x[5], x[6], x[7],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7]);
|
||||
MIXTON1024( x0, x1, x2, x3, chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x4, x5, x6, x7, chainv[1], chainv[3], chainv[5], chainv[7]);
|
||||
|
||||
/* Process last 256-bit block */
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31) );
|
||||
}
|
||||
|
||||
/***************************************************/
|
||||
|
@@ -19,29 +19,37 @@
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include <emmintrin.h>
|
||||
#include "simd-utils.h"
|
||||
#include "luffa_for_sse2.h"
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
|
||||
a0 = _mm_alignr_epi32( a1, b, 1 ); \
|
||||
a1 = _mm_alignr_epi32( b, a1, 1 ); \
|
||||
}
|
||||
|
||||
#elif defined(__SSE4_1__)
|
||||
|
||||
#define MULT2( a0, a1 ) do \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \
|
||||
a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) ); \
|
||||
a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) ); \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
|
||||
a0 = _mm_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm_alignr_epi8( b, a1, 4 ); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
static inline __m256i mult2_avx2( a )
|
||||
{
|
||||
__m128 a0, a0, b;
|
||||
a0 = mm128_extractlo_256( a );
|
||||
a1 = mm128_extracthi_256( a );
|
||||
b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) );
|
||||
a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) );
|
||||
a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) );
|
||||
return mm256_concat_128( a1, a0 );
|
||||
}
|
||||
*/
|
||||
#else
|
||||
|
||||
#define MULT2( a0, a1 ) do \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
|
||||
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
|
||||
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
|
||||
} while(0)
|
||||
|
||||
#endif
|
||||
|
||||
#define STEP_PART(x,c,t)\
|
||||
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||
@@ -73,13 +81,13 @@ static inline __m256i mult2_avx2( a )
|
||||
t = _mm_load_si128(&a0);\
|
||||
a0 = _mm_or_si128(a0,a1);\
|
||||
a2 = _mm_xor_si128(a2,a3);\
|
||||
a1 = _mm_andnot_si128(a1,ALLONE);\
|
||||
a1 = mm128_not( a1 );\
|
||||
a0 = _mm_xor_si128(a0,a3);\
|
||||
a3 = _mm_and_si128(a3,t);\
|
||||
a1 = _mm_xor_si128(a1,a3);\
|
||||
a3 = _mm_xor_si128(a3,a2);\
|
||||
a2 = _mm_and_si128(a2,a0);\
|
||||
a0 = _mm_andnot_si128(a0,ALLONE);\
|
||||
a0 = mm128_not( a0 );\
|
||||
a2 = _mm_xor_si128(a2,a1);\
|
||||
a1 = _mm_or_si128(a1,a3);\
|
||||
t = _mm_xor_si128(t,a1);\
|
||||
@@ -255,17 +263,18 @@ static const uint32 CNS_INIT[128] __attribute((aligned(16))) = {
|
||||
|
||||
|
||||
__m128i CNS128[32];
|
||||
__m128i ALLONE;
|
||||
#if !defined(__SSE4_1__)
|
||||
__m128i MASK;
|
||||
#endif
|
||||
|
||||
HashReturn init_luffa(hashState_luffa *state, int hashbitlen)
|
||||
{
|
||||
int i;
|
||||
state->hashbitlen = hashbitlen;
|
||||
#if !defined(__SSE4_1__)
|
||||
/* set the lower 32 bits to '1' */
|
||||
MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
|
||||
/* set all bits to '1' */
|
||||
ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||
#endif
|
||||
/* set the 32-bit round constant values to the 128-bit data field */
|
||||
for ( i=0; i<32; i++ )
|
||||
CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
|
||||
@@ -365,10 +374,10 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
|
||||
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
|
||||
int i;
|
||||
state->hashbitlen = hashbitlen;
|
||||
#if !defined(__SSE4_1__)
|
||||
/* set the lower 32 bits to '1' */
|
||||
MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
|
||||
/* set all bits to '1' */
|
||||
ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||
#endif
|
||||
/* set the 32-bit round constant values to the 128-bit data field */
|
||||
for ( i=0; i<32; i++ )
|
||||
CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
|
||||
|
@@ -230,25 +230,13 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
block0_hash[7] = _mm512_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces, add padding.
|
||||
// unique nonces.
|
||||
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
|
||||
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
|
||||
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
|
||||
block_buf[ 3] =
|
||||
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
|
||||
block_buf[ 4] = m512_const1_32( 0x80000000 );
|
||||
block_buf[ 5] =
|
||||
block_buf[ 6] =
|
||||
block_buf[ 7] =
|
||||
block_buf[ 8] =
|
||||
block_buf[ 9] =
|
||||
block_buf[10] =
|
||||
block_buf[11] =
|
||||
block_buf[12] = m512_zero;
|
||||
block_buf[13] = m512_one_32;
|
||||
block_buf[14] = m512_zero;
|
||||
block_buf[15] = m512_const1_32( 80*8 );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
@@ -425,24 +413,12 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces and add padding.
|
||||
// unique nonces.
|
||||
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
|
||||
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
|
||||
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
|
||||
block_buf[ 3] =
|
||||
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
|
||||
block_buf[ 4] = m256_const1_32( 0x80000000 );
|
||||
block_buf[ 5] =
|
||||
block_buf[ 6] =
|
||||
block_buf[ 7] =
|
||||
block_buf[ 8] =
|
||||
block_buf[ 9] =
|
||||
block_buf[10] =
|
||||
block_buf[11] =
|
||||
block_buf[12] = m256_zero;
|
||||
block_buf[13] = m256_one_32;
|
||||
block_buf[14] = m256_zero;
|
||||
block_buf[15] = m256_const1_32( 80*8 );
|
||||
block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4,
|
||||
n+ 3, n+ 2, n+ 1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
@@ -120,25 +120,13 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
||||
block0_hash[7] = _mm512_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces and add padding.
|
||||
// unique nonces.
|
||||
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
|
||||
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
|
||||
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
|
||||
block_buf[ 3] =
|
||||
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
block_buf[ 4] = m512_const1_32( 0x80000000 );
|
||||
block_buf[ 5] =
|
||||
block_buf[ 6] =
|
||||
block_buf[ 7] =
|
||||
block_buf[ 8] =
|
||||
block_buf[ 9] =
|
||||
block_buf[10] =
|
||||
block_buf[11] =
|
||||
block_buf[12] = m512_zero;
|
||||
block_buf[13] = m512_one_32;
|
||||
block_buf[14] = m512_zero;
|
||||
block_buf[15] = m512_const1_32( 80*8 );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
@@ -240,24 +228,12 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
||||
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces and add padding.
|
||||
// unique nonces.
|
||||
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
|
||||
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
|
||||
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
|
||||
block_buf[ 3] =
|
||||
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
block_buf[ 4] = m256_const1_32( 0x80000000 );
|
||||
block_buf[ 5] =
|
||||
block_buf[ 6] =
|
||||
block_buf[ 7] =
|
||||
block_buf[ 8] =
|
||||
block_buf[ 9] =
|
||||
block_buf[10] =
|
||||
block_buf[11] =
|
||||
block_buf[12] = m256_zero;
|
||||
block_buf[13] = m256_one_32;
|
||||
block_buf[14] = m256_zero;
|
||||
block_buf[15] = m256_const1_32( 80*8 );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include "lyra2.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
__thread uint64_t* lyra2z330_wholeMatrix;
|
||||
static __thread uint64_t* lyra2z330_wholeMatrix;
|
||||
|
||||
void lyra2z330_hash(void *state, const void *input, uint32_t height)
|
||||
{
|
||||
|
@@ -146,14 +146,25 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );
|
||||
|
||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
{ \
|
||||
__m128i t; \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm128_vrol256_64( s6, s7 ); \
|
||||
mm128_vror256_64( s2, s3 ); \
|
||||
t = mm128_alignr_64( s7, s6, 1 ); \
|
||||
s6 = mm128_alignr_64( s6, s7, 1 ); \
|
||||
s7 = t; \
|
||||
t = mm128_alignr_64( s2, s3, 1 ); \
|
||||
s2 = mm128_alignr_64( s3, s2, 1 ); \
|
||||
s3 = t; \
|
||||
G_2X64( s0, s2, s5, s6 ); \
|
||||
G_2X64( s1, s3, s4, s7 ); \
|
||||
mm128_vror256_64( s6, s7 ); \
|
||||
mm128_vrol256_64( s2, s3 );
|
||||
t = mm128_alignr_64( s6, s7, 1 ); \
|
||||
s6 = mm128_alignr_64( s7, s6, 1 ); \
|
||||
s7 = t; \
|
||||
t = mm128_alignr_64( s3, s2, 1 ); \
|
||||
s2 = mm128_alignr_64( s2, s3, 1 ); \
|
||||
s3 = t; \
|
||||
}
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
|
@@ -15,7 +15,8 @@
|
||||
|
||||
#if defined (ANIME_8WAY)
|
||||
|
||||
typedef struct {
|
||||
union _anime_8way_context_overlay
|
||||
{
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
#if defined(__VAES__)
|
||||
@@ -26,23 +27,9 @@ typedef struct {
|
||||
jh512_8way_context jh;
|
||||
skein512_8way_context skein;
|
||||
keccak512_8way_context keccak;
|
||||
} anime_8way_ctx_holder;
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
anime_8way_ctx_holder anime_8way_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_anime_8way_ctx()
|
||||
{
|
||||
blake512_8way_init( &anime_8way_ctx.blake );
|
||||
bmw512_8way_init( &anime_8way_ctx.bmw );
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_init( &anime_8way_ctx.groestl, 64 );
|
||||
#else
|
||||
init_groestl( &anime_8way_ctx.groestl, 64 );
|
||||
#endif
|
||||
skein512_8way_init( &anime_8way_ctx.skein );
|
||||
jh512_8way_init( &anime_8way_ctx.jh );
|
||||
keccak512_8way_init( &anime_8way_ctx.keccak );
|
||||
}
|
||||
typedef union _anime_8way_context_overlay anime_8way_context_overlay;
|
||||
|
||||
void anime_8way_hash( void *state, const void *input )
|
||||
{
|
||||
@@ -65,17 +52,14 @@ void anime_8way_hash( void *state, const void *input )
|
||||
__m512i* vhB = (__m512i*)vhashB;
|
||||
__m512i* vhC = (__m512i*)vhashC;
|
||||
const __m512i bit3_mask = m512_const1_64( 8 );
|
||||
const __m512i zero = _mm512_setzero_si512();
|
||||
__mmask8 vh_mask;
|
||||
anime_8way_ctx_holder ctx;
|
||||
memcpy( &ctx, &anime_8way_ctx, sizeof(anime_8way_ctx) );
|
||||
anime_8way_context_overlay ctx __attribute__ ((aligned (64)));
|
||||
|
||||
bmw512_8way_full( &ctx.bmw, vhash, input, 80 );
|
||||
|
||||
blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
@@ -152,8 +136,7 @@ void anime_8way_hash( void *state, const void *input )
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhash );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
|
||||
@@ -168,8 +151,7 @@ void anime_8way_hash( void *state, const void *input )
|
||||
|
||||
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
{
|
||||
@@ -237,14 +219,20 @@ int scanhash_anime_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined (ANIME_4WAY)
|
||||
|
||||
typedef struct {
|
||||
union _anime_4way_context_overlay
|
||||
{
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
jh512_4way_context jh;
|
||||
skein512_4way_context skein;
|
||||
keccak512_4way_context keccak;
|
||||
} anime_4way_ctx_holder;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl2;
|
||||
#endif
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
typedef union _anime_4way_context_overlay anime_4way_context_overlay;
|
||||
|
||||
void anime_4way_hash( void *state, const void *input )
|
||||
{
|
||||
@@ -262,7 +250,7 @@ void anime_4way_hash( void *state, const void *input )
|
||||
int h_mask;
|
||||
const __m256i bit3_mask = m256_const1_64( 8 );
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
anime_4way_ctx_holder ctx;
|
||||
anime_4way_context_overlay ctx __attribute__ ((aligned (64)));
|
||||
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way_update( &ctx.bmw, input, 80 );
|
||||
@@ -293,7 +281,18 @@ void anime_4way_hash( void *state, const void *input )
|
||||
|
||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl2, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl2, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
@@ -302,6 +301,8 @@ void anime_4way_hash( void *state, const void *input )
|
||||
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
#endif
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
@@ -13,6 +13,7 @@
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/shavite/shavite-hash-2way.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/hamsi/hamsi-hash-4way.h"
|
||||
@@ -98,8 +99,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
// A
|
||||
#if defined(__VAES__)
|
||||
@@ -154,8 +154,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx.keccak, vhash );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
// A
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
@@ -174,8 +173,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
if ( likely( ( vh_mask & 0xff ) != 0xff ) )
|
||||
{
|
||||
@@ -223,8 +221,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
simd512_4way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash );
|
||||
// 4x32 for haval
|
||||
@@ -302,8 +299,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
|
||||
blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
// A
|
||||
#if defined(__VAES__)
|
||||
@@ -374,8 +370,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7 );
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
// A
|
||||
#if defined(__VAES__)
|
||||
@@ -455,8 +450,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7 );
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
if ( hash0[0] & mask )
|
||||
fugue512_full( &ctx.fugue, hash0, hash0, 64 );
|
||||
@@ -520,8 +514,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
sha512_8way_update( &ctx.sha512, vhash, 64 );
|
||||
sha512_8way_close( &ctx.sha512, vhash );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
@@ -625,6 +618,7 @@ union _hmq1725_4way_context_overlay
|
||||
cube_2way_context cube2;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd sd;
|
||||
shavite512_2way_context shavite2;
|
||||
simd_2way_context simd;
|
||||
hashState_echo echo;
|
||||
hamsi512_4way_context hamsi;
|
||||
@@ -633,6 +627,10 @@ union _hmq1725_4way_context_overlay
|
||||
sph_whirlpool_context whirlpool;
|
||||
sha512_4way_context sha512;
|
||||
haval256_5_4way_context haval;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl2;
|
||||
echo_2way_context echo2;
|
||||
#endif
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
|
||||
@@ -750,15 +748,10 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
|
||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
shavite512_full( &ctx.shavite, hash0, hash0, 64 );
|
||||
shavite512_full( &ctx.shavite, hash1, hash1, 64 );
|
||||
shavite512_full( &ctx.shavite, hash2, hash2, 64 );
|
||||
shavite512_full( &ctx.shavite, hash3, hash3, 64 );
|
||||
|
||||
intrlv_2x128_512( vhashA, hash0, hash1 );
|
||||
intrlv_2x128_512( vhashB, hash2, hash3 );
|
||||
shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
|
||||
shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );
|
||||
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
@@ -795,6 +788,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
|
||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
echo_2way_full( &ctx.echo2, vhashA, 512, vhashA, 64 );
|
||||
echo_2way_full( &ctx.echo2, vhashB, 512, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
|
||||
@@ -807,7 +811,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
(const BitSequence *)hash3, 64 );
|
||||
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
blake512_4way_full( &ctx.blake, vhash, vhash, 64 );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
@@ -939,6 +945,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
|
||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl2, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl2, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
@@ -948,6 +965,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
#endif
|
||||
|
||||
sha512_4way_init( &ctx.sha512 );
|
||||
sha512_4way_update( &ctx.sha512, vhash, 64 );
|
||||
sha512_4way_close( &ctx.sha512, vhash );
|
||||
|
@@ -68,7 +68,6 @@ void quark_8way_hash( void *state, const void *input )
|
||||
quark_8way_ctx_holder ctx;
|
||||
const uint32_t mask = 8;
|
||||
const __m512i bit3_mask = m512_const1_64( mask );
|
||||
const __m512i zero = _mm512_setzero_si512();
|
||||
|
||||
memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );
|
||||
|
||||
@@ -76,9 +75,7 @@ void quark_8way_hash( void *state, const void *input )
|
||||
|
||||
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
@@ -154,8 +151,7 @@ void quark_8way_hash( void *state, const void *input )
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhash );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
|
||||
@@ -169,8 +165,7 @@ void quark_8way_hash( void *state, const void *input )
|
||||
|
||||
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
{
|
||||
|
@@ -4,24 +4,6 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
long double lbry_calc_network_diff( struct work *work )
|
||||
{
|
||||
// sample for diff 43.281 : 1c05ea29
|
||||
// todo: endian reversed on longpoll could be zr5 specific...
|
||||
|
||||
uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
|
||||
uint32_t bits = (nbits & 0xffffff);
|
||||
int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
|
||||
long double d = (long double)0x0000ffff / (long double)bits;
|
||||
|
||||
for (int m=shift; m < 29; m++) d *= 256.0;
|
||||
for (int m=29; m < shift; m++) d /= 256.0;
|
||||
if (opt_debug_diff)
|
||||
applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
// std_le should work but it doesn't
|
||||
void lbry_le_build_stratum_request( char *req, struct work *work,
|
||||
struct stratum_ctx *sctx )
|
||||
@@ -41,31 +23,6 @@ void lbry_le_build_stratum_request( char *req, struct work *work,
|
||||
free(xnonce2str);
|
||||
}
|
||||
|
||||
/*
|
||||
void lbry_build_block_header( struct work* g_work, uint32_t version,
|
||||
uint32_t *prevhash, uint32_t *merkle_root,
|
||||
uint32_t ntime, uint32_t nbits )
|
||||
{
|
||||
int i;
|
||||
memset( g_work->data, 0, sizeof(g_work->data) );
|
||||
g_work->data[0] = version;
|
||||
|
||||
if ( have_stratum )
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[1 + i] = le32dec( prevhash + i );
|
||||
else
|
||||
for (i = 0; i < 8; i++)
|
||||
g_work->data[ 8-i ] = le32dec( prevhash + i );
|
||||
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[9 + i] = be32dec( merkle_root + i );
|
||||
|
||||
g_work->data[ LBRY_NTIME_INDEX ] = ntime;
|
||||
g_work->data[ LBRY_NBITS_INDEX ] = nbits;
|
||||
g_work->data[28] = 0x80000000;
|
||||
}
|
||||
*/
|
||||
|
||||
void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
{
|
||||
unsigned char merkle_root[64] = { 0 };
|
||||
@@ -112,9 +69,7 @@ bool register_lbry_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&lbry_hash;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
||||
#endif
|
||||
gate->calc_network_diff = (void*)&lbry_calc_network_diff;
|
||||
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
|
||||
// gate->build_block_header = (void*)&build_block_header;
|
||||
gate->build_extraheader = (void*)&lbry_build_extraheader;
|
||||
gate->ntime_index = LBRY_NTIME_INDEX;
|
||||
gate->nbits_index = LBRY_NBITS_INDEX;
|
||||
|
@@ -830,7 +830,7 @@ void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )
|
||||
}
|
||||
}
|
||||
|
||||
// Working, not up to date, needs stream optimization.
|
||||
// Working, not up to date, needs stream, shuffle optimizations.
|
||||
// 4x32 interleaving
|
||||
static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
|
||||
{
|
||||
@@ -937,46 +937,28 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
|
||||
// 4x memory usage
|
||||
// Working
|
||||
// 4x128 interleaving
|
||||
static void salsa_shuffle_4way_simd128( __m512i *X )
|
||||
static inline void salsa_shuffle_4way_simd128( __m512i *X )
|
||||
{
|
||||
__m512i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3;
|
||||
|
||||
Y0 = _mm512_mask_blend_epi32( 0x1111, X[1], X[0] );
|
||||
Z0 = _mm512_mask_blend_epi32( 0x4444, X[3], X[2] );
|
||||
|
||||
Y1 = _mm512_mask_blend_epi32( 0x1111, X[2], X[1] );
|
||||
Z1 = _mm512_mask_blend_epi32( 0x4444, X[0], X[3] );
|
||||
|
||||
Y2 = _mm512_mask_blend_epi32( 0x1111, X[3], X[2] );
|
||||
Z2 = _mm512_mask_blend_epi32( 0x4444, X[1], X[0] );
|
||||
|
||||
Y3 = _mm512_mask_blend_epi32( 0x1111, X[0], X[3] );
|
||||
Z3 = _mm512_mask_blend_epi32( 0x4444, X[2], X[1] );
|
||||
|
||||
X[0] = _mm512_mask_blend_epi32( 0x3333, Z0, Y0 );
|
||||
X[1] = _mm512_mask_blend_epi32( 0x3333, Z1, Y1 );
|
||||
X[2] = _mm512_mask_blend_epi32( 0x3333, Z2, Y2 );
|
||||
X[3] = _mm512_mask_blend_epi32( 0x3333, Z3, Y3 );
|
||||
__m512i t0 = _mm512_mask_blend_epi32( 0xaaaa, X[0], X[1] );
|
||||
__m512i t1 = _mm512_mask_blend_epi32( 0x5555, X[0], X[1] );
|
||||
__m512i t2 = _mm512_mask_blend_epi32( 0xaaaa, X[2], X[3] );
|
||||
__m512i t3 = _mm512_mask_blend_epi32( 0x5555, X[2], X[3] );
|
||||
X[0] = _mm512_mask_blend_epi32( 0xcccc, t0, t2 );
|
||||
X[1] = _mm512_mask_blend_epi32( 0x6666, t1, t3 );
|
||||
X[2] = _mm512_mask_blend_epi32( 0x3333, t0, t2 );
|
||||
X[3] = _mm512_mask_blend_epi32( 0x9999, t1, t3 );
|
||||
}
|
||||
|
||||
static void salsa_unshuffle_4way_simd128( __m512i *X )
|
||||
static inline void salsa_unshuffle_4way_simd128( __m512i *X )
|
||||
{
|
||||
__m512i Y0, Y1, Y2, Y3;
|
||||
|
||||
Y0 = _mm512_mask_blend_epi32( 0x8888, X[0], X[1] );
|
||||
Y1 = _mm512_mask_blend_epi32( 0x1111, X[0], X[1] );
|
||||
Y2 = _mm512_mask_blend_epi32( 0x2222, X[0], X[1] );
|
||||
Y3 = _mm512_mask_blend_epi32( 0x4444, X[0], X[1] );
|
||||
|
||||
Y0 = _mm512_mask_blend_epi32( 0x4444, Y0, X[2] );
|
||||
Y1 = _mm512_mask_blend_epi32( 0x8888, Y1, X[2] );
|
||||
Y2 = _mm512_mask_blend_epi32( 0x1111, Y2, X[2] );
|
||||
Y3 = _mm512_mask_blend_epi32( 0x2222, Y3, X[2] );
|
||||
|
||||
X[0] = _mm512_mask_blend_epi32( 0x2222, Y0, X[3] );
|
||||
X[1] = _mm512_mask_blend_epi32( 0x4444, Y1, X[3] );
|
||||
X[2] = _mm512_mask_blend_epi32( 0x8888, Y2, X[3] );
|
||||
X[3] = _mm512_mask_blend_epi32( 0x1111, Y3, X[3] );
|
||||
__m512i t0 = _mm512_mask_blend_epi32( 0xcccc, X[0], X[2] );
|
||||
__m512i t1 = _mm512_mask_blend_epi32( 0x3333, X[0], X[2] );
|
||||
__m512i t2 = _mm512_mask_blend_epi32( 0x6666, X[1], X[3] );
|
||||
__m512i t3 = _mm512_mask_blend_epi32( 0x9999, X[1], X[3] );
|
||||
X[0] = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 );
|
||||
X[1] = _mm512_mask_blend_epi32( 0x5555, t0, t2 );
|
||||
X[2] = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 );
|
||||
X[3] = _mm512_mask_blend_epi32( 0x5555, t1, t3 );
|
||||
}
|
||||
|
||||
static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
|
||||
@@ -1147,46 +1129,28 @@ void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
|
||||
// { l1xb, l1xa, l1c9, l1x8, l0xb, l0xa, l0x9, l0x8 } b[1] B[23:16]
|
||||
// { l1xf, l1xe, l1xd, l1xc, l0xf, l0xe, l0xd, l0xc } b[0] B[31:24]
|
||||
|
||||
static void salsa_shuffle_2way_simd128( __m256i *X )
|
||||
static inline void salsa_shuffle_2way_simd128( __m256i *X )
|
||||
{
|
||||
__m256i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3;
|
||||
|
||||
Y0 = _mm256_blend_epi32( X[1], X[0], 0x11 );
|
||||
Z0 = _mm256_blend_epi32( X[3], X[2], 0x44 );
|
||||
|
||||
Y1 = _mm256_blend_epi32( X[2], X[1], 0x11 );
|
||||
Z1 = _mm256_blend_epi32( X[0], X[3], 0x44 );
|
||||
|
||||
Y2 = _mm256_blend_epi32( X[3], X[2], 0x11 );
|
||||
Z2 = _mm256_blend_epi32( X[1], X[0], 0x44 );
|
||||
|
||||
Y3 = _mm256_blend_epi32( X[0], X[3], 0x11 );
|
||||
Z3 = _mm256_blend_epi32( X[2], X[1], 0x44 );
|
||||
|
||||
X[0] = _mm256_blend_epi32( Z0, Y0, 0x33 );
|
||||
X[1] = _mm256_blend_epi32( Z1, Y1, 0x33 );
|
||||
X[2] = _mm256_blend_epi32( Z2, Y2, 0x33 );
|
||||
X[3] = _mm256_blend_epi32( Z3, Y3, 0x33 );
|
||||
__m256i t0 = _mm256_blend_epi32( X[0], X[1], 0xaa );
|
||||
__m256i t1 = _mm256_blend_epi32( X[0], X[1], 0x55 );
|
||||
__m256i t2 = _mm256_blend_epi32( X[2], X[3], 0xaa );
|
||||
__m256i t3 = _mm256_blend_epi32( X[2], X[3], 0x55 );
|
||||
X[0] = _mm256_blend_epi32( t0, t2, 0xcc );
|
||||
X[1] = _mm256_blend_epi32( t1, t3, 0x66 );
|
||||
X[2] = _mm256_blend_epi32( t0, t2, 0x33 );
|
||||
X[3] = _mm256_blend_epi32( t1, t3, 0x99 );
|
||||
}
|
||||
|
||||
static void salsa_unshuffle_2way_simd128( __m256i *X )
|
||||
static inline void salsa_unshuffle_2way_simd128( __m256i *X )
|
||||
{
|
||||
__m256i Y0, Y1, Y2, Y3;
|
||||
|
||||
Y0 = _mm256_blend_epi32( X[0], X[1], 0x88 );
|
||||
Y1 = _mm256_blend_epi32( X[0], X[1], 0x11 );
|
||||
Y2 = _mm256_blend_epi32( X[0], X[1], 0x22 );
|
||||
Y3 = _mm256_blend_epi32( X[0], X[1], 0x44 );
|
||||
|
||||
Y0 = _mm256_blend_epi32( Y0, X[2], 0x44 );
|
||||
Y1 = _mm256_blend_epi32( Y1, X[2], 0x88 );
|
||||
Y2 = _mm256_blend_epi32( Y2, X[2], 0x11 );
|
||||
Y3 = _mm256_blend_epi32( Y3, X[2], 0x22 );
|
||||
|
||||
X[0] = _mm256_blend_epi32( Y0, X[3], 0x22 );
|
||||
X[1] = _mm256_blend_epi32( Y1, X[3], 0x44 );
|
||||
X[2] = _mm256_blend_epi32( Y2, X[3], 0x88 );
|
||||
X[3] = _mm256_blend_epi32( Y3, X[3], 0x11 );
|
||||
__m256i t0 = _mm256_blend_epi32( X[0], X[2], 0xcc );
|
||||
__m256i t1 = _mm256_blend_epi32( X[0], X[2], 0x33 );
|
||||
__m256i t2 = _mm256_blend_epi32( X[1], X[3], 0x66 );
|
||||
__m256i t3 = _mm256_blend_epi32( X[1], X[3], 0x99 );
|
||||
X[0] = _mm256_blend_epi32( t0, t2, 0xaa );
|
||||
X[1] = _mm256_blend_epi32( t0, t2, 0x55 );
|
||||
X[2] = _mm256_blend_epi32( t1, t3, 0xaa );
|
||||
X[3] = _mm256_blend_epi32( t1, t3, 0x55 );
|
||||
}
|
||||
|
||||
static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
|
||||
@@ -2163,7 +2127,7 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
|
||||
X2 = _mm_blend_epi32( B[1], B[0], 0x4 );
|
||||
Y3 = _mm_blend_epi32( B[0], B[3], 0x1 );
|
||||
X3 = _mm_blend_epi32( B[2], B[1], 0x4 );
|
||||
X0 = _mm_blend_epi32( X0, Y0, 0x3);
|
||||
X0 = _mm_blend_epi32( X0, Y0, 0x3 );
|
||||
X1 = _mm_blend_epi32( X1, Y1, 0x3 );
|
||||
X2 = _mm_blend_epi32( X2, Y2, 0x3 );
|
||||
X3 = _mm_blend_epi32( X3, Y3, 0x3 );
|
||||
@@ -2311,91 +2275,34 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
|
||||
// Double buffered, 2x memory usage
|
||||
// No interleaving
|
||||
|
||||
static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
|
||||
static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
|
||||
{
|
||||
__m128i *XA = (__m128i*)xa;
|
||||
__m128i *XB = (__m128i*)xb;
|
||||
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
// __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
|
||||
__m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3;
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 );
|
||||
YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 );
|
||||
ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 );
|
||||
ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 );
|
||||
|
||||
YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 );
|
||||
YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 );
|
||||
ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 );
|
||||
ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 );
|
||||
|
||||
YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 );
|
||||
YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
|
||||
ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
|
||||
ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
|
||||
|
||||
YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
|
||||
YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
|
||||
ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
|
||||
ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
|
||||
|
||||
XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
|
||||
XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
|
||||
|
||||
XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
|
||||
XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
|
||||
|
||||
XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
|
||||
XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
|
||||
|
||||
XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
|
||||
XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
|
||||
|
||||
#else
|
||||
|
||||
// SSE4.1
|
||||
|
||||
YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
|
||||
YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
|
||||
ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
|
||||
ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
|
||||
|
||||
YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
|
||||
YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
|
||||
ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
|
||||
ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
|
||||
|
||||
YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
|
||||
YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
|
||||
ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
|
||||
ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
|
||||
|
||||
YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
|
||||
YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
|
||||
ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
|
||||
ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
|
||||
|
||||
XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
|
||||
XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
|
||||
|
||||
XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
|
||||
XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
|
||||
|
||||
XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
|
||||
XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
|
||||
|
||||
XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
|
||||
XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
|
||||
|
||||
#endif // AVX2 else SSE4_1
|
||||
__m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
|
||||
__m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
|
||||
__m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
|
||||
__m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
|
||||
XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
|
||||
XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
|
||||
XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
||||
XA[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
||||
t0 = _mm_blend_epi16( XB[0], XB[1], 0xcc );
|
||||
t1 = _mm_blend_epi16( XB[0], XB[1], 0x33 );
|
||||
t2 = _mm_blend_epi16( XB[2], XB[3], 0xcc );
|
||||
t3 = _mm_blend_epi16( XB[2], XB[3], 0x33 );
|
||||
XB[0] = _mm_blend_epi16( t0, t2, 0xf0 );
|
||||
XB[1] = _mm_blend_epi16( t1, t3, 0x3c );
|
||||
XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
||||
XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
||||
|
||||
#else // SSE2
|
||||
|
||||
|
||||
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
|
||||
|
||||
YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
|
||||
YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
|
||||
YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
|
||||
@@ -2417,7 +2324,7 @@ static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
|
||||
#endif
|
||||
}
|
||||
|
||||
static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
||||
static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
||||
{
|
||||
|
||||
__m128i *XA = (__m128i*)xa;
|
||||
@@ -2425,67 +2332,22 @@ static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 );
|
||||
YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 );
|
||||
YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 );
|
||||
YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 );
|
||||
YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 );
|
||||
YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 );
|
||||
YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 );
|
||||
YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 );
|
||||
|
||||
YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 );
|
||||
YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 );
|
||||
YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 );
|
||||
YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
|
||||
YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
|
||||
YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
|
||||
YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
|
||||
YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
|
||||
|
||||
XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
|
||||
XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
|
||||
XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
|
||||
XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
|
||||
XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
|
||||
XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
|
||||
XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
|
||||
XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
|
||||
|
||||
#else // SSE4_1
|
||||
|
||||
YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
|
||||
YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
|
||||
YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
|
||||
YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
|
||||
YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
|
||||
YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
|
||||
YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
|
||||
YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
|
||||
|
||||
YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
|
||||
YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
|
||||
YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
|
||||
YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
|
||||
YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
|
||||
YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
|
||||
YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
|
||||
YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
|
||||
|
||||
XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
|
||||
XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
|
||||
XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
|
||||
XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
|
||||
XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
|
||||
XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
|
||||
XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
|
||||
XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
|
||||
|
||||
#endif // AVX2 else SSE4_1
|
||||
__m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
|
||||
__m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
|
||||
__m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
|
||||
__m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
|
||||
XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
|
||||
XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
|
||||
XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
||||
XA[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
||||
t0 = _mm_blend_epi16( XB[0], XB[2], 0xf0 );
|
||||
t1 = _mm_blend_epi16( XB[0], XB[2], 0x0f );
|
||||
t2 = _mm_blend_epi16( XB[1], XB[3], 0x3c );
|
||||
t3 = _mm_blend_epi16( XB[1], XB[3], 0xc3 );
|
||||
XB[0] = _mm_blend_epi16( t0, t2, 0xcc );
|
||||
XB[1] = _mm_blend_epi16( t0, t2, 0x33 );
|
||||
XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
||||
XB[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
||||
|
||||
#else // SSE2
|
||||
|
||||
@@ -2690,116 +2552,44 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
|
||||
}
|
||||
|
||||
|
||||
static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
|
||||
static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
|
||||
uint32_t *xc )
|
||||
{
|
||||
__m128i *XA = (__m128i*)xa;
|
||||
__m128i *XB = (__m128i*)xb;
|
||||
__m128i *XC = (__m128i*)xc;
|
||||
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
__m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3, ZC0, ZC1, ZC2, ZC3;
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 );
|
||||
YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 );
|
||||
YC0 = _mm_blend_epi32( XC[1], XC[0], 0x1 );
|
||||
ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 );
|
||||
ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 );
|
||||
ZC0 = _mm_blend_epi32( XC[3], XC[2], 0x4 );
|
||||
|
||||
YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 );
|
||||
YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 );
|
||||
YC1 = _mm_blend_epi32( XC[2], XC[1], 0x1 );
|
||||
ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 );
|
||||
ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 );
|
||||
ZC1 = _mm_blend_epi32( XC[0], XC[3], 0x4 );
|
||||
|
||||
YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 );
|
||||
YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
|
||||
YC2 = _mm_blend_epi32( XC[3], XC[2], 0x1 );
|
||||
ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
|
||||
ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
|
||||
ZC2 = _mm_blend_epi32( XC[1], XC[0], 0x4 );
|
||||
|
||||
YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
|
||||
YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
|
||||
YC3 = _mm_blend_epi32( XC[0], XC[3], 0x1 );
|
||||
ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
|
||||
ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
|
||||
ZC3 = _mm_blend_epi32( XC[2], XC[1], 0x4 );
|
||||
|
||||
XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
|
||||
XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
|
||||
XC[0] = _mm_blend_epi32( ZC0, YC0, 0x3 );
|
||||
|
||||
XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
|
||||
XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
|
||||
XC[1] = _mm_blend_epi32( ZC1, YC1, 0x3 );
|
||||
|
||||
XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
|
||||
XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
|
||||
XC[2] = _mm_blend_epi32( ZC2, YC2, 0x3 );
|
||||
|
||||
XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
|
||||
XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
|
||||
XC[3] = _mm_blend_epi32( ZC3, YC3, 0x3 );
|
||||
|
||||
#else
|
||||
|
||||
// SSE4.1
|
||||
|
||||
YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
|
||||
YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
|
||||
YC0 = _mm_blend_epi16( XC[1], XC[0], 0x03 );
|
||||
ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
|
||||
ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
|
||||
ZC0 = _mm_blend_epi16( XC[3], XC[2], 0x30 );
|
||||
|
||||
YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
|
||||
YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
|
||||
YC1 = _mm_blend_epi16( XC[2], XC[1], 0x03 );
|
||||
ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
|
||||
ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
|
||||
ZC1 = _mm_blend_epi16( XC[0], XC[3], 0x30 );
|
||||
|
||||
YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
|
||||
YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
|
||||
YC2 = _mm_blend_epi16( XC[3], XC[2], 0x03 );
|
||||
ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
|
||||
ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
|
||||
ZC2 = _mm_blend_epi16( XC[1], XC[0], 0x30 );
|
||||
|
||||
YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
|
||||
YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
|
||||
YC3 = _mm_blend_epi16( XC[0], XC[3], 0x03 );
|
||||
ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
|
||||
ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
|
||||
ZC3 = _mm_blend_epi16( XC[2], XC[1], 0x30 );
|
||||
|
||||
XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
|
||||
XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
|
||||
XC[0] = _mm_blend_epi16( ZC0, YC0, 0x0f );
|
||||
|
||||
XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
|
||||
XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
|
||||
XC[1] = _mm_blend_epi16( ZC1, YC1, 0x0f );
|
||||
|
||||
XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
|
||||
XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
|
||||
XC[2] = _mm_blend_epi16( ZC2, YC2, 0x0f );
|
||||
|
||||
XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
|
||||
XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
|
||||
XC[3] = _mm_blend_epi16( ZC3, YC3, 0x0f );
|
||||
|
||||
#endif // AVX2 else SSE4_1
|
||||
__m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
|
||||
__m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
|
||||
__m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
|
||||
__m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
|
||||
XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
|
||||
XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
|
||||
XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
||||
XA[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
||||
t0 = _mm_blend_epi16( XB[0], XB[1], 0xcc );
|
||||
t1 = _mm_blend_epi16( XB[0], XB[1], 0x33 );
|
||||
t2 = _mm_blend_epi16( XB[2], XB[3], 0xcc );
|
||||
t3 = _mm_blend_epi16( XB[2], XB[3], 0x33 );
|
||||
XB[0] = _mm_blend_epi16( t0, t2, 0xf0 );
|
||||
XB[1] = _mm_blend_epi16( t1, t3, 0x3c );
|
||||
XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
||||
XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
||||
t0 = _mm_blend_epi16( XC[0], XC[1], 0xcc );
|
||||
t1 = _mm_blend_epi16( XC[0], XC[1], 0x33 );
|
||||
t2 = _mm_blend_epi16( XC[2], XC[3], 0xcc );
|
||||
t3 = _mm_blend_epi16( XC[2], XC[3], 0x33 );
|
||||
XC[0] = _mm_blend_epi16( t0, t2, 0xf0 );
|
||||
XC[1] = _mm_blend_epi16( t1, t3, 0x3c );
|
||||
XC[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
||||
XC[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
||||
|
||||
#else // SSE2
|
||||
|
||||
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
|
||||
|
||||
YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
|
||||
YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
|
||||
YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] );
|
||||
@@ -2829,7 +2619,7 @@ static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
|
||||
#endif
|
||||
}
|
||||
|
||||
static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
||||
static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
||||
uint32_t* xc )
|
||||
{
|
||||
__m128i *XA = (__m128i*)xa;
|
||||
@@ -2838,91 +2628,30 @@ static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 );
|
||||
YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 );
|
||||
YC0 = _mm_blend_epi32( XC[0], XC[1], 0x8 );
|
||||
YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 );
|
||||
YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 );
|
||||
YC1 = _mm_blend_epi32( XC[0], XC[1], 0x1 );
|
||||
YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 );
|
||||
YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 );
|
||||
YC2 = _mm_blend_epi32( XC[0], XC[1], 0x2 );
|
||||
YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 );
|
||||
YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 );
|
||||
YC3 = _mm_blend_epi32( XC[0], XC[1], 0x4 );
|
||||
|
||||
YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 );
|
||||
YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 );
|
||||
YC0 = _mm_blend_epi32( YC0, XC[2], 0x4 );
|
||||
YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 );
|
||||
YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
|
||||
YC1 = _mm_blend_epi32( YC1, XC[2], 0x8 );
|
||||
YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
|
||||
YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
|
||||
YC2 = _mm_blend_epi32( YC2, XC[2], 0x1 );
|
||||
YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
|
||||
YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
|
||||
YC3 = _mm_blend_epi32( YC3, XC[2], 0x2 );
|
||||
|
||||
XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
|
||||
XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
|
||||
XC[0] = _mm_blend_epi32( YC0, XC[3], 0x2 );
|
||||
XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
|
||||
XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
|
||||
XC[1] = _mm_blend_epi32( YC1, XC[3], 0x4 );
|
||||
XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
|
||||
XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
|
||||
XC[2] = _mm_blend_epi32( YC2, XC[3], 0x8 );
|
||||
XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
|
||||
XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
|
||||
XC[3] = _mm_blend_epi32( YC3, XC[3], 0x1 );
|
||||
|
||||
#else // SSE4_1
|
||||
|
||||
YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
|
||||
YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
|
||||
YC0 = _mm_blend_epi16( XC[0], XC[1], 0xc0 );
|
||||
YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
|
||||
YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
|
||||
YC1 = _mm_blend_epi16( XC[0], XC[1], 0x03 );
|
||||
YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
|
||||
YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
|
||||
YC2 = _mm_blend_epi16( XC[0], XC[1], 0x0c );
|
||||
YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
|
||||
YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
|
||||
YC3 = _mm_blend_epi16( XC[0], XC[1], 0x30 );
|
||||
|
||||
YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
|
||||
YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
|
||||
YC0 = _mm_blend_epi16( YC0, XC[2], 0x30 );
|
||||
YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
|
||||
YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
|
||||
YC1 = _mm_blend_epi16( YC1, XC[2], 0xc0 );
|
||||
YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
|
||||
YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
|
||||
YC2 = _mm_blend_epi16( YC2, XC[2], 0x03 );
|
||||
YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
|
||||
YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
|
||||
YC3 = _mm_blend_epi16( YC3, XC[2], 0x0c );
|
||||
|
||||
XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
|
||||
XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
|
||||
XC[0] = _mm_blend_epi16( YC0, XC[3], 0x0c );
|
||||
XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
|
||||
XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
|
||||
XC[1] = _mm_blend_epi16( YC1, XC[3], 0x30 );
|
||||
XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
|
||||
XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
|
||||
XC[2] = _mm_blend_epi16( YC2, XC[3], 0xc0 );
|
||||
XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
|
||||
XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
|
||||
XC[3] = _mm_blend_epi16( YC3, XC[3], 0x03 );
|
||||
|
||||
#endif // AVX2 else SSE4_1
|
||||
__m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
|
||||
__m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
|
||||
__m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
|
||||
__m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
|
||||
XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
|
||||
XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
|
||||
XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
||||
XA[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
||||
t0 = _mm_blend_epi16( XB[0], XB[2], 0xf0 );
|
||||
t1 = _mm_blend_epi16( XB[0], XB[2], 0x0f );
|
||||
t2 = _mm_blend_epi16( XB[1], XB[3], 0x3c );
|
||||
t3 = _mm_blend_epi16( XB[1], XB[3], 0xc3 );
|
||||
XB[0] = _mm_blend_epi16( t0, t2, 0xcc );
|
||||
XB[1] = _mm_blend_epi16( t0, t2, 0x33 );
|
||||
XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
||||
XB[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
||||
t0 = _mm_blend_epi16( XC[0], XC[2], 0xf0 );
|
||||
t1 = _mm_blend_epi16( XC[0], XC[2], 0x0f );
|
||||
t2 = _mm_blend_epi16( XC[1], XC[3], 0x3c );
|
||||
t3 = _mm_blend_epi16( XC[1], XC[3], 0xc3 );
|
||||
XC[0] = _mm_blend_epi16( t0, t2, 0xcc );
|
||||
XC[1] = _mm_blend_epi16( t0, t2, 0x33 );
|
||||
XC[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
||||
XC[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
||||
|
||||
#else // SSE2
|
||||
|
||||
|
@@ -1,270 +0,0 @@
|
||||
/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
|
||||
/*
|
||||
* This file contains some functions which implement the external data
|
||||
* handling and padding for Merkle-Damgard hash functions which follow
|
||||
* the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
|
||||
*
|
||||
* API: this file is meant to be included, not compiled as a stand-alone
|
||||
* file. Some macros must be defined:
|
||||
* RFUN name for the round function
|
||||
* HASH "short name" for the hash function
|
||||
* BE32 defined for big-endian, 32-bit based (e.g. SHA-1)
|
||||
* LE32 defined for little-endian, 32-bit based (e.g. MD5)
|
||||
* BE64 defined for big-endian, 64-bit based (e.g. SHA-512)
|
||||
* LE64 defined for little-endian, 64-bit based (no example yet)
|
||||
* PW01 if defined, append 0x01 instead of 0x80 (for Tiger)
|
||||
* BLEN if defined, length of a message block (in bytes)
|
||||
* PLW1 if defined, length is defined on one 64-bit word only (for Tiger)
|
||||
* PLW4 if defined, length is defined on four 64-bit words (for WHIRLPOOL)
|
||||
* SVAL if defined, reference to the context state information
|
||||
*
|
||||
* BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
|
||||
* this is used for instance for Tiger, which works on 64-bit words but
|
||||
* uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
|
||||
* ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
|
||||
* set, then only one word (64 bits) will be used to encode the input
|
||||
* message length (in bits), otherwise two words will be used (as in
|
||||
* SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
|
||||
* not PLW1), four 64-bit words will be used to encode the message length
|
||||
* (in bits). Note that regardless of those settings, only 64-bit message
|
||||
* lengths are supported (in bits): messages longer than 2 Exabytes will be
|
||||
* improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
|
||||
* 2 millions Terabytes, which is huge).
|
||||
*
|
||||
* If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
|
||||
* function. This is used for Tiger2, which is identical to Tiger except
|
||||
* when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
|
||||
* of the 0x01 from original Tiger).
|
||||
*
|
||||
* The RFUN function is invoked with two arguments, the first pointing to
|
||||
* aligned data (as a "const void *"), the second being state information
|
||||
* from the context structure. By default, this state information is the
|
||||
* "val" field from the context, and this field is assumed to be an array
|
||||
* of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
|
||||
* from the context structure. The "val" field can have any type, except
|
||||
* for the output encoding which assumes that it is an array of "sph_u32"
|
||||
* values. By defining NO_OUTPUT, this last step is deactivated; the
|
||||
* includer code is then responsible for writing out the hash result. When
|
||||
* NO_OUTPUT is defined, the third parameter to the "close()" function is
|
||||
* ignored.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
#undef SPH_XCAT
|
||||
#define SPH_XCAT(a, b) SPH_XCAT_(a, b)
|
||||
#undef SPH_XCAT_
|
||||
#define SPH_XCAT_(a, b) a ## b
|
||||
|
||||
#undef SPH_BLEN
|
||||
#undef SPH_WLEN
|
||||
#if defined BE64 || defined LE64
|
||||
#define SPH_BLEN 128U
|
||||
#define SPH_WLEN 8U
|
||||
#else
|
||||
#define SPH_BLEN 64U
|
||||
#define SPH_WLEN 4U
|
||||
#endif
|
||||
|
||||
#ifdef BLEN
|
||||
#undef SPH_BLEN
|
||||
#define SPH_BLEN BLEN
|
||||
#endif
|
||||
|
||||
#undef SPH_MAXPAD
|
||||
#if defined PLW1
|
||||
#define SPH_MAXPAD (SPH_BLEN - SPH_WLEN)
|
||||
#elif defined PLW4
|
||||
#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 2))
|
||||
#else
|
||||
#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 1))
|
||||
#endif
|
||||
|
||||
#undef SPH_VAL
|
||||
#undef SPH_NO_OUTPUT
|
||||
#ifdef SVAL
|
||||
#define SPH_VAL SVAL
|
||||
#define SPH_NO_OUTPUT 1
|
||||
#else
|
||||
#define SPH_VAL sc->val
|
||||
#endif
|
||||
|
||||
#ifndef CLOSE_ONLY
|
||||
|
||||
#ifdef SPH_UPTR
|
||||
static void
|
||||
SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len )
|
||||
#else
|
||||
void
|
||||
HASH ( void *cc, const void *data, size_t len )
|
||||
#endif
|
||||
{
|
||||
SPH_XCAT( HASH, _context ) *sc;
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
size_t ptr;
|
||||
|
||||
sc = cc;
|
||||
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
clen = SPH_BLEN - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
|
||||
vdata = vdata + (clen>>3);
|
||||
ptr += clen;
|
||||
len -= clen;
|
||||
if ( ptr == SPH_BLEN )
|
||||
{
|
||||
RFUN( sc->buf, SPH_VAL );
|
||||
ptr = 0;
|
||||
}
|
||||
sc->count += clen;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef SPH_UPTR
|
||||
void
|
||||
HASH (void *cc, const void *data, size_t len)
|
||||
{
|
||||
SPH_XCAT(HASH, _context) *sc;
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
unsigned ptr;
|
||||
|
||||
if ( len < (2 * SPH_BLEN) )
|
||||
{
|
||||
SPH_XCAT(HASH, _short)(cc, data, len);
|
||||
return;
|
||||
}
|
||||
sc = cc;
|
||||
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
|
||||
if ( ptr > 0 )
|
||||
{
|
||||
unsigned t;
|
||||
t = SPH_BLEN - ptr;
|
||||
SPH_XCAT( HASH, _short )( cc, data, t );
|
||||
vdata = vdata + (t>>3);
|
||||
len -= t;
|
||||
}
|
||||
SPH_XCAT( HASH, _short )( cc, data, len );
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Perform padding and produce result. The context is NOT reinitialized
|
||||
* by this function.
|
||||
*/
|
||||
static void
|
||||
SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
|
||||
void *dst, unsigned rnum )
|
||||
{
|
||||
SPH_XCAT(HASH, _context) *sc;
|
||||
unsigned ptr, u;
|
||||
sc = cc;
|
||||
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
|
||||
|
||||
#ifdef PW01
|
||||
sc->buf[ptr>>3] = m256_const1_64( 0x100 >> 8 );
|
||||
#else
|
||||
sc->buf[ptr>>3] = m256_const1_64( 0x80 );
|
||||
#endif
|
||||
ptr += 8;
|
||||
|
||||
if ( ptr > SPH_MAXPAD )
|
||||
{
|
||||
memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
|
||||
RFUN( sc->buf, SPH_VAL );
|
||||
memset_zero_256( sc->buf, SPH_MAXPAD >> 3 );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
|
||||
}
|
||||
#if defined BE64
|
||||
#if defined PLW1
|
||||
sc->buf[ SPH_MAXPAD>>3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#elif defined PLW4
|
||||
memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
|
||||
sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#else
|
||||
sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#endif // PLW
|
||||
#else // LE64
|
||||
#if defined PLW1
|
||||
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
|
||||
#elif defined PLW4
|
||||
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
|
||||
sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
|
||||
_mm256_set1_epi64x( c->count >> 61 );
|
||||
memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
|
||||
2 * SPH_WLEN );
|
||||
#else
|
||||
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
|
||||
sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
|
||||
_mm256_set1_epi64x( sc->count >> 61 );
|
||||
#endif // PLW
|
||||
|
||||
#endif // LE64
|
||||
|
||||
RFUN( sc->buf, SPH_VAL );
|
||||
|
||||
#ifdef SPH_NO_OUTPUT
|
||||
(void)dst;
|
||||
(void)rnum;
|
||||
(void)u;
|
||||
#else
|
||||
for ( u = 0; u < rnum; u ++ )
|
||||
{
|
||||
#if defined BE64
|
||||
((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
|
||||
#else // LE64
|
||||
((__m256i*)dst)[u] = sc->val[u];
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum )
|
||||
{
|
||||
SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum );
|
||||
}
|
@@ -711,8 +711,11 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
{
|
||||
__m256i A, B, C, D, E, F, G, H;
|
||||
|
||||
X[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
|
||||
X[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] );
|
||||
// W[9:14] are zero, therefore X[9:13] are also zero and not needed.
|
||||
// Except X[ 9] which is part of W[ 0] from the third group.
|
||||
X[ 0] = _mm256_add_epi32( SSG2_0x( W[ 1] ), W[ 0] );
|
||||
X[ 1] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( W[15] ),
|
||||
SSG2_0x( W[ 2] ) ), W[ 1] );
|
||||
X[ 2] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 0] ), W[11] ),
|
||||
W[ 2] );
|
||||
X[ 3] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 1] ), W[12] ),
|
||||
@@ -725,16 +728,12 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
W[ 6] );
|
||||
X[ 7] = _mm256_add_epi32( _mm256_add_epi32( X[ 0], SSG2_0x( W[ 8] ) ),
|
||||
W[ 7] );
|
||||
X[ 8] = _mm256_add_epi32( _mm256_add_epi32( X[ 1], SSG2_0x( W[ 9] ) ),
|
||||
W[ 8] );
|
||||
X[ 9] = _mm256_add_epi32( SSG2_0x( W[10] ), W[ 9] );
|
||||
X[10] = _mm256_add_epi32( SSG2_0x( W[11] ), W[10] );
|
||||
X[11] = _mm256_add_epi32( SSG2_0x( W[12] ), W[11] );
|
||||
X[12] = _mm256_add_epi32( SSG2_0x( W[13] ), W[12] );
|
||||
X[13] = _mm256_add_epi32( SSG2_0x( W[14] ), W[13] );
|
||||
X[14] = _mm256_add_epi32( SSG2_0x( W[15] ), W[14] );
|
||||
X[ 8] = _mm256_add_epi32( X[ 1], W[ 8] );
|
||||
X[14] = SSG2_0x( W[15] );
|
||||
X[15] = _mm256_add_epi32( SSG2_0x( X[ 0] ), W[15] );
|
||||
|
||||
X[ 9] = _mm256_add_epi32( SSG2_0x( X[ 1] ), X[ 0] );
|
||||
|
||||
A = _mm256_load_si256( state_in );
|
||||
B = _mm256_load_si256( state_in + 1 );
|
||||
C = _mm256_load_si256( state_in + 2 );
|
||||
@@ -779,10 +778,6 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
G = _mm256_load_si256( state_mid + 6 );
|
||||
H = _mm256_load_si256( state_mid + 7 );
|
||||
|
||||
// SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
// SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
// SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( G, H );
|
||||
#endif
|
||||
@@ -810,23 +805,36 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
W[ 6] = _mm256_add_epi32( X[ 6], SSG2_1x( W[ 4] ) );
|
||||
W[ 7] = _mm256_add_epi32( X[ 7], SSG2_1x( W[ 5] ) );
|
||||
W[ 8] = _mm256_add_epi32( X[ 8], SSG2_1x( W[ 6] ) );
|
||||
W[ 9] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[ 7] ),
|
||||
W[ 2] ) );
|
||||
W[10] = _mm256_add_epi32( X[10], _mm256_add_epi32( SSG2_1x( W[ 8] ),
|
||||
W[ 3] ) );
|
||||
W[11] = _mm256_add_epi32( X[11], _mm256_add_epi32( SSG2_1x( W[ 9] ),
|
||||
W[ 4] ) );
|
||||
W[12] = _mm256_add_epi32( X[12], _mm256_add_epi32( SSG2_1x( W[10] ),
|
||||
W[ 5] ) );
|
||||
W[13] = _mm256_add_epi32( X[13], _mm256_add_epi32( SSG2_1x( W[11] ),
|
||||
W[ 6] ) );
|
||||
W[ 9] = _mm256_add_epi32( SSG2_1x( W[ 7] ), W[ 2] );
|
||||
W[10] = _mm256_add_epi32( SSG2_1x( W[ 8] ), W[ 3] );
|
||||
W[11] = _mm256_add_epi32( SSG2_1x( W[ 9] ), W[ 4] );
|
||||
W[12] = _mm256_add_epi32( SSG2_1x( W[10] ), W[ 5] );
|
||||
W[13] = _mm256_add_epi32( SSG2_1x( W[11] ), W[ 6] );
|
||||
W[14] = _mm256_add_epi32( X[14], _mm256_add_epi32( SSG2_1x( W[12] ),
|
||||
W[ 7] ) );
|
||||
W[15] = _mm256_add_epi32( X[15], _mm256_add_epi32( SSG2_1x( W[13] ),
|
||||
W[ 8] ) );
|
||||
|
||||
SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
|
||||
SHA256x8_MSG_EXPANSION( W );
|
||||
|
||||
W[ 0] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[14] ),
|
||||
W[ 9] ) );
|
||||
W[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] );
|
||||
W[ 2] = SHA2x_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
|
||||
W[ 3] = SHA2x_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
|
||||
W[ 4] = SHA2x_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
|
||||
W[ 5] = SHA2x_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
|
||||
W[ 6] = SHA2x_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
|
||||
W[ 7] = SHA2x_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
|
||||
W[ 8] = SHA2x_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
|
||||
W[ 9] = SHA2x_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
|
||||
W[10] = SHA2x_MEXP( W[ 8], W[ 3], W[11], W[10] );
|
||||
W[11] = SHA2x_MEXP( W[ 9], W[ 4], W[12], W[11] );
|
||||
W[12] = SHA2x_MEXP( W[10], W[ 5], W[13], W[12] );
|
||||
W[13] = SHA2x_MEXP( W[11], W[ 6], W[14], W[13] );
|
||||
W[14] = SHA2x_MEXP( W[12], W[ 7], W[15], W[14] );
|
||||
W[15] = SHA2x_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
||||
|
||||
SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
|
||||
SHA256x8_MSG_EXPANSION( W );
|
||||
SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
|
||||
@@ -1201,9 +1209,13 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
{
|
||||
__m512i A, B, C, D, E, F, G, H;
|
||||
|
||||
// precalculate constant part msg expansion for second iteration.
|
||||
X[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
|
||||
X[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] );
|
||||
// X is pre-expanded constant part of msg for second group, rounds 16 to 31.
|
||||
// W[9:14] are zero, therefore X[9:13] are also zero and not needed.
|
||||
// Except X[ 9] which is used to pre-expand part of W[ 0] from the third
|
||||
// group, rounds 32 to 48.
|
||||
X[ 0] = _mm512_add_epi32( SSG2_0x16( W[ 1] ), W[ 0] );
|
||||
X[ 1] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( W[15] ),
|
||||
SSG2_0x16( W[ 2] ) ), W[ 1] );
|
||||
X[ 2] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 0] ), W[11] ),
|
||||
W[ 2] );
|
||||
X[ 3] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 1] ), W[12] ),
|
||||
@@ -1216,16 +1228,12 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
W[ 6] );
|
||||
X[ 7] = _mm512_add_epi32( _mm512_add_epi32( X[ 0], SSG2_0x16( W[ 8] ) ),
|
||||
W[ 7] );
|
||||
X[ 8] = _mm512_add_epi32( _mm512_add_epi32( X[ 1], SSG2_0x16( W[ 9] ) ),
|
||||
W[ 8] );
|
||||
X[ 9] = _mm512_add_epi32( SSG2_0x16( W[10] ), W[ 9] );
|
||||
X[10] = _mm512_add_epi32( SSG2_0x16( W[11] ), W[10] );
|
||||
X[11] = _mm512_add_epi32( SSG2_0x16( W[12] ), W[11] );
|
||||
X[12] = _mm512_add_epi32( SSG2_0x16( W[13] ), W[12] );
|
||||
X[13] = _mm512_add_epi32( SSG2_0x16( W[14] ), W[13] );
|
||||
X[14] = _mm512_add_epi32( SSG2_0x16( W[15] ), W[14] );
|
||||
X[ 8] = _mm512_add_epi32( X[ 1], W[ 8] );
|
||||
X[14] = SSG2_0x16( W[15] );
|
||||
X[15] = _mm512_add_epi32( SSG2_0x16( X[ 0] ), W[15] );
|
||||
|
||||
X[ 9] = _mm512_add_epi32( SSG2_0x16( X[ 1] ), X[ 0] );
|
||||
|
||||
A = _mm512_load_si512( state_in );
|
||||
B = _mm512_load_si512( state_in + 1 );
|
||||
C = _mm512_load_si512( state_in + 2 );
|
||||
@@ -1280,7 +1288,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
// update precalculated msg expansion with new nonce: W[3].
|
||||
// inject nonce, W[3], to complete msg expansion.
|
||||
W[ 0] = X[ 0];
|
||||
W[ 1] = X[ 1];
|
||||
W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) );
|
||||
@@ -1290,23 +1298,36 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
W[ 6] = _mm512_add_epi32( X[ 6], SSG2_1x16( W[ 4] ) );
|
||||
W[ 7] = _mm512_add_epi32( X[ 7], SSG2_1x16( W[ 5] ) );
|
||||
W[ 8] = _mm512_add_epi32( X[ 8], SSG2_1x16( W[ 6] ) );
|
||||
W[ 9] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[ 7] ),
|
||||
W[ 2] ) );
|
||||
W[10] = _mm512_add_epi32( X[10], _mm512_add_epi32( SSG2_1x16( W[ 8] ),
|
||||
W[ 3] ) );
|
||||
W[11] = _mm512_add_epi32( X[11], _mm512_add_epi32( SSG2_1x16( W[ 9] ),
|
||||
W[ 4] ) );
|
||||
W[12] = _mm512_add_epi32( X[12], _mm512_add_epi32( SSG2_1x16( W[10] ),
|
||||
W[ 5] ) );
|
||||
W[13] = _mm512_add_epi32( X[13], _mm512_add_epi32( SSG2_1x16( W[11] ),
|
||||
W[ 6] ) );
|
||||
W[ 9] = _mm512_add_epi32( SSG2_1x16( W[ 7] ), W[ 2] );
|
||||
W[10] = _mm512_add_epi32( SSG2_1x16( W[ 8] ), W[ 3] );
|
||||
W[11] = _mm512_add_epi32( SSG2_1x16( W[ 9] ), W[ 4] );
|
||||
W[12] = _mm512_add_epi32( SSG2_1x16( W[10] ), W[ 5] );
|
||||
W[13] = _mm512_add_epi32( SSG2_1x16( W[11] ), W[ 6] );
|
||||
W[14] = _mm512_add_epi32( X[14], _mm512_add_epi32( SSG2_1x16( W[12] ),
|
||||
W[ 7] ) );
|
||||
W[15] = _mm512_add_epi32( X[15], _mm512_add_epi32( SSG2_1x16( W[13] ),
|
||||
W[ 8] ) );
|
||||
|
||||
SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
|
||||
SHA256x16_MSG_EXPANSION( W );
|
||||
|
||||
W[ 0] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[14] ),
|
||||
W[ 9] ) );
|
||||
W[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] );
|
||||
W[ 2] = SHA2x16_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
|
||||
W[ 3] = SHA2x16_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
|
||||
W[ 4] = SHA2x16_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
|
||||
W[ 5] = SHA2x16_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
|
||||
W[ 6] = SHA2x16_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
|
||||
W[ 7] = SHA2x16_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
|
||||
W[ 8] = SHA2x16_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
|
||||
W[ 9] = SHA2x16_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
|
||||
W[10] = SHA2x16_MEXP( W[ 8], W[ 3], W[11], W[10] );
|
||||
W[11] = SHA2x16_MEXP( W[ 9], W[ 4], W[12], W[11] );
|
||||
W[12] = SHA2x16_MEXP( W[10], W[ 5], W[13], W[12] );
|
||||
W[13] = SHA2x16_MEXP( W[11], W[ 6], W[14], W[13] );
|
||||
W[14] = SHA2x16_MEXP( W[12], W[ 7], W[15], W[14] );
|
||||
W[15] = SHA2x16_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
||||
|
||||
SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
|
||||
SHA256x16_MSG_EXPANSION( W );
|
||||
SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
|
||||
@@ -1336,8 +1357,8 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
{
|
||||
__m512i A, B, C, D, E, F, G, H;
|
||||
__m512i W[16]; memcpy_512( W, data, 16 );
|
||||
// Value for H at round 60, before adding K, to produce valid final hash
|
||||
//where H == 0.
|
||||
// Value for H at round 60, before adding K, needed to produce valid final
|
||||
// hash where H == 0.
|
||||
// H_ = -( H256[7] + K256[60] );
|
||||
const __m512i H_ = m512_const1_32( 0x136032ED );
|
||||
|
||||
|
@@ -33,6 +33,7 @@
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
// 4way is only used with AVX2, 8way only with AVX512, 16way is not needed.
|
||||
#ifdef __SSE4_1__
|
||||
|
||||
#include "shabal-hash-4way.h"
|
||||
@@ -44,21 +45,6 @@ extern "C"{
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Part of this code was automatically generated (the part between
|
||||
* the "BEGIN" and "END" markers).
|
||||
*/
|
||||
|
||||
#define sM 16
|
||||
|
||||
#define C32 SPH_C32
|
||||
#define T32 SPH_T32
|
||||
|
||||
#define O1 13
|
||||
#define O2 9
|
||||
#define O3 6
|
||||
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define DECL_STATE8 \
|
||||
@@ -310,72 +296,71 @@ do { \
|
||||
mm256_swap512_256( BF, CF ); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
|
||||
#define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
|
||||
do { \
|
||||
xa0 = mm256_xor3( xm, xb1, _mm256_xor_si256( \
|
||||
_mm256_andnot_si256( xb3, xb2 ), \
|
||||
_mm256_mullo_epi32( mm256_xor3( xa0, xc, \
|
||||
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
|
||||
FIVE ) ), THREE ) ) ); \
|
||||
xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
|
||||
_mm256_mullo_epi32( mm256_xor3( xa0, xc, \
|
||||
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
|
||||
xb3, xb2 ) ); \
|
||||
xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_0_8 do { \
|
||||
PERM_ELT8(A0, AB, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT8(A1, A0, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT8(A2, A1, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT8(A3, A2, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT8(A4, A3, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT8(A5, A4, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT8(A6, A5, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT8(A7, A6, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT8(A8, A7, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT8(A9, A8, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT8(AA, A9, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT8(AB, AA, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT8(A0, AB, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT8(A1, A0, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT8(A2, A1, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT8(A3, A2, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
PERM_ELT8( A0, AB, B0, BD, B9, B6, C8, M0 ); \
|
||||
PERM_ELT8( A1, A0, B1, BE, BA, B7, C7, M1 ); \
|
||||
PERM_ELT8( A2, A1, B2, BF, BB, B8, C6, M2 ); \
|
||||
PERM_ELT8( A3, A2, B3, B0, BC, B9, C5, M3 ); \
|
||||
PERM_ELT8( A4, A3, B4, B1, BD, BA, C4, M4 ); \
|
||||
PERM_ELT8( A5, A4, B5, B2, BE, BB, C3, M5 ); \
|
||||
PERM_ELT8( A6, A5, B6, B3, BF, BC, C2, M6 ); \
|
||||
PERM_ELT8( A7, A6, B7, B4, B0, BD, C1, M7 ); \
|
||||
PERM_ELT8( A8, A7, B8, B5, B1, BE, C0, M8 ); \
|
||||
PERM_ELT8( A9, A8, B9, B6, B2, BF, CF, M9 ); \
|
||||
PERM_ELT8( AA, A9, BA, B7, B3, B0, CE, MA ); \
|
||||
PERM_ELT8( AB, AA, BB, B8, B4, B1, CD, MB ); \
|
||||
PERM_ELT8( A0, AB, BC, B9, B5, B2, CC, MC ); \
|
||||
PERM_ELT8( A1, A0, BD, BA, B6, B3, CB, MD ); \
|
||||
PERM_ELT8( A2, A1, BE, BB, B7, B4, CA, ME ); \
|
||||
PERM_ELT8( A3, A2, BF, BC, B8, B5, C9, MF ); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_1_8 do { \
|
||||
PERM_ELT8(A4, A3, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT8(A5, A4, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT8(A6, A5, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT8(A7, A6, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT8(A8, A7, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT8(A9, A8, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT8(AA, A9, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT8(AB, AA, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT8(A0, AB, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT8(A1, A0, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT8(A2, A1, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT8(A3, A2, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT8(A4, A3, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT8(A5, A4, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT8(A6, A5, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT8(A7, A6, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
PERM_ELT8( A4, A3, B0, BD, B9, B6, C8, M0 ); \
|
||||
PERM_ELT8( A5, A4, B1, BE, BA, B7, C7, M1 ); \
|
||||
PERM_ELT8( A6, A5, B2, BF, BB, B8, C6, M2 ); \
|
||||
PERM_ELT8( A7, A6, B3, B0, BC, B9, C5, M3 ); \
|
||||
PERM_ELT8( A8, A7, B4, B1, BD, BA, C4, M4 ); \
|
||||
PERM_ELT8( A9, A8, B5, B2, BE, BB, C3, M5 ); \
|
||||
PERM_ELT8( AA, A9, B6, B3, BF, BC, C2, M6 ); \
|
||||
PERM_ELT8( AB, AA, B7, B4, B0, BD, C1, M7 ); \
|
||||
PERM_ELT8( A0, AB, B8, B5, B1, BE, C0, M8 ); \
|
||||
PERM_ELT8( A1, A0, B9, B6, B2, BF, CF, M9 ); \
|
||||
PERM_ELT8( A2, A1, BA, B7, B3, B0, CE, MA ); \
|
||||
PERM_ELT8( A3, A2, BB, B8, B4, B1, CD, MB ); \
|
||||
PERM_ELT8( A4, A3, BC, B9, B5, B2, CC, MC ); \
|
||||
PERM_ELT8( A5, A4, BD, BA, B6, B3, CB, MD ); \
|
||||
PERM_ELT8( A6, A5, BE, BB, B7, B4, CA, ME ); \
|
||||
PERM_ELT8( A7, A6, BF, BC, B8, B5, C9, MF ); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_2_8 do { \
|
||||
PERM_ELT8(A8, A7, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT8(A9, A8, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT8(AA, A9, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT8(AB, AA, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT8(A0, AB, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT8(A1, A0, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT8(A2, A1, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT8(A3, A2, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT8(A4, A3, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT8(A5, A4, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT8(A6, A5, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT8(A7, A6, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT8(A8, A7, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT8(A9, A8, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT8(AA, A9, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT8(AB, AA, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
PERM_ELT8( A8, A7, B0, BD, B9, B6, C8, M0 ); \
|
||||
PERM_ELT8( A9, A8, B1, BE, BA, B7, C7, M1 ); \
|
||||
PERM_ELT8( AA, A9, B2, BF, BB, B8, C6, M2 ); \
|
||||
PERM_ELT8( AB, AA, B3, B0, BC, B9, C5, M3 ); \
|
||||
PERM_ELT8( A0, AB, B4, B1, BD, BA, C4, M4 ); \
|
||||
PERM_ELT8( A1, A0, B5, B2, BE, BB, C3, M5 ); \
|
||||
PERM_ELT8( A2, A1, B6, B3, BF, BC, C2, M6 ); \
|
||||
PERM_ELT8( A3, A2, B7, B4, B0, BD, C1, M7 ); \
|
||||
PERM_ELT8( A4, A3, B8, B5, B1, BE, C0, M8 ); \
|
||||
PERM_ELT8( A5, A4, B9, B6, B2, BF, CF, M9 ); \
|
||||
PERM_ELT8( A6, A5, BA, B7, B3, B0, CE, MA ); \
|
||||
PERM_ELT8( A7, A6, BB, B8, B4, B1, CD, MB ); \
|
||||
PERM_ELT8( A8, A7, BC, B9, B5, B2, CC, MC ); \
|
||||
PERM_ELT8( A9, A8, BD, BA, B6, B3, CB, MD ); \
|
||||
PERM_ELT8( AA, A9, BE, BB, B7, B4, CA, ME ); \
|
||||
PERM_ELT8( AB, AA, BF, BC, B8, B5, C9, MF ); \
|
||||
} while (0)
|
||||
|
||||
#define APPLY_P8 \
|
||||
do { \
|
||||
@@ -437,8 +422,8 @@ do { \
|
||||
} while (0)
|
||||
|
||||
#define INCR_W8 do { \
|
||||
if ((Wlow = T32(Wlow + 1)) == 0) \
|
||||
Whigh = T32(Whigh + 1); \
|
||||
if ( ( Wlow = Wlow + 1 ) == 0 ) \
|
||||
Whigh = Whigh + 1; \
|
||||
} while (0)
|
||||
|
||||
static void
|
||||
@@ -650,15 +635,8 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
shabal_8way_close(cc, ub, n, dst, 16);
|
||||
}
|
||||
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
/*
|
||||
* We copy the state into local variables, so that the compiler knows
|
||||
* that it can optimize them at will.
|
||||
*/
|
||||
|
||||
|
||||
#define DECL_STATE \
|
||||
__m128i A0, A1, A2, A3, A4, A5, A6, A7, \
|
||||
A8, A9, AA, AB; \
|
||||
@@ -888,15 +866,6 @@ do { \
|
||||
A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
/*
|
||||
#define SWAP(v1, v2) do { \
|
||||
sph_u32 tmp = (v1); \
|
||||
(v1) = (v2); \
|
||||
(v2) = tmp; \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
#define SWAP_BC \
|
||||
do { \
|
||||
mm128_swap256_128( B0, C0 ); \
|
||||
@@ -917,18 +886,6 @@ do { \
|
||||
mm128_swap256_128( BF, CF ); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
|
||||
do { \
|
||||
__m128i t1 = _mm_mullo_epi32( mm_rol_32( xa1, 15 ),\
|
||||
_mm_set1_epi32(5UL) ) \
|
||||
__m128i t2 = _mm_xor_si128( xa0, xc ); \
|
||||
xb0 = mm_not( _mm_xor_si256( xa0, mm_rol_32( xb0, 1 ) ) ); \
|
||||
xa0 = mm_xor4( xm, xb1, _mm_andnot_si128( xb3, xb2 ), \
|
||||
_mm_xor_si128( t2, \
|
||||
_mm_mullo_epi32( t1, _mm_set1_epi32(5UL) ) ) ) \
|
||||
*/
|
||||
|
||||
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
|
||||
do { \
|
||||
xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \
|
||||
@@ -1056,8 +1013,8 @@ do { \
|
||||
} while (0)
|
||||
|
||||
#define INCR_W do { \
|
||||
if ((Wlow = T32(Wlow + 1)) == 0) \
|
||||
Whigh = T32(Whigh + 1); \
|
||||
if ( ( Wlow = Wlow + 1 ) == 0 ) \
|
||||
Whigh = Whigh + 1; \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
|
@@ -75,7 +75,6 @@ void shabal512_8way_close( void *cc, void *dst );
|
||||
void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
@@ -97,7 +96,6 @@ void shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
|
||||
void shabal512_4way_init( void *cc );
|
||||
void shabal512_4way_update( void *cc, const void *data, size_t len );
|
||||
//#define shabal512_4way shabal512_4way_update
|
||||
void shabal512_4way_close( void *cc, void *dst );
|
||||
void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
@@ -383,11 +383,17 @@ static const m512_v16 FFT256_Twiddle4w[] =
|
||||
|
||||
#define shufxor4w(x,s) _mm512_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
|
||||
|
||||
#define REDUCE4w(x) \
|
||||
_mm512_sub_epi16( _mm512_maskz_mov_epi8( 0x5555555555555555, x ), \
|
||||
_mm512_srai_epi16( x, 8 ) )
|
||||
|
||||
/*
|
||||
#define REDUCE4w(x) \
|
||||
_mm512_sub_epi16( _mm512_and_si512( x, m512_const1_64( \
|
||||
0x00ff00ff00ff00ff ) ), _mm512_srai_epi16( x, 8 ) )
|
||||
*/
|
||||
|
||||
#define EXTRA_REDUCE_S4w(x)\
|
||||
#define EXTRA_REDUCE_S4w(x) \
|
||||
_mm512_sub_epi16( x, _mm512_and_si512( \
|
||||
m512_const1_64( 0x0101010101010101 ), \
|
||||
_mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
|
||||
@@ -400,8 +406,8 @@ static const m512_v16 FFT256_Twiddle4w[] =
|
||||
|
||||
#define DO_REDUCE_FULL_S4w(i) \
|
||||
do { \
|
||||
X(i) = REDUCE4w( X(i) ); \
|
||||
X(i) = EXTRA_REDUCE_S4w( X(i) ); \
|
||||
X(i) = REDUCE4w( X(i) ); \
|
||||
X(i) = EXTRA_REDUCE_S4w( X(i) ); \
|
||||
} while(0)
|
||||
|
||||
|
||||
@@ -431,10 +437,6 @@ void fft64_4way( void *a )
|
||||
// Unrolled decimation in frequency (DIF) radix-2 NTT.
|
||||
// Output data is in revbin_permuted order.
|
||||
|
||||
static const int w[] = {0, 2, 4, 6};
|
||||
// __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
|
||||
|
||||
|
||||
// targetted
|
||||
#define BUTTERFLY_0( i,j ) \
|
||||
do { \
|
||||
@@ -443,25 +445,25 @@ do { \
|
||||
X(i) = _mm512_sub_epi16( X(i), v ); \
|
||||
} while(0)
|
||||
|
||||
#define BUTTERFLY_N( i,j,n ) \
|
||||
#define BUTTERFLY_N( i, j, w ) \
|
||||
do { \
|
||||
__m512i v = X(j); \
|
||||
X(j) = _mm512_add_epi16( X(i), X(j) ); \
|
||||
X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w[n] ); \
|
||||
X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w ); \
|
||||
} while(0)
|
||||
|
||||
BUTTERFLY_0( 0, 4 );
|
||||
BUTTERFLY_N( 1, 5, 1 );
|
||||
BUTTERFLY_N( 2, 6, 2 );
|
||||
BUTTERFLY_N( 3, 7, 3 );
|
||||
BUTTERFLY_N( 1, 5, 2 );
|
||||
BUTTERFLY_N( 2, 6, 4 );
|
||||
BUTTERFLY_N( 3, 7, 6 );
|
||||
|
||||
DO_REDUCE( 2 );
|
||||
DO_REDUCE( 3 );
|
||||
|
||||
BUTTERFLY_0( 0, 2 );
|
||||
BUTTERFLY_0( 4, 6 );
|
||||
BUTTERFLY_N( 1, 3, 2 );
|
||||
BUTTERFLY_N( 5, 7, 2 );
|
||||
BUTTERFLY_N( 1, 3, 4 );
|
||||
BUTTERFLY_N( 5, 7, 4 );
|
||||
|
||||
DO_REDUCE( 1 );
|
||||
|
||||
@@ -501,12 +503,11 @@ do { \
|
||||
// Transpose the FFT state with a revbin order permutation
|
||||
// on the rows and the column.
|
||||
// This will make the full FFT_64 in order.
|
||||
#define INTERLEAVE(i,j) \
|
||||
#define INTERLEAVE( i, j ) \
|
||||
do { \
|
||||
__m512i t1= X(i); \
|
||||
__m512i t2= X(j); \
|
||||
X(i) = _mm512_unpacklo_epi16( t1, t2 ); \
|
||||
X(j) = _mm512_unpackhi_epi16( t1, t2 ); \
|
||||
__m512i u = X(j); \
|
||||
X(j) = _mm512_unpackhi_epi16( X(i), X(j) ); \
|
||||
X(i) = _mm512_unpacklo_epi16( X(i), u ); \
|
||||
} while(0)
|
||||
|
||||
INTERLEAVE( 1, 0 );
|
||||
@@ -534,10 +535,10 @@ do { \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define BUTTERFLY_N( i,j,n ) \
|
||||
#define BUTTERFLY_N( i, j, w ) \
|
||||
do { \
|
||||
__m512i u = X(j); \
|
||||
X(i) = _mm512_slli_epi16( X(i), w[n] ); \
|
||||
X(i) = _mm512_slli_epi16( X(i), w ); \
|
||||
X(j) = _mm512_sub_epi16( X(j), X(i) ); \
|
||||
X(i) = _mm512_add_epi16( u, X(i) ); \
|
||||
} while(0)
|
||||
@@ -558,15 +559,15 @@ do { \
|
||||
|
||||
BUTTERFLY_0( 0, 2 );
|
||||
BUTTERFLY_0( 4, 6 );
|
||||
BUTTERFLY_N( 1, 3, 2 );
|
||||
BUTTERFLY_N( 5, 7, 2 );
|
||||
BUTTERFLY_N( 1, 3, 4 );
|
||||
BUTTERFLY_N( 5, 7, 4 );
|
||||
|
||||
DO_REDUCE( 3 );
|
||||
|
||||
BUTTERFLY_0( 0, 4 );
|
||||
BUTTERFLY_N( 1, 5, 1 );
|
||||
BUTTERFLY_N( 2, 6, 2 );
|
||||
BUTTERFLY_N( 3, 7, 3 );
|
||||
BUTTERFLY_N( 1, 5, 2 );
|
||||
BUTTERFLY_N( 2, 6, 4 );
|
||||
BUTTERFLY_N( 3, 7, 6 );
|
||||
|
||||
DO_REDUCE_FULL_S4w( 0 );
|
||||
DO_REDUCE_FULL_S4w( 1 );
|
||||
@@ -599,7 +600,6 @@ void fft128_4way( void *a )
|
||||
// Temp space to help for interleaving in the end
|
||||
__m512i B[8];
|
||||
__m512i *A = (__m512i*) a;
|
||||
// __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
|
||||
|
||||
/* Size-2 butterflies */
|
||||
for ( i = 0; i<8; i++ )
|
||||
@@ -633,7 +633,6 @@ void fft128_4way_msg( uint16_t *a, const uint8_t *x, int final )
|
||||
|
||||
__m512i *X = (__m512i*)x;
|
||||
__m512i *A = (__m512i*)a;
|
||||
// __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
|
||||
|
||||
#define UNPACK( i ) \
|
||||
do { \
|
||||
@@ -686,7 +685,6 @@ void fft256_4way_msg( uint16_t *a, const uint8_t *x, int final )
|
||||
|
||||
__m512i *X = (__m512i*)x;
|
||||
__m512i *A = (__m512i*)a;
|
||||
// __m256i *Twiddle = (__m256i*)FFT256_Twiddle;
|
||||
|
||||
#define UNPACK( i ) \
|
||||
do { \
|
||||
@@ -776,109 +774,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
|
||||
// We split the round function in two halfes
|
||||
// so as to insert some independent computations in between
|
||||
|
||||
// generic
|
||||
#if 0
|
||||
#define SUM7_00 0
|
||||
#define SUM7_01 1
|
||||
#define SUM7_02 2
|
||||
#define SUM7_03 3
|
||||
#define SUM7_04 4
|
||||
#define SUM7_05 5
|
||||
#define SUM7_06 6
|
||||
|
||||
#define SUM7_10 1
|
||||
#define SUM7_11 2
|
||||
#define SUM7_12 3
|
||||
#define SUM7_13 4
|
||||
#define SUM7_14 5
|
||||
#define SUM7_15 6
|
||||
#define SUM7_16 0
|
||||
|
||||
#define SUM7_20 2
|
||||
#define SUM7_21 3
|
||||
#define SUM7_22 4
|
||||
#define SUM7_23 5
|
||||
#define SUM7_24 6
|
||||
#define SUM7_25 0
|
||||
#define SUM7_26 1
|
||||
|
||||
#define SUM7_30 3
|
||||
#define SUM7_31 4
|
||||
#define SUM7_32 5
|
||||
#define SUM7_33 6
|
||||
#define SUM7_34 0
|
||||
#define SUM7_35 1
|
||||
#define SUM7_36 2
|
||||
|
||||
#define SUM7_40 4
|
||||
#define SUM7_41 5
|
||||
#define SUM7_42 6
|
||||
#define SUM7_43 0
|
||||
#define SUM7_44 1
|
||||
#define SUM7_45 2
|
||||
#define SUM7_46 3
|
||||
|
||||
#define SUM7_50 5
|
||||
#define SUM7_51 6
|
||||
#define SUM7_52 0
|
||||
#define SUM7_53 1
|
||||
#define SUM7_54 2
|
||||
#define SUM7_55 3
|
||||
#define SUM7_56 4
|
||||
|
||||
#define SUM7_60 6
|
||||
#define SUM7_61 0
|
||||
#define SUM7_62 1
|
||||
#define SUM7_63 2
|
||||
#define SUM7_64 3
|
||||
#define SUM7_65 4
|
||||
#define SUM7_66 5
|
||||
|
||||
#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
|
||||
|
||||
#define PERM_0(d,a) /* XOR 1 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##l, 1 ); \
|
||||
d##h = shufxor( a##h, 1 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_1(d,a) /* XOR 6 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##h, 2 ); \
|
||||
d##h = shufxor( a##l, 2 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_2(d,a) /* XOR 2 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##l, 2 ); \
|
||||
d##h = shufxor( a##h, 2 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_3(d,a) /* XOR 3 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##l, 3 ); \
|
||||
d##h = shufxor( a##h, 3 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_4(d,a) /* XOR 5 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##h, 1 ); \
|
||||
d##h = shufxor( a##l, 1 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_5(d,a) /* XOR 7 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##h, 3 ); \
|
||||
d##h = shufxor( a##l, 3 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_6(d,a) /* XOR 4 */ \
|
||||
do { \
|
||||
d##l = a##h; \
|
||||
d##h = a##l; \
|
||||
} while(0)
|
||||
#endif
|
||||
|
||||
// targetted
|
||||
|
||||
#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
|
||||
|
@@ -1106,8 +1106,7 @@ skein256_4way_close(void *cc, void *dst)
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Do not use with 128 bit data
|
||||
// Broken for 80 & 128 bytes, use prehash or full
|
||||
void
|
||||
skein512_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
|
@@ -31,18 +31,19 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
int thr_id = mythr->id;
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
skeinhash(hash64, endiandata);
|
||||
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
}
|
||||
if (hash64[7] <= Htarg )
|
||||
if ( fulltest(hash64, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
@@ -34,31 +34,31 @@ void skein2hash(void *output, const void *input)
|
||||
sph_skein512_close(&ctx_skein, hash);
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
|
||||
}
|
||||
|
||||
int scanhash_skein2( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t hash64[8] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__ ((aligned (64)));
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
int thr_id = mythr->id;
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
skein2hash(hash64, endiandata);
|
||||
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
}
|
||||
if (hash64[7] <= Htarg )
|
||||
if ( fulltest(hash64, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
@@ -1,291 +0,0 @@
|
||||
/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
|
||||
/*
|
||||
* This file contains some functions which implement the external data
|
||||
* handling and padding for Merkle-Damgard hash functions which follow
|
||||
* the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
|
||||
*
|
||||
* API: this file is meant to be included, not compiled as a stand-alone
|
||||
* file. Some macros must be defined:
|
||||
* RFUN name for the round function
|
||||
* HASH "short name" for the hash function
|
||||
* BE32 defined for big-endian, 32-bit based (e.g. SHA-1)
|
||||
* LE32 defined for little-endian, 32-bit based (e.g. MD5)
|
||||
* BE64 defined for big-endian, 64-bit based (e.g. SHA-512)
|
||||
* LE64 defined for little-endian, 64-bit based (no example yet)
|
||||
* PW01 if defined, append 0x01 instead of 0x80 (for Tiger)
|
||||
* BLEN if defined, length of a message block (in bytes)
|
||||
* PLW1 if defined, length is defined on one 64-bit word only (for Tiger)
|
||||
* PLW4 if defined, length is defined on four 64-bit words (for WHIRLPOOL)
|
||||
* SVAL if defined, reference to the context state information
|
||||
*
|
||||
* BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
|
||||
* this is used for instance for Tiger, which works on 64-bit words but
|
||||
* uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
|
||||
* ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
|
||||
* set, then only one word (64 bits) will be used to encode the input
|
||||
* message length (in bits), otherwise two words will be used (as in
|
||||
* SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
|
||||
* not PLW1), four 64-bit words will be used to encode the message length
|
||||
* (in bits). Note that regardless of those settings, only 64-bit message
|
||||
* lengths are supported (in bits): messages longer than 2 Exabytes will be
|
||||
* improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
|
||||
* 2 millions Terabytes, which is huge).
|
||||
*
|
||||
* If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
|
||||
* function. This is used for Tiger2, which is identical to Tiger except
|
||||
* when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
|
||||
* of the 0x01 from original Tiger).
|
||||
*
|
||||
* The RFUN function is invoked with two arguments, the first pointing to
|
||||
* aligned data (as a "const void *"), the second being state information
|
||||
* from the context structure. By default, this state information is the
|
||||
* "val" field from the context, and this field is assumed to be an array
|
||||
* of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
|
||||
* from the context structure. The "val" field can have any type, except
|
||||
* for the output encoding which assumes that it is an array of "sph_u32"
|
||||
* values. By defining NO_OUTPUT, this last step is deactivated; the
|
||||
* includer code is then responsible for writing out the hash result. When
|
||||
* NO_OUTPUT is defined, the third parameter to the "close()" function is
|
||||
* ignored.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
#undef SPH_XCAT
|
||||
#define SPH_XCAT(a, b) SPH_XCAT_(a, b)
|
||||
#undef SPH_XCAT_
|
||||
#define SPH_XCAT_(a, b) a ## b
|
||||
|
||||
#undef SPH_BLEN
|
||||
#undef SPH_WLEN
|
||||
#if defined BE64 || defined LE64
|
||||
#define SPH_BLEN 128U
|
||||
#define SPH_WLEN 8U
|
||||
#else
|
||||
#define SPH_BLEN 64U
|
||||
#define SPH_WLEN 4U
|
||||
#endif
|
||||
|
||||
#ifdef BLEN
|
||||
#undef SPH_BLEN
|
||||
#define SPH_BLEN BLEN
|
||||
#endif
|
||||
|
||||
#undef SPH_MAXPAD
|
||||
#if defined PLW1
|
||||
#define SPH_MAXPAD (SPH_BLEN - SPH_WLEN)
|
||||
#elif defined PLW4
|
||||
#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 2))
|
||||
#else
|
||||
#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 1))
|
||||
#endif
|
||||
|
||||
#undef SPH_VAL
|
||||
#undef SPH_NO_OUTPUT
|
||||
#ifdef SVAL
|
||||
#define SPH_VAL SVAL
|
||||
#define SPH_NO_OUTPUT 1
|
||||
#else
|
||||
#define SPH_VAL sc->val
|
||||
#endif
|
||||
|
||||
#ifndef CLOSE_ONLY
|
||||
|
||||
#ifdef SPH_UPTR
|
||||
static void
|
||||
SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len )
|
||||
#else
|
||||
void
|
||||
HASH ( void *cc, const void *data, size_t len )
|
||||
#endif
|
||||
{
|
||||
SPH_XCAT( HASH, _context ) *sc;
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
size_t ptr;
|
||||
|
||||
sc = cc;
|
||||
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
clen = SPH_BLEN - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
|
||||
vdata = vdata + (clen>>3);
|
||||
ptr += clen;
|
||||
len -= clen;
|
||||
if ( ptr == SPH_BLEN )
|
||||
{
|
||||
RFUN( sc->buf, SPH_VAL );
|
||||
ptr = 0;
|
||||
}
|
||||
sc->count += clen;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef SPH_UPTR
|
||||
void
|
||||
HASH (void *cc, const void *data, size_t len)
|
||||
{
|
||||
SPH_XCAT(HASH, _context) *sc;
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
unsigned ptr;
|
||||
|
||||
if ( len < (2 * SPH_BLEN) )
|
||||
{
|
||||
SPH_XCAT(HASH, _short)(cc, data, len);
|
||||
return;
|
||||
}
|
||||
sc = cc;
|
||||
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
|
||||
if ( ptr > 0 )
|
||||
{
|
||||
unsigned t;
|
||||
t = SPH_BLEN - ptr;
|
||||
SPH_XCAT( HASH, _short )( cc, data, t );
|
||||
vdata = vdata + (t>>3);
|
||||
len -= t;
|
||||
}
|
||||
SPH_XCAT( HASH, _short )( cc, data, len );
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Perform padding and produce result. The context is NOT reinitialized
|
||||
* by this function.
|
||||
*/
|
||||
static void
|
||||
SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
|
||||
void *dst, unsigned rnum )
|
||||
{
|
||||
SPH_XCAT(HASH, _context) *sc;
|
||||
unsigned ptr, u;
|
||||
sc = cc;
|
||||
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
|
||||
|
||||
//uint64_t *b= (uint64_t*)sc->buf;
|
||||
//uint64_t *s= (uint64_t*)sc->state;
|
||||
//printf("Vptr 1= %u\n", ptr);
|
||||
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
|
||||
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
|
||||
|
||||
#ifdef PW01
|
||||
sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x100 >> 8 );
|
||||
// sc->buf[ptr++] = 0x100 >> 8;
|
||||
#else
|
||||
// need to overwrite exactly one byte
|
||||
// sc->buf[ptr>>3] = _mm256_set_epi64x( 0, 0, 0, 0x80 );
|
||||
sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x80 );
|
||||
// ptr++;
|
||||
#endif
|
||||
ptr += 8;
|
||||
|
||||
//printf("Vptr 2= %u\n", ptr);
|
||||
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
|
||||
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
|
||||
|
||||
if ( ptr > SPH_MAXPAD )
|
||||
{
|
||||
memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
|
||||
RFUN( sc->buf, SPH_VAL );
|
||||
memset_zero_256( sc->buf, SPH_MAXPAD >> 3 );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
|
||||
}
|
||||
#if defined BE64
|
||||
#if defined PLW1
|
||||
sc->buf[ SPH_MAXPAD>>3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#elif defined PLW4
|
||||
memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
|
||||
sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#else
|
||||
sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#endif // PLW
|
||||
#else // LE64
|
||||
#if defined PLW1
|
||||
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
|
||||
#elif defined PLW4
|
||||
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
|
||||
sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
|
||||
_mm256_set1_epi64x( c->count >> 61 );
|
||||
memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
|
||||
2 * SPH_WLEN );
|
||||
#else
|
||||
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
|
||||
sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
|
||||
_mm256_set1_epi64x( sc->count >> 61 );
|
||||
#endif // PLW
|
||||
|
||||
#endif // LE64
|
||||
|
||||
//printf("Vptr 3= %u\n", ptr);
|
||||
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
|
||||
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
|
||||
RFUN( sc->buf, SPH_VAL );
|
||||
|
||||
//printf("Vptr after= %u\n", ptr);
|
||||
//printf("VState %016llx %016llx %016llx %016llx\n", s[0], s[4], s[8], s[12] );
|
||||
//printf("VState %016llx %016llx %016llx %016llx\n", s[16], s[20], s[24], s[28] );
|
||||
|
||||
#ifdef SPH_NO_OUTPUT
|
||||
(void)dst;
|
||||
(void)rnum;
|
||||
(void)u;
|
||||
#else
|
||||
for ( u = 0; u < rnum; u ++ )
|
||||
{
|
||||
#if defined BE64
|
||||
((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
|
||||
#else // LE64
|
||||
((__m256i*)dst)[u] = sc->val[u];
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum )
|
||||
{
|
||||
SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum );
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@@ -1,108 +0,0 @@
|
||||
/* $Id: sph_whirlpool.h 216 2010-06-08 09:46:57Z tp $ */
|
||||
/**
|
||||
* WHIRLPOOL interface.
|
||||
*
|
||||
* WHIRLPOOL knows three variants, dubbed "WHIRLPOOL-0" (original
|
||||
* version, published in 2000, studied by NESSIE), "WHIRLPOOL-1"
|
||||
* (first revision, 2001, with a new S-box) and "WHIRLPOOL" (current
|
||||
* version, 2003, with a new diffusion matrix, also described as "plain
|
||||
* WHIRLPOOL"). All three variants are implemented here.
|
||||
*
|
||||
* The original WHIRLPOOL (i.e. WHIRLPOOL-0) was published in: P. S. L.
|
||||
* M. Barreto, V. Rijmen, "The Whirlpool Hashing Function", First open
|
||||
* NESSIE Workshop, Leuven, Belgium, November 13--14, 2000.
|
||||
*
|
||||
* The current WHIRLPOOL specification and a reference implementation
|
||||
* can be found on the WHIRLPOOL web page:
|
||||
* http://paginas.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_whirlpool.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef WHIRLPOOL_HASH_4WAY_H__
|
||||
#define WHIRLPOOL_HASH_4WAY_H__
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for WHIRLPOOL.
|
||||
*/
|
||||
#define SPH_SIZE_whirlpool 512
|
||||
|
||||
/**
|
||||
* Output size (in bits) for WHIRLPOOL-0.
|
||||
*/
|
||||
#define SPH_SIZE_whirlpool0 512
|
||||
|
||||
/**
|
||||
* Output size (in bits) for WHIRLPOOL-1.
|
||||
*/
|
||||
#define SPH_SIZE_whirlpool1 512
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[8] __attribute__ ((aligned (64)));
|
||||
__m256i state[8];
|
||||
sph_u64 count;
|
||||
} whirlpool_4way_context;
|
||||
|
||||
void whirlpool_4way_init( void *cc );
|
||||
|
||||
void whirlpool_4way( void *cc, const void *data, size_t len );
|
||||
|
||||
void whirlpool_4way_close( void *cc, void *dst );
|
||||
|
||||
/**
|
||||
* WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL.
|
||||
*/
|
||||
typedef whirlpool_4way_context whirlpool0_4way_context;
|
||||
|
||||
#define whirlpool0_4way_init whirlpool_4way_init
|
||||
|
||||
void whirlpool0_4way( void *cc, const void *data, size_t len );
|
||||
|
||||
void whirlpool0_4way_close( void *cc, void *dst );
|
||||
|
||||
/**
|
||||
* WHIRLPOOL-1 uses the same structure than plain WHIRLPOOL.
|
||||
*/
|
||||
typedef whirlpool_4way_context whirlpool1_4way_context;
|
||||
|
||||
#define whirlpool1_4way_init whirlpool_4way_init
|
||||
|
||||
void whirlpool1_4way(void *cc, const void *data, size_t len);
|
||||
|
||||
void whirlpool1_4way_close(void *cc, void *dst);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -12,6 +12,7 @@
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/shavite/shavite-hash-2way.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#if defined(__VAES__)
|
||||
@@ -22,15 +23,15 @@
|
||||
|
||||
#if defined (C11_8WAY)
|
||||
|
||||
typedef struct {
|
||||
union _c11_8way_context_overlay
|
||||
{
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
skein512_8way_context skein;
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
luffa_4way_context luffa;
|
||||
cube_4way_context cube;
|
||||
simd_4way_context simd;
|
||||
cube_4way_2buf_context cube;
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_context groestl;
|
||||
shavite512_4way_context shavite;
|
||||
@@ -40,32 +41,14 @@ typedef struct {
|
||||
sph_shavite512_context shavite;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
} c11_8way_ctx_holder;
|
||||
simd_4way_context simd;
|
||||
} __attribute__ ((aligned (64)));
|
||||
typedef union _c11_8way_context_overlay c11_8way_context_overlay;
|
||||
|
||||
c11_8way_ctx_holder c11_8way_ctx;
|
||||
static __thread __m512i c11_8way_midstate[16] __attribute__((aligned(64)));
|
||||
static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
|
||||
|
||||
void init_c11_8way_ctx()
|
||||
{
|
||||
blake512_8way_init( &c11_8way_ctx.blake );
|
||||
bmw512_8way_init( &c11_8way_ctx.bmw );
|
||||
skein512_8way_init( &c11_8way_ctx.skein );
|
||||
jh512_8way_init( &c11_8way_ctx.jh );
|
||||
keccak512_8way_init( &c11_8way_ctx.keccak );
|
||||
luffa_4way_init( &c11_8way_ctx.luffa, 512 );
|
||||
cube_4way_init( &c11_8way_ctx.cube, 512, 16, 32 );
|
||||
simd_4way_init( &c11_8way_ctx.simd, 512 );
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_init( &c11_8way_ctx.groestl, 64 );
|
||||
shavite512_4way_init( &c11_8way_ctx.shavite );
|
||||
echo_4way_init( &c11_8way_ctx.echo, 512 );
|
||||
#else
|
||||
init_groestl( &c11_8way_ctx.groestl, 64 );
|
||||
sph_shavite512_init( &c11_8way_ctx.shavite );
|
||||
init_echo( &c11_8way_ctx.echo, 512 );
|
||||
#endif
|
||||
}
|
||||
|
||||
void c11_8way_hash( void *state, const void *input )
|
||||
int c11_8way_hash( void *state, const void *input, int thr_id )
|
||||
{
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
|
||||
@@ -78,24 +61,19 @@ void c11_8way_hash( void *state, const void *input )
|
||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
c11_8way_ctx_holder ctx;
|
||||
memcpy( &ctx, &c11_8way_ctx, sizeof(c11_8way_ctx) );
|
||||
c11_8way_context_overlay ctx;
|
||||
|
||||
// 1 Blake 4way
|
||||
blake512_8way_update( &ctx.blake, input, 80 );
|
||||
blake512_8way_close( &ctx.blake, vhash );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_8way_close( &ctx.bmw, vhash );
|
||||
blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
|
||||
c11_8way_midstate );
|
||||
|
||||
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
|
||||
groestl512_4way_init( &ctx.groestl, 64 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
|
||||
groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
@@ -104,21 +82,14 @@ void c11_8way_hash( void *state, const void *input )
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7 );
|
||||
@@ -126,83 +97,56 @@ void c11_8way_hash( void *state, const void *input )
|
||||
#endif
|
||||
|
||||
// 4 JH
|
||||
jh512_8way_init( &ctx.jh );
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhash );
|
||||
|
||||
// 5 Keccak
|
||||
keccak512_8way_init( &ctx.keccak );
|
||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx.keccak, vhash );
|
||||
|
||||
// 6 Skein
|
||||
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx.skein, vhash );
|
||||
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa_4way_init( &ctx.luffa, 512 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
|
||||
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
|
||||
luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_4way_init( &ctx.shavite );
|
||||
shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash4, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash4 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash5, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash5 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash6, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash6 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||
|
||||
shavite512_full( &ctx.shavite, hash0, hash0, 64 );
|
||||
shavite512_full( &ctx.shavite, hash1, hash1, 64 );
|
||||
shavite512_full( &ctx.shavite, hash2, hash2, 64 );
|
||||
shavite512_full( &ctx.shavite, hash3, hash3, 64 );
|
||||
shavite512_full( &ctx.shavite, hash4, hash4, 64 );
|
||||
shavite512_full( &ctx.shavite, hash5, hash5, 64 );
|
||||
shavite512_full( &ctx.shavite, hash6, hash6, 64 );
|
||||
shavite512_full( &ctx.shavite, hash7, hash7, 64 );
|
||||
|
||||
intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
|
||||
intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
|
||||
|
||||
#endif
|
||||
|
||||
simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
|
||||
simd_4way_init( &ctx.simd, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
simd512_4way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_4way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
|
||||
echo_4way_init( &ctx.echo, 512 );
|
||||
echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
|
||||
echo_4way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
|
||||
echo_4way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
@@ -212,29 +156,22 @@ void c11_8way_hash( void *state, const void *input )
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash4,
|
||||
(const BitSequence *) hash4, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash5,
|
||||
(const BitSequence *) hash5, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash6,
|
||||
(const BitSequence *) hash6, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||
(const BitSequence *) hash7, 512 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
|
||||
(const BitSequence *)hash0, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash1, 512,
|
||||
(const BitSequence *)hash1, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash2, 512,
|
||||
(const BitSequence *)hash2, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
|
||||
(const BitSequence *)hash3, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash4, 512,
|
||||
(const BitSequence *)hash4, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash5, 512,
|
||||
(const BitSequence *)hash5, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash6, 512,
|
||||
(const BitSequence *)hash6, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash7, 512,
|
||||
(const BitSequence *)hash7, 64 );
|
||||
|
||||
#endif
|
||||
|
||||
@@ -246,225 +183,223 @@ void c11_8way_hash( void *state, const void *input )
|
||||
memcpy( state+160, hash5, 32 );
|
||||
memcpy( state+192, hash6, 32 );
|
||||
memcpy( state+224, hash7, 32 );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int thr_id = mythr->id;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
__m128i edata[5] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
__m512i *noncev = (__m512i*)vdata + 9;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const __m512i eight = m512_const1_64( 8 );
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
max_nonce -= 8;
|
||||
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
|
||||
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
|
||||
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
|
||||
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
|
||||
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
mm512_intrlv80_8x64( vdata, edata );
|
||||
*noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
|
||||
0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0 ) );
|
||||
blake512_8way_prehash_le( &blake512_8way_ctx, c11_8way_midstate, vdata );
|
||||
|
||||
do
|
||||
{
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
c11_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( ( ( hash+(i<<3) )[7] <= Htarg )
|
||||
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
n += 8;
|
||||
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
do
|
||||
{
|
||||
if ( likely( c11_8way_hash( hash, vdata, thr_id ) ) )
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( ( ( hash + ( lane << 3 ) )[7] <= targ32_d7 )
|
||||
&& valid_hash( hash +( lane << 3 ), ptarget ) && !bench )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, hash + ( lane << 3 ), mythr );
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, eight );
|
||||
n += 8;
|
||||
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (C11_4WAY)
|
||||
|
||||
typedef struct {
|
||||
union _c11_4way_context_overlay
|
||||
{
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl;
|
||||
echo512_2way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
luffa_2way_context luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
simd_2way_context simd;
|
||||
hashState_echo echo;
|
||||
} c11_4way_ctx_holder;
|
||||
#endif
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
luffa_2way_context luffa;
|
||||
cube_2way_context cube;
|
||||
shavite512_2way_context shavite;
|
||||
simd_2way_context simd;
|
||||
};
|
||||
typedef union _c11_4way_context_overlay c11_4way_context_overlay;
|
||||
|
||||
c11_4way_ctx_holder c11_4way_ctx;
|
||||
static __thread __m256i c11_4way_midstate[16] __attribute__((aligned(64)));
|
||||
static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));
|
||||
|
||||
void init_c11_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &c11_4way_ctx.blake );
|
||||
bmw512_4way_init( &c11_4way_ctx.bmw );
|
||||
init_groestl( &c11_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &c11_4way_ctx.skein );
|
||||
jh512_4way_init( &c11_4way_ctx.jh );
|
||||
keccak512_4way_init( &c11_4way_ctx.keccak );
|
||||
luffa_2way_init( &c11_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &c11_4way_ctx.shavite );
|
||||
simd_2way_init( &c11_4way_ctx.simd, 512 );
|
||||
init_echo( &c11_4way_ctx.echo, 512 );
|
||||
}
|
||||
|
||||
void c11_4way_hash( void *state, const void *input )
|
||||
int c11_4way_hash( void *state, const void *input, int thr_id )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashA[8*2] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[8*2] __attribute__ ((aligned (64)));
|
||||
c11_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
|
||||
c11_4way_context_overlay ctx;
|
||||
|
||||
// 1 Blake 4way
|
||||
blake512_4way_update( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
|
||||
c11_4way_midstate );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
// Serial
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
|
||||
// 4way
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
// 4 JH
|
||||
#else
|
||||
|
||||
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// 5 Keccak
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// 6 Skein
|
||||
skein512_4way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
// Serial
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
// 7 Luffa
|
||||
intrlv_2x128( vhash, hash0, hash1, 512 );
|
||||
intrlv_2x128( vhashB, hash2, hash3, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
dintrlv_2x128( hash0, hash1, vhash, 512 );
|
||||
dintrlv_2x128( hash2, hash3, vhashB, 512 );
|
||||
luffa512_2way_full( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa512_2way_full( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
// 8 Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
cube_2way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
|
||||
cube_2way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
shavite512_2way_full( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_2way_full( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
// 10 Simd
|
||||
intrlv_2x128( vhash, hash0, hash1, 512 );
|
||||
intrlv_2x128( vhashB, hash2, hash3, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
dintrlv_2x128( hash0, hash1, vhash, 512 );
|
||||
dintrlv_2x128( hash2, hash3, vhashB, 512 );
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
|
||||
// 11 Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
|
||||
echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
|
||||
|
||||
dintrlv_2x128_512( hash0, hash1, vhashA );
|
||||
dintrlv_2x128_512( hash2, hash3, vhashB );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_2x128_512( hash0, hash1, vhashA );
|
||||
dintrlv_2x128_512( hash2, hash3, vhashB );
|
||||
|
||||
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
|
||||
(const BitSequence *)hash0, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash1, 512,
|
||||
(const BitSequence *)hash1, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash2, 512,
|
||||
(const BitSequence *)hash2, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
|
||||
(const BitSequence *)hash3, 64 );
|
||||
|
||||
#endif
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
__m128i edata[5] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
__m256i *noncev = (__m256i*)vdata + 9;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const __m256i four = m256_const1_64( 4 );
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
|
||||
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
|
||||
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
|
||||
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
|
||||
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
do
|
||||
{
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
mm256_intrlv80_4x64( vdata, edata );
|
||||
|
||||
c11_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
*noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32(
|
||||
0, 3, 0, 2, 0, 1, 0, 0 ) );
|
||||
blake512_4way_prehash_le( &blake512_4way_ctx, c11_4way_midstate, vdata );
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( ( ( hash+(i<<3) )[7] <= Htarg )
|
||||
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
do
|
||||
{
|
||||
if ( likely( c11_4way_hash( hash, vdata, thr_id ) ) )
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( ( hash + ( lane << 3 ) )[7] <= targ32_d7 )
|
||||
&& valid_hash( hash +( lane << 3 ), ptarget ) && !bench )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, hash + ( lane << 3 ), mythr );
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -3,11 +3,9 @@
|
||||
bool register_c11_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (C11_8WAY)
|
||||
init_c11_8way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_c11_8way;
|
||||
gate->hash = (void*)&c11_8way_hash;
|
||||
#elif defined (C11_4WAY)
|
||||
init_c11_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_c11_4way;
|
||||
gate->hash = (void*)&c11_4way_hash;
|
||||
#else
|
||||
|
@@ -14,14 +14,14 @@
|
||||
bool register_c11_algo( algo_gate_t* gate );
|
||||
#if defined(C11_8WAY)
|
||||
|
||||
void c11_8way_hash( void *state, const void *input );
|
||||
int c11_8way_hash( void *state, const void *input, int thr_id );
|
||||
int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_c11_8way_ctx();
|
||||
//void init_c11_8way_ctx();
|
||||
|
||||
#elif defined(C11_4WAY)
|
||||
|
||||
void c11_4way_hash( void *state, const void *input );
|
||||
int c11_4way_hash( void *state, const void *input, int thr_id );
|
||||
int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_c11_4way_ctx();
|
||||
|
@@ -112,8 +112,9 @@ void timetravel_4way_hash(void *output, const void *input)
|
||||
intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
break;
|
||||
case 3:
|
||||
skein512_4way_update( &ctx.skein, vhashA, dataLen );
|
||||
skein512_4way_close( &ctx.skein, vhashB );
|
||||
skein512_4way_full( &ctx.skein, vhashB, vhashA, dataLen );
|
||||
// skein512_4way_update( &ctx.skein, vhashA, dataLen );
|
||||
// skein512_4way_close( &ctx.skein, vhashB );
|
||||
if ( i == 7 )
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
|
||||
break;
|
||||
|
@@ -118,8 +118,9 @@ void timetravel10_4way_hash(void *output, const void *input)
|
||||
intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
break;
|
||||
case 3:
|
||||
skein512_4way_update( &ctx.skein, vhashA, dataLen );
|
||||
skein512_4way_close( &ctx.skein, vhashB );
|
||||
skein512_4way_full( &ctx.skein, vhashB, vhashA, dataLen );
|
||||
// skein512_4way_update( &ctx.skein, vhashA, dataLen );
|
||||
// skein512_4way_close( &ctx.skein, vhashB );
|
||||
if ( i == 9 )
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
|
||||
break;
|
||||
|
@@ -33,9 +33,10 @@ void polytimos_4way_hash( void *output, const void *input )
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
poly_4way_context_overlay ctx;
|
||||
|
||||
skein512_4way_init( &ctx.skein );
|
||||
skein512_4way_update( &ctx.skein, input, 80 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
skein512_4way_full( &ctx.skein, vhash, input, 80 );
|
||||
// skein512_4way_init( &ctx.skein );
|
||||
// skein512_4way_update( &ctx.skein, input, 80 );
|
||||
// skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// Need to convert from 64 bit interleaved to 32 bit interleaved.
|
||||
uint32_t vhash32[16*4];
|
||||
|
@@ -38,8 +38,10 @@ void veltor_4way_hash( void *output, const void *input )
|
||||
veltor_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &veltor_4way_ctx, sizeof(veltor_4way_ctx) );
|
||||
|
||||
skein512_4way_update( &ctx.skein, input, 80 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
// skein512_4way_update( &ctx.skein, input, 80 );
|
||||
// skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
skein512_4way_full( &ctx.skein, vhash, input, 80 );
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
@@ -105,7 +107,7 @@ int scanhash_veltor_4way( struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) && ! opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
|
@@ -18,6 +18,7 @@
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/yespower/yespower.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
@@ -31,6 +32,9 @@
|
||||
// Config
|
||||
#define MINOTAUR_ALGO_COUNT 16
|
||||
|
||||
static const yespower_params_t minotaurx_yespower_params =
|
||||
{ YESPOWER_1_0, 2048, 8, "et in arcadia ego", 17 };
|
||||
|
||||
typedef struct TortureNode TortureNode;
|
||||
typedef struct TortureGarden TortureGarden;
|
||||
|
||||
@@ -59,20 +63,22 @@ struct TortureGarden
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sph_sha512_context sha512;
|
||||
|
||||
struct TortureNode {
|
||||
struct TortureNode
|
||||
{
|
||||
unsigned int algo;
|
||||
TortureNode *child[2];
|
||||
} nodes[22];
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
// Get a 64-byte hash for given 64-byte input, using given TortureGarden contexts and given algo index
|
||||
static void get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
unsigned int algo )
|
||||
static int get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
unsigned int algo, int thr_id )
|
||||
{
|
||||
unsigned char hash[64] __attribute__ ((aligned (64)));
|
||||
int rc = 1;
|
||||
|
||||
switch (algo) {
|
||||
switch ( algo )
|
||||
{
|
||||
case 0:
|
||||
sph_blake512_init(&garden->blake);
|
||||
sph_blake512(&garden->blake, input, 64);
|
||||
@@ -97,14 +103,14 @@ static void get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
sph_echo512(&garden->echo, input, 64);
|
||||
sph_echo512_close(&garden->echo, hash);
|
||||
#endif
|
||||
break;
|
||||
break;
|
||||
case 4:
|
||||
#if defined(__AES__)
|
||||
fugue512_full( &garden->fugue, hash, input, 64 );
|
||||
#else
|
||||
sph_fugue512_full( &garden->fugue, hash, input, 64 );
|
||||
#endif
|
||||
break;
|
||||
break;
|
||||
case 5:
|
||||
#if defined(__AES__)
|
||||
groestl512_full( &garden->groestl, (char*)hash, (char*)input, 512 );
|
||||
@@ -113,7 +119,7 @@ static void get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
sph_groestl512(&garden->groestl, input, 64);
|
||||
sph_groestl512_close(&garden->groestl, hash);
|
||||
#endif
|
||||
break;
|
||||
break;
|
||||
case 6:
|
||||
sph_hamsi512_init(&garden->hamsi);
|
||||
sph_hamsi512(&garden->hamsi, input, 64);
|
||||
@@ -164,16 +170,20 @@ static void get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
sph_whirlpool(&garden->whirlpool, input, 64);
|
||||
sph_whirlpool_close(&garden->whirlpool, hash);
|
||||
break;
|
||||
case 16: // minotaurx only, yespower hardcoded for last node
|
||||
rc = yespower_tls( input, 64, &minotaurx_yespower_params,
|
||||
(yespower_binary_t*)hash, thr_id );
|
||||
}
|
||||
|
||||
memcpy(output, hash, 64);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static __thread TortureGarden garden;
|
||||
|
||||
bool initialize_torture_garden()
|
||||
{
|
||||
// Create torture garden nodes. Note that both sides of 19 and 20 lead to 21, and 21 has no children (to make traversal complete).
|
||||
// Create torture garden nodes. Note that both sides of 19 and 20 lead to 21, and 21 has no children (to make traversal complete).
|
||||
|
||||
garden.nodes[ 0].child[0] = &garden.nodes[ 1];
|
||||
garden.nodes[ 0].child[1] = &garden.nodes[ 2];
|
||||
@@ -219,7 +229,6 @@ bool initialize_torture_garden()
|
||||
garden.nodes[20].child[1] = &garden.nodes[21];
|
||||
garden.nodes[21].child[0] = NULL;
|
||||
garden.nodes[21].child[1] = NULL;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -227,38 +236,45 @@ bool initialize_torture_garden()
|
||||
int minotaur_hash( void *output, const void *input, int thr_id )
|
||||
{
|
||||
unsigned char hash[64] __attribute__ ((aligned (64)));
|
||||
int rc = 1;
|
||||
|
||||
// Find initial sha512 hash
|
||||
sph_sha512_init( &garden.sha512 );
|
||||
sph_sha512( &garden.sha512, input, 80 );
|
||||
sph_sha512_close( &garden.sha512, hash );
|
||||
|
||||
// algo 6 (Hamsi) is very slow. It's faster to skip hashing this nonce
|
||||
// if Hamsi is needed but only the first and last functions are
|
||||
// currently known. Abort if either is Hamsi.
|
||||
if ( ( ( hash[ 0] % MINOTAUR_ALGO_COUNT ) == 6 )
|
||||
|| ( ( hash[21] % MINOTAUR_ALGO_COUNT ) == 6 ) )
|
||||
return 0;
|
||||
|
||||
if ( opt_algo != ALGO_MINOTAURX )
|
||||
{
|
||||
// algo 6 (Hamsi) is very slow. It's faster to skip hashing this nonce
|
||||
// if Hamsi is needed but only the first and last functions are
|
||||
// currently known. Abort if either is Hamsi.
|
||||
if ( ( ( hash[ 0] % MINOTAUR_ALGO_COUNT ) == 6 )
|
||||
|| ( ( hash[21] % MINOTAUR_ALGO_COUNT ) == 6 ) )
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Assign algos to torture garden nodes based on initial hash
|
||||
for ( int i = 0; i < 22; i++ )
|
||||
garden.nodes[i].algo = hash[i] % MINOTAUR_ALGO_COUNT;
|
||||
|
||||
// MinotaurX override algo for last node with yespower
|
||||
if ( opt_algo == ALGO_MINOTAURX )
|
||||
garden.nodes[21].algo = MINOTAUR_ALGO_COUNT;
|
||||
|
||||
// Send the initial hash through the torture garden
|
||||
TortureNode *node = &garden.nodes[0];
|
||||
|
||||
while ( node )
|
||||
while ( rc && node )
|
||||
{
|
||||
get_hash( hash, hash, &garden, node->algo );
|
||||
rc = get_hash( hash, hash, &garden, node->algo, thr_id );
|
||||
node = node->child[ hash[63] & 1 ];
|
||||
}
|
||||
|
||||
memcpy( output, hash, 32 );
|
||||
return 1;
|
||||
return rc;
|
||||
}
|
||||
|
||||
int scanhash_minotaur( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t edata[20] __attribute__((aligned(64)));
|
||||
uint32_t hash[8] __attribute__((aligned(64)));
|
||||
@@ -277,7 +293,7 @@ int scanhash_minotaur( struct work *work, uint32_t max_nonce,
|
||||
edata[19] = n;
|
||||
if ( likely( algo_gate.hash( hash, edata, thr_id ) ) )
|
||||
{
|
||||
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
|
||||
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n );
|
||||
submit_solution( work, hash, mythr );
|
||||
@@ -291,12 +307,14 @@ int scanhash_minotaur( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// hash function has hooks for minotaurx
|
||||
bool register_minotaur_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_minotaur;
|
||||
gate->hash = (void*)&minotaur_hash;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->scanhash = (void*)&scanhash_minotaur;
|
||||
gate->hash = (void*)&minotaur_hash;
|
||||
gate->miner_thread_init = (void*)&initialize_torture_garden;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -163,7 +163,7 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
|
||||
{
|
||||
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||
size<<3 );
|
||||
bmw512_8way_update( &ctx.bmw, vhash, size );
|
||||
bmw512_8way_update( &ctx.bmw, vhash, size );
|
||||
}
|
||||
bmw512_8way_close( &ctx.bmw, vhash );
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
|
@@ -198,7 +198,7 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
{
|
||||
char* data;
|
||||
data = (char*)malloc( 2 + strlen( denom10_str ) * 4 + 16 * 4
|
||||
+ strlen( merkleroot_str ) * 3 );
|
||||
+ strlen( merkleroot_str ) * 3 + 1 );
|
||||
// Build the block header veildatahash in hex
|
||||
sprintf( data, "%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
merkleroot_str, witmerkleroot_str, "04",
|
||||
|
@@ -31,7 +31,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
|
||||
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
|
||||
s_ntime = masked_ntime;
|
||||
if ( !thr_id )
|
||||
applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
|
||||
applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
|
||||
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
|
||||
}
|
||||
|
||||
@@ -85,7 +85,7 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
|
||||
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
|
||||
s_ntime = masked_ntime;
|
||||
if ( !thr_id )
|
||||
applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
|
||||
applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
|
||||
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
|
||||
}
|
||||
|
||||
|
@@ -257,6 +257,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
|
||||
const __m512i eight = m512_const1_64( 8 );
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
// convert LE32 to LE64
|
||||
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
|
||||
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
|
||||
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
|
||||
@@ -264,10 +265,8 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
|
||||
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
mm512_intrlv80_8x64( vdata, edata );
|
||||
|
||||
*noncev = mm512_intrlv_blend_32( *noncev,
|
||||
_mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
|
||||
0, n+3, 0, n+2, 0, n+1, 0, n ) );
|
||||
*noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
|
||||
0,7, 0,6, 0,5, 0,4, 0,3, 0,2, 0,1, 0,0 ) );
|
||||
blake512_8way_prehash_le( &blake512_8way_ctx, x17_8way_midstate, vdata );
|
||||
|
||||
do
|
||||
@@ -279,7 +278,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
@@ -291,8 +290,6 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#elif defined(X17_4WAY)
|
||||
|
||||
union _x17_4way_context_overlay
|
||||
@@ -322,6 +319,9 @@ union _x17_4way_context_overlay
|
||||
};
|
||||
typedef union _x17_4way_context_overlay x17_4way_context_overlay;
|
||||
|
||||
static __thread __m256i x17_4way_midstate[16] __attribute__((aligned(64)));
|
||||
static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));
|
||||
|
||||
int x17_4way_hash( void *state, const void *input, int thr_id )
|
||||
{
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
@@ -333,7 +333,10 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
|
||||
uint64_t hash3[8] __attribute__ ((aligned (32)));
|
||||
x17_4way_context_overlay ctx;
|
||||
|
||||
blake512_4way_full( &ctx.blake, vhash, input, 80 );
|
||||
blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
|
||||
x17_4way_midstate );
|
||||
|
||||
// blake512_4way_full( &ctx.blake, vhash, input, 80 );
|
||||
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
@@ -449,4 +452,55 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
|
||||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash32[8*4] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m128i edata[5] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *hash32_d7 = &(hash32[7*4]);
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
__m256i *noncev = (__m256i*)vdata + 9;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const __m256i four = m256_const1_64( 4 );
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
// convert LE32 to LE64
|
||||
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
|
||||
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
|
||||
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
|
||||
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
|
||||
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
mm256_intrlv80_4x64( vdata, edata );
|
||||
*noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32( 0,3,0,2, 0,1,0,0 ) );
|
||||
blake512_4way_prehash_le( &blake512_4way_ctx, x17_4way_midstate, vdata );
|
||||
|
||||
do
|
||||
{
|
||||
if ( likely( x17_4way_hash( hash32, vdata, thr_id ) ) )
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( ( hash32_d7[ lane ] <= targ32_d7 ) && !bench ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -6,7 +6,8 @@ bool register_x17_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x17_8way;
|
||||
gate->hash = (void*)&x17_8way_hash;
|
||||
#elif defined (X17_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_4way_64in_32out;
|
||||
gate->scanhash = (void*)&scanhash_x17_4way;
|
||||
// gate->scanhash = (void*)&scanhash_4way_64in_32out;
|
||||
gate->hash = (void*)&x17_4way_hash;
|
||||
#else
|
||||
gate->hash = (void*)&x17_hash;
|
||||
|
@@ -581,10 +581,8 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
|
||||
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
mm512_intrlv80_8x64( vdata, edata );
|
||||
|
||||
*noncev = mm512_intrlv_blend_32( *noncev,
|
||||
_mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
|
||||
0, n+3, 0, n+2, 0, n+1, 0, n ) );
|
||||
*noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
|
||||
0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0 ) );
|
||||
blake512_8way_prehash_le( &blake512_8way_ctx, x25x_8way_midstate, vdata );
|
||||
|
||||
do
|
||||
@@ -941,9 +939,8 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
|
||||
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
mm256_intrlv80_4x64( vdata, edata );
|
||||
|
||||
*noncev = mm256_intrlv_blend_32( *noncev,
|
||||
_mm256_set_epi32( 0, n+3, 0, n+2, 0, n+1, 0, n ) );
|
||||
*noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32(
|
||||
0, 3, 0, 2, 0, 1, 0, 0 ) );
|
||||
blake512_4way_prehash_le( &blake512_4way_ctx, x25x_4way_midstate, vdata );
|
||||
|
||||
do
|
||||
|
File diff suppressed because it is too large
Load Diff
10
api.c
10
api.c
@@ -336,7 +336,7 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
|
||||
char inpkey[128] = { 0 };
|
||||
char seckey[64];
|
||||
uchar sha1[20];
|
||||
SHA_CTX ctx;
|
||||
// SHA_CTX ctx;
|
||||
|
||||
if (opt_protocol)
|
||||
applog(LOG_DEBUG, "clientkey: %s", clientkey);
|
||||
@@ -346,9 +346,11 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
|
||||
// SHA-1 test from rfc, returns in base64 "s3pPLMBiTxaQ9kYGzzhZRbK+xOo="
|
||||
//sprintf(inpkey, "dGhlIHNhbXBsZSBub25jZQ==258EAFA5-E914-47DA-95CA-C5AB0DC85B11");
|
||||
|
||||
SHA1_Init(&ctx);
|
||||
SHA1_Update(&ctx, inpkey, strlen(inpkey));
|
||||
SHA1_Final(sha1, &ctx);
|
||||
SHA1( inpkey, strlen(inpkey), sha1 );
|
||||
// Deprecated in openssl-3
|
||||
// SHA1_Init(&ctx);
|
||||
// SHA1_Update(&ctx, inpkey, strlen(inpkey));
|
||||
// SHA1_Final(sha1, &ctx);
|
||||
|
||||
base64_encode(sha1, 20, seckey, sizeof(seckey));
|
||||
|
||||
|
@@ -4,18 +4,48 @@
|
||||
# during develpment. However the information contained may provide compilation
|
||||
# tips to users.
|
||||
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 > /dev/null
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake > /dev/null
|
||||
|
||||
# AVX512 SHA VAES: Intel Core Icelake, Rocketlake
|
||||
make distclean || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
CFLAGS="-O3 -march=icelake-client -Wall -fno-common" ./configure --with-curl
|
||||
# Rocketlake needs gcc-11
|
||||
#CFLAGS="-O3 -march=rocketlake -Wall -fno-common" ./configure --with-curl
|
||||
make -j 8
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-avx512-sha-vaes
|
||||
|
||||
# AVX256 SHA VAES: Intel Core Alderlake, needs gcc-12
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#./autogen.sh || echo done
|
||||
#CFLAGS="-O3 -march=alderlake -Wall -fno-common" ./configure --with-curl
|
||||
#make -j 8
|
||||
#strip -s cpuminer
|
||||
#mv cpuminer cpuminer-alderlake
|
||||
|
||||
# Zen4 AVX512 SHA VAES
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
# znver3 needs gcc-11, znver4 ?
|
||||
#CFLAGS="-O3 -march=znver4 -Wall -fno-common " ./configure --with-curl
|
||||
CFLAGS="-O3 -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
|
||||
#CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
|
||||
make -j 8
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-zen4
|
||||
|
||||
# Zen3 AVX2 SHA VAES
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
#CFLAGS="-O3 -march=znver2 -mvaes -fno-common " ./configure --with-curl
|
||||
CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl
|
||||
make -j 8
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-zen3
|
||||
|
||||
# AVX512 AES: Intel Core HEDT Sylake-X, Cascadelake
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
@@ -59,7 +89,7 @@ make -j 8
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-avx
|
||||
|
||||
# SSE4.2 AES: Intel Westmere
|
||||
# SSE4.2 AES: Intel Westmere, most Pentium & Celeron
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl
|
||||
|
@@ -2,8 +2,8 @@
|
||||
#
|
||||
# make clean and rm all the targetted executables.
|
||||
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes > /dev/null
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 > /dev/null
|
||||
|
||||
rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe > /dev/null
|
||||
rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe > /dev/null
|
||||
|
||||
make distclean > /dev/null
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.20.2])
|
||||
AC_INIT([cpuminer-opt], [3.22.0])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
444
cpu-miner.c
444
cpu-miner.c
@@ -37,6 +37,7 @@
|
||||
#include <curl/curl.h>
|
||||
#include <jansson.h>
|
||||
#include <openssl/sha.h>
|
||||
//#include <mm_malloc.h>
|
||||
#include "sysinfos.c"
|
||||
#include "algo/sha/sha256d.h"
|
||||
|
||||
@@ -131,10 +132,9 @@ bool opt_verify = false;
|
||||
static bool opt_stratum_keepalive = false;
|
||||
static struct timeval stratum_keepalive_timer;
|
||||
// Stratum typically times out in 5 minutes or 300 seconds
|
||||
#define stratum_keepalive_timeout 180 // 3 minutes
|
||||
#define stratum_keepalive_timeout 150 // 2.5 minutes
|
||||
static struct timeval stratum_reset_time;
|
||||
|
||||
|
||||
// pk_buffer_size is used as a version selector by b58 code, therefore
|
||||
// it must be set correctly to work.
|
||||
const int pk_buffer_size_max = 26;
|
||||
@@ -318,8 +318,9 @@ static void affine_to_cpu( struct thr_info *thr )
|
||||
if ( !ok )
|
||||
{
|
||||
last_error = GetLastError();
|
||||
applog( LOG_WARNING, "affine_to_cpu_mask for %u returned 0x%x",
|
||||
thread, last_error );
|
||||
if ( !thread )
|
||||
applog( LOG_WARNING, "Set affinity returned error 0x%x for thread %d",
|
||||
last_error, thread );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -390,11 +391,11 @@ bool std_le_work_decode( struct work *work )
|
||||
{
|
||||
int i;
|
||||
const int adata_sz = algo_gate.get_work_data_size() / 4;
|
||||
const int atarget_sz = ARRAY_SIZE(work->target);
|
||||
// const int atarget_sz = ARRAY_SIZE(work->target);
|
||||
|
||||
for ( i = 0; i < adata_sz; i++ )
|
||||
work->data[i] = le32dec( work->data + i );
|
||||
for ( i = 0; i < atarget_sz; i++ )
|
||||
for ( i = 0; i < 8; i++ )
|
||||
work->target[i] = le32dec( work->target + i );
|
||||
return true;
|
||||
}
|
||||
@@ -403,11 +404,11 @@ bool std_be_work_decode( struct work *work )
|
||||
{
|
||||
int i;
|
||||
const int adata_sz = algo_gate.get_work_data_size() / 4;
|
||||
const int atarget_sz = ARRAY_SIZE(work->target);
|
||||
// const int atarget_sz = ARRAY_SIZE(work->target);
|
||||
|
||||
for ( i = 0; i < adata_sz; i++ )
|
||||
work->data[i] = be32dec( work->data + i );
|
||||
for ( i = 0; i < atarget_sz; i++ )
|
||||
for ( i = 0; i < 8; i++ )
|
||||
work->target[i] = le32dec( work->target + i );
|
||||
return true;
|
||||
}
|
||||
@@ -431,20 +432,18 @@ static bool work_decode( const json_t *val, struct work *work )
|
||||
if ( unlikely( !algo_gate.work_decode( work ) ) )
|
||||
return false;
|
||||
|
||||
if ( !allow_mininginfo )
|
||||
net_diff = algo_gate.calc_network_diff( work );
|
||||
else
|
||||
net_diff = hash_to_diff( work->target );
|
||||
|
||||
work->targetdiff = net_diff;
|
||||
stratum_diff = last_targetdiff = work->targetdiff;
|
||||
// many of these aren't used solo.
|
||||
net_diff =
|
||||
work->targetdiff =
|
||||
stratum_diff =
|
||||
last_targetdiff = hash_to_diff( work->target );
|
||||
work->sharediff = 0;
|
||||
algo_gate.decode_extra_data( work, &net_blocks );
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// good alternative for wallet mining, difficulty and net hashrate
|
||||
// Only used for net_hashrate with GBT/getwork, data is from previous block.
|
||||
static const char *info_req =
|
||||
"{\"method\": \"getmininginfo\", \"params\": [], \"id\":8}\r\n";
|
||||
|
||||
@@ -470,17 +469,14 @@ static bool get_mininginfo( CURL *curl, struct work *work )
|
||||
// "networkhashps": 56475980
|
||||
if ( res )
|
||||
{
|
||||
// net_diff is a global that is set from the work hash target by
|
||||
// both getwork and GBT. Don't overwrite it, define a local to override
|
||||
// the global.
|
||||
double net_diff = 0.;
|
||||
double difficulty = 0.;
|
||||
json_t *key = json_object_get( res, "difficulty" );
|
||||
if ( key )
|
||||
{
|
||||
if ( json_is_object( key ) )
|
||||
key = json_object_get( key, "proof-of-work" );
|
||||
if ( json_is_real( key ) )
|
||||
net_diff = json_real_value( key );
|
||||
difficulty = json_real_value( key );
|
||||
}
|
||||
|
||||
key = json_object_get( res, "networkhashps" );
|
||||
@@ -497,12 +493,13 @@ static bool get_mininginfo( CURL *curl, struct work *work )
|
||||
net_blocks = json_integer_value( key );
|
||||
|
||||
if ( opt_debug )
|
||||
applog(LOG_INFO,"Mining info: diff %.5g, net_hashrate %f, height %d",
|
||||
net_diff, net_hashrate, net_blocks );
|
||||
|
||||
applog( LOG_INFO,"getmininginfo: difficulty %.5g, networkhashps %.5g, blocks %d", difficulty, net_hashrate, net_blocks );
|
||||
|
||||
if ( !work->height )
|
||||
{
|
||||
// complete missing data from getwork
|
||||
if ( opt_debug )
|
||||
applog( LOG_DEBUG, "work height set by getmininginfo" );
|
||||
work->height = (uint32_t) net_blocks + 1;
|
||||
if ( work->height > g_work.height )
|
||||
restart_threads();
|
||||
@@ -518,11 +515,10 @@ static bool get_mininginfo( CURL *curl, struct work *work )
|
||||
|
||||
static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
{
|
||||
int i, n;
|
||||
uint32_t prevhash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t target[8] __attribute__ ((aligned (32)));
|
||||
unsigned char final_sapling_hash[32] __attribute__ ((aligned (32)));
|
||||
uint32_t version, curtime, bits;
|
||||
uint32_t prevhash[8];
|
||||
uint32_t target[8];
|
||||
unsigned char final_sapling_hash[32];
|
||||
int cbtx_size;
|
||||
uchar *cbtx = NULL;
|
||||
int tx_count, tx_size;
|
||||
@@ -534,9 +530,9 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
bool version_reduce = false;
|
||||
json_t *tmp, *txa;
|
||||
bool rc = false;
|
||||
|
||||
// Segwit BEGIN
|
||||
int i, n;
|
||||
bool segwit = false;
|
||||
|
||||
tmp = json_object_get( val, "rules" );
|
||||
if ( tmp && json_is_array( tmp ) )
|
||||
{
|
||||
@@ -554,8 +550,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
}
|
||||
}
|
||||
}
|
||||
// Segwit END
|
||||
|
||||
|
||||
tmp = json_object_get( val, "mutable" );
|
||||
if ( tmp && json_is_array( tmp ) )
|
||||
{
|
||||
@@ -637,7 +632,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* find count and size of transactions */
|
||||
txa = json_object_get(val, "transactions" );
|
||||
if ( !txa || !json_is_array( txa ) )
|
||||
@@ -712,12 +707,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
cbtx[41] = cbtx_size - 42; /* scriptsig length */
|
||||
le32enc( (uint32_t *)( cbtx+cbtx_size ), 0xffffffff ); /* sequence */
|
||||
cbtx_size += 4;
|
||||
|
||||
// Segwit BEGIN
|
||||
//cbtx[cbtx_size++] = 1; /* out-counter */
|
||||
cbtx[cbtx_size++] = segwit ? 2 : 1; /* out-counter */
|
||||
// Segwit END
|
||||
|
||||
cbtx[cbtx_size++] = segwit ? 2 : 1; /* out-counter */
|
||||
le32enc( (uint32_t *)( cbtx+cbtx_size) , (uint32_t)cbvalue ); /* value */
|
||||
le32enc( (uint32_t *)( cbtx+cbtx_size+4 ), cbvalue >> 32 );
|
||||
cbtx_size += 8;
|
||||
@@ -725,7 +715,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
memcpy( cbtx+cbtx_size, pk_script, pk_script_size );
|
||||
cbtx_size += (int) pk_script_size;
|
||||
|
||||
// Segwit BEGIN
|
||||
if ( segwit )
|
||||
{
|
||||
unsigned char (*wtree)[32] = calloc(tx_count + 2, 32);
|
||||
@@ -760,12 +749,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
for ( i = 0; i < n; i++ )
|
||||
sha256d( wtree[i], wtree[2*i], 64 );
|
||||
}
|
||||
memset( wtree[1], 0, 32 ); /* witness reserved value = 0 */
|
||||
memset( wtree[1], 0, 32 ); // witness reserved value = 0
|
||||
sha256d( cbtx+cbtx_size, wtree[0], 64 );
|
||||
cbtx_size += 32;
|
||||
free( wtree );
|
||||
}
|
||||
// Segwit END
|
||||
|
||||
le32enc( (uint32_t *)( cbtx+cbtx_size ), 0 ); /* lock time */
|
||||
cbtx_size += 4;
|
||||
@@ -784,10 +772,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
xsig_len += n;
|
||||
}
|
||||
else
|
||||
{
|
||||
applog( LOG_WARNING,
|
||||
"Signature does not fit in coinbase, skipping" );
|
||||
}
|
||||
}
|
||||
tmp = json_object_get( val, "coinbaseaux" );
|
||||
if ( tmp && json_is_object( tmp ) )
|
||||
@@ -814,8 +800,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
if ( xsig_len )
|
||||
{
|
||||
unsigned char *ssig_end = cbtx + 42 + cbtx[41];
|
||||
int push_len = cbtx[41] + xsig_len < 76 ? 1 :
|
||||
cbtx[41] + 2 + xsig_len > 100 ? 0 : 2;
|
||||
int push_len = cbtx[41] + xsig_len < 76
|
||||
? 1 : cbtx[41] + 2 + xsig_len > 100 ? 0 : 2;
|
||||
n = xsig_len + push_len;
|
||||
memmove( ssig_end + n, ssig_end, cbtx_size - 42 - cbtx[41] );
|
||||
cbtx[41] += n;
|
||||
@@ -842,7 +828,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
const char *tx_hex = json_string_value( json_object_get( tmp, "data" ) );
|
||||
const int tx_size = tx_hex ? (int) ( strlen( tx_hex ) / 2 ) : 0;
|
||||
|
||||
// Segwit BEGIN
|
||||
if ( segwit )
|
||||
{
|
||||
const char *txid = json_string_value( json_object_get( tmp, "txid" ) );
|
||||
@@ -855,8 +840,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
}
|
||||
else
|
||||
{
|
||||
// Segwit END
|
||||
|
||||
unsigned char *tx = (uchar*) malloc( tx_size );
|
||||
if ( !tx_hex || !hex2bin( tx, tx_hex, tx_size ) )
|
||||
{
|
||||
@@ -866,10 +849,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
}
|
||||
sha256d( merkle_tree[1 + i], tx, tx_size );
|
||||
free( tx );
|
||||
|
||||
// Segwit BEGIN
|
||||
}
|
||||
// Segwit END
|
||||
|
||||
if ( !submit_coinbase )
|
||||
strcat( work->txs, tx_hex );
|
||||
@@ -898,10 +878,12 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
applog( LOG_ERR, "JSON invalid target" );
|
||||
goto out;
|
||||
}
|
||||
for ( i = 0; i < ARRAY_SIZE( work->target ); i++ )
|
||||
work->target[7 - i] = be32dec( target + i );
|
||||
|
||||
// reverse the bytes in target
|
||||
casti_m128i( work->target, 0 ) = mm128_bswap_128( casti_m128i( target, 1 ) );
|
||||
casti_m128i( work->target, 1 ) = mm128_bswap_128( casti_m128i( target, 0 ) );
|
||||
net_diff = work->targetdiff = hash_to_diff( work->target );
|
||||
|
||||
|
||||
tmp = json_object_get( val, "workid" );
|
||||
if ( tmp )
|
||||
{
|
||||
@@ -1077,12 +1059,11 @@ void report_summary_log( bool force )
|
||||
timeval_subtract( &et, &now, &start_time );
|
||||
timeval_subtract( &uptime, &total_hashes_time, &session_start );
|
||||
|
||||
double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6;
|
||||
double share_time = (double)et.tv_sec + (double)et.tv_usec * 1e-6;
|
||||
double ghrate = safe_div( total_hashes, (double)uptime.tv_sec, 0. );
|
||||
double target_diff = exp32 * last_targetdiff;
|
||||
double shrate = safe_div( target_diff * (double)(accepts),
|
||||
share_time, 0. );
|
||||
// global_hashrate = ghrate;
|
||||
double sess_hrate = safe_div( exp32 * norm_diff_sum,
|
||||
(double)uptime.tv_sec, 0. );
|
||||
double submit_rate = safe_div( (double)submits * 60., share_time, 0. );
|
||||
@@ -1103,7 +1084,7 @@ void report_summary_log( bool force )
|
||||
applog2( LOG_NOTICE, "Periodic Report %s %s", et_str, upt_str );
|
||||
applog2( LOG_INFO, "Share rate %.2f/min %.2f/min",
|
||||
submit_rate, safe_div( (double)submitted_share_count*60.,
|
||||
( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ), 0. ) );
|
||||
( (double)uptime.tv_sec + (double)uptime.tv_usec * 1e-6 ), 0. ) );
|
||||
applog2( LOG_INFO, "Hash rate %7.2f%sh/s %7.2f%sh/s (%.2f%sh/s)",
|
||||
shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units );
|
||||
|
||||
@@ -1459,6 +1440,7 @@ char* std_malloc_txs_request( struct work *work )
|
||||
json_t *val;
|
||||
char data_str[2 * sizeof(work->data) + 1];
|
||||
int i;
|
||||
// datasize is an ugly hack, it should go through the gate
|
||||
int datasize = work->sapling ? 112 : 80;
|
||||
|
||||
for ( i = 0; i < ARRAY_SIZE(work->data); i++ )
|
||||
@@ -1549,7 +1531,6 @@ const char *getwork_req =
|
||||
|
||||
#define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]"
|
||||
|
||||
// Segwit BEGIN
|
||||
#define GBT_RULES "[\"segwit\"]"
|
||||
static const char *gbt_req =
|
||||
"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
|
||||
@@ -1558,16 +1539,6 @@ const char *gbt_lp_req =
|
||||
"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
|
||||
GBT_CAPABILITIES ", \"rules\": " GBT_RULES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n";
|
||||
|
||||
/*
|
||||
static const char *gbt_req =
|
||||
"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
|
||||
GBT_CAPABILITIES "}], \"id\":0}\r\n";
|
||||
const char *gbt_lp_req =
|
||||
"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
|
||||
GBT_CAPABILITIES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n";
|
||||
*/
|
||||
// Segwit END
|
||||
|
||||
static bool get_upstream_work( CURL *curl, struct work *work )
|
||||
{
|
||||
json_t *val;
|
||||
@@ -1645,46 +1616,46 @@ start:
|
||||
applog( LOG_BLUE, "New Block %d, Net Diff %.5g, Ntime %08x",
|
||||
work->height, net_diff,
|
||||
work->data[ algo_gate.ntime_index ] );
|
||||
|
||||
if ( !opt_quiet )
|
||||
{
|
||||
double miner_hr = 0.;
|
||||
double net_hr = net_hashrate;
|
||||
double nd = net_diff * exp32;
|
||||
char net_hr_units[4] = {0};
|
||||
char miner_hr_units[4] = {0};
|
||||
char net_ttf[32];
|
||||
char miner_ttf[32];
|
||||
|
||||
pthread_mutex_lock( &stats_lock );
|
||||
|
||||
for ( int i = 0; i < opt_n_threads; i++ )
|
||||
miner_hr += thr_hashrates[i];
|
||||
global_hashrate = miner_hr;
|
||||
|
||||
pthread_mutex_unlock( &stats_lock );
|
||||
|
||||
if ( net_hr > 0. )
|
||||
sprintf_et( net_ttf, nd / net_hr );
|
||||
else
|
||||
sprintf( net_ttf, "NA" );
|
||||
if ( miner_hr > 0. )
|
||||
sprintf_et( miner_ttf, nd / miner_hr );
|
||||
else
|
||||
sprintf( miner_ttf, "NA" );
|
||||
|
||||
scale_hash_for_display ( &miner_hr, miner_hr_units );
|
||||
scale_hash_for_display ( &net_hr, net_hr_units );
|
||||
applog2( LOG_INFO,
|
||||
"Miner TTF @ %.2f %sh/s %s, Net TTF @ %.2f %sh/s %s",
|
||||
miner_hr, miner_hr_units, miner_ttf, net_hr,
|
||||
net_hr_units, net_ttf );
|
||||
}
|
||||
} // work->height > last_block_height
|
||||
}
|
||||
else if ( memcmp( &work->data[1], &g_work.data[1], 32 ) )
|
||||
applog( LOG_BLUE, "New Work: Block %d, Net Diff %.5g, Ntime %08x",
|
||||
work->height, net_diff,
|
||||
work->height, net_diff,
|
||||
work->data[ algo_gate.ntime_index ] );
|
||||
|
||||
if ( !opt_quiet )
|
||||
{
|
||||
double miner_hr = 0.;
|
||||
double net_hr = net_hashrate;
|
||||
double nd = net_diff * exp32;
|
||||
char net_hr_units[4] = {0};
|
||||
char miner_hr_units[4] = {0};
|
||||
char net_ttf[32];
|
||||
char miner_ttf[32];
|
||||
|
||||
pthread_mutex_lock( &stats_lock );
|
||||
|
||||
for ( int i = 0; i < opt_n_threads; i++ )
|
||||
miner_hr += thr_hashrates[i];
|
||||
global_hashrate = miner_hr;
|
||||
|
||||
pthread_mutex_unlock( &stats_lock );
|
||||
|
||||
if ( net_hr > 0. )
|
||||
sprintf_et( net_ttf, nd / net_hr );
|
||||
else
|
||||
sprintf( net_ttf, "NA" );
|
||||
if ( miner_hr > 0. )
|
||||
sprintf_et( miner_ttf, nd / miner_hr );
|
||||
else
|
||||
sprintf( miner_ttf, "NA" );
|
||||
|
||||
scale_hash_for_display ( &miner_hr, miner_hr_units );
|
||||
scale_hash_for_display ( &net_hr, net_hr_units );
|
||||
applog2( LOG_INFO,
|
||||
"Miner TTF @ %.2f %sh/s %s, Net TTF @ %.2f %sh/s %s",
|
||||
miner_hr, miner_hr_units, miner_ttf, net_hr,
|
||||
net_hr_units, net_ttf );
|
||||
}
|
||||
} // rc
|
||||
|
||||
return rc;
|
||||
@@ -1710,36 +1681,36 @@ static void workio_cmd_free(struct workio_cmd *wc)
|
||||
|
||||
static bool workio_get_work( struct workio_cmd *wc, CURL *curl )
|
||||
{
|
||||
struct work *ret_work;
|
||||
struct work *work_heap;
|
||||
int failures = 0;
|
||||
|
||||
ret_work = (struct work*) calloc( 1, sizeof(*ret_work) );
|
||||
if ( !ret_work )
|
||||
return false;
|
||||
work_heap = calloc( 1, sizeof(struct work) );
|
||||
if ( !work_heap ) return false;
|
||||
|
||||
/* obtain new work from bitcoin via JSON-RPC */
|
||||
while ( !get_upstream_work( curl, ret_work ) )
|
||||
while ( !get_upstream_work( curl, work_heap ) )
|
||||
{
|
||||
if ( unlikely( ( opt_retries >= 0 ) && ( ++failures > opt_retries ) ) )
|
||||
{
|
||||
applog( LOG_ERR, "json_rpc_call failed, terminating workio thread" );
|
||||
free( ret_work );
|
||||
return false;
|
||||
free( work_heap );
|
||||
return false;
|
||||
}
|
||||
|
||||
/* pause, then restart work-request loop */
|
||||
applog( LOG_ERR, "json_rpc_call failed, retry after %d seconds",
|
||||
opt_fail_pause );
|
||||
applog( LOG_ERR, "json_rpc_call failed, retry after %d seconds",
|
||||
opt_fail_pause );
|
||||
sleep( opt_fail_pause );
|
||||
}
|
||||
|
||||
/* send work to requesting thread */
|
||||
if ( !tq_push(wc->thr->q, ret_work ) )
|
||||
free( ret_work );
|
||||
if ( !tq_push(wc->thr->q, work_heap ) )
|
||||
free( work_heap );
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static bool workio_submit_work(struct workio_cmd *wc, CURL *curl)
|
||||
{
|
||||
int failures = 0;
|
||||
@@ -1810,7 +1781,7 @@ static void *workio_thread(void *userdata)
|
||||
static bool get_work(struct thr_info *thr, struct work *work)
|
||||
{
|
||||
struct workio_cmd *wc;
|
||||
struct work *work_heap;
|
||||
struct work *work_heap;
|
||||
|
||||
if unlikely( opt_benchmark )
|
||||
{
|
||||
@@ -1835,17 +1806,16 @@ static bool get_work(struct thr_info *thr, struct work *work)
|
||||
wc->thr = thr;
|
||||
/* send work request to workio thread */
|
||||
if (!tq_push(thr_info[work_thr_id].q, wc))
|
||||
{
|
||||
{
|
||||
workio_cmd_free(wc);
|
||||
return false;
|
||||
}
|
||||
/* wait for response, a unit of work */
|
||||
work_heap = (struct work*) tq_pop(thr->q, NULL);
|
||||
if (!work_heap)
|
||||
return false;
|
||||
/* copy returned work into storage provided by caller */
|
||||
memcpy(work, work_heap, sizeof(*work));
|
||||
free(work_heap);
|
||||
if ( !work_heap ) return false;
|
||||
/* copy returned work into storage provided by caller */
|
||||
memcpy( work, work_heap, sizeof(*work) );
|
||||
free( work_heap );
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1895,9 +1865,9 @@ static void update_submit_stats( struct work *work, const void *hash )
|
||||
bool submit_solution( struct work *work, const void *hash,
|
||||
struct thr_info *thr )
|
||||
{
|
||||
// Job went stale during hashing of a valid share.
|
||||
if ( !opt_quiet && work_restart[ thr->id ].restart )
|
||||
applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N );
|
||||
// Job went stale during hashing of a valid share.
|
||||
// if ( !opt_quiet && work_restart[ thr->id ].restart )
|
||||
// applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N );
|
||||
|
||||
work->sharediff = hash_to_diff( hash );
|
||||
if ( likely( submit_work( thr, work ) ) )
|
||||
@@ -1915,32 +1885,34 @@ bool submit_solution( struct work *work, const void *hash,
|
||||
if ( !opt_quiet )
|
||||
{
|
||||
if ( have_stratum )
|
||||
{
|
||||
applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Job %s",
|
||||
submitted_share_count, work->sharediff, work->height,
|
||||
work->job_id );
|
||||
if ( opt_debug && opt_extranonce )
|
||||
{
|
||||
unsigned char *xnonce2str = abin2hex( work->xnonce2,
|
||||
work->xnonce2_len );
|
||||
applog( LOG_INFO, "Xnonce2 %s", xnonce2str );
|
||||
free( xnonce2str );
|
||||
}
|
||||
}
|
||||
else
|
||||
applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Ntime %08x",
|
||||
submitted_share_count, work->sharediff, work->height,
|
||||
work->data[ algo_gate.ntime_index ] );
|
||||
}
|
||||
|
||||
if ( opt_debug )
|
||||
{
|
||||
uint32_t* h = (uint32_t*)hash;
|
||||
uint32_t* t = (uint32_t*)work->target;
|
||||
uint32_t* d = (uint32_t*)work->data;
|
||||
if ( opt_debug )
|
||||
{
|
||||
uint32_t* h = (uint32_t*)hash;
|
||||
uint32_t* t = (uint32_t*)work->target;
|
||||
uint32_t* d = (uint32_t*)work->data;
|
||||
|
||||
unsigned char *xnonce2str = abin2hex( work->xnonce2,
|
||||
work->xnonce2_len );
|
||||
applog(LOG_INFO,"Thread %d, Nonce %08x, Xnonce2 %s", thr->id,
|
||||
work->data[ algo_gate.nonce_index ], xnonce2str );
|
||||
free( xnonce2str );
|
||||
applog(LOG_INFO,"Data[0:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
|
||||
applog(LOG_INFO," : %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19]);
|
||||
applog(LOG_INFO,"Hash[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
|
||||
h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0]);
|
||||
applog(LOG_INFO,"Targ[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
|
||||
t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0]);
|
||||
applog( LOG_INFO, "Data[ 0: 9]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
|
||||
applog( LOG_INFO, "Data[10:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19] );
|
||||
applog( LOG_INFO, "Hash[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x", h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0] );
|
||||
applog( LOG_INFO, "Targ[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x", t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0] );
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -1958,15 +1930,15 @@ static bool wanna_mine(int thr_id)
|
||||
float temp = cpu_temp(0);
|
||||
if (temp > opt_max_temp)
|
||||
{
|
||||
if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
|
||||
applog(LOG_INFO, "temperature too high (%.0fC), waiting...", temp);
|
||||
state = false;
|
||||
if ( !thr_id && !conditional_state[thr_id] && !opt_quiet )
|
||||
applog(LOG_NOTICE, "CPU temp too high: %.0fC max %.0f, waiting...", temp, opt_max_temp );
|
||||
state = false;
|
||||
}
|
||||
}
|
||||
if (opt_max_diff > 0.0 && net_diff > opt_max_diff)
|
||||
{
|
||||
if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
|
||||
applog(LOG_INFO, "network diff too high, waiting...");
|
||||
applog(LOG_NOTICE, "network diff too high, waiting...");
|
||||
state = false;
|
||||
}
|
||||
if (opt_max_rate > 0.0 && net_hashrate > opt_max_rate)
|
||||
@@ -1975,12 +1947,14 @@ static bool wanna_mine(int thr_id)
|
||||
{
|
||||
char rate[32];
|
||||
format_hashrate(opt_max_rate, rate);
|
||||
applog(LOG_INFO, "network hashrate too high, waiting %s...", rate);
|
||||
applog(LOG_NOTICE, "network hashrate too high (%s), waiting...", rate);
|
||||
}
|
||||
state = false;
|
||||
}
|
||||
if (thr_id < MAX_CPUS)
|
||||
conditional_state[thr_id] = (uint8_t) !state;
|
||||
|
||||
if ( conditional_state[thr_id] && state && !thr_id && !opt_quiet )
|
||||
applog(LOG_NOTICE, "...resuming" );
|
||||
conditional_state[thr_id] = (uint8_t) !state;
|
||||
return state;
|
||||
}
|
||||
|
||||
@@ -2014,33 +1988,6 @@ void set_work_data_big_endian( struct work *work )
|
||||
be32enc( work->data + i, work->data[i] );
|
||||
}
|
||||
|
||||
// calculate net diff from nbits.
|
||||
double std_calc_network_diff( struct work* work )
|
||||
{
|
||||
uint32_t nbits = work->data[ algo_gate.nbits_index ];
|
||||
uint32_t shift = nbits & 0xff;
|
||||
uint32_t bits = bswap_32( nbits ) & 0x00ffffff;
|
||||
/*
|
||||
// sample for diff 43.281 : 1c05ea29
|
||||
// todo: endian reversed on longpoll could be zr5 specific...
|
||||
int nbits_index = algo_gate.nbits_index;
|
||||
uint32_t nbits = have_longpoll ? work->data[ nbits_index]
|
||||
: swab32( work->data[ nbits_index ] );
|
||||
uint32_t bits = ( nbits & 0xffffff );
|
||||
int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
|
||||
*/
|
||||
|
||||
int m;
|
||||
long double d = (long double)0x0000ffff / (long double)bits;
|
||||
for ( m = shift; m < 29; m++ )
|
||||
d *= 256.0;
|
||||
for ( m = 29; m < shift; m++ )
|
||||
d /= 256.0;
|
||||
if ( opt_debug_diff )
|
||||
applog(LOG_DEBUG, "net diff: %8f -> shift %u, bits %08x", (double)d, shift, bits);
|
||||
return (double)d;
|
||||
}
|
||||
|
||||
void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
|
||||
uint32_t *end_nonce_ptr )
|
||||
{
|
||||
@@ -2064,17 +2011,6 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
|
||||
++(*nonceptr);
|
||||
}
|
||||
|
||||
bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
|
||||
int thr_id )
|
||||
{
|
||||
if ( have_stratum && !work->data[0] && !opt_benchmark )
|
||||
{
|
||||
sleep(1);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
{
|
||||
bool new_job;
|
||||
@@ -2091,7 +2027,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
g_work->xnonce2 = (uchar*) realloc( g_work->xnonce2, sctx->xnonce2_size );
|
||||
memcpy( g_work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size );
|
||||
algo_gate.build_extraheader( g_work, sctx );
|
||||
net_diff = algo_gate.calc_network_diff( g_work );
|
||||
net_diff = nbits_to_diff( g_work->data[ algo_gate.nbits_index ] );
|
||||
algo_gate.set_work_data_endian( g_work );
|
||||
g_work->height = sctx->block_height;
|
||||
g_work->targetdiff = sctx->job.diff
|
||||
@@ -2141,8 +2077,6 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
if ( ( stratum_diff != sctx->job.diff )
|
||||
|| ( last_block_height != sctx->block_height ) )
|
||||
{
|
||||
static bool multipool = false;
|
||||
if ( stratum.block_height < last_block_height ) multipool = true;
|
||||
if ( unlikely( !session_first_block ) )
|
||||
session_first_block = stratum.block_height;
|
||||
last_block_height = stratum.block_height;
|
||||
@@ -2150,56 +2084,47 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
last_targetdiff = g_work->targetdiff;
|
||||
if ( lowest_share < last_targetdiff )
|
||||
lowest_share = 9e99;
|
||||
}
|
||||
|
||||
if ( !opt_quiet )
|
||||
{
|
||||
applog2( LOG_INFO, "Diff: Net %.5g, Stratum %.5g, Target %.5g",
|
||||
net_diff, stratum_diff, g_work->targetdiff );
|
||||
if ( !opt_quiet )
|
||||
{
|
||||
applog2( LOG_INFO, "Diff: Net %.5g, Stratum %.5g, Target %.5g",
|
||||
net_diff, stratum_diff, g_work->targetdiff );
|
||||
|
||||
if ( likely( hr > 0. ) )
|
||||
{
|
||||
double nd = net_diff * exp32;
|
||||
char hr_units[4] = {0};
|
||||
char block_ttf[32];
|
||||
char share_ttf[32];
|
||||
if ( likely( hr > 0. ) )
|
||||
{
|
||||
double nd = net_diff * exp32;
|
||||
char hr_units[4] = {0};
|
||||
char block_ttf[32];
|
||||
char share_ttf[32];
|
||||
static bool multipool = false;
|
||||
|
||||
if ( stratum.block_height < last_block_height ) multipool = true;
|
||||
|
||||
sprintf_et( block_ttf, nd / hr );
|
||||
sprintf_et( share_ttf, ( g_work->targetdiff * exp32 ) / hr );
|
||||
scale_hash_for_display ( &hr, hr_units );
|
||||
applog2( LOG_INFO, "TTF @ %.2f %sh/s: Block %s, Share %s",
|
||||
hr, hr_units, block_ttf, share_ttf );
|
||||
|
||||
sprintf_et( block_ttf, nd / hr );
|
||||
sprintf_et( share_ttf, ( g_work->targetdiff * exp32 ) / hr );
|
||||
scale_hash_for_display ( &hr, hr_units );
|
||||
applog2( LOG_INFO, "TTF @ %.2f %sh/s: Block %s, Share %s",
|
||||
hr, hr_units, block_ttf, share_ttf );
|
||||
|
||||
if ( !multipool && last_block_height > session_first_block )
|
||||
{
|
||||
struct timeval now, et;
|
||||
gettimeofday( &now, NULL );
|
||||
timeval_subtract( &et, &now, &session_start );
|
||||
uint64_t net_ttf =
|
||||
( last_block_height - session_first_block ) == 0 ? 0
|
||||
: et.tv_sec / ( last_block_height - session_first_block );
|
||||
if ( net_diff > 0. && net_ttf )
|
||||
{
|
||||
double net_hr = nd / net_ttf;
|
||||
char net_hr_units[4] = {0};
|
||||
scale_hash_for_display ( &net_hr, net_hr_units );
|
||||
applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
|
||||
net_hr, net_hr_units );
|
||||
}
|
||||
}
|
||||
} // hr > 0
|
||||
} // !quiet
|
||||
} // new diff/block
|
||||
|
||||
if ( new_job && !( opt_quiet || stratum_errors ) )
|
||||
{
|
||||
int mismatch = submitted_share_count - ( accepted_share_count
|
||||
+ stale_share_count
|
||||
+ rejected_share_count );
|
||||
if ( mismatch )
|
||||
applog( LOG_INFO,
|
||||
CL_LBL "%d Submitted share pending, maybe stale" CL_N,
|
||||
submitted_share_count );
|
||||
}
|
||||
if ( !multipool && last_block_height > session_first_block )
|
||||
{
|
||||
struct timeval now, et;
|
||||
gettimeofday( &now, NULL );
|
||||
timeval_subtract( &et, &now, &session_start );
|
||||
uint64_t net_ttf = safe_div( et.tv_sec,
|
||||
last_block_height - session_first_block, 0 );
|
||||
if ( net_diff > 0. && net_ttf )
|
||||
{
|
||||
double net_hr = safe_div( nd, net_ttf, 0. );
|
||||
char net_hr_units[4] = {0};
|
||||
scale_hash_for_display ( &net_hr, net_hr_units );
|
||||
applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
|
||||
net_hr, net_hr_units );
|
||||
}
|
||||
}
|
||||
} // hr > 0
|
||||
} // !quiet
|
||||
}
|
||||
|
||||
static void *miner_thread( void *userdata )
|
||||
@@ -2337,9 +2262,14 @@ static void *miner_thread( void *userdata )
|
||||
} // do_this_thread
|
||||
algo_gate.resync_threads( thr_id, &work );
|
||||
|
||||
if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) )
|
||||
// conditional mining
|
||||
if ( unlikely( !wanna_mine( thr_id ) ) )
|
||||
{
|
||||
restart_threads();
|
||||
sleep(5);
|
||||
continue;
|
||||
|
||||
}
|
||||
|
||||
// opt_scantime expressed in hashes
|
||||
max64 = opt_scantime * thr_hashrates[thr_id];
|
||||
|
||||
@@ -2444,8 +2374,8 @@ static void *miner_thread( void *userdata )
|
||||
{
|
||||
scale_hash_for_display( &hashrate, hr_units );
|
||||
sprintf( hr, "%.2f", hashrate );
|
||||
applog( LOG_INFO, "CPU #%d: %s %sh/s",
|
||||
thr_id, hr, hr_units );
|
||||
applog( LOG_INFO, "Thread %d, CPU %d: %s %sh/s",
|
||||
thr_id, thread_affinity_map[ thr_id ], hr, hr_units );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2486,14 +2416,6 @@ static void *miner_thread( void *userdata )
|
||||
}
|
||||
}
|
||||
} // benchmark
|
||||
|
||||
// conditional mining
|
||||
if ( unlikely( !wanna_mine( thr_id ) ) )
|
||||
{
|
||||
sleep(5);
|
||||
continue;
|
||||
}
|
||||
|
||||
} // miner_thread loop
|
||||
|
||||
out:
|
||||
@@ -2885,7 +2807,7 @@ static void *stratum_thread(void *userdata )
|
||||
else
|
||||
timeval_subtract( &et, &now, &stratum_reset_time );
|
||||
|
||||
if ( et.tv_sec > stratum_keepalive_timeout + 60 )
|
||||
if ( et.tv_sec > stratum_keepalive_timeout + 90 )
|
||||
{
|
||||
applog( LOG_NOTICE, "No shares submitted, resetting stratum connection" );
|
||||
stratum_need_reset = true;
|
||||
@@ -3668,7 +3590,7 @@ int main(int argc, char *argv[])
|
||||
|
||||
#if defined(WIN32)
|
||||
|
||||
// Are Windows CPU Groups supported?
|
||||
// Get the number of cpus, display after parsing command line
|
||||
#if defined(WINDOWS_CPU_GROUPS_ENABLED)
|
||||
num_cpus = 0;
|
||||
num_cpugroups = GetActiveProcessorGroupCount();
|
||||
@@ -3677,8 +3599,8 @@ int main(int argc, char *argv[])
|
||||
int cpus = GetActiveProcessorCount( i );
|
||||
num_cpus += cpus;
|
||||
|
||||
if (opt_debug)
|
||||
applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
|
||||
// if (opt_debug)
|
||||
// applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
|
||||
}
|
||||
|
||||
#else
|
||||
@@ -3695,7 +3617,7 @@ int main(int argc, char *argv[])
|
||||
sysctl(req, 2, &num_cpus, &len, NULL, 0);
|
||||
#else
|
||||
num_cpus = 1;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if ( num_cpus < 1 )
|
||||
num_cpus = 1;
|
||||
@@ -3719,7 +3641,6 @@ int main(int argc, char *argv[])
|
||||
if ( opt_time_limit )
|
||||
time_limit_stop = (unsigned int)time(NULL) + opt_time_limit;
|
||||
|
||||
|
||||
// need to register to get algo optimizations for cpu capabilities
|
||||
// but that causes registration logs before cpu capabilities is output.
|
||||
// Would need to split register function into 2 parts. First part sets algo
|
||||
@@ -3847,6 +3768,11 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
|
||||
if ( !opt_quiet )
|
||||
applog( LOG_INFO, "Found %d CPUs in %d groups", num_cpus, num_cpugroups );
|
||||
#endif
|
||||
|
||||
if ( opt_affinity && num_cpus > max_cpus )
|
||||
{
|
||||
applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",
|
||||
@@ -3858,7 +3784,7 @@ int main(int argc, char *argv[])
|
||||
{
|
||||
for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ )
|
||||
{
|
||||
while ( !( ( opt_affinity >> ( cpu&63 ) ) & 1ULL ) ) cpu++;
|
||||
while ( !( ( opt_affinity >> ( cpu & 63 ) ) & 1ULL ) ) cpu++;
|
||||
thread_affinity_map[ thr ] = cpu % num_cpus;
|
||||
}
|
||||
if ( !opt_quiet )
|
||||
@@ -3992,7 +3918,7 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize stats times and counters
|
||||
// Initialize stats timers and counters
|
||||
memset( share_stats, 0, s_stats_size * sizeof (struct share_stats_t) );
|
||||
gettimeofday( &last_submit_time, NULL );
|
||||
memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
|
||||
|
31
miner.h
31
miner.h
@@ -91,6 +91,19 @@ enum {
|
||||
LOG_PINK = 0x14 };
|
||||
#endif
|
||||
|
||||
#define WORK_ALIGNMENT 64
|
||||
|
||||
// When working with dynamically allocated memory to guarantee data alignment
|
||||
// for large vectors. Physical block size must be extended by alignment number
|
||||
// of bytes when allocated. free() should use the physical pointer returned by
|
||||
// malloc(), not the aligned pointer. All others shoujld use the logical,
|
||||
// aligned, pointer returned by this function.
|
||||
static inline void *align_ptr( const void *ptr, const uint64_t alignment )
|
||||
{
|
||||
const uint64_t mask = alignment - 1;
|
||||
return (void*)( ( ((const uint64_t)ptr) + mask ) & (~mask) );
|
||||
}
|
||||
|
||||
extern bool is_power_of_2( int n );
|
||||
|
||||
static inline bool is_windows(void)
|
||||
@@ -118,7 +131,7 @@ static inline bool is_windows(void)
|
||||
static inline uint32_t swab32(uint32_t v)
|
||||
{
|
||||
#ifdef WANT_BUILTIN_BSWAP
|
||||
return __builtin_bswap32(v);
|
||||
return __builtin_bswap32(v);
|
||||
#else
|
||||
return bswap_32(v);
|
||||
#endif
|
||||
@@ -317,7 +330,7 @@ extern void cbin2hex(char *out, const char *in, size_t len);
|
||||
void bin2hex( char *s, const unsigned char *p, size_t len );
|
||||
char *abin2hex( const unsigned char *p, size_t len );
|
||||
char *bebin2hex( const unsigned char *p, size_t len );
|
||||
bool hex2bin( unsigned char *p, const char *hexstr, size_t len );
|
||||
bool hex2bin( unsigned char *p, const char *hexstr, const size_t len );
|
||||
bool jobj_binary( const json_t *obj, const char *key, void *buf,
|
||||
size_t buflen );
|
||||
int varint_encode( unsigned char *p, uint64_t n );
|
||||
@@ -333,10 +346,7 @@ extern void memrev(unsigned char *p, size_t len);
|
||||
// number of hashes.
|
||||
//
|
||||
// https://en.bitcoin.it/wiki/Difficulty
|
||||
//
|
||||
// hash = diff * 2**32
|
||||
//
|
||||
// diff_to_hash = 2**32 = 0x100000000 = 4294967296 = exp32;
|
||||
|
||||
#define EXP16 65536.
|
||||
#define EXP32 4294967296.
|
||||
@@ -350,8 +360,9 @@ extern const long double exp160; // 2**160
|
||||
bool fulltest( const uint32_t *hash, const uint32_t *target );
|
||||
bool valid_hash( const void*, const void* );
|
||||
|
||||
double hash_to_diff( const void* );
|
||||
extern double hash_to_diff( const void* );
|
||||
extern void diff_to_hash( uint32_t*, const double );
|
||||
extern double nbits_to_diff( uint32_t );
|
||||
|
||||
double hash_target_ratio( uint32_t* hash, uint32_t* target );
|
||||
void work_set_target_ratio( struct work* work, const void *hash );
|
||||
@@ -405,7 +416,7 @@ struct work
|
||||
unsigned char *xnonce2;
|
||||
bool sapling;
|
||||
bool stale;
|
||||
} __attribute__ ((aligned (64)));
|
||||
} __attribute__ ((aligned (WORK_ALIGNMENT)));
|
||||
|
||||
struct stratum_job
|
||||
{
|
||||
@@ -540,7 +551,6 @@ enum algos {
|
||||
ALGO_BMW,
|
||||
ALGO_BMW512,
|
||||
ALGO_C11,
|
||||
ALGO_DECRED,
|
||||
ALGO_DEEP,
|
||||
ALGO_DMD_GR,
|
||||
ALGO_GROESTL,
|
||||
@@ -559,6 +569,7 @@ enum algos {
|
||||
ALGO_LYRA2Z330,
|
||||
ALGO_M7M,
|
||||
ALGO_MINOTAUR,
|
||||
ALGO_MINOTAURX,
|
||||
ALGO_MYR_GR,
|
||||
ALGO_NEOSCRYPT,
|
||||
ALGO_NIST5,
|
||||
@@ -633,7 +644,6 @@ static const char* const algo_names[] = {
|
||||
"bmw",
|
||||
"bmw512",
|
||||
"c11",
|
||||
"decred",
|
||||
"deep",
|
||||
"dmd-gr",
|
||||
"groestl",
|
||||
@@ -652,6 +662,7 @@ static const char* const algo_names[] = {
|
||||
"lyra2z330",
|
||||
"m7m",
|
||||
"minotaur",
|
||||
"minotaurx",
|
||||
"myr-gr",
|
||||
"neoscrypt",
|
||||
"nist5",
|
||||
@@ -793,7 +804,6 @@ Options:\n\
|
||||
bmw BMW 256\n\
|
||||
bmw512 BMW 512\n\
|
||||
c11 Chaincoin\n\
|
||||
decred Blake256r14dcr\n\
|
||||
deep Deepcoin (DCN)\n\
|
||||
dmd-gr Diamond\n\
|
||||
groestl Groestl coin\n\
|
||||
@@ -813,6 +823,7 @@ Options:\n\
|
||||
m7m Magi (XMG)\n\
|
||||
myr-gr Myriad-Groestl\n\
|
||||
minotaur\n\
|
||||
minotaurx\n\
|
||||
neoscrypt NeoScrypt(128, 2, 1)\n\
|
||||
nist5 Nist5\n\
|
||||
pentablake 5 x blake512\n\
|
||||
|
25
simd-utils.h
25
simd-utils.h
@@ -57,10 +57,15 @@
|
||||
// 32 bytes for 256 bit vectors and 64 bytes for 512 bit vectors. 64 byte
|
||||
// alignment is recommended in all cases for best cache alignment.
|
||||
//
|
||||
// All functions are defined with type agnostic pointers (void*) arguments
|
||||
// and are cast or aliased as the appropriate type. This adds convenience
|
||||
// for the applications but also adds responsibility to ensure adequate data
|
||||
// alignment.
|
||||
//
|
||||
// Windows has problems with function vector arguments larger than
|
||||
// 128 bits. Stack alignment is only guaranteed to 16 bytes. Always use
|
||||
// pointers for larger vectors in function arguments. Macros can be
|
||||
// used for larger value arguments.
|
||||
// pointers for larger vectors in function arguments. Macros can be used
|
||||
// for larger value arguments.
|
||||
//
|
||||
// An attempt was made to make the names as similar as possible to
|
||||
// Intel's intrinsic function format. Most variations are to avoid
|
||||
@@ -74,7 +79,7 @@
|
||||
// to avoid the ambiguity of "mm".
|
||||
// - the element size does not include additional type specifiers
|
||||
// like "epi".
|
||||
// - some macros contain value args that are updated.
|
||||
// - some macros may contain value args that are updated.
|
||||
// - specialized shift and rotate functions that move elements around
|
||||
// use the notation "1x32" to indicate the distance moved as units of
|
||||
// the element size.
|
||||
@@ -86,10 +91,10 @@
|
||||
//
|
||||
// Function names follow this pattern:
|
||||
//
|
||||
// prefix_op[esize]_[vsize]
|
||||
// prefix_op[vsize]_[esize]
|
||||
//
|
||||
// Prefix: usually the size of the largest vectors used. Following
|
||||
// are some examples:
|
||||
// Prefix: usually the size of the returned vector.
|
||||
// Following are some examples:
|
||||
//
|
||||
// u64: unsigned 64 bit integer function
|
||||
// i128: signed 128 bit integer function (rarely used)
|
||||
@@ -102,10 +107,12 @@
|
||||
// esize: optional, element size of operation
|
||||
//
|
||||
// vsize: optional, lane size used when a function operates on elements
|
||||
// of vectors within lanes of a vector.
|
||||
// within lanes of a larger vector.
|
||||
//
|
||||
// Ex: mm256_ror1x64_128 rotates each 128 bit lane of a 256 bit vector
|
||||
// right by 64 bits.
|
||||
// m256_const_64 defines a vector contructed from the supplied 64 bit
|
||||
// integer arguments.
|
||||
// mm256_shuflr128_32 rotates each 128 bit lane of a 256 bit vector
|
||||
// right by 32 bits.
|
||||
//
|
||||
// Vector constants
|
||||
//
|
||||
|
@@ -54,7 +54,7 @@ static inline __m128i mm128_mov64_128( const uint64_t n )
|
||||
#else
|
||||
asm( "movq %1, %0\n\t" : "=x"(a) : "r"(n) );
|
||||
#endif
|
||||
return a;
|
||||
return a;
|
||||
}
|
||||
|
||||
static inline __m128i mm128_mov32_128( const uint32_t n )
|
||||
@@ -65,7 +65,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
|
||||
#else
|
||||
asm( "movd %1, %0\n\t" : "=x"(a) : "r"(n) );
|
||||
#endif
|
||||
return a;
|
||||
return a;
|
||||
}
|
||||
|
||||
// Inconstant naming, prefix should reflect return value:
|
||||
@@ -79,7 +79,7 @@ static inline uint64_t u64_mov128_64( const __m128i a )
|
||||
#else
|
||||
asm( "movq %1, %0\n\t" : "=r"(n) : "x"(a) );
|
||||
#endif
|
||||
return n;
|
||||
return n;
|
||||
}
|
||||
|
||||
static inline uint32_t u32_mov128_32( const __m128i a )
|
||||
@@ -90,7 +90,7 @@ static inline uint32_t u32_mov128_32( const __m128i a )
|
||||
#else
|
||||
asm( "movd %1, %0\n\t" : "=r"(n) : "x"(a) );
|
||||
#endif
|
||||
return n;
|
||||
return n;
|
||||
}
|
||||
|
||||
// Equivalent of set1, broadcast integer to all elements.
|
||||
@@ -193,13 +193,23 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
|
||||
// Basic operations without equivalent SIMD intrinsic
|
||||
|
||||
// Bitwise not (~v)
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
static inline __m128i mm128_not( const __m128i v )
|
||||
{ return _mm_ternarylogic_epi64( v, v, v, 1 ); }
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_not( v ) _mm_xor_si128( v, m128_neg1 )
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
// Unary negation of elements (-v)
|
||||
#define mm128_negate_64( v ) _mm_sub_epi64( m128_zero, v )
|
||||
#define mm128_negate_32( v ) _mm_sub_epi32( m128_zero, v )
|
||||
#define mm128_negate_16( v ) _mm_sub_epi16( m128_zero, v )
|
||||
|
||||
*/
|
||||
|
||||
// Add 4 values, fewer dependencies than sequential addition.
|
||||
#define mm128_add4_64( a, b, c, d ) \
|
||||
@@ -255,20 +265,16 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
// a ^ b ^ c
|
||||
#define mm128_xor3( a, b, c ) \
|
||||
_mm_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
#define mm128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
|
||||
// a ^ ( b & c )
|
||||
#define mm128_xorand( a, b, c ) \
|
||||
_mm_ternarylogic_epi64( a, b, c, 0x78 )
|
||||
#define mm128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 )
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_xor3( a, b, c ) \
|
||||
_mm_xor_si128( a, _mm_xor_si128( b, c ) )
|
||||
#define mm128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
|
||||
|
||||
#define mm128_xorand( a, b, c ) \
|
||||
_mm_xor_si128( a, _mm_and_si128( b, c ) )
|
||||
#define mm128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
#endif
|
||||
|
||||
@@ -283,26 +289,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm_movmask_32( v ) \
|
||||
_mm_castps_si128( _mm_movmask_ps( _mm_castsi128_ps( v ) ) )
|
||||
|
||||
|
||||
// Diagonal blend
|
||||
|
||||
// Blend 4 32 bit elements from 4 vectors
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
#define mm128_diagonal_32( v3, v2, v1, v0 ) \
|
||||
mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \
|
||||
_mm_blend_epi32( s1, s0, 0x1 ), 0x3 )
|
||||
|
||||
#elif defined(__SSE4_1__)
|
||||
|
||||
#define mm128_diagonal_32( v3, v2, v1, v0 ) \
|
||||
mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \
|
||||
_mm_blend_epi16( s1, s0, 0x03 ), 0x0f )
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
// Bit rotations
|
||||
|
||||
@@ -401,14 +387,14 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
|
||||
//
|
||||
// Limited 2 input shuffle, combines shuffle with blend. The destination low
|
||||
// half is always taken from src a, and the high half from src b.
|
||||
#define mm128_shuffle2_64( a, b, c ) \
|
||||
_mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( a ), \
|
||||
_mm_castsi128_pd( b ), c ) );
|
||||
// half is always taken from v1, and the high half from v2.
|
||||
#define mm128_shuffle2_64( v1, v2, c ) \
|
||||
_mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \
|
||||
_mm_castsi128_pd( v2 ), c ) );
|
||||
|
||||
#define mm128_shuffle2_32( a, b, c ) \
|
||||
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( a ), \
|
||||
_mm_castsi128_ps( b ), c ) );
|
||||
#define mm128_shuffle2_32( v1, v2, c ) \
|
||||
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
|
||||
_mm_castsi128_ps( v2 ), c ) );
|
||||
|
||||
//
|
||||
// Rotate vector elements accross all lanes
|
||||
@@ -475,6 +461,10 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define mm128_bswap_128( v ) \
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x0001020304050607, \
|
||||
0x08090a0b0c0d0e0f ) )
|
||||
|
||||
#define mm128_bswap_64( v ) \
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
|
||||
0x0001020304050607 ) )
|
||||
@@ -536,6 +526,9 @@ static inline __m128i mm128_bswap_16( __m128i v )
|
||||
return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
|
||||
}
|
||||
|
||||
#define mm128_bswap_128( v ) \
|
||||
mm128_swap_64( mm128_bswap_64( v ) )
|
||||
|
||||
static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[0] = mm128_bswap_64( s[0] );
|
||||
@@ -562,9 +555,6 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
|
||||
#endif // SSSE3 else SSE2
|
||||
|
||||
//
|
||||
// Rotate in place concatenated 128 bit vectors as one 256 bit vector.
|
||||
|
||||
// Swap 128 bit vectors.
|
||||
// This should be avoided, it's more efficient to switch references.
|
||||
#define mm128_swap256_128( v1, v2 ) \
|
||||
@@ -573,163 +563,24 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
v1 = _mm_xor_si128( v1, v2 );
|
||||
|
||||
|
||||
// Two input shuffle-rotate.
|
||||
// Concatenate v1 & v2 and bit rotate as one 256 bit vector.
|
||||
// alignr for 32 & 64 bit elements is only available with AVX512 but
|
||||
// emulated here. Shift argument is not needed, it's always 1.
|
||||
// Behaviour is otherwise consistent with Intel alignr intrinsics.
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
// Function macros with two inputs and one output, inputs are preserved.
|
||||
// Returns the high 128 bits, ie updated v1.
|
||||
// These functions are preferred but only available with SSSE3. Use procedure
|
||||
// macros below for SSE2 compatibility.
|
||||
#define mm128_alignr_64( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*8 )
|
||||
#define mm128_alignr_32( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*4 )
|
||||
|
||||
#define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
|
||||
#define mm128_shufl2l_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
|
||||
#else
|
||||
|
||||
#define mm128_shufl2r_32( v1, v2 ) _mm_alignr_epi8( v2, v1, 4 )
|
||||
#define mm128_shufl2l_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 )
|
||||
#define mm128_alignr_64( hi, lo, c ) \
|
||||
_mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
|
||||
|
||||
#define mm128_shufl2r_16( v1, v2 ) _mm_alignr_epi8( v2, v1, 2 )
|
||||
#define mm128_shufl2l_16( v1, v2 ) _mm_alignr_epi8( v1, v2, 2 )
|
||||
#define mm128_alignr_32( hi, lo, c ) \
|
||||
_mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
|
||||
|
||||
#define mm128_shufl2r_8( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
|
||||
#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
|
||||
|
||||
// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
|
||||
// Deprecated for SSSE3 and above, SSSE3 versions exist for only for
|
||||
// compatibility with with existing code.
|
||||
|
||||
#define mm128_vror256_64( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
|
||||
v1 = _mm_alignr_epi8( v2, v1, 8 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vrol256_64( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
|
||||
v2 = _mm_alignr_epi8( v2, v1, 8 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vror256_32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 4 ); \
|
||||
v1 = _mm_alignr_epi8( v2, v1, 4 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vrol256_32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 12 ); \
|
||||
v2 = _mm_alignr_epi8( v2, v1, 12 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vror256_16( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 2 ); \
|
||||
v1 = _mm_alignr_epi8( v2, v1, 2 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vrol256_16( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 14 ); \
|
||||
v2 = _mm_alignr_epi8( v2, v1, 14 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vror256_8( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 1 ); \
|
||||
v1 = _mm_alignr_epi8( v2, v1, 1 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vrol256_8( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 15 ); \
|
||||
v2 = _mm_alignr_epi8( v2, v1, 15 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define mm128_vror256_64( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
|
||||
_mm_slli_si128( v2, 8 ) ); \
|
||||
v2 = _mm_or_si128( _mm_srli_si128( v2, 8 ), \
|
||||
_mm_slli_si128( v1, 8 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vrol256_64( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
|
||||
_mm_srli_si128( v2, 8 ) ); \
|
||||
v2 = _mm_or_si128( _mm_slli_si128( v2, 8 ), \
|
||||
_mm_srli_si128( v1, 8 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vror256_32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
|
||||
_mm_slli_si128( v2, 12 ) ); \
|
||||
v2 = _mm_or_si128( _mm_srli_si128( v2, 4 ), \
|
||||
_mm_slli_si128( v1, 12 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vrol256_32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
|
||||
_mm_srli_si128( v2, 12 ) ); \
|
||||
v2 = _mm_or_si128( _mm_slli_si128( v2, 4 ), \
|
||||
_mm_srli_si128( v1, 12 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vror256_16( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
|
||||
_mm_slli_si128( v2, 14 ) ); \
|
||||
v2 = _mm_or_si128( _mm_srli_si128( v2, 2 ), \
|
||||
_mm_slli_si128( v1, 14 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vrol256_16( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
|
||||
_mm_srli_si128( v2, 14 ) ); \
|
||||
v2 = _mm_or_si128( _mm_slli_si128( v2, 2 ), \
|
||||
_mm_srli_si128( v1, 14 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vror256_8( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
|
||||
_mm_slli_si128( v2, 15 ) ); \
|
||||
v2 = _mm_or_si128( _mm_srli_si128( v2, 1 ), \
|
||||
_mm_slli_si128( v1, 15 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vrol256_8( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
|
||||
_mm_srli_si128( v2, 15 ) ); \
|
||||
v2 = _mm_or_si128( _mm_slli_si128( v2, 1 ), \
|
||||
_mm_srli_si128( v1, 15 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#endif // SSE4.1 else SSE2
|
||||
#endif
|
||||
|
||||
#endif // __SSE2__
|
||||
#endif // SIMD_128_H__
|
||||
|
@@ -1,30 +1,29 @@
|
||||
#if !defined(SIMD_256_H__)
|
||||
#define SIMD_256_H__ 1
|
||||
|
||||
//#if defined(__AVX2__)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// AVX2 256 bit vectors
|
||||
//
|
||||
// Basic support for 256 bit vectors is available with AVX but integer
|
||||
// support requires AVX2.
|
||||
// Some 256 bit vector utilities require AVX512 or have more efficient
|
||||
// AVX512 implementations. They will be selected automatically but their use
|
||||
// is limited because 256 bit vectors are less likely to be used when 512
|
||||
// is available.
|
||||
//
|
||||
// AVX2 version of _mm256_shuffle_epi8 is limited to 128 bit lanes but AVX512
|
||||
// version is not. Some usage has the index vector encoded as if full vector
|
||||
// shuffles are supported. This has no side effects and would have the same
|
||||
// results using either version.
|
||||
// If needed and AVX512 is available, 256 bit full vector shuffles can be
|
||||
// implemented using the AVX512 zero-mask feature with a NULL mask.
|
||||
// Using intrinsics it's simple:
|
||||
// _mm256_maskz_shuffle_epi8( k0, v, c )
|
||||
// With asm it's a bit more complicated with the addition of the mask register
|
||||
// and zero tag:
|
||||
// vpshufb ymm0{k0}{z}, ymm1, ymm2
|
||||
// AVX512VL backports some AVX512 features to 256 bit vectors and can produce
|
||||
// more efficient implementations of some functions. They will be selected
|
||||
// automatically but their use is limited because 256 bit vectors are less
|
||||
// likely to be used when 512 is available.
|
||||
//
|
||||
// "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
|
||||
// lanes and data can't cross the 128 bit lane boundary.
|
||||
// Full width byte shuffle is available with AVX512VL using the mask version
|
||||
// with a full mask (-1).
|
||||
// Instructions that can move data across 128 bit lane boundary incur a
|
||||
// performance penalty over those that can't.
|
||||
// Some usage of index vectors may be encoded as if full vector shuffles are
|
||||
// supported. This has no side effects and would have the same results using
|
||||
// either version.
|
||||
// If the need arises and AVX512VL is available, 256 bit full vector byte
|
||||
// shuffles can be implemented using the AVX512 mask feature with a NULL mask.
|
||||
|
||||
#if defined(__AVX__)
|
||||
|
||||
@@ -57,8 +56,8 @@ typedef union
|
||||
#define casto_m256i(p,o) (((__m256i*)(p))+(o))
|
||||
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// Move integer to low element of vector, other elements are set to zero.
|
||||
#define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
|
||||
@@ -68,11 +67,6 @@ typedef union
|
||||
#define u64_mov256_64( v ) u64_mov128_64( _mm256_castsi256_si128( v ) )
|
||||
#define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) )
|
||||
|
||||
// deprecated
|
||||
//#define mm256_mov256_64 u64_mov256_64
|
||||
//#define mm256_mov256_32 u32_mov256_32
|
||||
|
||||
|
||||
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
|
||||
#define mm256_concat_128( hi, lo ) \
|
||||
_mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )
|
||||
@@ -144,13 +138,23 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
//
|
||||
// Basic operations without SIMD equivalent
|
||||
|
||||
// Bitwise not ( ~v )
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
static inline __m256i mm256_not( const __m256i v )
|
||||
{ return _mm256_ternarylogic_epi64( v, v, v, 1 ); }
|
||||
|
||||
#else
|
||||
|
||||
#define mm256_not( v ) _mm256_xor_si256( v, m256_neg1 ) \
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
// Unary negation of each element ( -v )
|
||||
#define mm256_negate_64( v ) _mm256_sub_epi64( m256_zero, v )
|
||||
#define mm256_negate_32( v ) _mm256_sub_epi32( m256_zero, v )
|
||||
#define mm256_negate_16( v ) _mm256_sub_epi16( m256_zero, v )
|
||||
*/
|
||||
|
||||
|
||||
// Add 4 values, fewer dependencies than sequential addition.
|
||||
@@ -172,44 +176,34 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
// AVX512 has ternary logic that supports any 3 input boolean expression.
|
||||
|
||||
// a ^ b ^ c
|
||||
#define mm256_xor3( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
#define mm256_xor3( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
|
||||
// legacy convenience only
|
||||
#define mm256_xor4( a, b, c, d ) \
|
||||
_mm256_xor_si256( a, mm256_xor3( b, c, d ) )
|
||||
#define mm256_xor4( a, b, c, d ) _mm256_xor_si256( a, mm256_xor3( b, c, d ) )
|
||||
|
||||
// a & b & c
|
||||
#define mm256_and3( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0x80 )
|
||||
#define mm256_and3( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0x80 )
|
||||
|
||||
// a | b | c
|
||||
#define mm256_or3( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0xfe )
|
||||
#define mm256_or3( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xfe )
|
||||
|
||||
// a ^ ( b & c )
|
||||
#define mm256_xorand( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0x78 )
|
||||
#define mm256_xorand( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0x78 )
|
||||
|
||||
// a & ( b ^ c )
|
||||
#define mm256_andxor( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0x60 )
|
||||
#define mm256_andxor( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0x60 )
|
||||
|
||||
// a ^ ( b | c )
|
||||
#define mm256_xoror( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0x1e )
|
||||
#define mm256_xoror( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0x1e )
|
||||
|
||||
// a ^ ( ~b & c )
|
||||
#define mm256_xorandnot( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0xd2 )
|
||||
#define mm256_xorandnot( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xd2 )
|
||||
|
||||
// a | ( b & c )
|
||||
#define mm256_orand( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0xf8 )
|
||||
#define mm256_orand( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xf8 )
|
||||
|
||||
// ~( a ^ b ), same as (~a) ^ b
|
||||
#define mm256_xnor( a, b ) \
|
||||
_mm256_ternarylogic_epi64( a, b, b, 0x81 )
|
||||
#define mm256_xnor( a, b ) _mm256_ternarylogic_epi64( a, b, b, 0x81 )
|
||||
|
||||
#else
|
||||
|
||||
@@ -256,32 +250,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
#define mm256_movmask_32( v ) \
|
||||
_mm256_castps_si256( _mm256_movmask_ps( _mm256_castsi256_ps( v ) ) )
|
||||
|
||||
|
||||
// Diagonal blending
|
||||
|
||||
// Blend 4 64 bit elements from 4 vectors
|
||||
#define mm256_diagonal_64( v3, v2, v1, v0 ) \
|
||||
mm256_blend_epi32( _mm256_blend_epi32( v3, v2, 0x30 ), \
|
||||
_mm256_blend_epi32( v1, v0, 0x03 ), 0x0f )
|
||||
|
||||
// Blend 8 32 bit elements from 8 vectors
|
||||
#define mm256_diagonal_32( v7, v6, v5, v4, v3, v2, v1, v0 ) \
|
||||
_mm256_blend_epi32( \
|
||||
_mm256_blend_epi32( \
|
||||
_mm256_blend_epi32( v7, v6, 0x40 ), \
|
||||
_mm256_blend_epi32( v5, v4, 0x10 ) 0x30 ), \
|
||||
_mm256_blend_epi32( \
|
||||
_mm256_blend_epi32( v3, v2, 0x04) \
|
||||
_mm256_blend_epi32( v1, v0, 0x01 ), 0x03 ), 0x0f )
|
||||
|
||||
|
||||
// Blend 4 32 bit elements from each 128 bit lane.
|
||||
#define mm256_diagonal128_32( v3, v2, v1, v0 ) \
|
||||
_mm256_blend_epi32( \
|
||||
_mm256_blend_epi32( v3, v2, 0x44) \
|
||||
_mm256_blend_epi32( v1, v0, 0x11 ) )
|
||||
|
||||
|
||||
//
|
||||
// Bit rotations.
|
||||
//
|
||||
@@ -400,6 +368,16 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
|
||||
|
||||
// Rotate 256 bit vector by one 32 bit element.
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
static inline __m256i mm256_shuflr_32( const __m256i v )
|
||||
{ return _mm256_alignr_epi32( v, v, 1 ); }
|
||||
|
||||
static inline __m256i mm256_shufll_32( const __m256i v )
|
||||
{ return _mm256_alignr_epi32( v, v, 15 ); }
|
||||
|
||||
#else
|
||||
|
||||
#define mm256_shuflr_32( v ) \
|
||||
_mm256_permutevar8x32_epi32( v, \
|
||||
m256_const_64( 0x0000000000000007, 0x0000000600000005, \
|
||||
@@ -410,17 +388,19 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
m256_const_64( 0x0000000600000005, 0x0000000400000003, \
|
||||
0x0000000200000001, 0x0000000000000007 ) )
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
// Rotate elements within each 128 bit lane of 256 bit vector.
|
||||
|
||||
// Limited 2 input shuffle
|
||||
#define mm256_shuffle2_64( a, b, c ) \
|
||||
_mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( a ), \
|
||||
_mm256_castsi256_pd( b ), c ) );
|
||||
#define mm256_shuffle2_64( v1, v2, c ) \
|
||||
_mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( v1 ), \
|
||||
_mm256_castsi256_pd( v2 ), c ) );
|
||||
|
||||
#define mm256_shuffle2_32( a, b, c ) \
|
||||
_mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \
|
||||
_mm256_castsi256_ps( b ), c ) );
|
||||
#define mm256_shuffle2_32( v1, v2, c ) \
|
||||
_mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \
|
||||
_mm256_castsi256_ps( v2 ), c ) );
|
||||
|
||||
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
|
||||
#define mm256_shuflr128_64 mm256_swap128_64
|
||||
@@ -444,8 +424,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
#define mm256_shuflr64_24( v ) _mm256_ror_epi64( v, 24 )
|
||||
#else
|
||||
#define mm256_shuflr64_24( v ) \
|
||||
_mm256_shuffle_epi8( v, _mm256_set_epi64x( \
|
||||
0x0a09080f0e0d0c0b, 0x0201000706050403, \
|
||||
_mm256_shuffle_epi8( v, m256_const2_64( \
|
||||
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
|
||||
#endif
|
||||
|
||||
@@ -453,8 +432,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
#define mm256_shuflr64_16( v ) _mm256_ror_epi64( v, 16 )
|
||||
#else
|
||||
#define mm256_shuflr64_16( v ) \
|
||||
_mm256_shuffle_epi8( v, _mm256_set_epi64x( \
|
||||
0x09080f0e0d0c0b0a, 0x0100070605040302, \
|
||||
_mm256_shuffle_epi8( v, m256_const2_64( \
|
||||
0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
|
||||
#endif
|
||||
|
||||
@@ -462,8 +440,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
#define mm256_swap32_16( v ) _mm256_ror_epi32( v, 16 )
|
||||
#else
|
||||
#define mm256_swap32_16( v ) \
|
||||
_mm256_shuffle_epi8( v, _mm256_set_epi64x( \
|
||||
0x0d0c0f0e09080b0a, 0x0504070601000302, \
|
||||
_mm256_shuffle_epi8( v, m256_const2_64( \
|
||||
0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
|
||||
#endif
|
||||
#define mm256_shuflr32_16 mm256_swap32_16
|
||||
@@ -478,35 +455,24 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
|
||||
#endif
|
||||
|
||||
// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
|
||||
// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
|
||||
// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
|
||||
// needed for a shuffle that crosses 128 bit lanes. BSWAP doesn't therefore the
|
||||
// AVX2 version will work here. The bswap control vector is coded to work
|
||||
// with both versions, bit 4 is ignored in AVX2.
|
||||
|
||||
// Reverse byte order in elements, endian bswap.
|
||||
#define mm256_bswap_64( v ) \
|
||||
_mm256_shuffle_epi8( v, \
|
||||
m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
|
||||
m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
|
||||
|
||||
#define mm256_bswap_32( v ) \
|
||||
_mm256_shuffle_epi8( v, \
|
||||
m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) )
|
||||
m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) )
|
||||
|
||||
#define mm256_bswap_16( v ) \
|
||||
_mm256_shuffle_epi8( v, \
|
||||
m256_const_64( 0x1e1f1c1d1a1b1819, 0x1617141512131011, \
|
||||
0x0e0f0c0d0a0b0809, 0x0607040502030001, ) )
|
||||
m256_const2_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001, ) )
|
||||
|
||||
// Source and destination are pointers, may point to same memory.
|
||||
// 8 byte qword * 8 qwords * 4 lanes = 256 bytes
|
||||
#define mm256_block_bswap_64( d, s ) do \
|
||||
{ \
|
||||
__m256i ctl = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ; \
|
||||
__m256i ctl = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ; \
|
||||
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
|
||||
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
|
||||
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
|
||||
@@ -520,8 +486,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
// 4 byte dword * 8 dwords * 8 lanes = 256 bytes
|
||||
#define mm256_block_bswap_32( d, s ) do \
|
||||
{ \
|
||||
__m256i ctl = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
__m256i ctl = m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
|
||||
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
|
||||
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
|
||||
|
@@ -2,42 +2,57 @@
|
||||
#define SIMD_512_H__ 1
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////
|
||||
//
|
||||
// AVX-512
|
||||
// AVX512 512 bit vectors
|
||||
//
|
||||
// The baseline for these utilities is AVX512F, AVX512DQ, AVX512BW
|
||||
// and AVX512VL, first available in quantity in Skylake-X.
|
||||
// Some utilities may require additional features available in subsequent
|
||||
// architectures and are noted.
|
||||
|
||||
// Some utilities may require additional AVX512 extensions available in
|
||||
// subsequent architectures and are noted where used.
|
||||
// AVX512VL is used to backport AVX512 instructions to 128 and 256 bit
|
||||
// vectors. It is therefore not technically required for any 512 bit vector
|
||||
// utilities defined below.
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// AVX512 intrinsics have a few changes from previous conventions.
|
||||
//
|
||||
// cmp instruction now returns a bitmask instead of a vector mask.
|
||||
// This eliminates the need for the blendv instruction.
|
||||
// "_mm512_cmp" instructions now returns a bitmask instead of a vector mask.
|
||||
// This removes the need for an explicit movemask instruction.
|
||||
//
|
||||
// The new rotate instructions require the count to be an 8 bit
|
||||
// immediate value only. Compilation fails if a variable is used.
|
||||
// The documentation is the same as for shift and it works with
|
||||
// variables. The inconsistency is likely due to compiler optimizations
|
||||
// that can eliminate the variable in some instances.
|
||||
// Many previously sizeless (si) instructions now have sized (epi) versions
|
||||
// to accomodate masking packed elements.
|
||||
//
|
||||
// _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
|
||||
// usually shuffles accross all lanes.
|
||||
// Many AVX512 instructions have a different argument order from the AVX2
|
||||
// versions of similar instructions. There is also some inconsistency in how
|
||||
// different AVX512 instructions position the mask register in the argument
|
||||
// list.
|
||||
//
|
||||
// permutexvar has args reversed, index is first arg. Previously all
|
||||
// permutes and shuffles have the index last.
|
||||
// "_mm512_permutex_epi64" only shuffles within 256 bit lanes. All other
|
||||
// AVX512 permutes can cross all lanes.
|
||||
//
|
||||
// _mm512_permutexvar_epi8 requires AVX512-VBMI, larger elements don't.
|
||||
// It also performs the same op as _mm512_shuffle_epi8.
|
||||
// "_mm512_shuffle_epi8" shuffles accross the entire 512 bits. Shuffle
|
||||
// instructions generally don't cross 128 bit lane boundaries and the AVX2
|
||||
// version of this specific instruction does not.
|
||||
//
|
||||
// shuffle_epi8 shuffles accross entire 512 bits. Shuffle usually
|
||||
// doesn't cross 128 bit lane boundaries but is consistent with AVX2
|
||||
// where shuffle_epi8 spans the entire vector.
|
||||
// New alignr instructions for epi64 and epi32 operate across the entire
|
||||
// vector but slower than epi8 which continues to be restricted to 128 bit
|
||||
// lanes.
|
||||
//
|
||||
// There are 2 areas where overhead is aconcern: constants and
|
||||
// "_mm512_permutexvar_epi8" and "_mm512_permutex2var_epi8" require
|
||||
// AVX512-VBMI. The same instructions with larger elements don't have this
|
||||
// requirement. "_mm512_permutexvar_epi8" also performs the same operation
|
||||
// as "_mm512_shuffle_epi8" which only requires AVX512-BW.
|
||||
//
|
||||
// Two coding conventions are used to prevent macro argument side effects:
|
||||
// - if a macro arg is used in an expression it must be protected by
|
||||
// parentheses to ensure an expression argument is evaluated first.
|
||||
// - if an argument is to referenced multiple times a C inline function
|
||||
// should be used instead of a macro to prevent an expression argument
|
||||
// from being evaluated multiple times.
|
||||
//
|
||||
// There are 2 areas where overhead is a major concern: constants and
|
||||
// permutations.
|
||||
//
|
||||
// Constants need to be composed at run time by assembling individual
|
||||
@@ -60,13 +75,10 @@
|
||||
// The same rules apply, if an index is to be reused it should be defined
|
||||
// as a local. This applies specifically to bswap operations.
|
||||
//
|
||||
// Additionally, permutations using smaller vectors can be more efficient
|
||||
// if the permutation doesn't cross lane boundaries, typically 128 bits,
|
||||
// and the smaller vector can use an imm comtrol.
|
||||
//
|
||||
// If the permutation doesn't cross lane boundaries a shuffle instructions
|
||||
// can be used with imm control instead of permute.
|
||||
|
||||
// Permutations that cross 128 bit lanes are typically slower and often need
|
||||
// a vector control index. If the permutation doesn't need to cross 128 bit
|
||||
// lanes a shuffle instruction can often be used with an imm control.
|
||||
//
|
||||
//////////////////////////////////////////////////////////////
|
||||
//
|
||||
// AVX512 512 bit vectors
|
||||
@@ -173,22 +185,30 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
||||
#define m512_one_16 m512_const1_16( 1 )
|
||||
#define m512_one_8 m512_const1_8( 1 )
|
||||
|
||||
//#define m512_neg1 m512_const1_64( 0xffffffffffffffff )
|
||||
#define m512_neg1 _mm512_movm_epi64( 0xff )
|
||||
// use asm to avoid compiler warning for unitialized local
|
||||
static inline __m512i mm512_neg1_fn()
|
||||
{
|
||||
__m512i a;
|
||||
asm( "vpternlogq $0xff, %0, %0, %0\n\t" : "=x"(a) );
|
||||
return a;
|
||||
}
|
||||
#define m512_neg1 mm512_neg1_fn() // 1 clock
|
||||
//#define m512_neg1 m512_const1_64( 0xffffffffffffffff ) // 5 clocks
|
||||
//#define m512_neg1 _mm512_movm_epi64( 0xff ) // 2 clocks
|
||||
|
||||
//
|
||||
// Basic operations without SIMD equivalent
|
||||
|
||||
// ~x
|
||||
// #define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 )
|
||||
// Bitwise NOT: ~x
|
||||
static inline __m512i mm512_not( const __m512i x )
|
||||
{ return _mm512_ternarylogic_epi64( x, x, x, 1 ); }
|
||||
|
||||
// -x
|
||||
/*
|
||||
// Unary negation: -x
|
||||
#define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
|
||||
#define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x )
|
||||
#define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x )
|
||||
|
||||
*/
|
||||
|
||||
//
|
||||
// Pointer casting
|
||||
@@ -242,76 +262,43 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
// expression using any number or combinations of AND, OR, XOR, NOT.
|
||||
|
||||
// a ^ b ^ c
|
||||
#define mm512_xor3( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
#define mm512_xor3( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
|
||||
// legacy convenience only
|
||||
#define mm512_xor4( a, b, c, d ) \
|
||||
_mm512_xor_si512( a, mm512_xor3( b, c, d ) )
|
||||
#define mm512_xor4( a, b, c, d ) _mm512_xor_si512( a, mm512_xor3( b, c, d ) )
|
||||
|
||||
// a & b & c
|
||||
#define mm512_and3( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0x80 )
|
||||
#define mm512_and3( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0x80 )
|
||||
|
||||
// a | b | c
|
||||
#define mm512_or3( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0xfe )
|
||||
#define mm512_or3( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0xfe )
|
||||
|
||||
// a ^ ( b & c )
|
||||
#define mm512_xorand( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0x78 )
|
||||
#define mm512_xorand( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0x78 )
|
||||
|
||||
// a & ( b ^ c )
|
||||
#define mm512_andxor( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0x60 )
|
||||
#define mm512_andxor( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0x60 )
|
||||
|
||||
// a ^ ( b | c )
|
||||
#define mm512_xoror( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0x1e )
|
||||
#define mm512_xoror( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0x1e )
|
||||
|
||||
// a ^ ( ~b & c ) xor( a, andnot( b, c ) )
|
||||
#define mm512_xorandnot( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0xd2 )
|
||||
// a ^ ( ~b & c ), xor( a, andnot( b, c ) )
|
||||
#define mm512_xorandnot( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0xd2 )
|
||||
|
||||
// a | ( b & c )
|
||||
#define mm512_orand( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0xf8 )
|
||||
#define mm512_orand( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0xf8 )
|
||||
|
||||
// Some 2 input operations that don't have their own instruction mnemonic.
|
||||
// Use with caution, args are not expression safe.
|
||||
|
||||
// ~( a | b ), (~a) & (~b)
|
||||
#define mm512_nor( a, b ) \
|
||||
_mm512_ternarylogic_epi64( a, b, b, 0x01 )
|
||||
#define mm512_nor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x01 )
|
||||
|
||||
// ~( a ^ b ), (~a) ^ b
|
||||
#define mm512_xnor( a, b ) \
|
||||
_mm512_ternarylogic_epi64( a, b, b, 0x81 )
|
||||
#define mm512_xnor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x81 )
|
||||
|
||||
// ~( a & b )
|
||||
#define mm512_nand( a, b ) \
|
||||
_mm512_ternarylogic_epi64( a, b, b, 0xef )
|
||||
|
||||
|
||||
// Diagonal blending
|
||||
// Blend 8 64 bit elements from 8 vectors
|
||||
#define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \
|
||||
_mm512_mask_blend_epi64( 0x0f, \
|
||||
_mm512_mask_blend_epi64( 0x30, \
|
||||
_mm512_mask_blend_epi64( 0x40, v7, v6 ), \
|
||||
_mm512_mask_blend_epi64( 0x40, v5, v4 ) ), \
|
||||
_mm512_mask_blend_epi64( 0x03, \
|
||||
_mm512_mask_blend_epi64( 0x04, v3, v2 ) \
|
||||
_mm512_mask_blend_epi64( 0x01, v1, v0 ) ) )
|
||||
|
||||
|
||||
// Blend 4 32 bit elements from each 128 bit lane.
|
||||
#define mm512_diagonal128_32( v3, v2, v1, v0 ) \
|
||||
_mm512_mask_blend_epi32( 0x3333, \
|
||||
_mm512_mask_blend_epi32( 0x4444, v3, v2 ), \
|
||||
_mm512_mask_blend_epi32( 0x1111, v1, v0 ) )
|
||||
|
||||
|
||||
|
||||
#define mm512_nand( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0xef )
|
||||
|
||||
// Bit rotations.
|
||||
|
||||
@@ -328,14 +315,8 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
#define mm512_ror_32 _mm512_ror_epi32
|
||||
#define mm512_rol_32 _mm512_rol_epi32
|
||||
|
||||
// Rotations using a vector control index are very slow due to overhead
|
||||
// to generate the index vector. Repeated rotations using the same index
|
||||
// are better handled by the calling function where the index only needs
|
||||
// to be generated once then reused very efficiently.
|
||||
// Permutes and shuffles using an immediate index are significantly faster.
|
||||
|
||||
//
|
||||
// Swap bytes in vector elements, vectorized endian conversion.
|
||||
// Reverse byte order of packed elements, vectorized endian conversion.
|
||||
|
||||
#define mm512_bswap_64( v ) \
|
||||
_mm512_shuffle_epi8( v, \
|
||||
@@ -394,30 +375,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
} while(0)
|
||||
|
||||
|
||||
// Cross-lane shuffles implementing rotate & shift of elements within a vector.
|
||||
//
|
||||
|
||||
#define mm512_shiftr_256( v ) \
|
||||
_mm512_alignr_epi64( _mm512_setzero, v, 4 )
|
||||
#define mm512_shiftl_256( v ) mm512_shifr_256
|
||||
|
||||
#define mm512_shiftr_128( v ) \
|
||||
_mm512_alignr_epi64( _mm512_setzero, v, 2 )
|
||||
#define mm512_shiftl_128( v ) \
|
||||
_mm512_alignr_epi64( v, _mm512_setzero, 6 )
|
||||
|
||||
#define mm512_shiftr_64( v ) \
|
||||
_mm512_alignr_epi64( _mm512_setzero, v, 1 )
|
||||
#define mm512_shiftl_64( v ) \
|
||||
_mm512_alignr_epi64( v, _mm512_setzero, 7 )
|
||||
|
||||
#define mm512_shiftr_32( v ) \
|
||||
_mm512_alignr_epi32( _mm512_setzero, v, 1 )
|
||||
#define mm512_shiftl_32( v ) \
|
||||
_mm512_alignr_epi32( v, _mm512_setzero, 15 )
|
||||
|
||||
// Shuffle-rotate elements left or right in 512 bit vector.
|
||||
// Cross-lane shuffles implementing rotation of packed elements.
|
||||
//
|
||||
|
||||
// Rotate elements across entire vector.
|
||||
static inline __m512i mm512_swap_256( const __m512i v )
|
||||
{ return _mm512_alignr_epi64( v, v, 4 ); }
|
||||
#define mm512_shuflr_256( v ) mm512_swap_256
|
||||
@@ -451,16 +412,16 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
#define mm512_shuflr_16( v ) \
|
||||
_mm512_permutexvar_epi16( m512_const_64( \
|
||||
0x0000001F001E001D, 0x001C001B001A0019, \
|
||||
0X0018001700160015, 0X0014001300120011, \
|
||||
0X0010000F000E000D, 0X000C000B000A0009, \
|
||||
0X0008000700060005, 0X0004000300020001 ), v )
|
||||
0x0018001700160015, 0x0014001300120011, \
|
||||
0x0010000F000E000D, 0x000C000B000A0009, \
|
||||
0x0008000700060005, 0x0004000300020001 ), v )
|
||||
|
||||
#define mm512_shufll_16( v ) \
|
||||
_mm512_permutexvar_epi16( m512_const_64( \
|
||||
0x001E001D001C001B, 0x001A001900180017, \
|
||||
0X0016001500140013, 0X001200110010000F, \
|
||||
0X000E000D000C000B, 0X000A000900080007, \
|
||||
0X0006000500040003, 0X000200010000001F ), v )
|
||||
0x0016001500140013, 0x001200110010000F, \
|
||||
0x000E000D000C000B, 0x000A000900080007, \
|
||||
0x0006000500040003, 0x000200010000001F ), v )
|
||||
|
||||
#define mm512_shuflr_8( v ) \
|
||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
||||
@@ -476,9 +437,8 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
0x1E1D1C1B1A191817, 0x161514131211100F, \
|
||||
0x0E0D0C0B0A090807, 0x060504030201003F ) )
|
||||
|
||||
//
|
||||
// 256 bit lanes used only by lyra2, move these there
|
||||
// Rotate elements within 256 bit lanes of 512 bit vector.
|
||||
// 128 bit lane shift is handled by bslli bsrli.
|
||||
|
||||
// Swap hi & lo 128 bits in each 256 bit lane
|
||||
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
|
||||
@@ -489,6 +449,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 )
|
||||
#define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 )
|
||||
|
||||
/*
|
||||
// Rotate 256 bit lanes by one 32 bit element
|
||||
#define mm512_shuflr256_32( v ) \
|
||||
_mm512_permutexvar_epi32( m512_const_64( \
|
||||
@@ -531,20 +492,20 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
0x2e2d2c2b2a292827, 0x262524232221203f, \
|
||||
0x1e1d1c1b1a191817, 0x161514131211100f, \
|
||||
0x0e0d0c0b0a090807, 0x060504030201001f ) )
|
||||
|
||||
*/
|
||||
//
|
||||
// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
|
||||
|
||||
// Limited 2 input, 1 output shuffle, combines shuffle with blend.
|
||||
// Like most shuffles it's limited to 128 bit lanes and like some shuffles
|
||||
// destination elements must come from a specific source.
|
||||
#define mm512_shuffle2_64( a, b, c ) \
|
||||
_mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( a ), \
|
||||
_mm512_castsi512_pd( b ), c ) );
|
||||
// destination elements must come from a specific source arg.
|
||||
#define mm512_shuffle2_64( v1, v2, c ) \
|
||||
_mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( v1 ), \
|
||||
_mm512_castsi512_pd( v2 ), c ) );
|
||||
|
||||
#define mm512_shuffle2_32( a, b, c ) \
|
||||
_mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( a ), \
|
||||
_mm512_castsi512_ps( b ), c ) );
|
||||
#define mm512_shuffle2_32( v1, v2, c ) \
|
||||
_mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( v1 ), \
|
||||
_mm512_castsi512_ps( v2 ), c ) );
|
||||
|
||||
// Swap 64 bits in each 128 bit lane
|
||||
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
|
||||
@@ -583,21 +544,5 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 )
|
||||
#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 )
|
||||
|
||||
|
||||
// 2 input, 1 output
|
||||
// Concatenate { v1, v2 ) then rotate right or left and return the high
|
||||
// 512 bits, ie rotated v1.
|
||||
#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 )
|
||||
#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 )
|
||||
|
||||
#define mm512_shufl2r_128( v1, v2 ) _mm512_alignr_epi64( v2, v1, 2 )
|
||||
#define mm512_shufl2l_128( v1, v2 ) _mm512_alignr_epi64( v1, v2, 2 )
|
||||
|
||||
#define mm512_shufl2r_64( v1, v2 ) _mm512_alignr_epi64( v2, v1, 1 )
|
||||
#define mm512_shufl2l_64( v1, v2 ) _mm512_alignr_epi64( v1, v2, 1 )
|
||||
|
||||
#define mm512_shufl2r_32( v1, v2 ) _mm512_alignr_epi32( v2, v1, 1 )
|
||||
#define mm512_shufl2l_32( v1, v2 ) _mm512_alignr_epi32( v1, v2, 1 )
|
||||
|
||||
#endif // AVX512
|
||||
#endif // SIMD_512_H__
|
||||
|
@@ -34,10 +34,12 @@
|
||||
//#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
|
||||
#define mm64_not( a ) ( (__m64)( ~( (uint64_t)(a) ) )
|
||||
|
||||
/*
|
||||
// Unary negate elements
|
||||
#define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, v )
|
||||
#define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, v )
|
||||
#define mm64_negate_8( v ) _mm_sub_pi8( m64_zero, v )
|
||||
*/
|
||||
|
||||
// Rotate bits in packed elements of 64 bit vector
|
||||
#define mm64_rol_64( a, n ) \
|
||||
|
@@ -333,7 +333,7 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
|
||||
// CPU_INFO ECX
|
||||
#define SSE3_Flag 1
|
||||
#define SSSE3_Flag (1<< 9)
|
||||
#define XOP_Flag (1<<11)
|
||||
#define XOP_Flag (1<<11) // obsolete, only available on pre-Ryzen AMD
|
||||
#define FMA3_Flag (1<<12)
|
||||
#define AES_Flag (1<<25)
|
||||
#define SSE41_Flag (1<<19)
|
||||
|
291
util.c
291
util.c
@@ -44,28 +44,22 @@
|
||||
#include <libgen.h>
|
||||
#endif
|
||||
|
||||
//#include "miner.h"
|
||||
#include "elist.h"
|
||||
#include "algo-gate-api.h"
|
||||
#include "algo/sha/sha256d.h"
|
||||
|
||||
//extern pthread_mutex_t stats_lock;
|
||||
|
||||
struct data_buffer {
|
||||
void *buf;
|
||||
size_t len;
|
||||
};
|
||||
|
||||
struct upload_buffer {
|
||||
const void *buf;
|
||||
size_t len;
|
||||
size_t pos;
|
||||
};
|
||||
|
||||
struct header_info {
|
||||
char *lp_path;
|
||||
char *reason;
|
||||
char *stratum_url;
|
||||
size_t content_length;
|
||||
};
|
||||
|
||||
struct data_buffer {
|
||||
void *buf;
|
||||
size_t len;
|
||||
size_t allocated;
|
||||
struct header_info *headers;
|
||||
};
|
||||
|
||||
struct tq_ent {
|
||||
@@ -127,7 +121,6 @@ void applog2( int prio, const char *fmt, ... )
|
||||
int len;
|
||||
// struct tm tm;
|
||||
// time_t now = time(NULL);
|
||||
|
||||
// localtime_r(&now, &tm);
|
||||
|
||||
switch ( prio )
|
||||
@@ -395,67 +388,53 @@ static void databuf_free(struct data_buffer *db)
|
||||
static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb,
|
||||
void *user_data)
|
||||
{
|
||||
struct data_buffer *db = (struct data_buffer *) user_data;
|
||||
struct data_buffer *db = user_data;
|
||||
size_t len = size * nmemb;
|
||||
size_t oldlen, newlen;
|
||||
size_t newalloc, reqalloc;
|
||||
void *newmem;
|
||||
static const unsigned char zero = 0;
|
||||
static const size_t max_realloc_increase = 8 * 1024 * 1024;
|
||||
static const size_t initial_alloc = 16 * 1024;
|
||||
|
||||
oldlen = db->len;
|
||||
newlen = oldlen + len;
|
||||
/* minimum required allocation size */
|
||||
reqalloc = db->len + len + 1;
|
||||
|
||||
newmem = realloc(db->buf, newlen + 1);
|
||||
if (!newmem)
|
||||
return 0;
|
||||
if (reqalloc > db->allocated) {
|
||||
if (db->len > 0) {
|
||||
newalloc = db->allocated * 2;
|
||||
} else {
|
||||
if (db->headers->content_length > 0)
|
||||
newalloc = db->headers->content_length + 1;
|
||||
else
|
||||
newalloc = initial_alloc;
|
||||
}
|
||||
|
||||
db->buf = newmem;
|
||||
db->len = newlen;
|
||||
memcpy((uchar*) db->buf + oldlen, ptr, len);
|
||||
memcpy((uchar*) db->buf + newlen, &zero, 1); /* null terminate */
|
||||
if (db->headers->content_length == 0) {
|
||||
/* limit the maximum buffer increase */
|
||||
if (newalloc - db->allocated > max_realloc_increase)
|
||||
newalloc = db->allocated + max_realloc_increase;
|
||||
}
|
||||
|
||||
/* ensure we have a big enough allocation */
|
||||
if (reqalloc > newalloc)
|
||||
newalloc = reqalloc;
|
||||
|
||||
newmem = realloc(db->buf, newalloc);
|
||||
if (!newmem)
|
||||
return 0;
|
||||
|
||||
db->buf = newmem;
|
||||
db->allocated = newalloc;
|
||||
}
|
||||
|
||||
memcpy(db->buf + db->len, ptr, len); /* append new data */
|
||||
memcpy(db->buf + db->len + len, &zero, 1); /* null terminate */
|
||||
|
||||
db->len += len;
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb,
|
||||
void *user_data)
|
||||
{
|
||||
struct upload_buffer *ub = (struct upload_buffer *) user_data;
|
||||
size_t len = size * nmemb;
|
||||
|
||||
if (len > ub->len - ub->pos)
|
||||
len = ub->len - ub->pos;
|
||||
|
||||
if (len) {
|
||||
memcpy(ptr, ((uchar*)ub->buf) + ub->pos, len);
|
||||
ub->pos += len;
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
#if LIBCURL_VERSION_NUM >= 0x071200
|
||||
static int seek_data_cb(void *user_data, curl_off_t offset, int origin)
|
||||
{
|
||||
struct upload_buffer *ub = (struct upload_buffer *) user_data;
|
||||
|
||||
switch (origin) {
|
||||
case SEEK_SET:
|
||||
ub->pos = (size_t) offset;
|
||||
break;
|
||||
case SEEK_CUR:
|
||||
ub->pos += (size_t) offset;
|
||||
break;
|
||||
case SEEK_END:
|
||||
ub->pos = ub->len + (size_t) offset;
|
||||
break;
|
||||
default:
|
||||
return 1; /* CURL_SEEKFUNC_FAIL */
|
||||
}
|
||||
|
||||
return 0; /* CURL_SEEKFUNC_OK */
|
||||
}
|
||||
#endif
|
||||
|
||||
static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
|
||||
{
|
||||
struct header_info *hi = (struct header_info *) user_data;
|
||||
@@ -505,6 +484,9 @@ static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
|
||||
val = NULL;
|
||||
}
|
||||
|
||||
if (!strcasecmp("Content-Length", key))
|
||||
hi->content_length = strtoul(val, NULL, 10);
|
||||
|
||||
out:
|
||||
free(key);
|
||||
free(val);
|
||||
@@ -564,48 +546,37 @@ json_t *json_rpc_call(CURL *curl, const char *url,
|
||||
int rc;
|
||||
long http_rc;
|
||||
struct data_buffer all_data = {0};
|
||||
struct upload_buffer upload_data;
|
||||
char *json_buf;
|
||||
json_error_t err;
|
||||
struct curl_slist *headers = NULL;
|
||||
char len_hdr[64];
|
||||
char curl_err_str[CURL_ERROR_SIZE] = { 0 };
|
||||
long timeout = (flags & JSON_RPC_LONGPOLL) ? opt_timeout : 30;
|
||||
struct header_info hi = {0};
|
||||
|
||||
/* it is assumed that 'curl' is freshly [re]initialized at this pt */
|
||||
|
||||
if (opt_protocol)
|
||||
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
|
||||
if (opt_protocol) curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url);
|
||||
if (opt_cert)
|
||||
curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
|
||||
//
|
||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
|
||||
|
||||
if (opt_cert) curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
|
||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
|
||||
curl_easy_setopt(curl, CURLOPT_ENCODING, "");
|
||||
curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
|
||||
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
|
||||
curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
|
||||
curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb);
|
||||
curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data);
|
||||
#if LIBCURL_VERSION_NUM >= 0x071200
|
||||
curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb);
|
||||
curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data);
|
||||
#endif
|
||||
curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
|
||||
if (opt_redirect)
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
|
||||
curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
|
||||
if (opt_redirect) curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
|
||||
curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb);
|
||||
curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi);
|
||||
if (opt_proxy) {
|
||||
if (opt_proxy)
|
||||
{
|
||||
curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
|
||||
curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
|
||||
}
|
||||
if (userpass) {
|
||||
if (userpass)
|
||||
{
|
||||
curl_easy_setopt(curl, CURLOPT_USERPWD, userpass);
|
||||
curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
|
||||
}
|
||||
@@ -613,23 +584,16 @@ json_t *json_rpc_call(CURL *curl, const char *url,
|
||||
if (flags & JSON_RPC_LONGPOLL)
|
||||
curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
|
||||
#endif
|
||||
curl_easy_setopt(curl, CURLOPT_POST, 1);
|
||||
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, rpc_req);
|
||||
|
||||
if (opt_protocol)
|
||||
applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req);
|
||||
|
||||
upload_data.buf = rpc_req;
|
||||
upload_data.len = strlen(rpc_req);
|
||||
upload_data.pos = 0;
|
||||
sprintf(len_hdr, "Content-Length: %lu",
|
||||
(unsigned long) upload_data.len);
|
||||
|
||||
headers = curl_slist_append(headers, "Content-Type: application/json");
|
||||
headers = curl_slist_append(headers, len_hdr);
|
||||
headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
|
||||
headers = curl_slist_append(headers, "X-Mining-Extensions: longpoll reject-reason");
|
||||
//headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/
|
||||
//headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/
|
||||
//headers = curl_slist_append(headers, "Accept:"); // disable Accept hdr
|
||||
//headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
|
||||
|
||||
@@ -786,18 +750,26 @@ err_out:
|
||||
return cfg;
|
||||
}
|
||||
|
||||
// Segwit BEGIN
|
||||
void memrev(unsigned char *p, size_t len)
|
||||
{
|
||||
unsigned char c, *q;
|
||||
for (q = p + len - 1; p < q; p++, q--) {
|
||||
c = *p;
|
||||
*p = *q;
|
||||
*q = c;
|
||||
if ( len == 32 )
|
||||
{
|
||||
__m128i *pv = (__m128i*)p;
|
||||
__m128i t = mm128_bswap_128( pv[0] );
|
||||
pv[0] = mm128_bswap_128( pv[1] );
|
||||
pv[1] = t;
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned char c, *q;
|
||||
for (q = p + len - 1; p < q; p++, q--)
|
||||
{
|
||||
c = *p;
|
||||
*p = *q;
|
||||
*q = c;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Segwit END
|
||||
|
||||
|
||||
void cbin2hex(char *out, const char *in, size_t len)
|
||||
{
|
||||
@@ -832,32 +804,42 @@ char *bebin2hex(const unsigned char *p, size_t len)
|
||||
return s;
|
||||
}
|
||||
|
||||
bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
|
||||
bool hex2bin( unsigned char *p, const char *hexstr, const size_t len )
|
||||
{
|
||||
char hex_byte[3];
|
||||
char *ep;
|
||||
if( hexstr == NULL ) return false;
|
||||
|
||||
hex_byte[2] = '\0';
|
||||
|
||||
while (*hexstr && len) {
|
||||
if (!hexstr[1]) {
|
||||
applog(LOG_ERR, "hex2bin str truncated");
|
||||
return false;
|
||||
}
|
||||
hex_byte[0] = hexstr[0];
|
||||
hex_byte[1] = hexstr[1];
|
||||
*p = (unsigned char) strtol(hex_byte, &ep, 16);
|
||||
if (*ep) {
|
||||
applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte);
|
||||
return false;
|
||||
}
|
||||
p++;
|
||||
hexstr += 2;
|
||||
len--;
|
||||
size_t hexstr_len = strlen( hexstr );
|
||||
if( ( hexstr_len % 2 ) != 0 )
|
||||
{
|
||||
applog( LOG_ERR, "hex2bin string truncated" );
|
||||
return false;
|
||||
}
|
||||
size_t bin_len = hexstr_len / 2;
|
||||
if ( bin_len > len )
|
||||
{
|
||||
applog( LOG_ERR, "hex2bin buffer too small" );
|
||||
return false;
|
||||
}
|
||||
|
||||
return(!len) ? true : false;
|
||||
/* return (len == 0 && *hexstr == 0) ? true : false; */
|
||||
memset( p, 0, len );
|
||||
size_t i = 0;
|
||||
while ( i < hexstr_len )
|
||||
{
|
||||
char c = hexstr[i];
|
||||
unsigned char nibble;
|
||||
if ( c >= '0' && c <= '9' ) nibble = (c - '0');
|
||||
else if ( c >= 'A' && c <= 'F' ) nibble = ( 10 + (c - 'A') );
|
||||
else if ( c >= 'a' && c <= 'f' ) nibble = ( 10 + (c - 'a') );
|
||||
else
|
||||
{
|
||||
applog( LOG_ERR, "hex2bin invalid hex" );
|
||||
return false;
|
||||
}
|
||||
p[(i / 2)] |= (nibble << ( (1 - (i % 2) ) * 4) );
|
||||
i++;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int varint_encode(unsigned char *p, uint64_t n)
|
||||
@@ -1339,6 +1321,43 @@ inline bool valid_hash( const void *hash, const void *target )
|
||||
|
||||
#endif
|
||||
|
||||
inline double nbits_to_diff( uint32_t nbits )
|
||||
{
|
||||
long double diff;
|
||||
uint32_t shift = nbits & 0xff;
|
||||
uint32_t bits = bswap_32( nbits ) & 0x00ffffff;
|
||||
int shift_off = (int)shift - 29;
|
||||
|
||||
// diff = ( (2**16 -1) / ( 256**shift_off * bits )
|
||||
// With uint128 byte shift is good for 16 <= shift <= 41. As unlikely
|
||||
// as this may seem necessary, check just in case.
|
||||
|
||||
if ( shift_off >= -13 && shift_off <= 12 )
|
||||
{ // fast
|
||||
if ( shift_off == 0 )
|
||||
diff = (long double)0xffff / (long double)bits;
|
||||
else if ( shift_off < 0 ) // shift < 29
|
||||
diff = (long double)( (uint128_t)0xffff << ( (-shift_off) *8 ) )
|
||||
/ (long double)bits;
|
||||
else // ( shift_off > 0 ) // shift > 29
|
||||
diff = (long double)0xffff
|
||||
/ (long double)( (uint128_t)bits << ( shift_off*8 ) );
|
||||
}
|
||||
else
|
||||
{ // slow
|
||||
int m;
|
||||
diff = 0.;
|
||||
for ( m = shift; m < 29; m++ ) diff *= 256.0;
|
||||
for ( m = 29; m < shift; m++ ) diff /= 256.0;
|
||||
}
|
||||
|
||||
if ( opt_debug )
|
||||
applog( LOG_INFO, "nbits %08x: shift %u(%d), bits %06x, diff %8g",
|
||||
nbits, shift, shift_off, bits, (double)diff );
|
||||
|
||||
return (double)diff;
|
||||
}
|
||||
|
||||
#ifdef WIN32
|
||||
#define socket_blocks() (WSAGetLastError() == WSAEWOULDBLOCK)
|
||||
#else
|
||||
@@ -1371,7 +1390,7 @@ static bool send_line( struct stratum_ctx *sctx, char *s )
|
||||
{
|
||||
if ( rc != CURLE_AGAIN )
|
||||
#else
|
||||
n = send(sock, s + sent, len, 0);
|
||||
n = send( sctx->sock, s + sent, len, 0);
|
||||
if ( n < 0 )
|
||||
{
|
||||
if ( !socket_blocks() )
|
||||
@@ -1379,8 +1398,8 @@ static bool send_line( struct stratum_ctx *sctx, char *s )
|
||||
return false;
|
||||
n = 0;
|
||||
}
|
||||
sent += n;
|
||||
len -= n;
|
||||
sent += n;
|
||||
len -= n;
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -1507,7 +1526,8 @@ out:
|
||||
return sret;
|
||||
}
|
||||
|
||||
#if LIBCURL_VERSION_NUM >= 0x071101
|
||||
#if LIBCURL_VERSION_NUM >= 0x071101 && LIBCURL_VERSION_NUM < 0x072d00
|
||||
//#if LIBCURL_VERSION_NUM >= 0x071101
|
||||
static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose,
|
||||
struct curl_sockaddr *addr)
|
||||
{
|
||||
@@ -1575,7 +1595,8 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
|
||||
#if LIBCURL_VERSION_NUM >= 0x070f06
|
||||
curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
|
||||
#endif
|
||||
#if LIBCURL_VERSION_NUM >= 0x071101
|
||||
#if LIBCURL_VERSION_NUM >= 0x071101 && LIBCURL_VERSION_NUM < 0x072d00
|
||||
//#if LIBCURL_VERSION_NUM >= 0x071101
|
||||
curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb);
|
||||
curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock);
|
||||
#endif
|
||||
@@ -1589,7 +1610,10 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
|
||||
return false;
|
||||
}
|
||||
|
||||
#if LIBCURL_VERSION_NUM < 0x071101
|
||||
#if LIBCURL_VERSION_NUM >= 0x072d00
|
||||
curl_easy_getinfo(curl, CURLINFO_ACTIVESOCKET, &sctx->sock);
|
||||
#elif LIBCURL_VERSION_NUM < 0x071101
|
||||
//#if LIBCURL_VERSION_NUM < 0x071101
|
||||
/* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */
|
||||
curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock);
|
||||
#endif
|
||||
@@ -1885,7 +1909,8 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)
|
||||
|
||||
// find 0xffff tag
|
||||
p = (uint8_t*) sctx->job.coinbase + 32;
|
||||
m = p + 128;
|
||||
m = p + sctx->job.coinbase_size - 32 - 2;
|
||||
// m = p + 128;
|
||||
while (*p != 0xff && p < m) p++;
|
||||
while (*p == 0xff && p < m) p++;
|
||||
if (*(p-1) == 0xff && *(p-2) == 0xff) {
|
||||
|
@@ -17,7 +17,9 @@ export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
|
||||
# used by GCC
|
||||
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
|
||||
# Support for Windows 7 CPU groups, AES sometimes not included in -march
|
||||
export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
|
||||
# CPU groups disabled due to incompatibilities between Intel and AMD CPUs.
|
||||
#export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
|
||||
export DEFAULT_CFLAGS="-maes -O3 -Wall"
|
||||
export DEFAULT_CFLAGS_OLD="-O3 -Wall"
|
||||
|
||||
# make link to local gmp header file.
|
||||
|
Reference in New Issue
Block a user