mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
41 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
57a6b7b58b | ||
![]() |
de564ccbde | ||
![]() |
fcd7727b0d | ||
![]() |
3dd6787531 | ||
![]() |
cae1ce2ab7 | ||
![]() |
7a91c41d74 | ||
![]() |
c6bc9d67fb | ||
![]() |
b339450898 | ||
![]() |
fb93160641 | ||
![]() |
520d4d5384 | ||
![]() |
da7030faa8 | ||
![]() |
bd84f199fe | ||
![]() |
58030e2788 | ||
![]() |
1321ac474c | ||
![]() |
40d07c0097 | ||
![]() |
f552f2b1e8 | ||
![]() |
26b8927632 | ||
![]() |
db76d3865f | ||
![]() |
5b678d2481 | ||
![]() |
90137b391e | ||
![]() |
8727d79182 | ||
![]() |
17ccbc328f | ||
![]() |
0e3945ddb5 | ||
![]() |
7d2ef7973d | ||
![]() |
e6fd9b1d69 | ||
![]() |
1a234cbe53 | ||
![]() |
47cc5dcff5 | ||
![]() |
2cd1507c2e | ||
![]() |
9b905fccc8 | ||
![]() |
92b3733925 | ||
![]() |
19cc88d102 | ||
![]() |
a053690170 | ||
![]() |
3c5e8921b7 | ||
![]() |
f3333b0070 | ||
![]() |
902ec046dd | ||
![]() |
d0b4941321 | ||
![]() |
40089428c5 | ||
![]() |
dc6b007a18 | ||
![]() |
06bfaa1249 | ||
![]() |
6566e99a13 | ||
![]() |
ccfccbadd5 |
@@ -1,4 +1,6 @@
|
||||
|
||||
These instructions may be out of date, see the Wiki for the latest...
|
||||
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
|
||||
|
||||
1. Requirements:
|
||||
---------------
|
||||
@@ -32,14 +34,26 @@ but different package names.
|
||||
$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
|
||||
|
||||
SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
|
||||
openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
|
||||
support depending on your CPU and compiler version:
|
||||
openssl 1.1.0e or higher.
|
||||
|
||||
"-march=native" is always the best choice
|
||||
znver1 and znver2 should be recognized on most recent version of GCC and
|
||||
znver3 is available with GCC 11. GCC 11 also includes rocketlake support.
|
||||
In the meantime here are some suggestions to compile with new CPUs:
|
||||
|
||||
"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
|
||||
"-march=native" is usually the best choice, used by build.sh.
|
||||
|
||||
"-msha" Add SHA to other tuning options
|
||||
"-march=znver2 -mvaes" can be used for Ryzen 5000 if znver3 is not recongized.
|
||||
|
||||
"-mcascadelake -msha" or
|
||||
"-mcometlake -mavx512 -msha" can be used for Rocket Lake.
|
||||
|
||||
Features can also be added individually:
|
||||
|
||||
"-msha" adds support for HW accelerated sha256.
|
||||
|
||||
"-mavx512" adds support for 512 bit vectors
|
||||
|
||||
"-mvaes" add support for parallel AES
|
||||
|
||||
Additional instructions for static compilalation can be found here:
|
||||
https://lxadm.com/Static_compilation_of_cpuminer
|
||||
|
172
INSTALL_WINDOWS
172
INSTALL_WINDOWS
@@ -1,172 +1,4 @@
|
||||
Instructions for compiling cpuminer-opt for Windows.
|
||||
Please consult the wiki for Windows compile instructions.
|
||||
|
||||
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
|
||||
|
||||
Windows compilation using Visual Studio is not supported. Mingw64 is
|
||||
used on a Linux system (bare metal or virtual machine) to cross-compile
|
||||
cpuminer-opt executable binaries for Windows.
|
||||
|
||||
These instructions were written for Debian and Ubuntu compatible distributions
|
||||
but should work on other major distributions as well. However some of the
|
||||
package names or file paths may be different.
|
||||
|
||||
It is assumed a Linux system is already available and running. And the user
|
||||
has enough Linux knowledge to find and install packages and follow these
|
||||
instructions.
|
||||
|
||||
First it is a good idea to create new user specifically for cross compiling.
|
||||
It keeps all mingw stuff contained and isolated from the rest of the system.
|
||||
|
||||
Step by step...
|
||||
|
||||
1. Install necessary packages from the distribution's repositories.
|
||||
|
||||
Refer to Linux compile instructions and install required packages.
|
||||
|
||||
Additionally, install mingw-w64.
|
||||
|
||||
sudo apt-get install mingw-w64
|
||||
|
||||
|
||||
2. Create a local library directory for packages to be compiled in the next
|
||||
step. Suggested location is $HOME/usr/lib/
|
||||
|
||||
3. Download and build other packages for mingw that don't have a mingw64
|
||||
version available in the repositories.
|
||||
|
||||
Download the following source code packages from their respective and
|
||||
respected download locations, copy them to ~/usr/lib/ and uncompress them.
|
||||
|
||||
openssl
|
||||
curl
|
||||
gmp
|
||||
|
||||
In most cases the latest vesrion is ok but it's safest to download
|
||||
the same major and minor version as included in your distribution.
|
||||
|
||||
Run the following commands or follow the supplied instructions.
|
||||
Do not run "make install" unless you are using ~/usr/lib, which isn't
|
||||
recommended.
|
||||
|
||||
Some instructions insist on running "make check". If make check fails
|
||||
it may still work, YMMV.
|
||||
|
||||
You can speed up "make" by using all CPU cores available with "-j n" where
|
||||
n is the number of CPU threads you want to use.
|
||||
|
||||
openssl:
|
||||
|
||||
./Configure mingw64 shared --cross-compile-prefix=x86_64-w64-mingw32
|
||||
make
|
||||
|
||||
curl:
|
||||
|
||||
./configure --with-winssl --with-winidn --host=x86_64-w64-mingw32
|
||||
make
|
||||
|
||||
gmp:
|
||||
|
||||
./configure --host=x86_64-w64-mingw32
|
||||
make
|
||||
|
||||
|
||||
|
||||
4. Tweak the environment.
|
||||
|
||||
This step is required everytime you login or the commands can be added to
|
||||
.bashrc.
|
||||
|
||||
Define some local variables to point to local library.
|
||||
|
||||
export LOCAL_LIB="$HOME/usr/lib"
|
||||
|
||||
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
|
||||
|
||||
export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
|
||||
|
||||
Create a release directory and copy some dll files previously built.
|
||||
This can be done outside of cpuminer-opt and only needs to be done once.
|
||||
If the release directory is in cpuminer-opt directory it needs to be
|
||||
recreated every a source package is decompressed.
|
||||
|
||||
mkdir release
|
||||
cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
|
||||
cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
|
||||
cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/
|
||||
cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libgcc_s_seh-1.dll release/
|
||||
cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
|
||||
cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
|
||||
|
||||
|
||||
|
||||
The following steps need to be done every time a new source package is
|
||||
opened.
|
||||
|
||||
5. Download cpuminer-opt
|
||||
|
||||
Download the latest source code package of cpumuner-opt to your desired
|
||||
location. .zip or .tar.gz, your choice.
|
||||
|
||||
https://github.com/JayDDee/cpuminer-opt/releases
|
||||
|
||||
Decompress and change to the cpuminer-opt directory.
|
||||
|
||||
|
||||
|
||||
6. Prepare to compile
|
||||
|
||||
Create a link to the locally compiled version of gmp.h
|
||||
|
||||
ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h
|
||||
|
||||
Edit configure.ac to fix lipthread package name.
|
||||
|
||||
sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
|
||||
|
||||
|
||||
7. Compile
|
||||
|
||||
you can use the default compile if you intend to use cpuminer-opt on the
|
||||
same CPU and the virtual machine supports that architecture.
|
||||
|
||||
./build.sh
|
||||
|
||||
Otherwise you can compile manually while setting options in CFLAGS.
|
||||
|
||||
Some common options:
|
||||
|
||||
To compile for a specific CPU architecture:
|
||||
|
||||
CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl
|
||||
|
||||
This will compile for AMD Ryzen.
|
||||
|
||||
You can compile more generically for a set of specific CPU features
|
||||
if you know what features you want:
|
||||
|
||||
CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
|
||||
|
||||
This will compile for an older CPU that does not have AVX.
|
||||
|
||||
You can find several examples in build-allarch.sh
|
||||
|
||||
If you have a CPU with more than 64 threads and Windows 7 or higher you
|
||||
can enable the CPU Groups feature:
|
||||
|
||||
-D_WIN32_WINNT==0x0601
|
||||
|
||||
Once you have run configure successfully run make with n CPU threads:
|
||||
|
||||
make -j n
|
||||
|
||||
Copy cpuminer.exe to the release directory, compress and copy the release
|
||||
directory to a Windows system and run cpuminer.exe from the command line.
|
||||
|
||||
Run cpuminer
|
||||
|
||||
In a command windows change directories to the unzipped release folder.
|
||||
to get a list of all options:
|
||||
|
||||
cpuminer.exe --help
|
||||
|
||||
Command options are specific to where you mine. Refer to the pool's
|
||||
instructions on how to set them.
|
||||
|
24
Makefile.am
24
Makefile.am
@@ -21,6 +21,7 @@ cpuminer_SOURCES = \
|
||||
api.c \
|
||||
sysinfos.c \
|
||||
algo-gate-api.c\
|
||||
malloc-huge.c \
|
||||
algo/argon2/argon2a/argon2a.c \
|
||||
algo/argon2/argon2a/ar2/argon2.c \
|
||||
algo/argon2/argon2a/ar2/opt.c \
|
||||
@@ -54,9 +55,6 @@ cpuminer_SOURCES = \
|
||||
algo/blake/mod_blakecoin.c \
|
||||
algo/blake/blakecoin.c \
|
||||
algo/blake/blakecoin-4way.c \
|
||||
algo/blake/decred-gate.c \
|
||||
algo/blake/decred.c \
|
||||
algo/blake/decred-4way.c \
|
||||
algo/blake/pentablake-gate.c \
|
||||
algo/blake/pentablake-4way.c \
|
||||
algo/blake/pentablake.c \
|
||||
@@ -129,7 +127,7 @@ cpuminer_SOURCES = \
|
||||
algo/lyra2/allium.c \
|
||||
algo/lyra2/phi2-4way.c \
|
||||
algo/lyra2/phi2.c \
|
||||
algo//m7m/m7m.c \
|
||||
algo/m7m/m7m.c \
|
||||
algo/m7m/magimath.cpp \
|
||||
algo/nist5/nist5-gate.c \
|
||||
algo/nist5/nist5-4way.c \
|
||||
@@ -158,19 +156,27 @@ cpuminer_SOURCES = \
|
||||
algo/ripemd/lbry.c \
|
||||
algo/ripemd/lbry-4way.c \
|
||||
algo/scrypt/scrypt.c \
|
||||
algo/scrypt/scrypt-core-4way.c \
|
||||
algo/scrypt/neoscrypt.c \
|
||||
algo/sha/sha256-hash.c \
|
||||
algo/sha/sph_sha2.c \
|
||||
algo/sha/sph_sha2big.c \
|
||||
algo/sha/sha256-hash-4way.c \
|
||||
algo/sha/sha512-hash-4way.c \
|
||||
algo/sha/sha256-hash-opt.c \
|
||||
algo/sha/sha256-hash-2way-ni.c \
|
||||
algo/sha/hmac-sha256-hash.c \
|
||||
algo/sha/hmac-sha256-hash-4way.c \
|
||||
algo/sha/sha256d.c \
|
||||
algo/sha/sha2.c \
|
||||
algo/sha/sha256d-4way.c \
|
||||
algo/sha/sha256t-gate.c \
|
||||
algo/sha/sha256t-4way.c \
|
||||
algo/sha/sha256t.c \
|
||||
algo/sha/sha256q-4way.c \
|
||||
algo/sha/sha256q.c \
|
||||
algo/sha/sha512256d-4way.c \
|
||||
algo/sha/sha256dt.c \
|
||||
algo/shabal/sph_shabal.c \
|
||||
algo/shabal/shabal-hash-4way.c \
|
||||
algo/shavite/sph_shavite.c \
|
||||
@@ -192,8 +198,12 @@ cpuminer_SOURCES = \
|
||||
algo/sm3/sm3-hash-4way.c \
|
||||
algo/swifftx/swifftx.c \
|
||||
algo/tiger/sph_tiger.c \
|
||||
algo/verthash/verthash-gate.c \
|
||||
algo/verthash/Verthash.c \
|
||||
algo/verthash/fopen_utf8.c \
|
||||
algo/verthash/tiny_sha3/sha3.c \
|
||||
algo/verthash/tiny_sha3/sha3-4way.c \
|
||||
algo/whirlpool/sph_whirlpool.c \
|
||||
algo/whirlpool/whirlpool-hash-4way.c \
|
||||
algo/whirlpool/whirlpool-gate.c \
|
||||
algo/whirlpool/whirlpool.c \
|
||||
algo/whirlpool/whirlpoolx.c \
|
||||
@@ -273,11 +283,9 @@ cpuminer_SOURCES = \
|
||||
algo/x22/x22i-gate.c \
|
||||
algo/x22/x25x.c \
|
||||
algo/x22/x25x-4way.c \
|
||||
algo/yescrypt/yescrypt.c \
|
||||
algo/yescrypt/yescrypt-best.c \
|
||||
algo/yespower/yespower-gate.c \
|
||||
algo/yespower/yespower-blake2b.c \
|
||||
algo/yespower/crypto/blake2b-yp.c \
|
||||
algo/yespower/crypto/hmac-blake2b.c \
|
||||
algo/yespower/yescrypt-r8g.c \
|
||||
algo/yespower/yespower-opt.c
|
||||
|
||||
|
66
README.md
66
README.md
@@ -40,17 +40,25 @@ Requirements
|
||||
Intel Core2 and newer and AMD equivalents. Further optimizations are available
|
||||
on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
|
||||
|
||||
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
|
||||
performance.
|
||||
32 bit CPUs are not supported.
|
||||
Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
|
||||
are not supported.
|
||||
|
||||
ARM and Aarch64 CPUs are not supported.
|
||||
Mobile CPUs like laptop computers are not recommended because they aren't
|
||||
designed for extreme heat of operating at full load for extended periods of
|
||||
time.
|
||||
|
||||
Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
|
||||
|
||||
2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
|
||||
including Mint and Centos, are known to work and have all dependencies
|
||||
in their repositories. Others may work but may require more effort. Older
|
||||
versions such as Centos 6 don't work due to missing features.
|
||||
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
|
||||
|
||||
Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
|
||||
binaries. WindowsXP 64 bit is YMMV.
|
||||
|
||||
FreeBSD is not actively tested but should work, YMMV.
|
||||
MacOS, OSx and Android are not supported.
|
||||
|
||||
3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
|
||||
@@ -66,53 +74,50 @@ Supported Algorithms
|
||||
argon2d250 argon2d-crds, Credits (CRDS)
|
||||
argon2d500 argon2d-dyn, Dynamic (DYN)
|
||||
argon2d4096 argon2d-uis, Unitus, (UIS)
|
||||
axiom Shabal-256 MemoHash
|
||||
blake Blake-256 (SFR)
|
||||
blake2b Blake2b 256
|
||||
blake2s Blake-2 S
|
||||
blake Blake-256
|
||||
blake2b Blake2-512
|
||||
blake2s Blake2-256
|
||||
blakecoin blake256r8
|
||||
bmw BMW 256
|
||||
bmw512 BMW 512
|
||||
c11 Chaincoin
|
||||
c11
|
||||
decred
|
||||
deep Deepcoin (DCN)
|
||||
dmd-gr Diamond-Groestl
|
||||
groestl Groestl coin
|
||||
hex x16r-hex
|
||||
hmq1725 Espers
|
||||
hmq1725
|
||||
hodl Hodlcoin
|
||||
jha Jackpotcoin
|
||||
keccak Maxcoin
|
||||
keccakc Creative coin
|
||||
lbry LBC, LBRY Credits
|
||||
luffa Luffa
|
||||
lyra2h Hppcoin
|
||||
lyra2h
|
||||
lyra2re lyra2
|
||||
lyra2rev2 lyra2v2
|
||||
lyra2rev3 lyrav2v3, Vertcoin
|
||||
lyra2rev3 lyrav2v3
|
||||
lyra2z
|
||||
lyra2z330 Lyra2 330 rows, Zoin (ZOI)
|
||||
m7m Magi (XMG)
|
||||
minotaur Ringcoin (RNG)
|
||||
lyra2z330
|
||||
m7m
|
||||
minotaur
|
||||
minotaurx
|
||||
myr-gr Myriad-Groestl
|
||||
neoscrypt NeoScrypt(128, 2, 1)
|
||||
nist5 Nist5
|
||||
pentablake Pentablake
|
||||
phi1612 phi
|
||||
phi2 Luxcoin (LUX)
|
||||
phi2-lux identical to phi2
|
||||
pluck Pluck:128 (Supcoin)
|
||||
phi2
|
||||
polytimos Ninja
|
||||
power2b MicroBitcoin (MBC)
|
||||
quark Quark
|
||||
qubit Qubit
|
||||
scrypt scrypt(1024, 1, 1) (default)
|
||||
scrypt:N scrypt(N, 1, 1)
|
||||
scryptn2 scrypt(1048576, 1, 1)
|
||||
sha256d Double SHA-256
|
||||
sha256q Quad SHA-256, Pyrite (PYE)
|
||||
sha256t Triple SHA-256, Onecoin (OC)
|
||||
sha256q Quad SHA-256
|
||||
sha256t Triple SHA-256
|
||||
sha3d Double keccak256 (BSHA3)
|
||||
shavite3 Shavite3
|
||||
skein Skein+Sha (Skeincoin)
|
||||
skein2 Double Skein (Woodcoin)
|
||||
skunk Signatum (SIGT)
|
||||
@@ -122,22 +127,23 @@ Supported Algorithms
|
||||
tribus Denarius (DNR)
|
||||
vanilla blake256r8vnl (VCash)
|
||||
veltor (VLT)
|
||||
verthash Vertcoin
|
||||
whirlpool
|
||||
whirlpoolx
|
||||
x11 Dash
|
||||
x11evo Revolvercoin
|
||||
x11gost sib (SibCoin)
|
||||
x12 Galaxie Cash (GCH)
|
||||
x13 X13
|
||||
x12
|
||||
x13
|
||||
x13bcd bcd
|
||||
x13sm3 hsr (Hshare)
|
||||
x14 X14
|
||||
x15 X15
|
||||
x14
|
||||
x15
|
||||
x16r
|
||||
x16rv2 Ravencoin (RVN)
|
||||
x16rt Gincoin (GIN)
|
||||
x16rt-veil Veil (VEIL)
|
||||
x16s Pigeoncoin (PGN)
|
||||
x16rv2
|
||||
x16rt
|
||||
x16rt-veil veil
|
||||
x16s
|
||||
x17
|
||||
x21s
|
||||
x22i
|
||||
|
64
README.txt
64
README.txt
@@ -1,12 +1,22 @@
|
||||
This file is included in the Windows binary package. Compile instructions
|
||||
for Linux and Windows can be found in RELEASE_NOTES.
|
||||
|
||||
This package is officially avalable only from:
|
||||
cpuminer-opt is open source and free of any fees. Many forks exist that are
|
||||
closed source and contain usage fees. support open source free software.
|
||||
|
||||
This package is officially avalaible only from:
|
||||
|
||||
https://github.com/JayDDee/cpuminer-opt
|
||||
|
||||
No other sources should be trusted.
|
||||
|
||||
cpuminer is a console program that is executed from a DOS or Powershell
|
||||
prompt. There is no GUI and no mouse support.
|
||||
command prompt. There is no GUI and no mouse support.
|
||||
|
||||
New users are encouraged to consult the cpuminer-opt Wiki for detailed
|
||||
information on usage:
|
||||
|
||||
https://github.com/JayDDee/cpuminer-opt/wiki
|
||||
|
||||
Miner programs are often flagged as malware by antivirus programs. This is
|
||||
a false positive, they are flagged simply because they are cryptocurrency
|
||||
@@ -14,18 +24,18 @@ miners. The source code is open for anyone to inspect. If you don't trust
|
||||
the software, don't use it.
|
||||
|
||||
Choose the exe that best matches you CPU's features or use trial and
|
||||
error to find the fastest one that doesn't crash. Pay attention to
|
||||
error to find the fastest one that works. Pay attention to
|
||||
the features listed at cpuminer startup to ensure you are mining at
|
||||
optimum speed using the best available features.
|
||||
|
||||
Architecture names and compile options used are only provided for Intel
|
||||
Core series. Budget CPUs like Pentium and Celeron are often missing some
|
||||
features.
|
||||
Architecture names and compile options used are only provided for
|
||||
mainstream desktop CPUs. Budget CPUs like Pentium and Celeron are often
|
||||
missing some features. Check your CPU.
|
||||
|
||||
AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
|
||||
supported by cpuminer-opt due to an incompatible implementation of SSE2 on
|
||||
these CPUs. Some algos may crash the miner with an invalid instruction.
|
||||
Users are recommended to use an unoptimized miner such as cpuminer-multi.
|
||||
Support for AMD CPUs older than Ryzen is incomplete and without specific
|
||||
recommendations. Find the best fit. CPUs older than Piledriver, including
|
||||
Athlon x2 and Phenom II x4, are not supported by cpuminer-opt due to an
|
||||
incompatible implementation of SSE2 on these CPUs.
|
||||
|
||||
More information for Intel and AMD CPU architectures and their features
|
||||
can be found on Wikipedia.
|
||||
@@ -34,33 +44,35 @@ https://en.wikipedia.org/wiki/List_of_Intel_CPU_microarchitectures
|
||||
|
||||
https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures
|
||||
|
||||
File name Architecture name
|
||||
|
||||
Exe file name Compile flags Arch name
|
||||
cpuminer-sse2.exe Core2, Nehalem, generic x86_64 with SSE2
|
||||
cpuminer-aes-sse42.exe Westmere
|
||||
cpuminer-avx.exe Sandybridge, Ivybridge
|
||||
cpuminer-avx2.exe Haswell, Skylake, Kabylake, Coffeelake, Cometlake
|
||||
cpuminer-avx2-sha.exe AMD Zen1, Zen2
|
||||
cpuminer-avx2-sha-vaes.exe Intel Alderlake*, AMD Zen3
|
||||
cpuminer-avx512.exe Intel HEDT Skylake-X, Cascadelake
|
||||
cpuminer-avx512-sha-vaes.exe AMD Zen4, Intel Rocketlake, Icelake
|
||||
|
||||
cpuminer-sse2.exe "-msse2" Core2, Nehalem
|
||||
cpuminer-aes-sse42.exe "-marxh=westmere" Westmere
|
||||
cpuminer-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell(1)
|
||||
cpuminer-avx512.exe "-march=skylake-avx512" Skylake-X, Cascadelake-X
|
||||
cpuminer-zen.exe "-march=znver1" Zen1, Zen2
|
||||
cpuminer-zen3.exe "-march=znver2 -mvaes" Zen3(2)
|
||||
cpuminer-avx512-sha-vaes.exe "-march=icelake-client" Icelake(3)
|
||||
|
||||
(1) Haswell includes Broadwell, Skylake, Kabylake, Coffeelake & Cometlake.
|
||||
(2) Zen3 build uses Zen2+VAES as workaround until Zen3 compiler support is
|
||||
available. Zen2 CPUs should use Zen build.
|
||||
(3) Icelake is only available on some laptops. Mining with a laptop is not
|
||||
recommended.
|
||||
* Alderlake is a hybrid architecture with a mix of E-cores & P-cores. Although
|
||||
the P-cores can support AVX512 the E-cores can't so Intel decided to disable
|
||||
AVX512 on the the P-cores.
|
||||
|
||||
Notes about included DLL files:
|
||||
|
||||
Downloading DLL files from alternative sources presents an inherent
|
||||
security risk if their source is unknown. All DLL files included have
|
||||
been copied from the Ubuntu-20.04 instalation or compiled by me from
|
||||
been copied from the Ubuntu-20.04 installation or compiled by me from
|
||||
source code obtained from the author's official repository. The exact
|
||||
procedure is documented in the build instructions for Windows:
|
||||
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
|
||||
|
||||
Some included DLL files may already be installed on the system by Windows or
|
||||
third party packages. They often will work and may be used instead of the
|
||||
included version of the files.
|
||||
|
||||
|
||||
If you like this software feel free to donate:
|
||||
|
||||
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
|
||||
|
354
RELEASE_NOTES
354
RELEASE_NOTES
@@ -22,7 +22,7 @@ required.
|
||||
Compile Instructions
|
||||
--------------------
|
||||
|
||||
See INSTALL_LINUX or INSTALL_WINDOWS for compile instruuctions
|
||||
See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
|
||||
|
||||
Requirements
|
||||
------------
|
||||
@@ -65,11 +65,361 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.22.3
|
||||
|
||||
Data interleaving and byte swap optimizations iwith AVX2, AVX512 & AVX512VBMI.
|
||||
Faster Luffa with AVX2 & AVX512.
|
||||
Other small optimizations.
|
||||
Some code cleanup.
|
||||
|
||||
v3.22.2
|
||||
|
||||
Added sha512256d & sha256dt algos.
|
||||
Fixed intermittant invalid shares lyra2v2 AVX512.
|
||||
Removed application limits on the number of CPUs and threads, HW and OS limits still apply.
|
||||
Added a log warning if more threads are defined than active CPUs in affinity mask.
|
||||
Improved merkle tree memory management for stratum.
|
||||
Added transaction count to New Work log.
|
||||
Other small improvements.
|
||||
|
||||
v3.22.1
|
||||
|
||||
#393 fixed segfault in GBT, regression from v3.22.0.
|
||||
More efficient 32 bit data interleaving.
|
||||
|
||||
v3.22.0
|
||||
|
||||
Stratum: faster netdiff calculation.
|
||||
Merged a few updates from Pooler/cpuminer:
|
||||
Use CURLOPT_POSTFIELDS in json_rpc_call,
|
||||
Use CURLINFO_ACTIVESOCKET when supported,
|
||||
JSONRPC speedup,
|
||||
Speed up hex2bin function.
|
||||
Small log improvements, notably more frequent hash rate reports.
|
||||
Removed decred algo.
|
||||
|
||||
v3.21.5
|
||||
|
||||
All issues with v3.21.3 & v3.21.4 should be resolved.
|
||||
Changes since v3.21.2:
|
||||
#392 #379 #389 Fixed misaligned address segfault solo mining.
|
||||
#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
|
||||
#392 Fixed conditional mining.
|
||||
#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
|
||||
Windows binaries no longer support CPU groups,
|
||||
Windows binaries support CPUs with up to 64 threads.
|
||||
Small optimizations to serialized vectoring.
|
||||
|
||||
v3.21.4 CANCELLED
|
||||
|
||||
Reapply selected changes from v3.21.3.
|
||||
#392 #379 #389 Fixed misaligned address segfault solo mining.
|
||||
#392 Fixed conditional mining.
|
||||
#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
|
||||
Windows binaries no longer support CPU groups,
|
||||
Windows binaries support CPUs with up to 64 threads.
|
||||
|
||||
v3.21.3.1 UNRELEASED
|
||||
|
||||
Revert to 3.21.2
|
||||
|
||||
v3.21.3 CANCELLED
|
||||
|
||||
#392 #379 #389 Fixed misaligned address segfault solo mining.
|
||||
#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
|
||||
#392 Fixed conditional mining.
|
||||
#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
|
||||
Windows binaries no longer support CPU groups,
|
||||
Windows binaries support CPUs with up to 64 threads.
|
||||
Midstate prehash is now centralized, done only once instead of by every thread
|
||||
for selected algos.
|
||||
Small optimizations to serialized vectoring.
|
||||
|
||||
v3.21.2
|
||||
|
||||
Faster SALSA SIMD shuffle for yespower, yescrypt & scryptn2.
|
||||
Fixed a couple of compiler warnings with gcc-12.
|
||||
|
||||
v3.21.1
|
||||
|
||||
Fixed a segfault in some obsolete algos.
|
||||
Small optimizations to Hamsi & Shabal AVX2 & AVX512.
|
||||
|
||||
v3.21.0
|
||||
|
||||
Added minotaurx algo for stratum only.
|
||||
Blake256 & sha256 prehash optimized to ignore zero-padded data for AVX2 & AVX512.
|
||||
Other small improvements.
|
||||
|
||||
v3.20.3
|
||||
|
||||
Faster c11 algo: AVX512 6%, AVX2 4%, AVX2+VAES 15%.
|
||||
Faster AVX2+VAES for anime 14%, hmq1725 6%.
|
||||
Small optimizations to Luffa AVX2 & AVX512.
|
||||
|
||||
v3.20.2
|
||||
|
||||
Bit rotation optimizations to Blake256, Blake512, Blake2b, Blake2s & Lyra2-blake2b for SSE2 & AVX2.
|
||||
Removed old unused yescrypt library and other unused code.
|
||||
|
||||
v3.20.1
|
||||
|
||||
sph_blake2b optimized 1-way SSSE3 & AVX2.
|
||||
Removed duplicate Blake2b used by Power2b algo, will now use optimized sph_blake2b.
|
||||
Removed imprecise hash & target display from rejected share log.
|
||||
Share and target difficulty is now displayed only for low difficulty shares.
|
||||
Updated configure.ac to check for AVX512 asm support.
|
||||
Small optimization to Lyra2 SSE2.
|
||||
|
||||
v3.20.0
|
||||
|
||||
#375 Fixed segfault in algos using Groestl VAES due to use of uninitialized data.
|
||||
|
||||
v3.19.9
|
||||
|
||||
More Blake256, Blake512, Luffa & Cubehash prehash optimizations.
|
||||
Relaxed some excessively strict data alignment that was negatively affecting performance.
|
||||
|
||||
v3.19.8
|
||||
|
||||
#370 "stratum+ssl", in addition to "stratum+tcps", is now recognized as a valid
|
||||
url protocol specifier for requesting a secure stratum connection.
|
||||
The full url, including the protocol, is now displayed in the stratum connect
|
||||
log and the periodic summary log.
|
||||
Small optimizations to Cubehash, AVX2 & AVX512.
|
||||
Byte order and prehash optimizations for Blake256 & Blake512, AVX2 & AVX512.
|
||||
|
||||
v3.19.7
|
||||
|
||||
#369 Fixed time limited mining, --time-limit.
|
||||
Fixed a potential compile error when using optimization below -O3.
|
||||
|
||||
v3.19.6
|
||||
|
||||
#363 Fixed a stratum bug where the first job may be ignored delaying start of hashing
|
||||
Fixed handling of nonce exhaust when hashing a fast algo with extranonce disabled
|
||||
Small optimization to Shavite.
|
||||
|
||||
v3.19.5
|
||||
|
||||
Enhanced stratum-keepalive preemptively resets the stratum connection
|
||||
before the server to avoid lost shares.
|
||||
|
||||
Added build-msys2.sh shell script for easier compiling on Windows, see Wiki for details.
|
||||
|
||||
X16RT: eliminate unnecessary recalculations of the hash order.
|
||||
|
||||
Fix a few compiler warnings.
|
||||
|
||||
Fixed log colour error when a block is solved.
|
||||
|
||||
v3.19.4
|
||||
|
||||
#359: Fix verthash memory allocation for non-hugepages, broken in v3.19.3.
|
||||
|
||||
New option stratum-keepalive prevents stratum timeouts when no shares are
|
||||
submitted for several minutes due to high difficulty.
|
||||
|
||||
Fixed a bug displaying optimizations for some algos.
|
||||
|
||||
v3.19.3
|
||||
|
||||
Linux: Faster verthash (+25%), scryptn2 (+2%) when huge pages are available.
|
||||
|
||||
Small speed up for Hamsi AVX2 & AVX512, Keccak AVX512.
|
||||
|
||||
v3.19.2
|
||||
|
||||
Fixed log displaying incorrect memory usage for scrypt, broken in v3.19.1.
|
||||
|
||||
Reduce log noise when replies to submitted shares are lost due to stratum errors.
|
||||
|
||||
Fugue prehash optimization for X16r family AVX2 & AVX512.
|
||||
|
||||
Small speed improvement for Hamsi AVX2 & AVX512.
|
||||
|
||||
Win: With CPU groups enabled the number of CPUs displayed in the ASCII art
|
||||
affinity map is the number of CPUs in a CPU group, was number of CPUs up to 64.
|
||||
|
||||
v3.19.1
|
||||
|
||||
Changes to Windows binaries package:
|
||||
- builds for CPUs with AVX or lower have CPU groups disabled,
|
||||
- zen3 build renamed to avx2-sha-vaes to support Alderlake as well as Zen3,
|
||||
- zen build renamed to avx2-sha, supports Zen1 & Zen2,
|
||||
- avx512-sha build removed, Rocketlake CPUs can use avx512-sha-vaes,
|
||||
- see README.txt for compatibility details.
|
||||
|
||||
Fixed a few compiler warnings that are new in GCC 11.
|
||||
Other minor fixes.
|
||||
|
||||
v3.19.0
|
||||
|
||||
Windows binaries now built with support for CPU groups, requires Windows 7.
|
||||
|
||||
Changes to cpu-affinity:
|
||||
- PR#346: Fixed incorrect CPU affinity on Windows built for CPU groups,
|
||||
- added support for CPU affinity for up to 256 threads or CPUs,
|
||||
- streamlined code for more efficient initialization of miner threads,
|
||||
- precise affining of each miner thread to a specific CPU,
|
||||
- added an option to disable CPU affinity with "--cpu-affinity 0"
|
||||
|
||||
Faster sha256t with AVX512 & AVX2.
|
||||
|
||||
Added stratum error count to stats log, reported only when non-zero.
|
||||
|
||||
v3.18.2
|
||||
|
||||
Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.
|
||||
|
||||
AVX512 for sha256d.
|
||||
|
||||
SSE42 and AVX may now be displayed as mining features at startup.
|
||||
This is hard coded for each algo, and is only implemented for scrypt
|
||||
at this time as it is the only algo with significant performance differences
|
||||
with those features.
|
||||
|
||||
Fixed an issue where a high hashrate algo could cause excessive invalid hash
|
||||
rate log reports when starting up in benchmark mode.
|
||||
|
||||
v3.18.1
|
||||
|
||||
More speed for scrypt:
|
||||
- additional scryptn2 optimizations for all CPU architectures,
|
||||
- AVX2 is now used by default on CPUS with SHA but not AVX512,
|
||||
- scrypt:1024 performance lost in v3.18.0 is restored,
|
||||
- AVX512 & AVX2 improvements to scrypt:1024.
|
||||
|
||||
Big speedup for SwiFFTx AVX2 & SSE4.1: x22i +55%, x25x +22%.
|
||||
|
||||
Issue #337: fixed a problem that could display negative stats values in the
|
||||
first summary report if the report was forced prematurely due to a stratum
|
||||
diff change. The stats will still be invalid but should display zeros.
|
||||
|
||||
v3.18.0
|
||||
|
||||
Complete rewrite of Scrypt code, optimized for large N factor (scryptn2):
|
||||
- AVX512 & SHA support for sha256, AVX512 has priority,
|
||||
- up to 50% increase in hashrate,
|
||||
- memory requirements reduced 30-60% depending on CPU architecture,
|
||||
- memory usage displayed at startup,
|
||||
- scrypt, default N=1024 (LTC), will likely perform slower.
|
||||
|
||||
Improved stale share detection and handling for Scrypt with large N factor:
|
||||
- abort and discard partially computed hash when new work is detected,
|
||||
- quicker response to new job, less time wasted mining stale job.
|
||||
|
||||
Improved stale share handling for all algorithms:
|
||||
- report possible stale share when new work received with a previously
|
||||
submitted share still pending,
|
||||
- when new work is detected report the submission of an already completed,
|
||||
otherwise valid, but likely stale, share,
|
||||
- fixed incorrect block height in stale share log.
|
||||
|
||||
Small performance improvements to sha, bmw, cube & hamsi for AVX512 & AVX2.
|
||||
|
||||
When stratum disconnects miner threads go to idle until reconnected.
|
||||
|
||||
Colour changes to some logs.
|
||||
|
||||
Some low level function name changes for clarity and consistency.
|
||||
|
||||
The reference hashrate in the summary log and the benchmark total hashrate
|
||||
are now the mean hashrate for the session.
|
||||
|
||||
v3.17.1
|
||||
|
||||
Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES.
|
||||
More ternary logic optimizations for AVX512, AVX512+VAES, and AVX512+AES.
|
||||
Fixed my-gr algo for VAES.
|
||||
|
||||
v3.17.0
|
||||
|
||||
AVX512 optimized using ternary logic instructions.
|
||||
Faster sha256t on all CPU architectures: AVX512 +30%, SHA +30%, AVX2 +9%.
|
||||
Use SHA on supported CPUs to produce merkle hash.
|
||||
Fixed byte order in Extranonce2 log & replaced Block height with Job ID.
|
||||
|
||||
v3.16.5
|
||||
|
||||
#329: Fixed GBT incorrect target diff in stats, second attempt.
|
||||
Fixed formatting error in share result log when --no-color option is used.
|
||||
|
||||
v3.16.4
|
||||
|
||||
Faster sha512 and sha256 when not using SHA CPU extension.
|
||||
#329: Fixed GBT incorrect target diff in stats.
|
||||
|
||||
v3.16.3
|
||||
|
||||
#313 Fix compile error with GCC 11.
|
||||
Incremental improvements to verthash.
|
||||
|
||||
v3.16.2
|
||||
|
||||
Verthash: midstate prehash optimization for all architectures.
|
||||
Verthash: AVX2 optimization.
|
||||
GBT: added support for Bech32 addresses.
|
||||
Linux: added CPU frequency to benchmark log.
|
||||
Fixed integer overflow in time calculations.
|
||||
|
||||
v3.16.1
|
||||
|
||||
New options for verthash:
|
||||
--data-file to specify the name, and optionally the path, of the verthash
|
||||
data file, default is "verthash.dat" in the current directory.
|
||||
--verify to perform the data file integrity check at startup, default is
|
||||
not to verify data file integrity.
|
||||
Support for creation of default verthash data file if:
|
||||
1) --data-file option is not used,
|
||||
2) no default data file is found in the current directory, and,
|
||||
3) --verify option is used.
|
||||
More detailed logs related to verthash data file.
|
||||
Small verthash performance improvement.
|
||||
Fixed detection of corrupt stats caused by networking issues.
|
||||
|
||||
v3.16.0
|
||||
|
||||
Added verthash algo.
|
||||
|
||||
v3.15.7
|
||||
|
||||
Added accepted/stale/rejected percentage to summary log report.
|
||||
Added warning if share counters mismatch which could corrupt stats.
|
||||
Linux: CPU temperature reporting is more responsive to rising temperature.
|
||||
A few AVX2 & AVX512 tweaks.
|
||||
Removed some dead code and other cleanup.
|
||||
|
||||
v3.15.6
|
||||
|
||||
Implement keccak pre-hash optimization for x16* algos.
|
||||
Move conditional mining test to before get_new_work in miner thread.
|
||||
Add test for share reject reason when solo mining.
|
||||
Add support for floating point, as well as integer, "networkhasps" in
|
||||
RPC getmininginfo method.
|
||||
|
||||
v3.15.5
|
||||
|
||||
Fix stratum jobs lost if 2 jobs received in less than one second.
|
||||
|
||||
v3.15.4
|
||||
|
||||
Fixed yescryptr16 broken in v3.15.3.
|
||||
|
||||
v3.15.3
|
||||
|
||||
Yescrypt algos now use yespower v0.5, a little faster.
|
||||
New implementation of sha256 using SHA CPU extension.
|
||||
Replace Openssl with SPH for sha256 & sha512.
|
||||
AVX512 optimization for sha256t & sha256q.
|
||||
Faster sha256t, sha256q, x21s, x22i & x25x on CPUs with SHA without AVX512.
|
||||
AVX512+SHA build for Intel Rocketlake added to Windows binary package.
|
||||
|
||||
v3.15.2
|
||||
|
||||
Zen3 AVX2+VAES optimization for x16*, x17, sonoa, xevan, x21s, x22i, x25x,
|
||||
allium.
|
||||
Zen3 build added to Windows binary package.
|
||||
Zen3 (AVX2+SHA+VAES) build added to Windows binary package.
|
||||
|
||||
v3.15.1
|
||||
|
||||
|
83
aclocal.m4
vendored
83
aclocal.m4
vendored
@@ -1,6 +1,6 @@
|
||||
# generated automatically by aclocal 1.16.1 -*- Autoconf -*-
|
||||
# generated automatically by aclocal 1.16.5 -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
|
||||
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -14,13 +14,13 @@
|
||||
m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])])
|
||||
m4_ifndef([AC_AUTOCONF_VERSION],
|
||||
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
|
||||
m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],,
|
||||
[m4_warning([this file was generated for autoconf 2.69.
|
||||
m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.71],,
|
||||
[m4_warning([this file was generated for autoconf 2.71.
|
||||
You have another version of autoconf. It may work, but is not guaranteed to.
|
||||
If you have problems, you may need to regenerate the build system entirely.
|
||||
To do so, use the procedure documented by the package, typically 'autoreconf'.])])
|
||||
|
||||
# Copyright (C) 2002-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2002-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -35,7 +35,7 @@ AC_DEFUN([AM_AUTOMAKE_VERSION],
|
||||
[am__api_version='1.16'
|
||||
dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
|
||||
dnl require some minimum version. Point them to the right macro.
|
||||
m4_if([$1], [1.16.1], [],
|
||||
m4_if([$1], [1.16.5], [],
|
||||
[AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
|
||||
])
|
||||
|
||||
@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
|
||||
# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
|
||||
# This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
|
||||
AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
|
||||
[AM_AUTOMAKE_VERSION([1.16.1])dnl
|
||||
[AM_AUTOMAKE_VERSION([1.16.5])dnl
|
||||
m4_ifndef([AC_AUTOCONF_VERSION],
|
||||
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
|
||||
_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
|
||||
|
||||
# Figure out how to run the assembler. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -78,7 +78,7 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl
|
||||
|
||||
# AM_AUX_DIR_EXPAND -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -130,7 +130,7 @@ am_aux_dir=`cd "$ac_aux_dir" && pwd`
|
||||
|
||||
# AM_CONDITIONAL -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1997-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1997-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -161,7 +161,7 @@ AC_CONFIG_COMMANDS_PRE(
|
||||
Usually this means the macro was only invoked conditionally.]])
|
||||
fi])])
|
||||
|
||||
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1999-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -352,7 +352,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
|
||||
|
||||
# Generate code to set up dependency tracking. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1999-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -391,7 +391,9 @@ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
|
||||
done
|
||||
if test $am_rc -ne 0; then
|
||||
AC_MSG_FAILURE([Something went wrong bootstrapping makefile fragments
|
||||
for automatic dependency tracking. Try re-running configure with the
|
||||
for automatic dependency tracking. If GNU make was not used, consider
|
||||
re-running the configure script with MAKE="gmake" (or whatever is
|
||||
necessary). You can also try re-running configure with the
|
||||
'--disable-dependency-tracking' option to at least be able to build
|
||||
the package (albeit without support for automatic dependency tracking).])
|
||||
fi
|
||||
@@ -418,7 +420,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
|
||||
|
||||
# Do all the work for Automake. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -446,6 +448,10 @@ m4_defn([AC_PROG_CC])
|
||||
# release and drop the old call support.
|
||||
AC_DEFUN([AM_INIT_AUTOMAKE],
|
||||
[AC_PREREQ([2.65])dnl
|
||||
m4_ifdef([_$0_ALREADY_INIT],
|
||||
[m4_fatal([$0 expanded multiple times
|
||||
]m4_defn([_$0_ALREADY_INIT]))],
|
||||
[m4_define([_$0_ALREADY_INIT], m4_expansion_stack)])dnl
|
||||
dnl Autoconf wants to disallow AM_ names. We explicitly allow
|
||||
dnl the ones we care about.
|
||||
m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
|
||||
@@ -482,7 +488,7 @@ m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
|
||||
[_AM_SET_OPTIONS([$1])dnl
|
||||
dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
|
||||
m4_if(
|
||||
m4_ifdef([AC_PACKAGE_NAME], [ok]):m4_ifdef([AC_PACKAGE_VERSION], [ok]),
|
||||
m4_ifset([AC_PACKAGE_NAME], [ok]):m4_ifset([AC_PACKAGE_VERSION], [ok]),
|
||||
[ok:ok],,
|
||||
[m4_fatal([AC_INIT should be called with package and version arguments])])dnl
|
||||
AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
|
||||
@@ -534,6 +540,20 @@ AC_PROVIDE_IFELSE([AC_PROG_OBJCXX],
|
||||
[m4_define([AC_PROG_OBJCXX],
|
||||
m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl
|
||||
])
|
||||
# Variables for tags utilities; see am/tags.am
|
||||
if test -z "$CTAGS"; then
|
||||
CTAGS=ctags
|
||||
fi
|
||||
AC_SUBST([CTAGS])
|
||||
if test -z "$ETAGS"; then
|
||||
ETAGS=etags
|
||||
fi
|
||||
AC_SUBST([ETAGS])
|
||||
if test -z "$CSCOPE"; then
|
||||
CSCOPE=cscope
|
||||
fi
|
||||
AC_SUBST([CSCOPE])
|
||||
|
||||
AC_REQUIRE([AM_SILENT_RULES])dnl
|
||||
dnl The testsuite driver may need to know about EXEEXT, so add the
|
||||
dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen. This
|
||||
@@ -615,7 +635,7 @@ for _am_header in $config_headers :; do
|
||||
done
|
||||
echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
|
||||
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -636,7 +656,7 @@ if test x"${install_sh+set}" != xset; then
|
||||
fi
|
||||
AC_SUBST([install_sh])])
|
||||
|
||||
# Copyright (C) 2003-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2003-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -658,7 +678,7 @@ AC_SUBST([am__leading_dot])])
|
||||
# Add --enable-maintainer-mode option to configure. -*- Autoconf -*-
|
||||
# From Jim Meyering
|
||||
|
||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -693,7 +713,7 @@ AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
|
||||
|
||||
# Check to see how 'make' treats includes. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -736,7 +756,7 @@ AC_SUBST([am__quote])])
|
||||
|
||||
# Fake the existence of programs that GNU maintainers use. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1997-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1997-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -757,12 +777,7 @@ AC_DEFUN([AM_MISSING_HAS_RUN],
|
||||
[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
|
||||
AC_REQUIRE_AUX_FILE([missing])dnl
|
||||
if test x"${MISSING+set}" != xset; then
|
||||
case $am_aux_dir in
|
||||
*\ * | *\ *)
|
||||
MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
|
||||
*)
|
||||
MISSING="\${SHELL} $am_aux_dir/missing" ;;
|
||||
esac
|
||||
MISSING="\${SHELL} '$am_aux_dir/missing'"
|
||||
fi
|
||||
# Use eval to expand $SHELL
|
||||
if eval "$MISSING --is-lightweight"; then
|
||||
@@ -775,7 +790,7 @@ fi
|
||||
|
||||
# Helper functions for option handling. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -804,7 +819,7 @@ AC_DEFUN([_AM_SET_OPTIONS],
|
||||
AC_DEFUN([_AM_IF_OPTION],
|
||||
[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
|
||||
|
||||
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1999-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -851,7 +866,7 @@ AC_LANG_POP([C])])
|
||||
# For backward compatibility.
|
||||
AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
|
||||
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -870,7 +885,7 @@ AC_DEFUN([AM_RUN_LOG],
|
||||
|
||||
# Check to make sure that the build environment is sane. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -951,7 +966,7 @@ AC_CONFIG_COMMANDS_PRE(
|
||||
rm -f conftest.file
|
||||
])
|
||||
|
||||
# Copyright (C) 2009-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2009-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -1011,7 +1026,7 @@ AC_SUBST([AM_BACKSLASH])dnl
|
||||
_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
|
||||
])
|
||||
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -1039,7 +1054,7 @@ fi
|
||||
INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
|
||||
AC_SUBST([INSTALL_STRIP_PROGRAM])])
|
||||
|
||||
# Copyright (C) 2006-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2006-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -1058,7 +1073,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
|
||||
|
||||
# Check how to create a tarball. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2004-2018 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2004-2021 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
|
202
algo-gate-api.c
202
algo-gate-api.c
@@ -15,8 +15,6 @@
|
||||
#include <stdbool.h>
|
||||
#include <memory.h>
|
||||
#include <unistd.h>
|
||||
#include <openssl/sha.h>
|
||||
//#include "miner.h"
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
// Define null and standard functions.
|
||||
@@ -69,7 +67,6 @@ void do_nothing () {}
|
||||
bool return_true () { return true; }
|
||||
bool return_false () { return false; }
|
||||
void *return_null () { return NULL; }
|
||||
void call_error () { printf("ERR: Uninitialized function pointer\n"); }
|
||||
|
||||
void algo_not_tested()
|
||||
{
|
||||
@@ -97,7 +94,8 @@ int null_scanhash()
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Default generic scanhash can be used in many cases.
|
||||
// Default generic scanhash can be used in many cases. Not to be used when
|
||||
// prehashing can be done or when byte swapping the data can be avoided.
|
||||
int scanhash_generic( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
@@ -154,6 +152,9 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
// overwrite byte swapped nonce with original byte order for proper
|
||||
// incrementing. The nonce only needs to byte swapped if it is to be
|
||||
// sumbitted.
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
@@ -262,8 +263,6 @@ void init_algo_gate( algo_gate_t* gate )
|
||||
gate->build_block_header = (void*)&std_build_block_header;
|
||||
gate->build_extraheader = (void*)&std_build_extraheader;
|
||||
gate->set_work_data_endian = (void*)&do_nothing;
|
||||
gate->calc_network_diff = (void*)&std_calc_network_diff;
|
||||
gate->ready_to_mine = (void*)&std_ready_to_mine;
|
||||
gate->resync_threads = (void*)&do_nothing;
|
||||
gate->do_this_thread = (void*)&return_true;
|
||||
gate->longpoll_rpc_call = (void*)&std_longpoll_rpc_call;
|
||||
@@ -279,9 +278,11 @@ void init_algo_gate( algo_gate_t* gate )
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
|
||||
|
||||
// called by each thread that uses the gate
|
||||
// Called once by main
|
||||
bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
{
|
||||
bool rc = false;
|
||||
|
||||
if ( NULL == gate )
|
||||
{
|
||||
applog(LOG_ERR,"FAIL: algo_gate registration failed, NULL gate\n");
|
||||
@@ -292,102 +293,104 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
|
||||
switch ( algo )
|
||||
{
|
||||
case ALGO_ALLIUM: register_allium_algo ( gate ); break;
|
||||
case ALGO_ANIME: register_anime_algo ( gate ); break;
|
||||
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
|
||||
case ALGO_ARGON2D250: register_argon2d_crds_algo ( gate ); break;
|
||||
case ALGO_ARGON2D500: register_argon2d_dyn_algo ( gate ); break;
|
||||
case ALGO_ARGON2D4096: register_argon2d4096_algo ( gate ); break;
|
||||
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
|
||||
case ALGO_BLAKE: register_blake_algo ( gate ); break;
|
||||
case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break;
|
||||
case ALGO_BLAKE2S: register_blake2s_algo ( gate ); break;
|
||||
case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break;
|
||||
case ALGO_BMW512: register_bmw512_algo ( gate ); break;
|
||||
case ALGO_C11: register_c11_algo ( gate ); break;
|
||||
case ALGO_DECRED: register_decred_algo ( gate ); break;
|
||||
case ALGO_DEEP: register_deep_algo ( gate ); break;
|
||||
case ALGO_DMD_GR: register_dmd_gr_algo ( gate ); break;
|
||||
case ALGO_GROESTL: register_groestl_algo ( gate ); break;
|
||||
case ALGO_HEX: register_hex_algo ( gate ); break;
|
||||
case ALGO_HMQ1725: register_hmq1725_algo ( gate ); break;
|
||||
case ALGO_HODL: register_hodl_algo ( gate ); break;
|
||||
case ALGO_JHA: register_jha_algo ( gate ); break;
|
||||
case ALGO_KECCAK: register_keccak_algo ( gate ); break;
|
||||
case ALGO_KECCAKC: register_keccakc_algo ( gate ); break;
|
||||
case ALGO_LBRY: register_lbry_algo ( gate ); break;
|
||||
case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break;
|
||||
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV3: register_lyra2rev3_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z330: register_lyra2z330_algo ( gate ); break;
|
||||
case ALGO_M7M: register_m7m_algo ( gate ); break;
|
||||
case ALGO_MINOTAUR: register_minotaur_algo ( gate ); break;
|
||||
case ALGO_MYR_GR: register_myriad_algo ( gate ); break;
|
||||
case ALGO_NEOSCRYPT: register_neoscrypt_algo ( gate ); break;
|
||||
case ALGO_NIST5: register_nist5_algo ( gate ); break;
|
||||
case ALGO_PENTABLAKE: register_pentablake_algo ( gate ); break;
|
||||
case ALGO_PHI1612: register_phi1612_algo ( gate ); break;
|
||||
case ALGO_PHI2: register_phi2_algo ( gate ); break;
|
||||
case ALGO_POLYTIMOS: register_polytimos_algo ( gate ); break;
|
||||
case ALGO_POWER2B: register_power2b_algo ( gate ); break;
|
||||
case ALGO_QUARK: register_quark_algo ( gate ); break;
|
||||
case ALGO_QUBIT: register_qubit_algo ( gate ); break;
|
||||
case ALGO_SCRYPT: register_scrypt_algo ( gate ); break;
|
||||
case ALGO_SHA256D: register_sha256d_algo ( gate ); break;
|
||||
case ALGO_SHA256Q: register_sha256q_algo ( gate ); break;
|
||||
case ALGO_SHA256T: register_sha256t_algo ( gate ); break;
|
||||
case ALGO_SHA3D: register_sha3d_algo ( gate ); break;
|
||||
case ALGO_SHAVITE3: register_shavite_algo ( gate ); break;
|
||||
case ALGO_SKEIN: register_skein_algo ( gate ); break;
|
||||
case ALGO_SKEIN2: register_skein2_algo ( gate ); break;
|
||||
case ALGO_SKUNK: register_skunk_algo ( gate ); break;
|
||||
case ALGO_SONOA: register_sonoa_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL: register_timetravel_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL10: register_timetravel10_algo ( gate ); break;
|
||||
case ALGO_TRIBUS: register_tribus_algo ( gate ); break;
|
||||
case ALGO_VANILLA: register_vanilla_algo ( gate ); break;
|
||||
case ALGO_VELTOR: register_veltor_algo ( gate ); break;
|
||||
case ALGO_WHIRLPOOL: register_whirlpool_algo ( gate ); break;
|
||||
case ALGO_WHIRLPOOLX: register_whirlpoolx_algo ( gate ); break;
|
||||
case ALGO_X11: register_x11_algo ( gate ); break;
|
||||
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
|
||||
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
|
||||
case ALGO_X12: register_x12_algo ( gate ); break;
|
||||
case ALGO_X13: register_x13_algo ( gate ); break;
|
||||
case ALGO_X13BCD: register_x13bcd_algo ( gate ); break;
|
||||
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
|
||||
case ALGO_X14: register_x14_algo ( gate ); break;
|
||||
case ALGO_X15: register_x15_algo ( gate ); break;
|
||||
case ALGO_X16R: register_x16r_algo ( gate ); break;
|
||||
case ALGO_X16RV2: register_x16rv2_algo ( gate ); break;
|
||||
case ALGO_X16RT: register_x16rt_algo ( gate ); break;
|
||||
case ALGO_X16RT_VEIL: register_x16rt_veil_algo ( gate ); break;
|
||||
case ALGO_X16S: register_x16s_algo ( gate ); break;
|
||||
case ALGO_X17: register_x17_algo ( gate ); break;
|
||||
case ALGO_X21S: register_x21s_algo ( gate ); break;
|
||||
case ALGO_X22I: register_x22i_algo ( gate ); break;
|
||||
case ALGO_X25X: register_x25x_algo ( gate ); break;
|
||||
case ALGO_XEVAN: register_xevan_algo ( gate ); break;
|
||||
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8G: register_yescryptr8g_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
|
||||
case ALGO_YESPOWER: register_yespower_algo ( gate ); break;
|
||||
case ALGO_YESPOWERR16: register_yespowerr16_algo ( gate ); break;
|
||||
case ALGO_YESPOWER_B2B: register_yespower_b2b_algo ( gate ); break;
|
||||
case ALGO_ZR5: register_zr5_algo ( gate ); break;
|
||||
case ALGO_ALLIUM: rc = register_allium_algo ( gate ); break;
|
||||
case ALGO_ANIME: rc = register_anime_algo ( gate ); break;
|
||||
case ALGO_ARGON2: rc = register_argon2_algo ( gate ); break;
|
||||
case ALGO_ARGON2D250: rc = register_argon2d_crds_algo ( gate ); break;
|
||||
case ALGO_ARGON2D500: rc = register_argon2d_dyn_algo ( gate ); break;
|
||||
case ALGO_ARGON2D4096: rc = register_argon2d4096_algo ( gate ); break;
|
||||
case ALGO_AXIOM: rc = register_axiom_algo ( gate ); break;
|
||||
case ALGO_BLAKE: rc = register_blake_algo ( gate ); break;
|
||||
case ALGO_BLAKE2B: rc = register_blake2b_algo ( gate ); break;
|
||||
case ALGO_BLAKE2S: rc = register_blake2s_algo ( gate ); break;
|
||||
case ALGO_BLAKECOIN: rc = register_blakecoin_algo ( gate ); break;
|
||||
case ALGO_BMW512: rc = register_bmw512_algo ( gate ); break;
|
||||
case ALGO_C11: rc = register_c11_algo ( gate ); break;
|
||||
case ALGO_DEEP: rc = register_deep_algo ( gate ); break;
|
||||
case ALGO_DMD_GR: rc = register_dmd_gr_algo ( gate ); break;
|
||||
case ALGO_GROESTL: rc = register_groestl_algo ( gate ); break;
|
||||
case ALGO_HEX: rc = register_hex_algo ( gate ); break;
|
||||
case ALGO_HMQ1725: rc = register_hmq1725_algo ( gate ); break;
|
||||
case ALGO_HODL: rc = register_hodl_algo ( gate ); break;
|
||||
case ALGO_JHA: rc = register_jha_algo ( gate ); break;
|
||||
case ALGO_KECCAK: rc = register_keccak_algo ( gate ); break;
|
||||
case ALGO_KECCAKC: rc = register_keccakc_algo ( gate ); break;
|
||||
case ALGO_LBRY: rc = register_lbry_algo ( gate ); break;
|
||||
case ALGO_LYRA2H: rc = register_lyra2h_algo ( gate ); break;
|
||||
case ALGO_LYRA2RE: rc = register_lyra2re_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV2: rc = register_lyra2rev2_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV3: rc = register_lyra2rev3_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z: rc = register_lyra2z_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z330: rc = register_lyra2z330_algo ( gate ); break;
|
||||
case ALGO_M7M: rc = register_m7m_algo ( gate ); break;
|
||||
case ALGO_MINOTAUR: rc = register_minotaur_algo ( gate ); break;
|
||||
case ALGO_MINOTAURX: rc = register_minotaur_algo ( gate ); break;
|
||||
case ALGO_MYR_GR: rc = register_myriad_algo ( gate ); break;
|
||||
case ALGO_NEOSCRYPT: rc = register_neoscrypt_algo ( gate ); break;
|
||||
case ALGO_NIST5: rc = register_nist5_algo ( gate ); break;
|
||||
case ALGO_PENTABLAKE: rc = register_pentablake_algo ( gate ); break;
|
||||
case ALGO_PHI1612: rc = register_phi1612_algo ( gate ); break;
|
||||
case ALGO_PHI2: rc = register_phi2_algo ( gate ); break;
|
||||
case ALGO_POLYTIMOS: rc = register_polytimos_algo ( gate ); break;
|
||||
case ALGO_POWER2B: rc = register_power2b_algo ( gate ); break;
|
||||
case ALGO_QUARK: rc = register_quark_algo ( gate ); break;
|
||||
case ALGO_QUBIT: rc = register_qubit_algo ( gate ); break;
|
||||
case ALGO_SCRYPT: rc = register_scrypt_algo ( gate ); break;
|
||||
case ALGO_SHA256D: rc = register_sha256d_algo ( gate ); break;
|
||||
case ALGO_SHA256DT: rc = register_sha256dt_algo ( gate ); break;
|
||||
case ALGO_SHA256Q: rc = register_sha256q_algo ( gate ); break;
|
||||
case ALGO_SHA256T: rc = register_sha256t_algo ( gate ); break;
|
||||
case ALGO_SHA3D: rc = register_sha3d_algo ( gate ); break;
|
||||
case ALGO_SHA512256D: rc = register_sha512256d_algo ( gate ); break;
|
||||
case ALGO_SHAVITE3: rc = register_shavite_algo ( gate ); break;
|
||||
case ALGO_SKEIN: rc = register_skein_algo ( gate ); break;
|
||||
case ALGO_SKEIN2: rc = register_skein2_algo ( gate ); break;
|
||||
case ALGO_SKUNK: rc = register_skunk_algo ( gate ); break;
|
||||
case ALGO_SONOA: rc = register_sonoa_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL: rc = register_timetravel_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL10: rc = register_timetravel10_algo ( gate ); break;
|
||||
case ALGO_TRIBUS: rc = register_tribus_algo ( gate ); break;
|
||||
case ALGO_VANILLA: rc = register_vanilla_algo ( gate ); break;
|
||||
case ALGO_VELTOR: rc = register_veltor_algo ( gate ); break;
|
||||
case ALGO_VERTHASH: rc = register_verthash_algo ( gate ); break;
|
||||
case ALGO_WHIRLPOOL: rc = register_whirlpool_algo ( gate ); break;
|
||||
case ALGO_WHIRLPOOLX: rc = register_whirlpoolx_algo ( gate ); break;
|
||||
case ALGO_X11: rc = register_x11_algo ( gate ); break;
|
||||
case ALGO_X11EVO: rc = register_x11evo_algo ( gate ); break;
|
||||
case ALGO_X11GOST: rc = register_x11gost_algo ( gate ); break;
|
||||
case ALGO_X12: rc = register_x12_algo ( gate ); break;
|
||||
case ALGO_X13: rc = register_x13_algo ( gate ); break;
|
||||
case ALGO_X13BCD: rc = register_x13bcd_algo ( gate ); break;
|
||||
case ALGO_X13SM3: rc = register_x13sm3_algo ( gate ); break;
|
||||
case ALGO_X14: rc = register_x14_algo ( gate ); break;
|
||||
case ALGO_X15: rc = register_x15_algo ( gate ); break;
|
||||
case ALGO_X16R: rc = register_x16r_algo ( gate ); break;
|
||||
case ALGO_X16RV2: rc = register_x16rv2_algo ( gate ); break;
|
||||
case ALGO_X16RT: rc = register_x16rt_algo ( gate ); break;
|
||||
case ALGO_X16RT_VEIL: rc = register_x16rt_veil_algo ( gate ); break;
|
||||
case ALGO_X16S: rc = register_x16s_algo ( gate ); break;
|
||||
case ALGO_X17: rc = register_x17_algo ( gate ); break;
|
||||
case ALGO_X21S: rc = register_x21s_algo ( gate ); break;
|
||||
case ALGO_X22I: rc = register_x22i_algo ( gate ); break;
|
||||
case ALGO_X25X: rc = register_x25x_algo ( gate ); break;
|
||||
case ALGO_XEVAN: rc = register_xevan_algo ( gate ); break;
|
||||
case ALGO_YESCRYPT: rc = register_yescrypt_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: rc = register_yescryptr8_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8G: rc = register_yescryptr8g_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: rc = register_yescryptr16_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR32: rc = register_yescryptr32_algo ( gate ); break;
|
||||
case ALGO_YESPOWER: rc = register_yespower_algo ( gate ); break;
|
||||
case ALGO_YESPOWERR16: rc = register_yespowerr16_algo ( gate ); break;
|
||||
case ALGO_YESPOWER_B2B: rc = register_yespower_b2b_algo ( gate ); break;
|
||||
case ALGO_ZR5: rc = register_zr5_algo ( gate ); break;
|
||||
default:
|
||||
applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
|
||||
applog(LOG_ERR,"BUG: unregistered algorithm %s.\n", algo_names[opt_algo] );
|
||||
return false;
|
||||
} // switch
|
||||
|
||||
// ensure required functions were defined.
|
||||
if ( gate->scanhash == (void*)&null_scanhash )
|
||||
if ( !rc )
|
||||
{
|
||||
applog(LOG_ERR, "FAIL: Required algo_gate functions undefined\n");
|
||||
applog(LOG_ERR, "FAIL: %s algorithm failed to initialize\n", algo_names[opt_algo] );
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@@ -415,7 +418,6 @@ void exec_hash_function( int algo, void *output, const void *pdata )
|
||||
const char* const algo_alias_map[][2] =
|
||||
{
|
||||
// alias proper
|
||||
{ "argon2d-crds", "argon2d250" },
|
||||
{ "argon2d-dyn", "argon2d500" },
|
||||
{ "argon2d-uis", "argon2d4096" },
|
||||
{ "bcd", "x13bcd" },
|
||||
@@ -424,13 +426,11 @@ const char* const algo_alias_map[][2] =
|
||||
{ "blake256r8", "blakecoin" },
|
||||
{ "blake256r8vnl", "vanilla" },
|
||||
{ "blake256r14", "blake" },
|
||||
{ "blake256r14dcr", "decred" },
|
||||
{ "diamond", "dmd-gr" },
|
||||
{ "espers", "hmq1725" },
|
||||
{ "flax", "c11" },
|
||||
{ "hsr", "x13sm3" },
|
||||
{ "jackpot", "jha" },
|
||||
{ "jane", "scryptjane" },
|
||||
{ "lyra2", "lyra2re" },
|
||||
{ "lyra2v2", "lyra2rev2" },
|
||||
{ "lyra2v3", "lyra2rev3" },
|
||||
|
@@ -1,3 +1,6 @@
|
||||
#ifndef __ALGO_GATE_API_H__
|
||||
#define __ALGO_GATE_API_H__ 1
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
@@ -94,7 +97,6 @@ typedef uint32_t set_t;
|
||||
#define SHA_OPT 0x20 // Zen1, Icelake (sha256)
|
||||
#define AVX512_OPT 0x40 // Skylake-X (AVX512[F,VL,DQ,BW])
|
||||
#define VAES_OPT 0x80 // Icelake (VAES & AVX512)
|
||||
#define VAES256_OPT 0x100 // Zen3 (VAES without AVX512)
|
||||
|
||||
|
||||
// return set containing all elements from sets a & b
|
||||
@@ -114,15 +116,15 @@ typedef struct
|
||||
// Mandatory functions, one of these is mandatory. If a generic scanhash
|
||||
// is used a custom target hash function must be registered, with a custom
|
||||
// scanhash the target hash function can be called directly and doesn't need
|
||||
// to be registered in the gate.
|
||||
// to be registered with the gate.
|
||||
int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );
|
||||
|
||||
int ( *hash ) ( void*, const void*, int );
|
||||
|
||||
//optional, safe to use default in most cases
|
||||
|
||||
// Allocate thread local buffers and other initialization specific to miner
|
||||
// threads.
|
||||
// Called once by each miner thread to allocate thread local buffers and
|
||||
// other initialization specific to miner threads.
|
||||
bool ( *miner_thread_init ) ( int );
|
||||
|
||||
// Get thread local copy of blockheader with unique nonce.
|
||||
@@ -150,21 +152,15 @@ void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );
|
||||
|
||||
char* ( *malloc_txs_request ) ( struct work* );
|
||||
|
||||
// Big or little
|
||||
// Big endian or little endian
|
||||
void ( *set_work_data_endian ) ( struct work* );
|
||||
|
||||
double ( *calc_network_diff ) ( struct work* );
|
||||
|
||||
// Wait for first work
|
||||
bool ( *ready_to_mine ) ( struct work*, struct stratum_ctx*, int );
|
||||
|
||||
// Diverge mining threads
|
||||
bool ( *do_this_thread ) ( int );
|
||||
|
||||
// After do_this_thread
|
||||
void ( *resync_threads ) ( struct work* );
|
||||
void ( *resync_threads ) ( int, struct work* );
|
||||
|
||||
// No longer needed
|
||||
json_t* ( *longpoll_rpc_call ) ( CURL*, int*, char* );
|
||||
|
||||
set_t optimizations;
|
||||
@@ -281,11 +277,9 @@ void std_be_build_stratum_request( char *req, struct work *work );
|
||||
|
||||
char* std_malloc_txs_request( struct work *work );
|
||||
|
||||
// Default is do_nothing (assumed LE)
|
||||
// Default is do_nothing, little endian is assumed
|
||||
void set_work_data_big_endian( struct work *work );
|
||||
|
||||
double std_calc_network_diff( struct work *work );
|
||||
|
||||
void std_build_block_header( struct work* g_work, uint32_t version,
|
||||
uint32_t *prevhash, uint32_t *merkle_root,
|
||||
uint32_t ntime, uint32_t nbits,
|
||||
@@ -295,9 +289,6 @@ void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );
|
||||
|
||||
json_t* std_longpoll_rpc_call( CURL *curl, int *err, char *lp_url );
|
||||
|
||||
bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
|
||||
int thr_id );
|
||||
|
||||
int std_get_work_data_size();
|
||||
|
||||
// Gate admin functions
|
||||
@@ -319,3 +310,4 @@ void exec_hash_function( int algo, void *output, const void *pdata );
|
||||
// algo name if valid alias, NULL if invalid alias or algo.
|
||||
void get_algo_alias( char **algo_or_alias );
|
||||
|
||||
#endif
|
||||
|
@@ -344,7 +344,7 @@ static size_t
|
||||
detect_cpu(void) {
|
||||
//union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
|
||||
//cpu_vendors_x86 vendor = cpu_nobody;
|
||||
x86_regs regs;
|
||||
x86_regs regs; regs.eax = regs.ebx = regs.ecx = 0;
|
||||
uint32_t max_level, max_ext_level;
|
||||
size_t cpu_flags = 0;
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
|
@@ -4,11 +4,12 @@ typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, sc
|
||||
#endif
|
||||
|
||||
/* romix pre/post nop function */
|
||||
/*
|
||||
static void asm_calling_convention
|
||||
scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
|
||||
(void)blocks; (void)nblocks;
|
||||
}
|
||||
|
||||
*/
|
||||
/* romix pre/post endian conversion function */
|
||||
static void asm_calling_convention
|
||||
scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
|
||||
|
@@ -37,6 +37,13 @@
|
||||
|
||||
#if defined(__AVX512F__)
|
||||
|
||||
static inline __m512i blamka( __m512i x, __m512i y )
|
||||
{
|
||||
__m512i xy = _mm512_mul_epu32( x, y );
|
||||
return _mm512_add_epi64( _mm512_add_epi64( x, y ),
|
||||
_mm512_add_epi64( xy, xy ) );
|
||||
}
|
||||
|
||||
static void fill_block( __m512i *state, const block *ref_block,
|
||||
block *next_block, int with_xor )
|
||||
{
|
||||
|
@@ -328,9 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#define ror64(x, n) _mm512_ror_epi64((x), (n))
|
||||
|
||||
static __m512i muladd(__m512i x, __m512i y)
|
||||
static inline __m512i muladd(__m512i x, __m512i y)
|
||||
{
|
||||
__m512i z = _mm512_mul_epu32(x, y);
|
||||
return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
|
||||
@@ -344,8 +342,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
D0 = _mm512_xor_si512(D0, A0); \
|
||||
D1 = _mm512_xor_si512(D1, A1); \
|
||||
\
|
||||
D0 = ror64(D0, 32); \
|
||||
D1 = ror64(D1, 32); \
|
||||
D0 = _mm512_ror_epi64(D0, 32); \
|
||||
D1 = _mm512_ror_epi64(D1, 32); \
|
||||
\
|
||||
C0 = muladd(C0, D0); \
|
||||
C1 = muladd(C1, D1); \
|
||||
@@ -353,8 +351,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
B0 = _mm512_xor_si512(B0, C0); \
|
||||
B1 = _mm512_xor_si512(B1, C1); \
|
||||
\
|
||||
B0 = ror64(B0, 24); \
|
||||
B1 = ror64(B1, 24); \
|
||||
B0 = _mm512_ror_epi64(B0, 24); \
|
||||
B1 = _mm512_ror_epi64(B1, 24); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
@@ -365,8 +363,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
D0 = _mm512_xor_si512(D0, A0); \
|
||||
D1 = _mm512_xor_si512(D1, A1); \
|
||||
\
|
||||
D0 = ror64(D0, 16); \
|
||||
D1 = ror64(D1, 16); \
|
||||
D0 = _mm512_ror_epi64(D0, 16); \
|
||||
D1 = _mm512_ror_epi64(D1, 16); \
|
||||
\
|
||||
C0 = muladd(C0, D0); \
|
||||
C1 = muladd(C1, D1); \
|
||||
@@ -374,8 +372,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
B0 = _mm512_xor_si512(B0, C0); \
|
||||
B1 = _mm512_xor_si512(B1, C1); \
|
||||
\
|
||||
B0 = ror64(B0, 63); \
|
||||
B1 = ror64(B1, 63); \
|
||||
B0 = _mm512_ror_epi64(B0, 63); \
|
||||
B1 = _mm512_ror_epi64(B1, 63); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
@@ -417,11 +415,10 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
|
||||
#define SWAP_HALVES(A0, A1) \
|
||||
do { \
|
||||
__m512i t0, t1; \
|
||||
t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
|
||||
t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
|
||||
A0 = t0; \
|
||||
A1 = t1; \
|
||||
__m512i t; \
|
||||
t = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
|
||||
A1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
|
||||
A0 = t; \
|
||||
} while((void)0, 0)
|
||||
|
||||
#define SWAP_QUARTERS(A0, A1) \
|
||||
|
@@ -49,6 +49,20 @@ extern "C"{
|
||||
|
||||
#define SPH_SIZE_blake512 512
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
// Blake-256 1 way SSE2
|
||||
|
||||
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
const uint32_t T0, const uint32_t T1 );
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
// Blake-512 1 way SSE2
|
||||
|
||||
void blake512_transform_le( uint64_t *H, const uint64_t *buf,
|
||||
const uint64_t T0, const uint64_t T1 );
|
||||
|
||||
//////////////////////////
|
||||
//
|
||||
// Blake-256 4 way SSE2
|
||||
@@ -98,6 +112,12 @@ typedef blake_8way_small_context blake256_8way_context;
|
||||
void blake256_8way_init(void *cc);
|
||||
void blake256_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256_8way_close(void *cc, void *dst);
|
||||
void blake256_8way_update_le(void *cc, const void *data, size_t len);
|
||||
void blake256_8way_close_le(void *cc, void *dst);
|
||||
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data );
|
||||
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data );
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_8way_small_context blake256r14_8way_context;
|
||||
@@ -128,6 +148,12 @@ void blake512_4way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_4way_close( void *cc, void *dst );
|
||||
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
const void *data );
|
||||
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
const __m256i nonce, const __m256i *midstate );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
@@ -148,6 +174,14 @@ typedef blake_16way_small_context blake256_16way_context;
|
||||
void blake256_16way_init(void *cc);
|
||||
void blake256_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256_16way_close(void *cc, void *dst);
|
||||
// Expects data in little endian order, no byte swap needed
|
||||
void blake256_16way_update_le(void *cc, const void *data, size_t len);
|
||||
void blake256_16way_close_le(void *cc, void *dst);
|
||||
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data );
|
||||
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data );
|
||||
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_16way_small_context blake256r14_16way_context;
|
||||
@@ -180,6 +214,12 @@ void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8way_close( void *cc, void *dst );
|
||||
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
const void *data );
|
||||
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
const __m512i nonce, const __m512i *midstate );
|
||||
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -52,6 +52,180 @@ static const uint8_t sigma[12][16] =
|
||||
};
|
||||
|
||||
|
||||
#define Z00 0
|
||||
#define Z01 1
|
||||
#define Z02 2
|
||||
#define Z03 3
|
||||
#define Z04 4
|
||||
#define Z05 5
|
||||
#define Z06 6
|
||||
#define Z07 7
|
||||
#define Z08 8
|
||||
#define Z09 9
|
||||
#define Z0A A
|
||||
#define Z0B B
|
||||
#define Z0C C
|
||||
#define Z0D D
|
||||
#define Z0E E
|
||||
#define Z0F F
|
||||
|
||||
#define Z10 E
|
||||
#define Z11 A
|
||||
#define Z12 4
|
||||
#define Z13 8
|
||||
#define Z14 9
|
||||
#define Z15 F
|
||||
#define Z16 D
|
||||
#define Z17 6
|
||||
#define Z18 1
|
||||
#define Z19 C
|
||||
#define Z1A 0
|
||||
#define Z1B 2
|
||||
#define Z1C B
|
||||
#define Z1D 7
|
||||
#define Z1E 5
|
||||
#define Z1F 3
|
||||
|
||||
#define Z20 B
|
||||
#define Z21 8
|
||||
#define Z22 C
|
||||
#define Z23 0
|
||||
#define Z24 5
|
||||
#define Z25 2
|
||||
#define Z26 F
|
||||
#define Z27 D
|
||||
#define Z28 A
|
||||
#define Z29 E
|
||||
#define Z2A 3
|
||||
#define Z2B 6
|
||||
#define Z2C 7
|
||||
#define Z2D 1
|
||||
#define Z2E 9
|
||||
#define Z2F 4
|
||||
|
||||
#define Z30 7
|
||||
#define Z31 9
|
||||
#define Z32 3
|
||||
#define Z33 1
|
||||
#define Z34 D
|
||||
#define Z35 C
|
||||
#define Z36 B
|
||||
#define Z37 E
|
||||
#define Z38 2
|
||||
#define Z39 6
|
||||
#define Z3A 5
|
||||
#define Z3B A
|
||||
#define Z3C 4
|
||||
#define Z3D 0
|
||||
#define Z3E F
|
||||
#define Z3F 8
|
||||
|
||||
#define Z40 9
|
||||
#define Z41 0
|
||||
#define Z42 5
|
||||
#define Z43 7
|
||||
#define Z44 2
|
||||
#define Z45 4
|
||||
#define Z46 A
|
||||
#define Z47 F
|
||||
#define Z48 E
|
||||
#define Z49 1
|
||||
#define Z4A B
|
||||
#define Z4B C
|
||||
#define Z4C 6
|
||||
#define Z4D 8
|
||||
#define Z4E 3
|
||||
#define Z4F D
|
||||
|
||||
#define Z50 2
|
||||
#define Z51 C
|
||||
#define Z52 6
|
||||
#define Z53 A
|
||||
#define Z54 0
|
||||
#define Z55 B
|
||||
#define Z56 8
|
||||
#define Z57 3
|
||||
#define Z58 4
|
||||
#define Z59 D
|
||||
#define Z5A 7
|
||||
#define Z5B 5
|
||||
#define Z5C F
|
||||
#define Z5D E
|
||||
#define Z5E 1
|
||||
#define Z5F 9
|
||||
|
||||
#define Z60 C
|
||||
#define Z61 5
|
||||
#define Z62 1
|
||||
#define Z63 F
|
||||
#define Z64 E
|
||||
#define Z65 D
|
||||
#define Z66 4
|
||||
#define Z67 A
|
||||
#define Z68 0
|
||||
#define Z69 7
|
||||
#define Z6A 6
|
||||
#define Z6B 3
|
||||
#define Z6C 9
|
||||
#define Z6D 2
|
||||
#define Z6E 8
|
||||
#define Z6F B
|
||||
|
||||
#define Z70 D
|
||||
#define Z71 B
|
||||
#define Z72 7
|
||||
#define Z73 E
|
||||
#define Z74 C
|
||||
#define Z75 1
|
||||
#define Z76 3
|
||||
#define Z77 9
|
||||
#define Z78 5
|
||||
#define Z79 0
|
||||
#define Z7A F
|
||||
#define Z7B 4
|
||||
#define Z7C 8
|
||||
#define Z7D 6
|
||||
#define Z7E 2
|
||||
#define Z7F A
|
||||
|
||||
#define Z80 6
|
||||
#define Z81 F
|
||||
#define Z82 E
|
||||
#define Z83 9
|
||||
#define Z84 B
|
||||
#define Z85 3
|
||||
#define Z86 0
|
||||
#define Z87 8
|
||||
#define Z88 C
|
||||
#define Z89 2
|
||||
#define Z8A D
|
||||
#define Z8B 7
|
||||
#define Z8C 1
|
||||
#define Z8D 4
|
||||
#define Z8E A
|
||||
#define Z8F 5
|
||||
|
||||
#define Z90 A
|
||||
#define Z91 2
|
||||
#define Z92 8
|
||||
#define Z93 4
|
||||
#define Z94 7
|
||||
#define Z95 6
|
||||
#define Z96 1
|
||||
#define Z97 5
|
||||
#define Z98 F
|
||||
#define Z99 B
|
||||
#define Z9A 9
|
||||
#define Z9B E
|
||||
#define Z9C 3
|
||||
#define Z9D C
|
||||
#define Z9E D
|
||||
#define Z9F 0
|
||||
|
||||
#define Mx(r, i) Mx_(Z ## r ## i)
|
||||
#define Mx_(n) Mx__(n)
|
||||
#define Mx__(n) M ## n
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define B2B8W_G(a, b, c, d, x, y) \
|
||||
@@ -122,14 +296,14 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
B2B8W_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
|
||||
}
|
||||
|
||||
ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] );
|
||||
ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] );
|
||||
ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] );
|
||||
ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] );
|
||||
ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] );
|
||||
ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] );
|
||||
ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] );
|
||||
ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] );
|
||||
ctx->h[0] = mm512_xor3( ctx->h[0], v[0], v[ 8] );
|
||||
ctx->h[1] = mm512_xor3( ctx->h[1], v[1], v[ 9] );
|
||||
ctx->h[2] = mm512_xor3( ctx->h[2], v[2], v[10] );
|
||||
ctx->h[3] = mm512_xor3( ctx->h[3], v[3], v[11] );
|
||||
ctx->h[4] = mm512_xor3( ctx->h[4], v[4], v[12] );
|
||||
ctx->h[5] = mm512_xor3( ctx->h[5], v[5], v[13] );
|
||||
ctx->h[6] = mm512_xor3( ctx->h[6], v[6], v[14] );
|
||||
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
|
||||
}
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
@@ -214,11 +388,11 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
||||
#define B2B_G(a, b, c, d, x, y) \
|
||||
{ \
|
||||
v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \
|
||||
v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \
|
||||
v[d] = mm256_swap64_32( _mm256_xor_si256( v[d], v[a] ) ); \
|
||||
v[c] = _mm256_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \
|
||||
v[b] = mm256_shuflr64_24( _mm256_xor_si256( v[b], v[c] ) ); \
|
||||
v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \
|
||||
v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \
|
||||
v[d] = mm256_shuflr64_16( _mm256_xor_si256( v[d], v[a] ) ); \
|
||||
v[c] = _mm256_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \
|
||||
}
|
||||
|
@@ -17,7 +17,7 @@
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
ALIGN(128) typedef struct {
|
||||
typedef struct ALIGN( 64 ) {
|
||||
__m512i b[16]; // input buffer
|
||||
__m512i h[8]; // chained state
|
||||
uint64_t t[2]; // total number of bytes
|
||||
@@ -35,7 +35,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// state context
|
||||
ALIGN(128) typedef struct {
|
||||
typedef struct ALIGN( 64 ) {
|
||||
__m256i b[16]; // input buffer
|
||||
__m256i h[8]; // chained state
|
||||
uint64_t t[2]; // total number of bytes
|
||||
|
@@ -4,7 +4,6 @@
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
//#if defined(__SSE4_2__)
|
||||
#if defined(__SSE2__)
|
||||
#define BLAKE2S_4WAY
|
||||
#endif
|
||||
@@ -27,8 +26,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined (BLAKE2S_8WAY)
|
||||
|
||||
//#if defined(BLAKE2S_8WAY)
|
||||
|
||||
void blake2s_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
@@ -108,11 +108,11 @@ do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
|
||||
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
|
||||
d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
|
||||
} while(0)
|
||||
@@ -320,11 +320,11 @@ do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
|
||||
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s1 ] ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
||||
d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
} while(0)
|
||||
@@ -368,7 +368,7 @@ do { \
|
||||
ROUND8W( 9 );
|
||||
|
||||
for( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] );
|
||||
S->h[i] = mm256_xor3( S->h[i], v[i], v[i + 8] );
|
||||
|
||||
#undef G8W
|
||||
#undef ROUND8W
|
||||
@@ -566,7 +566,7 @@ do { \
|
||||
ROUND16W( 9 );
|
||||
|
||||
for( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] );
|
||||
S->h[i] = mm512_xor3( S->h[i], v[i], v[i + 8] );
|
||||
|
||||
#undef G16W
|
||||
#undef ROUND16W
|
||||
|
@@ -60,7 +60,7 @@ typedef struct __blake2s_nway_param
|
||||
} blake2s_nway_param;
|
||||
#pragma pack(pop)
|
||||
|
||||
ALIGN( 64 ) typedef struct __blake2s_4way_state
|
||||
typedef struct ALIGN( 64 ) __blake2s_4way_state
|
||||
{
|
||||
__m128i h[8];
|
||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 4 ];
|
||||
@@ -80,7 +80,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
ALIGN( 64 ) typedef struct __blake2s_8way_state
|
||||
typedef struct ALIGN( 64 ) __blake2s_8way_state
|
||||
{
|
||||
__m256i h[8];
|
||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 8 ];
|
||||
@@ -101,7 +101,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
ALIGN( 128 ) typedef struct __blake2s_16way_state
|
||||
typedef struct ALIGN( 64 ) __blake2s_16way_state
|
||||
{
|
||||
__m512i h[8];
|
||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 16 ];
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -1,74 +0,0 @@
|
||||
#include "decred-gate.h"
|
||||
#include "blake-hash-4way.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#if defined (DECRED_4WAY)
|
||||
|
||||
static __thread blake256_4way_context blake_mid;
|
||||
|
||||
void decred_hash_4way( void *state, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
// uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
// uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
// uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||
// uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||
const void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
|
||||
int tail_len = 180 - DECRED_MIDSTATE_LEN;
|
||||
blake256_4way_context ctx __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
|
||||
blake256_4way_update( &ctx, tail, tail_len );
|
||||
blake256_4way_close( &ctx, vhash );
|
||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[48*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t _ALIGN(64) edata[48];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
// copy to buffer guaranteed to be aligned.
|
||||
memcpy( edata, pdata, 180 );
|
||||
|
||||
// use the old way until new way updated for size.
|
||||
mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 );
|
||||
|
||||
blake256_4way_init( &blake_mid );
|
||||
blake256_4way_update( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
|
||||
|
||||
uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
|
||||
do {
|
||||
* noncep = n;
|
||||
*(noncep+1) = n+1;
|
||||
*(noncep+2) = n+2;
|
||||
*(noncep+3) = n+3;
|
||||
|
||||
decred_hash_4way( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= HTarget )
|
||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[DECRED_NONCE_INDEX] = n+i;
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,168 +0,0 @@
|
||||
#include "decred-gate.h"
|
||||
#include <unistd.h>
|
||||
#include <memory.h>
|
||||
#include <string.h>
|
||||
|
||||
uint32_t *decred_get_nonceptr( uint32_t *work_data )
|
||||
{
|
||||
return &work_data[ DECRED_NONCE_INDEX ];
|
||||
}
|
||||
|
||||
double decred_calc_network_diff( struct work* work )
|
||||
{
|
||||
// sample for diff 43.281 : 1c05ea29
|
||||
// todo: endian reversed on longpoll could be zr5 specific...
|
||||
uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
|
||||
uint32_t bits = ( nbits & 0xffffff );
|
||||
int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
|
||||
int m;
|
||||
double d = (double)0x0000ffff / (double)bits;
|
||||
|
||||
for ( m = shift; m < 29; m++ )
|
||||
d *= 256.0;
|
||||
for ( m = 29; m < shift; m++ )
|
||||
d /= 256.0;
|
||||
if ( shift == 28 )
|
||||
d *= 256.0; // testnet
|
||||
if ( opt_debug_diff )
|
||||
applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
|
||||
shift, bits );
|
||||
return net_diff;
|
||||
}
|
||||
|
||||
void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
|
||||
{
|
||||
// some random extradata to make the work unique
|
||||
work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
|
||||
work->height = work->data[32];
|
||||
if (!have_longpoll && work->height > *net_blocks + 1)
|
||||
{
|
||||
char netinfo[64] = { 0 };
|
||||
if ( net_diff > 0. )
|
||||
{
|
||||
if (net_diff != work->targetdiff)
|
||||
sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
|
||||
work->targetdiff);
|
||||
else
|
||||
sprintf(netinfo, ", diff %.3f", net_diff);
|
||||
}
|
||||
applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
|
||||
netinfo);
|
||||
*net_blocks = work->height - 1;
|
||||
}
|
||||
}
|
||||
|
||||
void decred_be_build_stratum_request( char *req, struct work *work,
|
||||
struct stratum_ctx *sctx )
|
||||
{
|
||||
unsigned char *xnonce2str;
|
||||
uint32_t ntime, nonce;
|
||||
char ntimestr[9], noncestr[9];
|
||||
|
||||
be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
|
||||
be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
|
||||
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
|
||||
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
|
||||
xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
|
||||
sctx->xnonce1_size );
|
||||
snprintf( req, JSON_BUF_LEN,
|
||||
"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
|
||||
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
|
||||
free(xnonce2str);
|
||||
}
|
||||
#define min(a,b) (a>b ? (b) :(a))
|
||||
|
||||
void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
{
|
||||
uchar merkle_root[64] = { 0 };
|
||||
uint32_t extraheader[32] = { 0 };
|
||||
int headersize = 0;
|
||||
uint32_t* extradata = (uint32_t*) sctx->xnonce1;
|
||||
int i;
|
||||
|
||||
// getwork over stratum, getwork merkle + header passed in coinb1
|
||||
memcpy(merkle_root, sctx->job.coinbase, 32);
|
||||
headersize = min((int)sctx->job.coinbase_size - 32,
|
||||
sizeof(extraheader) );
|
||||
memcpy( extraheader, &sctx->job.coinbase[32], headersize );
|
||||
|
||||
// Assemble block header
|
||||
memset( g_work->data, 0, sizeof(g_work->data) );
|
||||
g_work->data[0] = le32dec( sctx->job.version );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[1 + i] = swab32(
|
||||
le32dec( (uint32_t *) sctx->job.prevhash + i ) );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
|
||||
|
||||
// for ( i = 0; i < 8; i++ ) // prevhash
|
||||
// g_work->data[1 + i] = swab32( g_work->data[1 + i] );
|
||||
// for ( i = 0; i < 8; i++ ) // merkle
|
||||
// g_work->data[9 + i] = swab32( g_work->data[9 + i] );
|
||||
|
||||
for ( i = 0; i < headersize/4; i++ ) // header
|
||||
g_work->data[17 + i] = extraheader[i];
|
||||
// extradata
|
||||
|
||||
for ( i = 0; i < sctx->xnonce1_size/4; i++ )
|
||||
g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
|
||||
for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
|
||||
g_work->data[i] = 0;
|
||||
g_work->data[37] = (rand()*4) << 8;
|
||||
// block header suffix from coinb2 (stake version)
|
||||
memcpy( &g_work->data[44],
|
||||
&sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
|
||||
sctx->block_height = g_work->data[32];
|
||||
//applog_hex(work->data, 180);
|
||||
//applog_hex(&work->data[36], 36);
|
||||
}
|
||||
|
||||
#undef min
|
||||
|
||||
bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
|
||||
int thr_id )
|
||||
{
|
||||
if ( have_stratum && strcmp(stratum->job.job_id, work->job_id) )
|
||||
// need to regen g_work..
|
||||
return false;
|
||||
if ( have_stratum && !work->data[0] && !opt_benchmark )
|
||||
{
|
||||
sleep(1);
|
||||
return false;
|
||||
}
|
||||
// extradata: prevent duplicates
|
||||
work->data[ DECRED_XNONCE_INDEX ] += 1;
|
||||
work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
|
||||
return true;
|
||||
}
|
||||
|
||||
int decred_get_work_data_size() { return DECRED_DATA_SIZE; }
|
||||
|
||||
bool register_decred_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(DECRED_4WAY)
|
||||
four_way_not_tested();
|
||||
gate->scanhash = (void*)&scanhash_decred_4way;
|
||||
gate->hash = (void*)&decred_hash_4way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_decred;
|
||||
gate->hash = (void*)&decred_hash;
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT;
|
||||
// gate->get_nonceptr = (void*)&decred_get_nonceptr;
|
||||
gate->decode_extra_data = (void*)&decred_decode_extradata;
|
||||
gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
|
||||
gate->work_decode = (void*)&std_be_work_decode;
|
||||
gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
|
||||
gate->build_extraheader = (void*)&decred_build_extraheader;
|
||||
gate->ready_to_mine = (void*)&decred_ready_to_mine;
|
||||
gate->nbits_index = DECRED_NBITS_INDEX;
|
||||
gate->ntime_index = DECRED_NTIME_INDEX;
|
||||
gate->nonce_index = DECRED_NONCE_INDEX;
|
||||
gate->get_work_data_size = (void*)&decred_get_work_data_size;
|
||||
gate->work_cmp_size = DECRED_WORK_COMPARE_SIZE;
|
||||
allow_mininginfo = false;
|
||||
have_gbt = false;
|
||||
return true;
|
||||
}
|
||||
|
@@ -1,36 +0,0 @@
|
||||
#ifndef __DECRED_GATE_H__
|
||||
#define __DECRED_GATE_H__
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#define DECRED_NBITS_INDEX 29
|
||||
#define DECRED_NTIME_INDEX 34
|
||||
#define DECRED_NONCE_INDEX 35
|
||||
#define DECRED_XNONCE_INDEX 36
|
||||
#define DECRED_DATA_SIZE 192
|
||||
#define DECRED_WORK_COMPARE_SIZE 140
|
||||
#define DECRED_MIDSTATE_LEN 128
|
||||
|
||||
#if defined (__AVX2__)
|
||||
//void blakehash_84way(void *state, const void *input);
|
||||
//int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done );
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
#define DECRED_4WAY
|
||||
#endif
|
||||
|
||||
#if defined (DECRED_4WAY)
|
||||
void decred_hash_4way(void *state, const void *input);
|
||||
int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
void decred_hash( void *state, const void *input );
|
||||
int scanhash_decred( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
@@ -1,282 +0,0 @@
|
||||
#include "decred-gate.h"
|
||||
|
||||
#if !defined(DECRED_8WAY) && !defined(DECRED_4WAY)
|
||||
|
||||
#include "sph_blake.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
#include <unistd.h>
|
||||
|
||||
/*
|
||||
#ifndef min
|
||||
#define min(a,b) (a>b ? b : a)
|
||||
#endif
|
||||
#ifndef max
|
||||
#define max(a,b) (a<b ? b : a)
|
||||
#endif
|
||||
*/
|
||||
/*
|
||||
#define DECRED_NBITS_INDEX 29
|
||||
#define DECRED_NTIME_INDEX 34
|
||||
#define DECRED_NONCE_INDEX 35
|
||||
#define DECRED_XNONCE_INDEX 36
|
||||
#define DECRED_DATA_SIZE 192
|
||||
#define DECRED_WORK_COMPARE_SIZE 140
|
||||
*/
|
||||
static __thread sph_blake256_context blake_mid;
|
||||
static __thread bool ctx_midstate_done = false;
|
||||
|
||||
void decred_hash(void *state, const void *input)
|
||||
{
|
||||
// #define MIDSTATE_LEN 128
|
||||
sph_blake256_context ctx __attribute__ ((aligned (64)));
|
||||
|
||||
uint8_t *ending = (uint8_t*) input;
|
||||
ending += DECRED_MIDSTATE_LEN;
|
||||
|
||||
if (!ctx_midstate_done) {
|
||||
sph_blake256_init(&blake_mid);
|
||||
sph_blake256(&blake_mid, input, DECRED_MIDSTATE_LEN);
|
||||
ctx_midstate_done = true;
|
||||
}
|
||||
memcpy(&ctx, &blake_mid, sizeof(blake_mid));
|
||||
|
||||
sph_blake256(&ctx, ending, (180 - DECRED_MIDSTATE_LEN));
|
||||
sph_blake256_close(&ctx, state);
|
||||
}
|
||||
|
||||
void decred_hash_simple(void *state, const void *input)
|
||||
{
|
||||
sph_blake256_context ctx;
|
||||
sph_blake256_init(&ctx);
|
||||
sph_blake256(&ctx, input, 180);
|
||||
sph_blake256_close(&ctx, state);
|
||||
}
|
||||
|
||||
int scanhash_decred( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) endiandata[48];
|
||||
uint32_t _ALIGN(64) hash32[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
// #define DCR_NONCE_OFT32 35
|
||||
|
||||
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
|
||||
const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
ctx_midstate_done = false;
|
||||
|
||||
#if 1
|
||||
memcpy(endiandata, pdata, 180);
|
||||
#else
|
||||
for (int k=0; k < (180/4); k++)
|
||||
be32enc(&endiandata[k], pdata[k]);
|
||||
#endif
|
||||
|
||||
do {
|
||||
//be32enc(&endiandata[DCR_NONCE_OFT32], n);
|
||||
endiandata[DECRED_NONCE_INDEX] = n;
|
||||
decred_hash(hash32, endiandata);
|
||||
|
||||
if (hash32[7] <= HTarget && fulltest(hash32, ptarget))
|
||||
{
|
||||
pdata[DECRED_NONCE_INDEX] = n;
|
||||
submit_solution( work, hash32, mythr );
|
||||
}
|
||||
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[DECRED_NONCE_INDEX] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
uint32_t *decred_get_nonceptr( uint32_t *work_data )
|
||||
{
|
||||
return &work_data[ DECRED_NONCE_INDEX ];
|
||||
}
|
||||
|
||||
double decred_calc_network_diff( struct work* work )
|
||||
{
|
||||
// sample for diff 43.281 : 1c05ea29
|
||||
// todo: endian reversed on longpoll could be zr5 specific...
|
||||
uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
|
||||
uint32_t bits = ( nbits & 0xffffff );
|
||||
int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
|
||||
int m;
|
||||
double d = (double)0x0000ffff / (double)bits;
|
||||
|
||||
for ( m = shift; m < 29; m++ )
|
||||
d *= 256.0;
|
||||
for ( m = 29; m < shift; m++ )
|
||||
d /= 256.0;
|
||||
if ( shift == 28 )
|
||||
d *= 256.0; // testnet
|
||||
if ( opt_debug_diff )
|
||||
applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
|
||||
shift, bits );
|
||||
return net_diff;
|
||||
}
|
||||
|
||||
void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
|
||||
{
|
||||
// some random extradata to make the work unique
|
||||
work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
|
||||
work->height = work->data[32];
|
||||
if (!have_longpoll && work->height > *net_blocks + 1)
|
||||
{
|
||||
char netinfo[64] = { 0 };
|
||||
if (net_diff > 0.)
|
||||
{
|
||||
if (net_diff != work->targetdiff)
|
||||
sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
|
||||
work->targetdiff);
|
||||
else
|
||||
sprintf(netinfo, ", diff %.3f", net_diff);
|
||||
}
|
||||
applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
|
||||
netinfo);
|
||||
*net_blocks = work->height - 1;
|
||||
}
|
||||
}
|
||||
|
||||
void decred_be_build_stratum_request( char *req, struct work *work,
|
||||
struct stratum_ctx *sctx )
|
||||
{
|
||||
unsigned char *xnonce2str;
|
||||
uint32_t ntime, nonce;
|
||||
char ntimestr[9], noncestr[9];
|
||||
|
||||
be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
|
||||
be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
|
||||
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
|
||||
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
|
||||
xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
|
||||
sctx->xnonce1_size );
|
||||
snprintf( req, JSON_BUF_LEN,
|
||||
"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
|
||||
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
|
||||
free(xnonce2str);
|
||||
}
|
||||
*/
|
||||
/*
|
||||
// data shared between gen_merkle_root and build_extraheader.
|
||||
__thread uint32_t decred_extraheader[32] = { 0 };
|
||||
__thread int decred_headersize = 0;
|
||||
|
||||
void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
|
||||
{
|
||||
// getwork over stratum, getwork merkle + header passed in coinb1
|
||||
memcpy(merkle_root, sctx->job.coinbase, 32);
|
||||
decred_headersize = min((int)sctx->job.coinbase_size - 32,
|
||||
sizeof(decred_extraheader) );
|
||||
memcpy( decred_extraheader, &sctx->job.coinbase[32], decred_headersize);
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
#define min(a,b) (a>b ? (b) :(a))
|
||||
|
||||
void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
{
|
||||
uchar merkle_root[64] = { 0 };
|
||||
uint32_t extraheader[32] = { 0 };
|
||||
int headersize = 0;
|
||||
uint32_t* extradata = (uint32_t*) sctx->xnonce1;
|
||||
size_t t;
|
||||
int i;
|
||||
|
||||
// getwork over stratum, getwork merkle + header passed in coinb1
|
||||
memcpy(merkle_root, sctx->job.coinbase, 32);
|
||||
headersize = min((int)sctx->job.coinbase_size - 32,
|
||||
sizeof(extraheader) );
|
||||
memcpy( extraheader, &sctx->job.coinbase[32], headersize );
|
||||
|
||||
// Increment extranonce2
|
||||
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
|
||||
|
||||
// Assemble block header
|
||||
memset( g_work->data, 0, sizeof(g_work->data) );
|
||||
g_work->data[0] = le32dec( sctx->job.version );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[1 + i] = swab32(
|
||||
le32dec( (uint32_t *) sctx->job.prevhash + i ) );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
|
||||
|
||||
// for ( i = 0; i < 8; i++ ) // prevhash
|
||||
// g_work->data[1 + i] = swab32( g_work->data[1 + i] );
|
||||
// for ( i = 0; i < 8; i++ ) // merkle
|
||||
// g_work->data[9 + i] = swab32( g_work->data[9 + i] );
|
||||
|
||||
for ( i = 0; i < headersize/4; i++ ) // header
|
||||
g_work->data[17 + i] = extraheader[i];
|
||||
// extradata
|
||||
|
||||
for ( i = 0; i < sctx->xnonce1_size/4; i++ )
|
||||
g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
|
||||
for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
|
||||
g_work->data[i] = 0;
|
||||
g_work->data[37] = (rand()*4) << 8;
|
||||
// block header suffix from coinb2 (stake version)
|
||||
memcpy( &g_work->data[44],
|
||||
&sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
|
||||
sctx->bloc_height = g_work->data[32];
|
||||
//applog_hex(work->data, 180);
|
||||
//applog_hex(&work->data[36], 36);
|
||||
}
|
||||
|
||||
#undef min
|
||||
|
||||
bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
|
||||
int thr_id )
|
||||
{
|
||||
if ( have_stratum && strcmp(stratum->job.job_id, work->job_id) )
|
||||
// need to regen g_work..
|
||||
return false;
|
||||
if ( have_stratum && !work->data[0] && !opt_benchmark )
|
||||
{
|
||||
sleep(1);
|
||||
return false;
|
||||
}
|
||||
// extradata: prevent duplicates
|
||||
work->data[ DECRED_XNONCE_INDEX ] += 1;
|
||||
work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool register_decred_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT;
|
||||
gate->scanhash = (void*)&scanhash_decred;
|
||||
gate->hash = (void*)&decred_hash;
|
||||
gate->get_nonceptr = (void*)&decred_get_nonceptr;
|
||||
gate->decode_extra_data = (void*)&decred_decode_extradata;
|
||||
gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
|
||||
gate->work_decode = (void*)&std_be_work_decode;
|
||||
gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
|
||||
gate->build_extraheader = (void*)&decred_build_extraheader;
|
||||
gate->ready_to_mine = (void*)&decred_ready_to_mine;
|
||||
gate->nbits_index = DECRED_NBITS_INDEX;
|
||||
gate->ntime_index = DECRED_NTIME_INDEX;
|
||||
gate->nonce_index = DECRED_NONCE_INDEX;
|
||||
gate->work_data_size = DECRED_DATA_SIZE;
|
||||
gate->work_cmp_size = DECRED_WORK_COMPARE_SIZE;
|
||||
allow_mininginfo = false;
|
||||
have_gbt = false;
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
#endif
|
@@ -1,6 +1,6 @@
|
||||
#include "pentablake-gate.h"
|
||||
|
||||
#if defined (__AVX2__)
|
||||
#if defined(PENTABLAKE_4WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
@@ -4,9 +4,10 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define PENTABLAKE_4WAY
|
||||
#endif
|
||||
// 4way is broken
|
||||
//#if defined(__AVX2__)
|
||||
// #define PENTABLAKE_4WAY
|
||||
//#endif
|
||||
|
||||
#if defined(PENTABLAKE_4WAY)
|
||||
void pentablakehash_4way( void *state, const void *input );
|
||||
|
@@ -323,7 +323,7 @@ int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )
|
||||
|
||||
int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
|
||||
{
|
||||
blake2s_state S[1];
|
||||
blake2s_state S;
|
||||
|
||||
/* Verify parameters */
|
||||
if ( NULL == in ) return -1;
|
||||
@@ -334,15 +334,15 @@ int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen
|
||||
|
||||
if( keylen > 0 )
|
||||
{
|
||||
if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
|
||||
if( blake2s_init_key( &S, outlen, key, keylen ) < 0 ) return -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if( blake2s_init( S, outlen ) < 0 ) return -1;
|
||||
if( blake2s_init( &S, outlen ) < 0 ) return -1;
|
||||
}
|
||||
|
||||
blake2s_update( S, ( uint8_t * )in, inlen );
|
||||
blake2s_final( S, out, outlen );
|
||||
blake2s_update( &S, ( uint8_t * )in, inlen );
|
||||
blake2s_final( &S, out, outlen );
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -116,7 +116,7 @@ extern "C" {
|
||||
uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32
|
||||
} blake2s_param;
|
||||
|
||||
ALIGN( 64 ) typedef struct __blake2s_state
|
||||
typedef struct ALIGN( 64 ) __blake2s_state
|
||||
{
|
||||
uint32_t h[8];
|
||||
uint32_t t[2];
|
||||
|
@@ -630,6 +630,69 @@ static const sph_u64 CB[16] = {
|
||||
H7 ^= S3 ^ V7 ^ VF; \
|
||||
} while (0)
|
||||
|
||||
#define COMPRESS32_LE do { \
|
||||
sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = S0 ^ CS0; \
|
||||
V9 = S1 ^ CS1; \
|
||||
VA = S2 ^ CS2; \
|
||||
VB = S3 ^ CS3; \
|
||||
VC = T0 ^ CS4; \
|
||||
VD = T0 ^ CS5; \
|
||||
VE = T1 ^ CS6; \
|
||||
VF = T1 ^ CS7; \
|
||||
M0 = *((uint32_t*)(buf + 0)); \
|
||||
M1 = *((uint32_t*)(buf + 4)); \
|
||||
M2 = *((uint32_t*)(buf + 8)); \
|
||||
M3 = *((uint32_t*)(buf + 12)); \
|
||||
M4 = *((uint32_t*)(buf + 16)); \
|
||||
M5 = *((uint32_t*)(buf + 20)); \
|
||||
M6 = *((uint32_t*)(buf + 24)); \
|
||||
M7 = *((uint32_t*)(buf + 28)); \
|
||||
M8 = *((uint32_t*)(buf + 32)); \
|
||||
M9 = *((uint32_t*)(buf + 36)); \
|
||||
MA = *((uint32_t*)(buf + 40)); \
|
||||
MB = *((uint32_t*)(buf + 44)); \
|
||||
MC = *((uint32_t*)(buf + 48)); \
|
||||
MD = *((uint32_t*)(buf + 52)); \
|
||||
ME = *((uint32_t*)(buf + 56)); \
|
||||
MF = *((uint32_t*)(buf + 60)); \
|
||||
ROUND_S(0); \
|
||||
ROUND_S(1); \
|
||||
ROUND_S(2); \
|
||||
ROUND_S(3); \
|
||||
ROUND_S(4); \
|
||||
ROUND_S(5); \
|
||||
ROUND_S(6); \
|
||||
ROUND_S(7); \
|
||||
if (BLAKE32_ROUNDS == 14) { \
|
||||
ROUND_S(8); \
|
||||
ROUND_S(9); \
|
||||
ROUND_S(0); \
|
||||
ROUND_S(1); \
|
||||
ROUND_S(2); \
|
||||
ROUND_S(3); \
|
||||
} \
|
||||
H0 ^= S0 ^ V0 ^ V8; \
|
||||
H1 ^= S1 ^ V1 ^ V9; \
|
||||
H2 ^= S2 ^ V2 ^ VA; \
|
||||
H3 ^= S3 ^ V3 ^ VB; \
|
||||
H4 ^= S0 ^ V4 ^ VC; \
|
||||
H5 ^= S1 ^ V5 ^ VD; \
|
||||
H6 ^= S2 ^ V6 ^ VE; \
|
||||
H7 ^= S3 ^ V7 ^ VF; \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#if SPH_64
|
||||
@@ -843,6 +906,45 @@ blake32(sph_blake_small_context *sc, const void *data, size_t len)
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_le(sph_blake_small_context *sc, const void *data, size_t len)
|
||||
{
|
||||
unsigned char *buf;
|
||||
size_t ptr;
|
||||
DECL_STATE32
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
|
||||
if (len < (sizeof sc->buf) - ptr) {
|
||||
memcpy(buf + ptr, data, len);
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE32(sc);
|
||||
while (len > 0) {
|
||||
size_t clen;
|
||||
|
||||
clen = (sizeof sc->buf) - ptr;
|
||||
if (clen > len)
|
||||
clen = len;
|
||||
memcpy(buf + ptr, data, clen);
|
||||
ptr += clen;
|
||||
data = (const unsigned char *)data + clen;
|
||||
len -= clen;
|
||||
if (ptr == sizeof sc->buf) {
|
||||
if ((T0 = SPH_T32(T0 + 512)) < 512)
|
||||
T1 = SPH_T32(T1 + 1);
|
||||
COMPRESS32_LE;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE32(sc);
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_close(sph_blake_small_context *sc,
|
||||
unsigned ub, unsigned n, void *dst, size_t out_size_w32)
|
||||
@@ -1050,6 +1152,12 @@ sph_blake256(void *cc, const void *data, size_t len)
|
||||
blake32(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
sph_blake256_update_le(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_le(cc, data, len);
|
||||
}
|
||||
|
||||
/* see sph_blake.h */
|
||||
void
|
||||
sph_blake256_close(void *cc, void *dst)
|
||||
|
@@ -198,6 +198,7 @@ void sph_blake256_init(void *cc);
|
||||
* @param len the input data length (in bytes)
|
||||
*/
|
||||
void sph_blake256(void *cc, const void *data, size_t len);
|
||||
void sph_blake256_update_le(void *cc, const void *data, size_t len);
|
||||
|
||||
/**
|
||||
* Terminate the current BLAKE-256 computation and output the result into
|
||||
|
@@ -30,18 +30,11 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "sph_blake2b.h"
|
||||
|
||||
// Cyclic right rotation.
|
||||
|
||||
#ifndef ROTR64
|
||||
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
|
||||
#endif
|
||||
|
||||
// Little-endian byte access.
|
||||
|
||||
#define B2B_GET64(p) \
|
||||
(((uint64_t) ((uint8_t *) (p))[0]) ^ \
|
||||
(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \
|
||||
@@ -52,32 +45,122 @@
|
||||
(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
|
||||
(((uint64_t) ((uint8_t *) (p))[7]) << 56))
|
||||
|
||||
// G Mixing function.
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define B2B_G(a, b, c, d, x, y) { \
|
||||
v[a] = v[a] + v[b] + x; \
|
||||
v[d] = ROTR64(v[d] ^ v[a], 32); \
|
||||
v[c] = v[c] + v[d]; \
|
||||
v[b] = ROTR64(v[b] ^ v[c], 24); \
|
||||
v[a] = v[a] + v[b] + y; \
|
||||
v[d] = ROTR64(v[d] ^ v[a], 16); \
|
||||
v[c] = v[c] + v[d]; \
|
||||
v[b] = ROTR64(v[b] ^ v[c], 63); }
|
||||
#define BLAKE2B_G( Sa, Sb, Sc, Sd, Se, Sf, Sg, Sh ) \
|
||||
{ \
|
||||
V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
|
||||
_mm256_set_epi64x( m[ sigmaR[ Sg ] ], m[ sigmaR[ Se ] ], \
|
||||
m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
||||
V[3] = mm256_swap64_32( _mm256_xor_si256( V[3], V[0] ) ); \
|
||||
V[2] = _mm256_add_epi64( V[2], V[3] ); \
|
||||
V[1] = mm256_shuflr64_24( _mm256_xor_si256( V[1], V[2] ) ); \
|
||||
\
|
||||
V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
|
||||
_mm256_set_epi64x( m[ sigmaR[ Sh ] ], m[ sigmaR[ Sf ] ], \
|
||||
m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
||||
V[3] = mm256_shuflr64_16( _mm256_xor_si256( V[3], V[0] ) ); \
|
||||
V[2] = _mm256_add_epi64( V[2], V[3] ); \
|
||||
V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 63 ); \
|
||||
}
|
||||
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
{ \
|
||||
__m256i *V = (__m256i*)v; \
|
||||
const uint8_t *sigmaR = sigma[R]; \
|
||||
BLAKE2B_G( 0, 1, 2, 3, 4, 5, 6, 7 ); \
|
||||
V[3] = mm256_shufll_64( V[3] ); \
|
||||
V[2] = mm256_swap_128( V[2] ); \
|
||||
V[1] = mm256_shuflr_64( V[1] ); \
|
||||
BLAKE2B_G( 8, 9, 10, 11, 12, 13, 14, 15 ); \
|
||||
V[3] = mm256_shuflr_64( V[3] ); \
|
||||
V[2] = mm256_swap_128( V[2] ); \
|
||||
V[1] = mm256_shufll_64( V[1] ); \
|
||||
}
|
||||
|
||||
#elif defined(__SSE2__)
|
||||
// always true
|
||||
|
||||
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
|
||||
{ \
|
||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
||||
_mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
||||
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
|
||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
||||
Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
|
||||
\
|
||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
||||
_mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
||||
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
|
||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
||||
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
|
||||
}
|
||||
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
{ \
|
||||
__m128i *V = (__m128i*)v; \
|
||||
__m128i V2, V3, V6, V7; \
|
||||
const uint8_t *sigmaR = sigma[R]; \
|
||||
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
|
||||
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
|
||||
V2 = mm128_alignr_64( V[3], V[2], 1 ); \
|
||||
V3 = mm128_alignr_64( V[2], V[3], 1 ); \
|
||||
V6 = mm128_alignr_64( V[6], V[7], 1 ); \
|
||||
V7 = mm128_alignr_64( V[7], V[6], 1 ); \
|
||||
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
|
||||
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
|
||||
V[2] = mm128_alignr_64( V2, V3, 1 ); \
|
||||
V[3] = mm128_alignr_64( V3, V2, 1 ); \
|
||||
V[6] = mm128_alignr_64( V7, V6, 1 ); \
|
||||
V[7] = mm128_alignr_64( V6, V7, 1 ); \
|
||||
}
|
||||
|
||||
#else
|
||||
// never used, SSE2 is always available
|
||||
|
||||
#ifndef ROTR64
|
||||
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
|
||||
#endif
|
||||
|
||||
#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb ) \
|
||||
{ \
|
||||
Va = Va + Vb + m[ sigma[R][Sa] ]; \
|
||||
Vd = ROTR64( Vd ^ Va, 32 ); \
|
||||
Vc = Vc + Vd; \
|
||||
Vb = ROTR64( Vb ^ Vc, 24 ); \
|
||||
\
|
||||
Va = Va + Vb + m[ sigma[R][Sb] ]; \
|
||||
Vd = ROTR64( Vd ^ Va, 16 ); \
|
||||
Vc = Vc + Vd; \
|
||||
Vb = ROTR64( Vb ^ Vc, 63 ); \
|
||||
}
|
||||
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
{ \
|
||||
BLAKE2B_G( R, v[ 0], v[ 4], v[ 8], v[12], 0, 1 ); \
|
||||
BLAKE2B_G( R, v[ 1], v[ 5], v[ 9], v[13], 2, 3 ); \
|
||||
BLAKE2B_G( R, v[ 2], v[ 6], v[10], v[14], 4, 5 ); \
|
||||
BLAKE2B_G( R, v[ 3], v[ 7], v[11], v[15], 6, 7 ); \
|
||||
BLAKE2B_G( R, v[ 0], v[ 5], v[10], v[15], 8, 9 ); \
|
||||
BLAKE2B_G( R, v[ 1], v[ 6], v[11], v[12], 10, 11 ); \
|
||||
BLAKE2B_G( R, v[ 2], v[ 7], v[ 8], v[13], 12, 13 ); \
|
||||
BLAKE2B_G( R, v[ 3], v[ 4], v[ 9], v[14], 14, 15 ); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Initialization Vector.
|
||||
|
||||
static const uint64_t blake2b_iv[8] = {
|
||||
static const uint64_t blake2b_iv[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
|
||||
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
|
||||
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||
};
|
||||
|
||||
// Compression function. "last" flag indicates last block.
|
||||
|
||||
static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
|
||||
static const uint8_t sigma[12][16] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
const uint8_t sigma[12][16] = {
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
|
||||
@@ -91,8 +174,14 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
|
||||
};
|
||||
|
||||
// Compression function. "last" flag indicates last block.
|
||||
|
||||
static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
|
||||
{
|
||||
uint64_t v[16] __attribute__ ((aligned (32)));
|
||||
uint64_t m[16] __attribute__ ((aligned (32)));
|
||||
int i;
|
||||
uint64_t v[16], m[16];
|
||||
|
||||
for (i = 0; i < 8; i++) { // init work variables
|
||||
v[i] = ctx->h[i];
|
||||
@@ -106,16 +195,8 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
|
||||
for (i = 0; i < 16; i++) // get little-endian words
|
||||
m[i] = B2B_GET64(&ctx->b[8 * i]);
|
||||
|
||||
for (i = 0; i < 12; i++) { // twelve rounds
|
||||
B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
|
||||
B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
|
||||
B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
|
||||
B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
|
||||
B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
|
||||
B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
|
||||
B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]);
|
||||
B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]);
|
||||
}
|
||||
for (i = 0; i < 12; i++)
|
||||
BLAKE2B_ROUND( i );
|
||||
|
||||
for( i = 0; i < 8; ++i )
|
||||
ctx->h[i] ^= v[i] ^ v[i + 8];
|
||||
|
@@ -18,7 +18,7 @@
|
||||
#endif
|
||||
|
||||
// state context
|
||||
ALIGN(64) typedef struct {
|
||||
typedef ALIGN(64) struct {
|
||||
uint8_t b[128]; // input buffer
|
||||
uint64_t h[8]; // chained state
|
||||
uint64_t t[2]; // total number of bytes
|
||||
|
@@ -867,40 +867,35 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
|
||||
qt[30] = expand2s8( qt, M, H, 30 );
|
||||
qt[31] = expand2s8( qt, M, H, 31 );
|
||||
|
||||
xl = _mm256_xor_si256(
|
||||
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm256_xor_si256( xl, _mm256_xor_si256(
|
||||
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
xl = mm256_xor3( mm256_xor3( qt[16], qt[17], qt[18] ),
|
||||
mm256_xor3( qt[19], qt[20], qt[21] ),
|
||||
_mm256_xor_si256( qt[22], qt[23] ) );
|
||||
|
||||
xh = mm256_xor3( mm256_xor3( xl, qt[24], qt[25] ),
|
||||
mm256_xor3( qt[26], qt[27], qt[28] ),
|
||||
mm256_xor3( qt[29], qt[30], qt[31] ) );
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_xor_si256( M[m], \
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
|
||||
_mm256_srli_epi32( qt[a], sr ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
|
||||
_mm256_add_epi32( mm256_xor3( M[m], _mm256_slli_epi32( xh, sl ), \
|
||||
_mm256_srli_epi32( qt[a], sr ) ), \
|
||||
mm256_xor3( xl, qt[b], qt[c] ) )
|
||||
|
||||
#define DH1R( m, sl, sr, a, b, c ) \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_xor_si256( M[m], \
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
|
||||
_mm256_slli_epi32( qt[a], sr ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
|
||||
_mm256_add_epi32( mm256_xor3( M[m], _mm256_srli_epi32( xh, sl ), \
|
||||
_mm256_slli_epi32( qt[a], sr ) ), \
|
||||
mm256_xor3( xl, qt[b], qt[c] ) )
|
||||
|
||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||
_mm256_add_epi32( _mm256_add_epi32( \
|
||||
mm256_rol_32( dH[h], rl ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
|
||||
_mm256_xor_si256( qt[b], qt[c] ) ) );
|
||||
mm256_xor3( xh, qt[a], M[m] ) ), \
|
||||
mm256_xor3( _mm256_slli_epi32( xl, sl ), qt[b], qt[c] ) )
|
||||
|
||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||
_mm256_add_epi32( _mm256_add_epi32( \
|
||||
mm256_rol_32( dH[h], rl ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
|
||||
_mm256_xor_si256( qt[b], qt[c] ) ) );
|
||||
mm256_xor3( xh, qt[a], M[m] ) ), \
|
||||
mm256_xor3( _mm256_srli_epi32( xl, sr ), qt[b], qt[c] ) )
|
||||
|
||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
||||
@@ -924,88 +919,6 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
|
||||
#undef DH2L
|
||||
#undef DH2R
|
||||
|
||||
/*
|
||||
dH[ 0] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[0],
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
|
||||
_mm256_srli_epi32( qt[16], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
|
||||
dH[ 1] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[1],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 7 ),
|
||||
_mm256_slli_epi32( qt[17], 8 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
|
||||
dH[ 2] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[2],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 5 ),
|
||||
_mm256_slli_epi32( qt[18], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
|
||||
dH[ 3] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[3],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 1 ),
|
||||
_mm256_slli_epi32( qt[19], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
|
||||
dH[ 4] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[4],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 3 ),
|
||||
_mm256_slli_epi32( qt[20], 0 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
|
||||
dH[ 5] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[5],
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xh, 6 ),
|
||||
_mm256_srli_epi32( qt[21], 6 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
|
||||
dH[ 6] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[6],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 4 ),
|
||||
_mm256_slli_epi32( qt[22], 6 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
|
||||
dH[ 7] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[7],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 11 ),
|
||||
_mm256_slli_epi32( qt[23], 2 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
|
||||
dH[ 8] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[4], 9 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xl, 8 ),
|
||||
_mm256_xor_si256( qt[23], qt[ 8] ) ) );
|
||||
dH[ 9] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[5], 10 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 6 ),
|
||||
_mm256_xor_si256( qt[16], qt[ 9] ) ) );
|
||||
dH[10] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[6], 11 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xl, 6 ),
|
||||
_mm256_xor_si256( qt[17], qt[10] ) ) );
|
||||
dH[11] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[7], 12 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xl, 4 ),
|
||||
_mm256_xor_si256( qt[18], qt[11] ) ) );
|
||||
dH[12] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[0], 13 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 3 ),
|
||||
_mm256_xor_si256( qt[19], qt[12] ) ) );
|
||||
dH[13] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[1], 14 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 4 ),
|
||||
_mm256_xor_si256( qt[20], qt[13] ) ) );
|
||||
dH[14] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[2], 15 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 7 ),
|
||||
_mm256_xor_si256( qt[21], qt[14] ) ) );
|
||||
dH[15] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[3], 16 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
|
||||
_mm256_xor_si256( qt[22], qt[15] ) ) );
|
||||
*/
|
||||
}
|
||||
|
||||
static const __m256i final_s8[16] =
|
||||
@@ -1422,40 +1335,35 @@ void compress_small_16way( const __m512i *M, const __m512i H[16],
|
||||
qt[30] = expand2s16( qt, M, H, 30 );
|
||||
qt[31] = expand2s16( qt, M, H, 31 );
|
||||
|
||||
xl = _mm512_xor_si512(
|
||||
mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm512_xor_si512( xl, _mm512_xor_si512(
|
||||
mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
|
||||
mm512_xor3( qt[19], qt[20], qt[21] ),
|
||||
_mm512_xor_si512( qt[22], qt[23] ) );
|
||||
|
||||
xh = mm512_xor3( mm512_xor3( xl, qt[24], qt[25] ),
|
||||
mm512_xor3( qt[26], qt[27], qt[28] ),
|
||||
mm512_xor3( qt[29], qt[30], qt[31] ) );
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_xor_si512( M[m], \
|
||||
_mm512_xor_si512( _mm512_slli_epi32( xh, sl ), \
|
||||
_mm512_srli_epi32( qt[a], sr ) ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||
_mm512_add_epi32( mm512_xor3( M[m], _mm512_slli_epi32( xh, sl ), \
|
||||
_mm512_srli_epi32( qt[a], sr ) ), \
|
||||
mm512_xor3( xl, qt[b], qt[c] ) )
|
||||
|
||||
#define DH1R( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_xor_si512( M[m], \
|
||||
_mm512_xor_si512( _mm512_srli_epi32( xh, sl ), \
|
||||
_mm512_slli_epi32( qt[a], sr ) ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||
_mm512_add_epi32( mm512_xor3( M[m], _mm512_srli_epi32( xh, sl ), \
|
||||
_mm512_slli_epi32( qt[a], sr ) ), \
|
||||
mm512_xor3( xl, qt[b], qt[c] ) )
|
||||
|
||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||
_mm512_add_epi32( _mm512_add_epi32( \
|
||||
mm512_rol_32( dH[h], rl ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||
_mm512_xor_si512( _mm512_slli_epi32( xl, sl ), \
|
||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||
mm512_xor3( xh, qt[a], M[m] ) ), \
|
||||
mm512_xor3( _mm512_slli_epi32( xl, sl ), qt[b], qt[c] ) )
|
||||
|
||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||
_mm512_add_epi32( _mm512_add_epi32( \
|
||||
mm512_rol_32( dH[h], rl ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||
_mm512_xor_si512( _mm512_srli_epi32( xl, sr ), \
|
||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||
mm512_xor3( xh, qt[a], M[m] ) ), \
|
||||
mm512_xor3( _mm512_srli_epi32( xl, sr ), qt[b], qt[c] ) )
|
||||
|
||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
||||
|
@@ -594,22 +594,12 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
||||
#define rb6(x) mm256_rol_64( x, 43 )
|
||||
#define rb7(x) mm256_rol_64( x, 53 )
|
||||
|
||||
#define rol_off_64( M, j, off ) \
|
||||
mm256_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
|
||||
( ( (j) + (off) ) & 0xF ) + 1 )
|
||||
#define add_elt_b( mj0, mj3, mj10, h, K ) \
|
||||
_mm256_xor_si256( h, _mm256_add_epi64( K, \
|
||||
_mm256_sub_epi64( _mm256_add_epi64( mj0, mj3 ), mj10 ) ) )
|
||||
|
||||
#define add_elt_b( M, H, j ) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \
|
||||
rol_off_64( M, j, 3 ) ), \
|
||||
rol_off_64( M, j, 10 ) ), \
|
||||
_mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
|
||||
H[ ( (j)+7 ) & 0xF ] )
|
||||
|
||||
|
||||
#define expand1b( qt, M, H, i ) \
|
||||
_mm256_add_epi64( mm256_add4_64( \
|
||||
#define expand1_b( qt, i ) \
|
||||
mm256_add4_64( \
|
||||
mm256_add4_64( sb1( qt[ (i)-16 ] ), sb2( qt[ (i)-15 ] ), \
|
||||
sb3( qt[ (i)-14 ] ), sb0( qt[ (i)-13 ] )), \
|
||||
mm256_add4_64( sb1( qt[ (i)-12 ] ), sb2( qt[ (i)-11 ] ), \
|
||||
@@ -617,11 +607,10 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
||||
mm256_add4_64( sb1( qt[ (i)- 8 ] ), sb2( qt[ (i)- 7 ] ), \
|
||||
sb3( qt[ (i)- 6 ] ), sb0( qt[ (i)- 5 ] )), \
|
||||
mm256_add4_64( sb1( qt[ (i)- 4 ] ), sb2( qt[ (i)- 3 ] ), \
|
||||
sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b( M, H, (i)-16 ) )
|
||||
sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) )
|
||||
|
||||
#define expand2b( qt, M, H, i) \
|
||||
_mm256_add_epi64( mm256_add4_64( \
|
||||
#define expand2_b( qt, i) \
|
||||
mm256_add4_64( \
|
||||
mm256_add4_64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ), \
|
||||
qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ), \
|
||||
mm256_add4_64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ), \
|
||||
@@ -629,159 +618,98 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
||||
mm256_add4_64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ), \
|
||||
qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ), \
|
||||
mm256_add4_64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ), \
|
||||
sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b( M, H, (i)-16 ) )
|
||||
|
||||
|
||||
sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) )
|
||||
|
||||
#define Wb0 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[13], H[13] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \
|
||||
_mm256_add_epi64( mh[13], mh[14] ) )
|
||||
|
||||
#define Wb1 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[14], H[14] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \
|
||||
_mm256_sub_epi64( mh[14], mh[15] ) )
|
||||
|
||||
#define Wb2 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
_mm256_add_epi64( _mm256_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \
|
||||
_mm256_sub_epi64( mh[12], mh[15] ) )
|
||||
|
||||
#define Wb3 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[10], H[10] ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \
|
||||
_mm256_sub_epi64( mh[10], \
|
||||
mh[13] ) )
|
||||
|
||||
#define Wb4 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
_mm256_add_epi64( _mm256_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \
|
||||
_mm256_add_epi64( mh[11], mh[14] ) )
|
||||
|
||||
#define Wb5 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \
|
||||
_mm256_sub_epi64( mh[12], mh[15] ) )
|
||||
|
||||
#define Wb6 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
|
||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[11], H[11] ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \
|
||||
_mm256_sub_epi64( mh[11], mh[13] ) )
|
||||
|
||||
#define Wb7 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \
|
||||
_mm256_add_epi64( mh[12], mh[14] ) )
|
||||
|
||||
#define Wb8 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[13], H[13] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \
|
||||
_mm256_sub_epi64( mh[13], mh[15] ) )
|
||||
|
||||
#define Wb9 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \
|
||||
_mm256_sub_epi64( mh[ 7], mh[14] ) )
|
||||
|
||||
#define Wb10 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \
|
||||
_mm256_sub_epi64( mh[ 7], mh[15] ) )
|
||||
|
||||
#define Wb11 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \
|
||||
_mm256_sub_epi64( mh[ 5], mh[ 9] ) )
|
||||
|
||||
#define Wb12 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ) )
|
||||
_mm256_sub_epi64( _mm256_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \
|
||||
_mm256_sub_epi64( mh[ 9], mh[10] ) )
|
||||
|
||||
#define Wb13 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[10], H[10] ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ) )
|
||||
_mm256_add_epi64( _mm256_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \
|
||||
_mm256_add_epi64( mh[10], mh[11] ) )
|
||||
|
||||
#define Wb14 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \
|
||||
_mm256_add_epi64( mh[11], mh[12] ) )
|
||||
|
||||
#define Wb15 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[ 4], H[4] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \
|
||||
_mm256_sub_epi64( mh[ 9], mh[13] ) )
|
||||
|
||||
|
||||
void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
{
|
||||
__m256i qt[32], xl, xh;
|
||||
__m256i mh[16];
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < 16; i++ )
|
||||
mh[i] = _mm256_xor_si256( M[i], H[i] );
|
||||
|
||||
qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] );
|
||||
qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] );
|
||||
@@ -799,22 +727,77 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] );
|
||||
qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] );
|
||||
qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] );
|
||||
qt[16] = expand1b( qt, M, H, 16 );
|
||||
qt[17] = expand1b( qt, M, H, 17 );
|
||||
qt[18] = expand2b( qt, M, H, 18 );
|
||||
qt[19] = expand2b( qt, M, H, 19 );
|
||||
qt[20] = expand2b( qt, M, H, 20 );
|
||||
qt[21] = expand2b( qt, M, H, 21 );
|
||||
qt[22] = expand2b( qt, M, H, 22 );
|
||||
qt[23] = expand2b( qt, M, H, 23 );
|
||||
qt[24] = expand2b( qt, M, H, 24 );
|
||||
qt[25] = expand2b( qt, M, H, 25 );
|
||||
qt[26] = expand2b( qt, M, H, 26 );
|
||||
qt[27] = expand2b( qt, M, H, 27 );
|
||||
qt[28] = expand2b( qt, M, H, 28 );
|
||||
qt[29] = expand2b( qt, M, H, 29 );
|
||||
qt[30] = expand2b( qt, M, H, 30 );
|
||||
qt[31] = expand2b( qt, M, H, 31 );
|
||||
|
||||
__m256i mj[16];
|
||||
|
||||
mj[ 0] = mm256_rol_64( M[ 0], 1 );
|
||||
mj[ 1] = mm256_rol_64( M[ 1], 2 );
|
||||
mj[ 2] = mm256_rol_64( M[ 2], 3 );
|
||||
mj[ 3] = mm256_rol_64( M[ 3], 4 );
|
||||
mj[ 4] = mm256_rol_64( M[ 4], 5 );
|
||||
mj[ 5] = mm256_rol_64( M[ 5], 6 );
|
||||
mj[ 6] = mm256_rol_64( M[ 6], 7 );
|
||||
mj[ 7] = mm256_rol_64( M[ 7], 8 );
|
||||
mj[ 8] = mm256_rol_64( M[ 8], 9 );
|
||||
mj[ 9] = mm256_rol_64( M[ 9], 10 );
|
||||
mj[10] = mm256_rol_64( M[10], 11 );
|
||||
mj[11] = mm256_rol_64( M[11], 12 );
|
||||
mj[12] = mm256_rol_64( M[12], 13 );
|
||||
mj[13] = mm256_rol_64( M[13], 14 );
|
||||
mj[14] = mm256_rol_64( M[14], 15 );
|
||||
mj[15] = mm256_rol_64( M[15], 16 );
|
||||
|
||||
__m256i K = _mm256_set1_epi64x( 16 * 0x0555555555555555ULL );
|
||||
const __m256i Kincr = _mm256_set1_epi64x( 0x0555555555555555ULL );
|
||||
|
||||
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6], K );
|
||||
|
||||
qt[16] = _mm256_add_epi64( qt[16], expand1_b( qt, 16 ) );
|
||||
qt[17] = _mm256_add_epi64( qt[17], expand1_b( qt, 17 ) );
|
||||
qt[18] = _mm256_add_epi64( qt[18], expand2_b( qt, 18 ) );
|
||||
qt[19] = _mm256_add_epi64( qt[19], expand2_b( qt, 19 ) );
|
||||
qt[20] = _mm256_add_epi64( qt[20], expand2_b( qt, 20 ) );
|
||||
qt[21] = _mm256_add_epi64( qt[21], expand2_b( qt, 21 ) );
|
||||
qt[22] = _mm256_add_epi64( qt[22], expand2_b( qt, 22 ) );
|
||||
qt[23] = _mm256_add_epi64( qt[23], expand2_b( qt, 23 ) );
|
||||
qt[24] = _mm256_add_epi64( qt[24], expand2_b( qt, 24 ) );
|
||||
qt[25] = _mm256_add_epi64( qt[25], expand2_b( qt, 25 ) );
|
||||
qt[26] = _mm256_add_epi64( qt[26], expand2_b( qt, 26 ) );
|
||||
qt[27] = _mm256_add_epi64( qt[27], expand2_b( qt, 27 ) );
|
||||
qt[28] = _mm256_add_epi64( qt[28], expand2_b( qt, 28 ) );
|
||||
qt[29] = _mm256_add_epi64( qt[29], expand2_b( qt, 29 ) );
|
||||
qt[30] = _mm256_add_epi64( qt[30], expand2_b( qt, 30 ) );
|
||||
qt[31] = _mm256_add_epi64( qt[31], expand2_b( qt, 31 ) );
|
||||
|
||||
xl = _mm256_xor_si256(
|
||||
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
@@ -823,7 +806,6 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_xor_si256( M[m], \
|
||||
@@ -1066,21 +1048,12 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
#define r8b6(x) mm512_rol_64( x, 43 )
|
||||
#define r8b7(x) mm512_rol_64( x, 53 )
|
||||
|
||||
#define rol8w_off_64( M, j, off ) \
|
||||
mm512_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
|
||||
( ( (j) + (off) ) & 0xF ) + 1 )
|
||||
#define add_elt_b8( mj0, mj3, mj10, h, K ) \
|
||||
_mm512_xor_si512( h, _mm512_add_epi64( K, \
|
||||
_mm512_sub_epi64( _mm512_add_epi64( mj0, mj3 ), mj10 ) ) )
|
||||
|
||||
#define add_elt_b8( M, H, j ) \
|
||||
_mm512_xor_si512( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_add_epi64( rol8w_off_64( M, j, 0 ), \
|
||||
rol8w_off_64( M, j, 3 ) ), \
|
||||
rol8w_off_64( M, j, 10 ) ), \
|
||||
_mm512_set1_epi64( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
|
||||
H[ ( (j)+7 ) & 0xF ] )
|
||||
|
||||
#define expand1b8( qt, M, H, i ) \
|
||||
_mm512_add_epi64( mm512_add4_64( \
|
||||
#define expand1_b8( qt, i ) \
|
||||
mm512_add4_64( \
|
||||
mm512_add4_64( s8b1( qt[ (i)-16 ] ), s8b2( qt[ (i)-15 ] ), \
|
||||
s8b3( qt[ (i)-14 ] ), s8b0( qt[ (i)-13 ] )), \
|
||||
mm512_add4_64( s8b1( qt[ (i)-12 ] ), s8b2( qt[ (i)-11 ] ), \
|
||||
@@ -1088,11 +1061,10 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
mm512_add4_64( s8b1( qt[ (i)- 8 ] ), s8b2( qt[ (i)- 7 ] ), \
|
||||
s8b3( qt[ (i)- 6 ] ), s8b0( qt[ (i)- 5 ] )), \
|
||||
mm512_add4_64( s8b1( qt[ (i)- 4 ] ), s8b2( qt[ (i)- 3 ] ), \
|
||||
s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b8( M, H, (i)-16 ) )
|
||||
s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) )
|
||||
|
||||
#define expand2b8( qt, M, H, i) \
|
||||
_mm512_add_epi64( mm512_add4_64( \
|
||||
#define expand2_b8( qt, i) \
|
||||
mm512_add4_64( \
|
||||
mm512_add4_64( qt[ (i)-16 ], r8b1( qt[ (i)-15 ] ), \
|
||||
qt[ (i)-14 ], r8b2( qt[ (i)-13 ] ) ), \
|
||||
mm512_add4_64( qt[ (i)-12 ], r8b3( qt[ (i)-11 ] ), \
|
||||
@@ -1100,157 +1072,97 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
mm512_add4_64( qt[ (i)- 8 ], r8b5( qt[ (i)- 7 ] ), \
|
||||
qt[ (i)- 6 ], r8b6( qt[ (i)- 5 ] ) ), \
|
||||
mm512_add4_64( qt[ (i)- 4 ], r8b7( qt[ (i)- 3 ] ), \
|
||||
s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b8( M, H, (i)-16 ) )
|
||||
s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) )
|
||||
|
||||
#define W8b0 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
|
||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||
_mm512_xor_si512( M[10], H[10] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[13], H[13] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \
|
||||
_mm512_add_epi64( mh[13], mh[14] ) )
|
||||
|
||||
#define W8b1 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 6], H[ 6] ), \
|
||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||
_mm512_xor_si512( M[11], H[11] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[14], H[14] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \
|
||||
_mm512_sub_epi64( mh[14], mh[15] ) )
|
||||
|
||||
#define W8b2 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
_mm512_add_epi64( _mm512_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \
|
||||
_mm512_sub_epi64( mh[12], mh[15] ) )
|
||||
|
||||
#define W8b3 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
|
||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[10], H[10] ), \
|
||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \
|
||||
_mm512_sub_epi64( mh[10], mh[13] ) )
|
||||
|
||||
#define W8b4 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
_mm512_add_epi64( _mm512_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \
|
||||
_mm512_add_epi64( mh[11], mh[14] ) )
|
||||
|
||||
#define W8b5 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
|
||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||
_mm512_xor_si512( M[10], H[10] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \
|
||||
_mm512_sub_epi64( mh[12], mh[15] ) )
|
||||
|
||||
#define W8b6 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 4], H[ 4] ), \
|
||||
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
|
||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[11], H[11] ), \
|
||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \
|
||||
_mm512_sub_epi64( mh[11], mh[13] ) )
|
||||
|
||||
#define W8b7 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \
|
||||
_mm512_add_epi64( mh[12], mh[14] ) )
|
||||
|
||||
#define W8b8 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
|
||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[13], H[13] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \
|
||||
_mm512_sub_epi64( mh[13], mh[15] ) )
|
||||
|
||||
#define W8b9 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \
|
||||
_mm512_sub_epi64( mh[ 7], mh[14] ) )
|
||||
|
||||
#define W8b10 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
|
||||
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
|
||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \
|
||||
_mm512_sub_epi64( mh[ 7], mh[15] ) )
|
||||
|
||||
#define W8b11 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
|
||||
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
|
||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
|
||||
_mm512_xor_si512( M[ 9], H[ 9] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \
|
||||
_mm512_sub_epi64( mh[ 5], mh[ 9] ) )
|
||||
|
||||
#define W8b12 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
|
||||
_mm512_xor_si512( M[10], H[10] ) ) )
|
||||
_mm512_sub_epi64( _mm512_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \
|
||||
_mm512_sub_epi64( mh[ 9], mh[10] ) )
|
||||
|
||||
#define W8b13 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
|
||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[10], H[10] ), \
|
||||
_mm512_xor_si512( M[11], H[11] ) ) )
|
||||
_mm512_add_epi64( _mm512_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \
|
||||
_mm512_add_epi64( mh[10], mh[11] ) )
|
||||
|
||||
#define W8b14 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
|
||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
|
||||
_mm512_xor_si512( M[12], H[12] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \
|
||||
_mm512_add_epi64( mh[11], mh[12] ) )
|
||||
|
||||
#define W8b15 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[ 4], H[4] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
|
||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \
|
||||
_mm512_sub_epi64( mh[ 9], mh[13] ) )
|
||||
|
||||
void compress_big_8way( const __m512i *M, const __m512i H[16],
|
||||
__m512i dH[16] )
|
||||
{
|
||||
__m512i qt[32], xl, xh;
|
||||
__m512i mh[16];
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < 16; i++ )
|
||||
mh[i] = _mm512_xor_si512( M[i], H[i] );
|
||||
|
||||
qt[ 0] = _mm512_add_epi64( s8b0( W8b0 ), H[ 1] );
|
||||
qt[ 1] = _mm512_add_epi64( s8b1( W8b1 ), H[ 2] );
|
||||
@@ -1268,57 +1180,107 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
|
||||
qt[13] = _mm512_add_epi64( s8b3( W8b13), H[14] );
|
||||
qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
|
||||
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
|
||||
qt[16] = expand1b8( qt, M, H, 16 );
|
||||
qt[17] = expand1b8( qt, M, H, 17 );
|
||||
qt[18] = expand2b8( qt, M, H, 18 );
|
||||
qt[19] = expand2b8( qt, M, H, 19 );
|
||||
qt[20] = expand2b8( qt, M, H, 20 );
|
||||
qt[21] = expand2b8( qt, M, H, 21 );
|
||||
qt[22] = expand2b8( qt, M, H, 22 );
|
||||
qt[23] = expand2b8( qt, M, H, 23 );
|
||||
qt[24] = expand2b8( qt, M, H, 24 );
|
||||
qt[25] = expand2b8( qt, M, H, 25 );
|
||||
qt[26] = expand2b8( qt, M, H, 26 );
|
||||
qt[27] = expand2b8( qt, M, H, 27 );
|
||||
qt[28] = expand2b8( qt, M, H, 28 );
|
||||
qt[29] = expand2b8( qt, M, H, 29 );
|
||||
qt[30] = expand2b8( qt, M, H, 30 );
|
||||
qt[31] = expand2b8( qt, M, H, 31 );
|
||||
|
||||
xl = _mm512_xor_si512(
|
||||
mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm512_xor_si512( xl, _mm512_xor_si512(
|
||||
mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
__m512i mj[16];
|
||||
|
||||
mj[ 0] = mm512_rol_64( M[ 0], 1 );
|
||||
mj[ 1] = mm512_rol_64( M[ 1], 2 );
|
||||
mj[ 2] = mm512_rol_64( M[ 2], 3 );
|
||||
mj[ 3] = mm512_rol_64( M[ 3], 4 );
|
||||
mj[ 4] = mm512_rol_64( M[ 4], 5 );
|
||||
mj[ 5] = mm512_rol_64( M[ 5], 6 );
|
||||
mj[ 6] = mm512_rol_64( M[ 6], 7 );
|
||||
mj[ 7] = mm512_rol_64( M[ 7], 8 );
|
||||
mj[ 8] = mm512_rol_64( M[ 8], 9 );
|
||||
mj[ 9] = mm512_rol_64( M[ 9], 10 );
|
||||
mj[10] = mm512_rol_64( M[10], 11 );
|
||||
mj[11] = mm512_rol_64( M[11], 12 );
|
||||
mj[12] = mm512_rol_64( M[12], 13 );
|
||||
mj[13] = mm512_rol_64( M[13], 14 );
|
||||
mj[14] = mm512_rol_64( M[14], 15 );
|
||||
mj[15] = mm512_rol_64( M[15], 16 );
|
||||
|
||||
__m512i K = _mm512_set1_epi64( 16 * 0x0555555555555555ULL );
|
||||
const __m512i Kincr = _mm512_set1_epi64( 0x0555555555555555ULL );
|
||||
|
||||
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6], K );
|
||||
|
||||
qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
|
||||
qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
|
||||
qt[18] = _mm512_add_epi64( qt[18], expand2_b8( qt, 18 ) );
|
||||
qt[19] = _mm512_add_epi64( qt[19], expand2_b8( qt, 19 ) );
|
||||
qt[20] = _mm512_add_epi64( qt[20], expand2_b8( qt, 20 ) );
|
||||
qt[21] = _mm512_add_epi64( qt[21], expand2_b8( qt, 21 ) );
|
||||
qt[22] = _mm512_add_epi64( qt[22], expand2_b8( qt, 22 ) );
|
||||
qt[23] = _mm512_add_epi64( qt[23], expand2_b8( qt, 23 ) );
|
||||
qt[24] = _mm512_add_epi64( qt[24], expand2_b8( qt, 24 ) );
|
||||
qt[25] = _mm512_add_epi64( qt[25], expand2_b8( qt, 25 ) );
|
||||
qt[26] = _mm512_add_epi64( qt[26], expand2_b8( qt, 26 ) );
|
||||
qt[27] = _mm512_add_epi64( qt[27], expand2_b8( qt, 27 ) );
|
||||
qt[28] = _mm512_add_epi64( qt[28], expand2_b8( qt, 28 ) );
|
||||
qt[29] = _mm512_add_epi64( qt[29], expand2_b8( qt, 29 ) );
|
||||
qt[30] = _mm512_add_epi64( qt[30], expand2_b8( qt, 30 ) );
|
||||
qt[31] = _mm512_add_epi64( qt[31], expand2_b8( qt, 31 ) );
|
||||
|
||||
xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
|
||||
mm512_xor3( qt[19], qt[20], qt[21] ),
|
||||
_mm512_xor_si512( qt[22], qt[23] ) );
|
||||
|
||||
xh = mm512_xor3( mm512_xor3( xl, qt[24], qt[25] ),
|
||||
mm512_xor3( qt[26], qt[27], qt[28] ),
|
||||
mm512_xor3( qt[29], qt[30], qt[31] ) );
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_xor_si512( M[m], \
|
||||
_mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \
|
||||
_mm512_srli_epi64( qt[a], sr ) ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||
_mm512_add_epi64( mm512_xor3( M[m], _mm512_slli_epi64( xh, sl ), \
|
||||
_mm512_srli_epi64( qt[a], sr ) ), \
|
||||
mm512_xor3( xl, qt[b], qt[c] ) )
|
||||
|
||||
#define DH1R( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_xor_si512( M[m], \
|
||||
_mm512_xor_si512( _mm512_srli_epi64( xh, sl ), \
|
||||
_mm512_slli_epi64( qt[a], sr ) ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||
_mm512_add_epi64( mm512_xor3( M[m], _mm512_srli_epi64( xh, sl ), \
|
||||
_mm512_slli_epi64( qt[a], sr ) ), \
|
||||
mm512_xor3( xl, qt[b], qt[c] ) )
|
||||
|
||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||
_mm512_add_epi64( _mm512_add_epi64( \
|
||||
mm512_rol_64( dH[h], rl ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||
_mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \
|
||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||
mm512_xor3( xh, qt[a], M[m] ) ), \
|
||||
mm512_xor3( _mm512_slli_epi64( xl, sl ), qt[b], qt[c] ) )
|
||||
|
||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||
_mm512_add_epi64( _mm512_add_epi64( \
|
||||
mm512_rol_64( dH[h], rl ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||
_mm512_xor_si512( _mm512_srli_epi64( xl, sr ), \
|
||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||
mm512_xor3( xh, qt[a], M[m] ) ), \
|
||||
mm512_xor3( _mm512_srli_epi64( xl, sr ), qt[b], qt[c] ) )
|
||||
|
||||
|
||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||
|
@@ -54,14 +54,12 @@ static void transform_4way( cube_4way_context *sp )
|
||||
x5 = _mm512_add_epi32( x1, x5 );
|
||||
x6 = _mm512_add_epi32( x2, x6 );
|
||||
x7 = _mm512_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x1;
|
||||
x0 = mm512_rol_32( x2, 7 );
|
||||
x1 = mm512_rol_32( x3, 7 );
|
||||
x2 = mm512_rol_32( y0, 7 );
|
||||
x3 = mm512_rol_32( y1, 7 );
|
||||
x0 = _mm512_xor_si512( x0, x4 );
|
||||
x1 = _mm512_xor_si512( x1, x5 );
|
||||
y0 = mm512_rol_32( x2, 7 );
|
||||
y1 = mm512_rol_32( x3, 7 );
|
||||
x2 = mm512_rol_32( x0, 7 );
|
||||
x3 = mm512_rol_32( x1, 7 );
|
||||
x0 = _mm512_xor_si512( y0, x4 );
|
||||
x1 = _mm512_xor_si512( y1, x5 );
|
||||
x2 = _mm512_xor_si512( x2, x6 );
|
||||
x3 = _mm512_xor_si512( x3, x7 );
|
||||
x4 = mm512_swap128_64( x4 );
|
||||
@@ -72,15 +70,13 @@ static void transform_4way( cube_4way_context *sp )
|
||||
x5 = _mm512_add_epi32( x1, x5 );
|
||||
x6 = _mm512_add_epi32( x2, x6 );
|
||||
x7 = _mm512_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x2;
|
||||
x0 = mm512_rol_32( x1, 11 );
|
||||
x1 = mm512_rol_32( y0, 11 );
|
||||
x2 = mm512_rol_32( x3, 11 );
|
||||
x3 = mm512_rol_32( y1, 11 );
|
||||
x0 = _mm512_xor_si512( x0, x4 );
|
||||
y0 = mm512_rol_32( x1, 11 );
|
||||
x1 = mm512_rol_32( x0, 11 );
|
||||
y1 = mm512_rol_32( x3, 11 );
|
||||
x3 = mm512_rol_32( x2, 11 );
|
||||
x0 = _mm512_xor_si512( y0, x4 );
|
||||
x1 = _mm512_xor_si512( x1, x5 );
|
||||
x2 = _mm512_xor_si512( x2, x6 );
|
||||
x2 = _mm512_xor_si512( y1, x6 );
|
||||
x3 = _mm512_xor_si512( x3, x7 );
|
||||
x4 = mm512_swap64_32( x4 );
|
||||
x5 = mm512_swap64_32( x5 );
|
||||
@@ -98,6 +94,122 @@ static void transform_4way( cube_4way_context *sp )
|
||||
_mm512_store_si512( (__m512i*)sp->h + 7, x7 );
|
||||
}
|
||||
|
||||
// 8 ways, 4 way parallel double buffered
|
||||
static void transform_4way_2buf( cube_4way_2buf_context *sp )
|
||||
{
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
__m512i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
__m512i y0, y1, y2, y3, y4, y5, y6, y7;
|
||||
__m512i tx0, tx1, ty0, ty1;
|
||||
|
||||
x0 = _mm512_load_si512( (__m512i*)sp->h0 );
|
||||
x1 = _mm512_load_si512( (__m512i*)sp->h0 + 1 );
|
||||
x2 = _mm512_load_si512( (__m512i*)sp->h0 + 2 );
|
||||
x3 = _mm512_load_si512( (__m512i*)sp->h0 + 3 );
|
||||
x4 = _mm512_load_si512( (__m512i*)sp->h0 + 4 );
|
||||
x5 = _mm512_load_si512( (__m512i*)sp->h0 + 5 );
|
||||
x6 = _mm512_load_si512( (__m512i*)sp->h0 + 6 );
|
||||
x7 = _mm512_load_si512( (__m512i*)sp->h0 + 7 );
|
||||
|
||||
y0 = _mm512_load_si512( (__m512i*)sp->h1 );
|
||||
y1 = _mm512_load_si512( (__m512i*)sp->h1 + 1 );
|
||||
y2 = _mm512_load_si512( (__m512i*)sp->h1 + 2 );
|
||||
y3 = _mm512_load_si512( (__m512i*)sp->h1 + 3 );
|
||||
y4 = _mm512_load_si512( (__m512i*)sp->h1 + 4 );
|
||||
y5 = _mm512_load_si512( (__m512i*)sp->h1 + 5 );
|
||||
y6 = _mm512_load_si512( (__m512i*)sp->h1 + 6 );
|
||||
y7 = _mm512_load_si512( (__m512i*)sp->h1 + 7 );
|
||||
|
||||
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x4 = _mm512_add_epi32( x0, x4 );
|
||||
y4 = _mm512_add_epi32( y0, y4 );
|
||||
x5 = _mm512_add_epi32( x1, x5 );
|
||||
y5 = _mm512_add_epi32( y1, y5 );
|
||||
tx0 = mm512_rol_32( x2, 7 );
|
||||
ty0 = mm512_rol_32( y2, 7 );
|
||||
tx1 = mm512_rol_32( x3, 7 );
|
||||
ty1 = mm512_rol_32( y3, 7 );
|
||||
x6 = _mm512_add_epi32( x2, x6 );
|
||||
y6 = _mm512_add_epi32( y2, y6 );
|
||||
x7 = _mm512_add_epi32( x3, x7 );
|
||||
y7 = _mm512_add_epi32( y3, y7 );
|
||||
x2 = mm512_rol_32( x0, 7 );
|
||||
y2 = mm512_rol_32( y0, 7 );
|
||||
x3 = mm512_rol_32( x1, 7 );
|
||||
y3 = mm512_rol_32( y1, 7 );
|
||||
x0 = _mm512_xor_si512( tx0, x4 );
|
||||
y0 = _mm512_xor_si512( ty0, y4 );
|
||||
x1 = _mm512_xor_si512( tx1, x5 );
|
||||
y1 = _mm512_xor_si512( ty1, y5 );
|
||||
x4 = mm512_swap128_64( x4 );
|
||||
y4 = mm512_swap128_64( y4 );
|
||||
x5 = mm512_swap128_64( x5 );
|
||||
y5 = mm512_swap128_64( y5 );
|
||||
x2 = _mm512_xor_si512( x2, x6 );
|
||||
y2 = _mm512_xor_si512( y2, y6 );
|
||||
x3 = _mm512_xor_si512( x3, x7 );
|
||||
y3 = _mm512_xor_si512( y3, y7 );
|
||||
x6 = mm512_swap128_64( x6 );
|
||||
y6 = mm512_swap128_64( y6 );
|
||||
x7 = mm512_swap128_64( x7 );
|
||||
y7 = mm512_swap128_64( y7 );
|
||||
x4 = _mm512_add_epi32( x0, x4 );
|
||||
y4 = _mm512_add_epi32( y0, y4 );
|
||||
x5 = _mm512_add_epi32( x1, x5 );
|
||||
y5 = _mm512_add_epi32( y1, y5 );
|
||||
tx0 = mm512_rol_32( x1, 11 );
|
||||
ty0 = mm512_rol_32( y1, 11 );
|
||||
tx1 = mm512_rol_32( x3, 11 );
|
||||
ty1 = mm512_rol_32( y3, 11 );
|
||||
x6 = _mm512_add_epi32( x2, x6 );
|
||||
y6 = _mm512_add_epi32( y2, y6 );
|
||||
x7 = _mm512_add_epi32( x3, x7 );
|
||||
y7 = _mm512_add_epi32( y3, y7 );
|
||||
x1 = mm512_rol_32( x0, 11 );
|
||||
y1 = mm512_rol_32( y0, 11 );
|
||||
x3 = mm512_rol_32( x2, 11 );
|
||||
y3 = mm512_rol_32( y2, 11 );
|
||||
x0 = _mm512_xor_si512( tx0, x4 );
|
||||
y0 = _mm512_xor_si512( ty0, y4 );
|
||||
x1 = _mm512_xor_si512( x1, x5 );
|
||||
y1 = _mm512_xor_si512( y1, y5 );
|
||||
x4 = mm512_swap64_32( x4 );
|
||||
y4 = mm512_swap64_32( y4 );
|
||||
x5 = mm512_swap64_32( x5 );
|
||||
y5 = mm512_swap64_32( y5 );
|
||||
x2 = _mm512_xor_si512( tx1, x6 );
|
||||
y2 = _mm512_xor_si512( ty1, y6 );
|
||||
x3 = _mm512_xor_si512( x3, x7 );
|
||||
y3 = _mm512_xor_si512( y3, y7 );
|
||||
x6 = mm512_swap64_32( x6 );
|
||||
y6 = mm512_swap64_32( y6 );
|
||||
x7 = mm512_swap64_32( x7 );
|
||||
y7 = mm512_swap64_32( y7 );
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)sp->h0, x0 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 1, x1 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 2, x2 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 3, x3 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 4, x4 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 5, x5 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 6, x6 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 7, x7 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)sp->h1, y0 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 1, y1 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 2, y2 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 3, y3 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 4, y4 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 5, y5 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 6, y6 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 7, y7 );
|
||||
}
|
||||
|
||||
int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
|
||||
int blockbytes )
|
||||
{
|
||||
@@ -109,22 +221,14 @@ int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m512_const1_128( iv[0] );
|
||||
h[ 1] = m512_const1_128( iv[1] );
|
||||
h[ 2] = m512_const1_128( iv[2] );
|
||||
h[ 3] = m512_const1_128( iv[3] );
|
||||
h[ 4] = m512_const1_128( iv[4] );
|
||||
h[ 5] = m512_const1_128( iv[5] );
|
||||
h[ 6] = m512_const1_128( iv[6] );
|
||||
h[ 7] = m512_const1_128( iv[7] );
|
||||
h[ 0] = m512_const1_128( iv[0] );
|
||||
h[ 1] = m512_const1_128( iv[1] );
|
||||
h[ 2] = m512_const1_128( iv[2] );
|
||||
h[ 3] = m512_const1_128( iv[3] );
|
||||
h[ 4] = m512_const1_128( iv[4] );
|
||||
h[ 5] = m512_const1_128( iv[5] );
|
||||
h[ 6] = m512_const1_128( iv[6] );
|
||||
h[ 7] = m512_const1_128( iv[7] );
|
||||
h[ 0] = mm512_bcast_m128( iv[0] );
|
||||
h[ 1] = mm512_bcast_m128( iv[1] );
|
||||
h[ 2] = mm512_bcast_m128( iv[2] );
|
||||
h[ 3] = mm512_bcast_m128( iv[3] );
|
||||
h[ 4] = mm512_bcast_m128( iv[4] );
|
||||
h[ 5] = mm512_bcast_m128( iv[5] );
|
||||
h[ 6] = mm512_bcast_m128( iv[6] );
|
||||
h[ 7] = mm512_bcast_m128( iv[7] );
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -155,11 +259,11 @@ int cube_4way_close( cube_4way_context *sp, void *output )
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
|
||||
m512_const2_64( 0, 0x0000000000000080 ) );
|
||||
mm512_bcast128lo_64( 0x0000000000000080 ) );
|
||||
transform_4way( sp );
|
||||
|
||||
sp->h[7] = _mm512_xor_si512( sp->h[7],
|
||||
m512_const2_64( 0x0000000100000000, 0 ) );
|
||||
mm512_bcast128hi_64( 0x0000000100000000 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way( sp );
|
||||
@@ -179,14 +283,14 @@ int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
sp->rounds = 16;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m512_const1_128( iv[0] );
|
||||
h[ 1] = m512_const1_128( iv[1] );
|
||||
h[ 2] = m512_const1_128( iv[2] );
|
||||
h[ 3] = m512_const1_128( iv[3] );
|
||||
h[ 4] = m512_const1_128( iv[4] );
|
||||
h[ 5] = m512_const1_128( iv[5] );
|
||||
h[ 6] = m512_const1_128( iv[6] );
|
||||
h[ 7] = m512_const1_128( iv[7] );
|
||||
h[ 0] = mm512_bcast_m128( iv[0] );
|
||||
h[ 1] = mm512_bcast_m128( iv[1] );
|
||||
h[ 2] = mm512_bcast_m128( iv[2] );
|
||||
h[ 3] = mm512_bcast_m128( iv[3] );
|
||||
h[ 4] = mm512_bcast_m128( iv[4] );
|
||||
h[ 5] = mm512_bcast_m128( iv[5] );
|
||||
h[ 6] = mm512_bcast_m128( iv[6] );
|
||||
h[ 7] = mm512_bcast_m128( iv[7] );
|
||||
|
||||
const int len = size >> 4;
|
||||
const __m512i *in = (__m512i*)data;
|
||||
@@ -206,11 +310,11 @@ int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
|
||||
m512_const2_64( 0, 0x0000000000000080 ) );
|
||||
mm512_bcast128lo_64( 0x0000000000000080 ) );
|
||||
transform_4way( sp );
|
||||
|
||||
sp->h[7] = _mm512_xor_si512( sp->h[7],
|
||||
m512_const2_64( 0x0000000100000000, 0 ) );
|
||||
mm512_bcast128hi_64( 0x0000000100000000 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way( sp );
|
||||
@@ -219,6 +323,66 @@ int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_2buf_full( cube_4way_2buf_context *sp,
|
||||
void *output0, void *output1, int hashbitlen,
|
||||
const void *data0, const void *data1, size_t size )
|
||||
{
|
||||
__m512i *h0 = (__m512i*)sp->h0;
|
||||
__m512i *h1 = (__m512i*)sp->h1;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = 32/16;
|
||||
sp->rounds = 16;
|
||||
sp->pos = 0;
|
||||
|
||||
h1[0] = h0[0] = mm512_bcast_m128( iv[0] );
|
||||
h1[1] = h0[1] = mm512_bcast_m128( iv[1] );
|
||||
h1[2] = h0[2] = mm512_bcast_m128( iv[2] );
|
||||
h1[3] = h0[3] = mm512_bcast_m128( iv[3] );
|
||||
h1[4] = h0[4] = mm512_bcast_m128( iv[4] );
|
||||
h1[5] = h0[5] = mm512_bcast_m128( iv[5] );
|
||||
h1[6] = h0[6] = mm512_bcast_m128( iv[6] );
|
||||
h1[7] = h0[7] = mm512_bcast_m128( iv[7] );
|
||||
|
||||
const int len = size >> 4;
|
||||
const __m512i *in0 = (__m512i*)data0;
|
||||
const __m512i *in1 = (__m512i*)data1;
|
||||
__m512i *hash0 = (__m512i*)output0;
|
||||
__m512i *hash1 = (__m512i*)output1;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], in0[i] );
|
||||
sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], in1[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_4way_2buf( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
__m512i tmp = mm512_bcast128lo_64( 0x0000000000000080 );
|
||||
sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], tmp );
|
||||
sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], tmp );
|
||||
|
||||
transform_4way_2buf( sp );
|
||||
|
||||
tmp = mm512_bcast128hi_64( 0x0000000100000000 );
|
||||
sp->h0[7] = _mm512_xor_si512( sp->h0[7], tmp );
|
||||
sp->h1[7] = _mm512_xor_si512( sp->h1[7], tmp );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way_2buf( sp );
|
||||
|
||||
memcpy( hash0, sp->h0, sp->hashlen<<6);
|
||||
memcpy( hash1, sp->h1, sp->hashlen<<6);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size )
|
||||
@@ -241,11 +405,11 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
|
||||
m512_const2_64( 0, 0x0000000000000080 ) );
|
||||
mm512_bcast128lo_64( 0x0000000000000080 ) );
|
||||
transform_4way( sp );
|
||||
|
||||
sp->h[7] = _mm512_xor_si512( sp->h[7],
|
||||
m512_const2_64( 0x0000000100000000, 0 ) );
|
||||
mm512_bcast128hi_64( 0x0000000100000000 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way( sp );
|
||||
@@ -259,6 +423,21 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
|
||||
// 2 way 128
|
||||
|
||||
// This isn't expected to be used with AVX512 so HW rotate intruction
|
||||
// is assumed not avaiable.
|
||||
// Use double buffering to optimize serial bit rotations. Full double
|
||||
// buffering isn't practical because it needs twice as many registers
|
||||
// with AVX2 having only half as many as AVX512.
|
||||
#define ROL2( out0, out1, in0, in1, c ) \
|
||||
{ \
|
||||
__m256i t0 = _mm256_slli_epi32( in0, c ); \
|
||||
__m256i t1 = _mm256_slli_epi32( in1, c ); \
|
||||
out0 = _mm256_srli_epi32( in0, 32-(c) ); \
|
||||
out1 = _mm256_srli_epi32( in1, 32-(c) ); \
|
||||
out0 = _mm256_or_si256( out0, t0 ); \
|
||||
out1 = _mm256_or_si256( out1, t1 ); \
|
||||
}
|
||||
|
||||
static void transform_2way( cube_2way_context *sp )
|
||||
{
|
||||
int r;
|
||||
@@ -281,14 +460,10 @@ static void transform_2way( cube_2way_context *sp )
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x1;
|
||||
x0 = mm256_rol_32( x2, 7 );
|
||||
x1 = mm256_rol_32( x3, 7 );
|
||||
x2 = mm256_rol_32( y0, 7 );
|
||||
x3 = mm256_rol_32( y1, 7 );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
ROL2( y0, y1, x2, x3, 7 );
|
||||
ROL2( x2, x3, x0, x1, 7 );
|
||||
x0 = _mm256_xor_si256( y0, x4 );
|
||||
x1 = _mm256_xor_si256( y1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = mm256_swap128_64( x4 );
|
||||
@@ -299,15 +474,11 @@ static void transform_2way( cube_2way_context *sp )
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x2;
|
||||
x0 = mm256_rol_32( x1, 11 );
|
||||
x1 = mm256_rol_32( y0, 11 );
|
||||
x2 = mm256_rol_32( x3, 11 );
|
||||
x3 = mm256_rol_32( y1, 11 );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
ROL2( y0, x1, x1, x0, 11 );
|
||||
ROL2( y1, x3, x3, x2, 11 );
|
||||
x0 = _mm256_xor_si256( y0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x2 = _mm256_xor_si256( y1, x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = mm256_swap64_32( x4 );
|
||||
x5 = mm256_swap64_32( x5 );
|
||||
@@ -336,27 +507,18 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
h[ 0] = mm256_bcast_m128( iv[0] );
|
||||
h[ 1] = mm256_bcast_m128( iv[1] );
|
||||
h[ 2] = mm256_bcast_m128( iv[2] );
|
||||
h[ 3] = mm256_bcast_m128( iv[3] );
|
||||
h[ 4] = mm256_bcast_m128( iv[4] );
|
||||
h[ 5] = mm256_bcast_m128( iv[5] );
|
||||
h[ 6] = mm256_bcast_m128( iv[6] );
|
||||
h[ 7] = mm256_bcast_m128( iv[7] );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
|
||||
{
|
||||
const int len = size >> 4;
|
||||
@@ -383,13 +545,14 @@ int cube_2way_close( cube_2way_context *sp, void *output )
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
mm256_bcast128lo_64( 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
mm256_bcast128hi_64( 0x0000000100000000 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_2way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
@@ -416,13 +579,14 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
mm256_bcast128lo_64( 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
mm256_bcast128hi_64( 0x0000000100000000 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_2way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
@@ -439,14 +603,14 @@ int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
|
||||
sp->rounds = 16;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
h[ 0] = mm256_bcast_m128( iv[0] );
|
||||
h[ 1] = mm256_bcast_m128( iv[1] );
|
||||
h[ 2] = mm256_bcast_m128( iv[2] );
|
||||
h[ 3] = mm256_bcast_m128( iv[3] );
|
||||
h[ 4] = mm256_bcast_m128( iv[4] );
|
||||
h[ 5] = mm256_bcast_m128( iv[5] );
|
||||
h[ 6] = mm256_bcast_m128( iv[6] );
|
||||
h[ 7] = mm256_bcast_m128( iv[7] );
|
||||
|
||||
const int len = size >> 4;
|
||||
const __m256i *in = (__m256i*)data;
|
||||
@@ -466,13 +630,14 @@ int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
mm256_bcast128lo_64( 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
mm256_bcast128hi_64( 0x0000000100000000 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_2way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
|
@@ -17,41 +17,41 @@ struct _cube_4way_context
|
||||
int pos;
|
||||
} __attribute__ ((aligned (128)));
|
||||
|
||||
struct _cube_4way_2buf_context
|
||||
{
|
||||
__m512i h0[8];
|
||||
__m512i h1[8];
|
||||
int hashlen;
|
||||
int rounds;
|
||||
int blocksize;
|
||||
int pos;
|
||||
} __attribute__ ((aligned (128)));
|
||||
|
||||
|
||||
typedef struct _cube_4way_context cube_4way_context;
|
||||
|
||||
typedef struct _cube_4way_2buf_context cube_4way_2buf_context;
|
||||
|
||||
int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
|
||||
int blockbytes );
|
||||
|
||||
int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
|
||||
|
||||
int cube_4way_close( cube_4way_context *sp, void *output );
|
||||
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size );
|
||||
|
||||
int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
const void *data, size_t size );
|
||||
|
||||
int cube_4x256_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
const void *data, size_t size );
|
||||
|
||||
#define cube512_4way_init( sp ) cube_4way_update( sp, 512 )
|
||||
#define cube512_4way_update cube_4way_update
|
||||
#define cube512_4way_update_close cube_4way_update
|
||||
#define cube512_4way_close cube_4way_update
|
||||
#define cube512_4way_full( sp, output, data, size ) \
|
||||
cube_4way_full( sp, output, 512, data, size )
|
||||
#define cube512_4x256_full( sp, output, data, size ) \
|
||||
cube_4x256_full( sp, output, 512, data, size )
|
||||
|
||||
#define cube256_4way_init( sp ) cube_4way_update( sp, 256 )
|
||||
#define cube256_4way_update cube_4way_update
|
||||
#define cube256_4way_update_close cube_4way_update
|
||||
#define cube256_4way_close cube_4way_update
|
||||
#define cube256_4way_full( sp, output, data, size ) \
|
||||
cube_4way_full( sp, output, 256, data, size )
|
||||
#define cube256_4x256_full( sp, output, data, size ) \
|
||||
cube_4x256_full( sp, output, 256, data, size )
|
||||
int cube_4way_2buf_full( cube_4way_2buf_context *sp,
|
||||
void *output0, void *output1, int hashbitlen,
|
||||
const void *data0, const void *data1, size_t size );
|
||||
|
||||
#endif
|
||||
|
||||
// 2x128, 2 way parallel SSE2
|
||||
// 2x128, 2 way parallel AVX2
|
||||
|
||||
struct _cube_2way_context
|
||||
{
|
||||
|
@@ -31,10 +31,14 @@ static void transform( cubehashParam *sp )
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x1 = _mm512_add_epi32( x0, x1 );
|
||||
x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 );
|
||||
x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) );
|
||||
x0 = _mm512_xor_si512( mm512_rol_32(
|
||||
mm512_swap256_128( x0 ), 11 ), x1 );
|
||||
x0 = mm512_swap_256( x0 );
|
||||
x0 = mm512_rol_32( x0, 7 );
|
||||
x0 = _mm512_xor_si512( x0, x1 );
|
||||
x1 = mm512_swap128_64( x1 );
|
||||
x1 = _mm512_add_epi32( x0, x1 );
|
||||
x0 = mm512_swap256_128( x0 );
|
||||
x0 = mm512_rol_32( x0, 11 );
|
||||
x0 = _mm512_xor_si512( x0, x1 );
|
||||
x1 = mm512_swap64_32( x1 );
|
||||
}
|
||||
|
||||
|
@@ -15,11 +15,11 @@
|
||||
|
||||
struct _cubehashParam
|
||||
{
|
||||
__m128i _ALIGN(64) x[8]; // aligned for __m512i
|
||||
int hashlen; // __m128i
|
||||
int rounds;
|
||||
int blocksize; // __m128i
|
||||
int pos; // number of __m128i read into x from current block
|
||||
__m128i _ALIGN(64) x[8]; // aligned for __m256i
|
||||
};
|
||||
|
||||
typedef struct _cubehashParam cubehashParam;
|
||||
|
@@ -53,10 +53,24 @@ MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x000
|
||||
MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
|
||||
|
||||
|
||||
#define ECHO_SUBBYTES4(state, j) \
|
||||
state[0][j] = _mm_aesenc_si128(state[0][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[1][j] = _mm_aesenc_si128(state[1][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[2][j] = _mm_aesenc_si128(state[2][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[3][j] = _mm_aesenc_si128(state[3][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[0][j] = _mm_aesenc_si128(state[0][j], m128_zero ); \
|
||||
state[1][j] = _mm_aesenc_si128(state[1][j], m128_zero ); \
|
||||
state[2][j] = _mm_aesenc_si128(state[2][j], m128_zero ); \
|
||||
state[3][j] = _mm_aesenc_si128(state[3][j], m128_zero )
|
||||
|
||||
#define ECHO_SUBBYTES(state, i, j) \
|
||||
state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
|
||||
state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\
|
||||
k1 = _mm_add_epi32(k1, M128(const1))
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero))
|
||||
|
||||
#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
|
||||
s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
|
||||
@@ -73,7 +87,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
|
||||
state2[0][j] = mm128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
|
||||
@@ -83,7 +97,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
|
||||
state2[1][j] = mm128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
|
||||
s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
|
||||
@@ -93,10 +107,29 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
|
||||
state2[2][j] = mm128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], s2)
|
||||
|
||||
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES4(_state, 0);\
|
||||
ECHO_SUBBYTES4(_state, 1);\
|
||||
ECHO_SUBBYTES4(_state, 2);\
|
||||
ECHO_SUBBYTES4(_state, 3);\
|
||||
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES4(_state2, 0);\
|
||||
ECHO_SUBBYTES4(_state2, 1);\
|
||||
ECHO_SUBBYTES4(_state2, 2);\
|
||||
ECHO_SUBBYTES4(_state2, 3);\
|
||||
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
/*
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES(_state, 0, 0);\
|
||||
ECHO_SUBBYTES(_state, 1, 0);\
|
||||
@@ -138,7 +171,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#define SAVESTATE(dst, src)\
|
||||
|
@@ -10,22 +10,27 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234
|
||||
};
|
||||
*/
|
||||
// do these need to be reversed?
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
|
||||
#define mul2mask \
|
||||
m512_const2_64( 0, 0x00001b00 )
|
||||
//_mm512_set4_epi32( 0, 0, 0, 0x00001b00 )
|
||||
// _mm512_set4_epi32( 0x00001b00, 0, 0, 0 )
|
||||
|
||||
#define lsbmask m512_const1_32( 0x01010101 )
|
||||
#define ECHO_SUBBYTES4(state, j) \
|
||||
state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
|
||||
k1 = _mm512_add_epi32( k1, one ); \
|
||||
state[1][j] = _mm512_aesenc_epi128( state[1][j], k1 ); \
|
||||
k1 = _mm512_add_epi32( k1, one ); \
|
||||
state[2][j] = _mm512_aesenc_epi128( state[2][j], k1 ); \
|
||||
k1 = _mm512_add_epi32( k1, one ); \
|
||||
state[3][j] = _mm512_aesenc_epi128( state[3][j], k1 ); \
|
||||
k1 = _mm512_add_epi32( k1, one ); \
|
||||
state[0][j] = _mm512_aesenc_epi128( state[0][j], m512_zero ); \
|
||||
state[1][j] = _mm512_aesenc_epi128( state[1][j], m512_zero ); \
|
||||
state[2][j] = _mm512_aesenc_epi128( state[2][j], m512_zero ); \
|
||||
state[3][j] = _mm512_aesenc_epi128( state[3][j], m512_zero )
|
||||
|
||||
#define ECHO_SUBBYTES( state, i, j ) \
|
||||
state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
|
||||
state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero ); \
|
||||
k1 = _mm512_add_epi32( k1, m512_one_128 );
|
||||
k1 = _mm512_add_epi32( k1, one ); \
|
||||
state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero );
|
||||
|
||||
#define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) do \
|
||||
{ \
|
||||
@@ -46,8 +51,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 );\
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
|
||||
state2[ 0 ][ j ] = mm512_xor3( state2[ 0 ][ j ], s2, state1[ 1 ][ j1 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
@@ -57,8 +61,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
|
||||
state2[ 1 ][ j ] = mm512_xor3( state2[ 1 ][ j ], s2, state1[ 2 ][ j2 ] ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
|
||||
s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
|
||||
@@ -68,11 +71,29 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
|
||||
state2[ 2 ][ j ] = mm512_xor3( state2[ 2 ][ j ], s2, state1[ 3 ][ j3] ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
|
||||
} while(0)
|
||||
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES4(_state, 0);\
|
||||
ECHO_SUBBYTES4(_state, 1);\
|
||||
ECHO_SUBBYTES4(_state, 2);\
|
||||
ECHO_SUBBYTES4(_state, 3);\
|
||||
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES4(_state2, 0);\
|
||||
ECHO_SUBBYTES4(_state2, 1);\
|
||||
ECHO_SUBBYTES4(_state2, 2);\
|
||||
ECHO_SUBBYTES4(_state2, 3);\
|
||||
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
/*
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES(_state, 0, 0);\
|
||||
ECHO_SUBBYTES(_state, 1, 0);\
|
||||
@@ -114,6 +135,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
*/
|
||||
|
||||
#define SAVESTATE(dst, src)\
|
||||
dst[0][0] = src[0][0];\
|
||||
@@ -140,6 +162,9 @@ void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg,
|
||||
unsigned int r, b, i, j;
|
||||
__m512i t1, t2, s2, k1;
|
||||
__m512i _state[4][4], _state2[4][4], _statebackup[4][4];
|
||||
const __m512i one = mm512_bcast128lo_64( 1 );
|
||||
const __m512i mul2mask = mm512_bcast128lo_64( 0x00001b00 );
|
||||
const __m512i lsbmask = _mm512_set1_epi32( 0x01010101 );
|
||||
|
||||
_state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ];
|
||||
_state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ];
|
||||
@@ -239,16 +264,16 @@ int echo_4way_init( echo_4way_context *ctx, int nHashSize )
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = m512_const2_64( 0, 0x100 );
|
||||
ctx->const1536 = m512_const2_64( 0, 0x600 );
|
||||
ctx->hashsize = mm512_bcast128lo_64( 0x100 );
|
||||
ctx->const1536 = mm512_bcast128lo_64( 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = m512_const2_64( 0, 0x200 );
|
||||
ctx->const1536 = m512_const2_64( 0, 0x400);
|
||||
ctx->hashsize = mm512_bcast128lo_64( 0x200 );
|
||||
ctx->const1536 = mm512_bcast128lo_64( 0x400);
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -280,7 +305,7 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval,
|
||||
{
|
||||
echo_4way_compress( state, data, 1 );
|
||||
state->processed_bits = 1024;
|
||||
remainingbits = m512_const2_64( 0, -1024 );
|
||||
remainingbits = mm512_bcast128lo_64( -1024 );
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
@@ -288,13 +313,15 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval,
|
||||
vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
memcpy_512( state->buffer, data, vlen );
|
||||
state->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = m512_const2_64( 0, (uint64_t)databitlen );
|
||||
remainingbits = mm512_bcast128lo_64( (uint64_t)databitlen );
|
||||
}
|
||||
|
||||
state->buffer[ vlen ] = m512_const2_64( 0, 0x80 );
|
||||
state->buffer[ vlen ] = mm512_bcast128lo_64( 0x80 );
|
||||
memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
state->buffer[ vblen-2 ] = m512_const2_64( (uint64_t)state->uHashSize << 48, 0 );
|
||||
state->buffer[ vblen-1 ] = m512_const2_64( 0, state->processed_bits);
|
||||
state->buffer[ vblen-2 ] =
|
||||
mm512_bcast128hi_64( (uint64_t)state->uHashSize << 48 );
|
||||
state->buffer[ vblen-1 ] =
|
||||
mm512_bcast128lo_64( state->processed_bits );
|
||||
|
||||
state->k = _mm512_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm512_sub_epi64( state->k, state->const1536 );
|
||||
@@ -327,16 +354,16 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = m512_const2_64( 0, 0x100 );
|
||||
ctx->const1536 = m512_const2_64( 0, 0x600 );
|
||||
ctx->hashsize = mm512_bcast128lo_64( 0x100 );
|
||||
ctx->const1536 = mm512_bcast128lo_64( 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = m512_const2_64( 0, 0x200 );
|
||||
ctx->const1536 = m512_const2_64( 0, 0x400 );
|
||||
ctx->hashsize = mm512_bcast128lo_64( 0x200 );
|
||||
ctx->const1536 = mm512_bcast128lo_64( 0x400 );
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -363,7 +390,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
{
|
||||
echo_4way_compress( ctx, data, 1 );
|
||||
ctx->processed_bits = 1024;
|
||||
remainingbits = m512_const2_64( 0, -1024 );
|
||||
remainingbits = mm512_bcast128lo_64( -1024 );
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
@@ -371,14 +398,14 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
memcpy_512( ctx->buffer, data, vlen );
|
||||
ctx->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = m512_const2_64( 0, databitlen );
|
||||
remainingbits = mm512_bcast128lo_64( databitlen );
|
||||
}
|
||||
|
||||
ctx->buffer[ vlen ] = m512_const2_64( 0, 0x80 );
|
||||
ctx->buffer[ vlen ] = mm512_bcast128lo_64( 0x80 );
|
||||
memset_zero_512( ctx->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
ctx->buffer[ vblen-2 ] =
|
||||
m512_const2_64( (uint64_t)ctx->uHashSize << 48, 0 );
|
||||
ctx->buffer[ vblen-1 ] = m512_const2_64( 0, ctx->processed_bits);
|
||||
mm512_bcast128hi_64( (uint64_t)ctx->uHashSize << 48 );
|
||||
ctx->buffer[ vblen-1 ] = mm512_bcast128lo_64( ctx->processed_bits);
|
||||
|
||||
ctx->k = _mm512_add_epi64( ctx->k, remainingbits );
|
||||
ctx->k = _mm512_sub_epi64( ctx->k, ctx->const1536 );
|
||||
@@ -400,14 +427,28 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
|
||||
// AVX2 + VAES
|
||||
|
||||
#define mul2mask_2way m256_const2_64( 0, 0x0000000000001b00 )
|
||||
#define mul2mask_2way mm256_bcast128lo_64( 0x0000000000001b00 )
|
||||
|
||||
#define lsbmask_2way m256_const1_32( 0x01010101 )
|
||||
#define lsbmask_2way _mm256_set1_epi32( 0x01010101 )
|
||||
|
||||
#define ECHO_SUBBYTES4_2WAY( state, j ) \
|
||||
state[0][j] = _mm256_aesenc_epi128( state[0][j], k1 ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||
state[1][j] = _mm256_aesenc_epi128( state[1][j], k1 ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||
state[2][j] = _mm256_aesenc_epi128( state[2][j], k1 ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||
state[3][j] = _mm256_aesenc_epi128( state[3][j], k1 ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||
state[0][j] = _mm256_aesenc_epi128( state[0][j], m256_zero ); \
|
||||
state[1][j] = _mm256_aesenc_epi128( state[1][j], m256_zero ); \
|
||||
state[2][j] = _mm256_aesenc_epi128( state[2][j], m256_zero ); \
|
||||
state[3][j] = _mm256_aesenc_epi128( state[3][j], m256_zero )
|
||||
|
||||
#define ECHO_SUBBYTES_2WAY( state, i, j ) \
|
||||
state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||
state[i][j] = _mm256_aesenc_epi128( state[i][j], m256_zero ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 );
|
||||
|
||||
#define ECHO_MIXBYTES_2WAY( state1, state2, j, t1, t2, s2 ) do \
|
||||
{ \
|
||||
@@ -455,6 +496,25 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
|
||||
} while(0)
|
||||
|
||||
#define ECHO_ROUND_UNROLL2_2WAY \
|
||||
ECHO_SUBBYTES4_2WAY(_state, 0);\
|
||||
ECHO_SUBBYTES4_2WAY(_state, 1);\
|
||||
ECHO_SUBBYTES4_2WAY(_state, 2);\
|
||||
ECHO_SUBBYTES4_2WAY(_state, 3);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES4_2WAY(_state2, 0);\
|
||||
ECHO_SUBBYTES4_2WAY(_state2, 1);\
|
||||
ECHO_SUBBYTES4_2WAY(_state2, 2);\
|
||||
ECHO_SUBBYTES4_2WAY(_state2, 3);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
/*
|
||||
#define ECHO_ROUND_UNROLL2_2WAY \
|
||||
ECHO_SUBBYTES_2WAY(_state, 0, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 1, 0);\
|
||||
@@ -496,6 +556,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
|
||||
*/
|
||||
|
||||
#define SAVESTATE_2WAY(dst, src)\
|
||||
dst[0][0] = src[0][0];\
|
||||
@@ -620,16 +681,16 @@ int echo_2way_init( echo_2way_context *ctx, int nHashSize )
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = m256_const2_64( 0, 0x100 );
|
||||
ctx->const1536 = m256_const2_64( 0, 0x600 );
|
||||
ctx->hashsize = mm256_bcast128lo_64( 0x100 );
|
||||
ctx->const1536 = mm256_bcast128lo_64( 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = m256_const2_64( 0, 0x200 );
|
||||
ctx->const1536 = m256_const2_64( 0, 0x400 );
|
||||
ctx->hashsize = mm256_bcast128lo_64( 0x200 );
|
||||
ctx->const1536 = mm256_bcast128lo_64( 0x400 );
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -661,20 +722,20 @@ int echo_2way_update_close( echo_2way_context *state, void *hashval,
|
||||
{
|
||||
echo_2way_compress( state, data, 1 );
|
||||
state->processed_bits = 1024;
|
||||
remainingbits = m256_const2_64( 0, -1024 );
|
||||
remainingbits = mm256_bcast128lo_64( -1024 );
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy_256( state->buffer, data, vlen );
|
||||
state->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = m256_const2_64( 0, databitlen );
|
||||
remainingbits = mm256_bcast128lo_64( databitlen );
|
||||
}
|
||||
|
||||
state->buffer[ vlen ] = m256_const2_64( 0, 0x80 );
|
||||
state->buffer[ vlen ] = mm256_bcast128lo_64( 0x80 );
|
||||
memset_zero_256( state->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
state->buffer[ vblen-2 ] = m256_const2_64( (uint64_t)state->uHashSize << 48, 0 );
|
||||
state->buffer[ vblen-1 ] = m256_const2_64( 0, state->processed_bits );
|
||||
state->buffer[ vblen-2 ] = mm256_bcast128hi_64( (uint64_t)state->uHashSize << 48 );
|
||||
state->buffer[ vblen-1 ] = mm256_bcast128lo_64( state->processed_bits );
|
||||
|
||||
state->k = _mm256_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm256_sub_epi64( state->k, state->const1536 );
|
||||
@@ -707,16 +768,16 @@ int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = m256_const2_64( 0, 0x100 );
|
||||
ctx->const1536 = m256_const2_64( 0, 0x600 );
|
||||
ctx->hashsize = mm256_bcast128lo_64( 0x100 );
|
||||
ctx->const1536 = mm256_bcast128lo_64( 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = m256_const2_64( 0, 0x200 );
|
||||
ctx->const1536 = m256_const2_64( 0, 0x400 );
|
||||
ctx->hashsize = mm256_bcast128lo_64( 0x200 );
|
||||
ctx->const1536 = mm256_bcast128lo_64( 0x400 );
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -739,7 +800,7 @@ int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
|
||||
{
|
||||
echo_2way_compress( ctx, data, 1 );
|
||||
ctx->processed_bits = 1024;
|
||||
remainingbits = m256_const2_64( 0, -1024 );
|
||||
remainingbits = mm256_bcast128lo_64( -1024 );
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
@@ -747,13 +808,13 @@ int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
|
||||
vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
memcpy_256( ctx->buffer, data, vlen );
|
||||
ctx->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = m256_const2_64( 0, databitlen );
|
||||
remainingbits = mm256_bcast128lo_64( databitlen );
|
||||
}
|
||||
|
||||
ctx->buffer[ vlen ] = m256_const2_64( 0, 0x80 );
|
||||
ctx->buffer[ vlen ] = mm256_bcast128lo_64( 0x80 );
|
||||
memset_zero_256( ctx->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
ctx->buffer[ vblen-2 ] = m256_const2_64( (uint64_t)ctx->uHashSize << 48, 0 );
|
||||
ctx->buffer[ vblen-1 ] = m256_const2_64( 0, ctx->processed_bits );
|
||||
ctx->buffer[ vblen-2 ] = mm256_bcast128hi_64( (uint64_t)ctx->uHashSize << 48 );
|
||||
ctx->buffer[ vblen-1 ] = mm256_bcast128lo_64( ctx->processed_bits );
|
||||
|
||||
ctx->k = _mm256_add_epi64( ctx->k, remainingbits );
|
||||
ctx->k = _mm256_sub_epi64( ctx->k, ctx->const1536 );
|
||||
|
@@ -124,7 +124,16 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s7 = _mm_xor_si128(s7, t1)
|
||||
|
||||
#define PRESUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t2 = t0;\
|
||||
t3 = _mm_add_epi8(t0, t0);\
|
||||
t4 = _mm_add_epi8(t3, t3);\
|
||||
t1 = _mm_srli_epi16(t0, 6);\
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
||||
|
||||
/*
|
||||
#define PRESUPERMIX(x, t1, s1, s2, t2)\
|
||||
s1 = x;\
|
||||
s2 = _mm_add_epi8(x, x);\
|
||||
@@ -133,37 +142,59 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
||||
*/
|
||||
|
||||
#define SUBSTITUTE(r0, _t1, _t2, _t3, _t0)\
|
||||
#define SUBSTITUTE(r0, _t2 )\
|
||||
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
|
||||
_t2 = _mm_aesenclast_si128( _t2, m128_zero )
|
||||
|
||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t2 = t0;\
|
||||
t3 = _mm_add_epi8(t0, t0);\
|
||||
t4 = _mm_add_epi8(t3, t3);\
|
||||
t1 = _mm_srli_epi16(t0, 6);\
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
|
||||
t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
|
||||
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
||||
t2 = mm128_xor3(t2, t3, t0 );\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||
t4 = mm128_xor3( t4, t1, t2 ); \
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
||||
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
||||
t4 = mm128_xor3( t4, t2, t1 ); \
|
||||
t0 = _mm_xor_si128(t0, t3);\
|
||||
t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
|
||||
|
||||
/*
|
||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||
PRESUPERMIX(t0, t1, t2, t3, t4);\
|
||||
POSTSUPERMIX(t0, t1, t2, t3, t4)
|
||||
|
||||
*/
|
||||
|
||||
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t1 = t2;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1b));\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
|
||||
t4 = t1;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = t4;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1d));\
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = t2;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1a));\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t2 = _mm_xor_si128(t2, t3);\
|
||||
t2 = _mm_xor_si128(t2, t0);\
|
||||
t2 = mm128_xor3(t2, t3, t0 );\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||
t4 = _mm_xor_si128(t4, t2);\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||
t4 = _mm_xor_si128(t4, t2);\
|
||||
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
||||
t1 = t0;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix4a));\
|
||||
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
||||
t0 = _mm_xor_si128(t0, t3);\
|
||||
@@ -171,59 +202,55 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
|
||||
t4 = _mm_xor_si128(t4, t0)
|
||||
|
||||
|
||||
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
|
||||
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
||||
PACK_S0(r1c, r1a, _t0);\
|
||||
SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE(r1c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
||||
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
||||
r2c = _mm_xor_si128(r2c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r2d = _mm_xor_si128(r2d, _t0);\
|
||||
UNPACK_S0(r1c, r1a, _t3);\
|
||||
SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE(r2c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
||||
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
||||
r3c = _mm_xor_si128(r3c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r3d = _mm_xor_si128(r3d, _t0);\
|
||||
UNPACK_S0(r2c, r2a, _t3);\
|
||||
SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE(r3c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
||||
UNPACK_S0(r3c, r3a, _t3)
|
||||
|
||||
|
||||
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
|
||||
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
||||
PACK_S0(r1c, r1a, _t0);\
|
||||
SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE( r1c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
||||
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
||||
r2c = _mm_xor_si128(r2c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r2d = _mm_xor_si128(r2d, _t0);\
|
||||
UNPACK_S0(r1c, r1a, _t3);\
|
||||
SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE(r2c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
||||
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
||||
r3c = _mm_xor_si128(r3c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r3d = _mm_xor_si128(r3d, _t0);\
|
||||
UNPACK_S0(r2c, r2a, _t3);\
|
||||
SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE( r3c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
||||
_t0 = _mm_shuffle_epi32(r3c, 0x39);\
|
||||
r4c = _mm_xor_si128(r4c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r4d = _mm_xor_si128(r4d, _t0);\
|
||||
UNPACK_S0(r3c, r3a, _t3);\
|
||||
SUBSTITUTE(r4c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE( r4c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
|
||||
UNPACK_S0(r4c, r4a, _t3)
|
||||
|
||||
|
||||
|
||||
#define LOADCOLUMN(x, s, a)\
|
||||
block[0] = col[(base + a + 0) % s];\
|
||||
block[1] = col[(base + a + 1) % s];\
|
||||
@@ -278,18 +305,15 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
while( uBlockCount > 0 )
|
||||
{
|
||||
TIX512( pmsg, ctx->state[ 7],ctx->state[2],ctx->state[8],ctx->state[9],
|
||||
ctx->state[10],ctx->state[0],ctx->state[1],ctx->state[2],
|
||||
_t0, _t1, _t2 );
|
||||
SUBROUND512_4( ctx->state[0], ctx->state[1], ctx->state[11],
|
||||
ctx->state[5], ctx->state[11], ctx->state[0],
|
||||
ctx->state[10], ctx->state[4], ctx->state[10],
|
||||
ctx->state[11], ctx->state[9], ctx->state[3],
|
||||
ctx->state[9], ctx->state[10], ctx->state[8],
|
||||
ctx->state[2] );
|
||||
SUBROUND512_4( ctx->state[0], ctx->state[1],ctx->state[11],ctx->state[5],
|
||||
ctx->state[11],ctx->state[0],ctx->state[10],ctx->state[4],
|
||||
ctx->state[10],ctx->state[11],ctx->state[9],ctx->state[3],
|
||||
ctx->state[9],ctx->state[10],ctx->state[8],ctx->state[2] );
|
||||
|
||||
ctx->base++;
|
||||
pmsg += 4;
|
||||
@@ -300,9 +324,10 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
|
||||
ctx->state[6],ctx->state[8], ctx->state[9],ctx->state[10],
|
||||
_t0, _t1, _t2 );
|
||||
|
||||
SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], ctx->state[1], ctx->state[7], ctx->state[8], ctx->state[6], ctx->state[0],
|
||||
SUBROUND512_4( ctx->state[8],ctx->state[9],ctx->state[7],ctx->state[1],
|
||||
ctx->state[7],ctx->state[8],ctx->state[6],ctx->state[0],
|
||||
ctx->state[6],ctx->state[7],ctx->state[5],ctx->state[11],
|
||||
ctx->state[5], ctx->state[6, ctx->state[4], ctx->state[10]);
|
||||
ctx->state[5],ctx->state[6],ctx->state[4],ctx->state[10] );
|
||||
|
||||
ctx->base++;
|
||||
pmsg += 4;
|
||||
@@ -357,7 +382,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
}
|
||||
@@ -375,7 +400,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
|
||||
@@ -390,7 +415,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
|
||||
@@ -405,7 +430,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
|
||||
@@ -420,7 +445,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
}
|
||||
|
@@ -16,6 +16,10 @@
|
||||
|
||||
#if defined(__AES__)
|
||||
|
||||
#if !defined(__SSE4_1__)
|
||||
#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
|
||||
#endif
|
||||
|
||||
#include "algo/sha/sha3_common.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
@@ -33,12 +37,23 @@ typedef struct
|
||||
|
||||
} hashState_fugue __attribute__ ((aligned (64)));
|
||||
|
||||
|
||||
// These functions are deprecated, use the lower case macro aliases that use
|
||||
// the standard interface. This will be cleaned up at a later date.
|
||||
HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
|
||||
|
||||
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
|
||||
|
||||
HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
|
||||
|
||||
#define fugue512_init( state ) \
|
||||
fugue512_Init( state, 512 )
|
||||
#define fugue512_update( state, data, len ) \
|
||||
fugue512_Update( state, data, (len)<<3 )
|
||||
#define fugue512_final \
|
||||
fugue512_Final
|
||||
|
||||
|
||||
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
|
||||
|
||||
#endif // AES
|
||||
|
@@ -67,11 +67,9 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm_xor_si128(j, j);\
|
||||
j = _mm_cmpgt_epi8(j, i);\
|
||||
j = _mm_cmpgt_epi8( m128_zero, i);\
|
||||
i = _mm_add_epi8(i, i);\
|
||||
j = _mm_and_si128(j, k);\
|
||||
i = _mm_xor_si128(i, j);\
|
||||
i = mm128_xorand(i, j, k );\
|
||||
}
|
||||
|
||||
/**/
|
||||
@@ -93,6 +91,96 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
b1 = a3;\
|
||||
TEMP2 = _mm_xor_si128(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
TEMP0 = mm128_xor3( b0, a4, a6 ); \
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP1 = mm128_xor3( b1, a5, a7 );\
|
||||
b2 = mm128_xor3( b2, a6, a0 ); \
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b3 = mm128_xor3( b3, a7, a1 ); \
|
||||
b1 = a1;\
|
||||
b6 = mm128_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = mm128_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = mm128_xor3( b7, a5, a3 ); \
|
||||
b5 = mm128_xor3( b5, a1, a3 ); \
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(TEMP2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm_xor_si128(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm_xor_si128(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#else
|
||||
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
@@ -189,6 +277,8 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* one round
|
||||
* a0-a7 = input rows
|
||||
|
@@ -58,11 +58,9 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm_xor_si128(j, j);\
|
||||
j = _mm_cmpgt_epi8(j, i);\
|
||||
j = _mm_cmpgt_epi8( m128_zero, i);\
|
||||
i = _mm_add_epi8(i, i);\
|
||||
j = _mm_and_si128(j, k);\
|
||||
i = _mm_xor_si128(i, j);\
|
||||
i = mm128_xorand(i, j, k );\
|
||||
}
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
@@ -82,6 +80,96 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
b1 = a3;\
|
||||
TEMP2 = _mm_xor_si128(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
TEMP0 = mm128_xor3( b0, a4, a6 ); \
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP1 = mm128_xor3( b1, a5, a7 );\
|
||||
b2 = mm128_xor3( b2, a6, a0 ); \
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b3 = mm128_xor3( b3, a7, a1 ); \
|
||||
b1 = a1;\
|
||||
b6 = mm128_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = mm128_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = mm128_xor3( b7, a5, a3 ); \
|
||||
b5 = mm128_xor3( b5, a1, a3 ); \
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(TEMP2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm_xor_si128(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm_xor_si128(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#else
|
||||
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
@@ -178,6 +266,8 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#endif
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
|
@@ -24,9 +24,6 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
|
||||
for ( i = 0; i < SIZE512; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
@@ -46,9 +43,6 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
|
||||
{
|
||||
int i;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
|
||||
for ( i = 0; i < SIZE512; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
@@ -156,14 +150,12 @@ int groestl512_full( hashState_groestl* ctx, void* output,
|
||||
}
|
||||
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
uint64_t blocks = len / SIZE512;
|
||||
__m128i* in = (__m128i*)input;
|
||||
|
||||
@@ -175,8 +167,8 @@ int groestl512_full( hashState_groestl* ctx, void* output,
|
||||
// copy any remaining data to buffer, it may already contain data
|
||||
// from a previous update for a midstate precalc
|
||||
for ( i = 0; i < len % SIZE512; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
|
||||
// use i as rem_ptr in final
|
||||
|
||||
//--- final ---
|
||||
|
||||
|
@@ -43,7 +43,8 @@
|
||||
#define ROUNDS (ROUNDS1024)
|
||||
//#endif
|
||||
|
||||
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
#define ROTL64(a,n) rol64( a, n )
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
||||
|
@@ -22,9 +22,6 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
@@ -43,9 +40,6 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
@@ -54,8 +48,6 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
|
||||
|
||||
ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );
|
||||
|
||||
// ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
// INIT256(ctx->chaining);
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
@@ -227,12 +219,10 @@ int groestl256_full( hashState_groestl256* ctx,
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT256( ctx->chaining );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE256;
|
||||
__m128i* in = (__m128i*)input;
|
||||
|
||||
@@ -255,8 +245,8 @@ int groestl256_full( hashState_groestl256* ctx,
|
||||
{
|
||||
// Copy any remaining data to buffer for final transform
|
||||
for ( i = 0; i < len % SIZE256; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
|
||||
// use i as rem_ptr in final
|
||||
}
|
||||
|
||||
//--- final ---
|
||||
|
@@ -63,7 +63,8 @@ typedef crypto_uint64 u64;
|
||||
//#define ROUNDS (ROUNDS1024)
|
||||
//#endif
|
||||
|
||||
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
#define ROTL64(a,n) rol64( a, n )
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
||||
|
@@ -26,9 +26,6 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = m512_zero;
|
||||
@@ -36,8 +33,7 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
|
||||
}
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
|
||||
|
||||
ctx->chaining[ 3 ] = mm512_bcast128lo_64( 0x0100000000000000 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
@@ -50,14 +46,10 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
|
||||
const int len = (int)datalen >> 4;
|
||||
const int hashlen_m128i = 32 >> 4; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE256;
|
||||
uint64_t blocks = len / SIZE256;
|
||||
__m512i* in = (__m512i*)input;
|
||||
int i;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = m512_zero;
|
||||
@@ -65,9 +57,8 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
|
||||
}
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
|
||||
ctx->chaining[ 3 ] = mm512_bcast128lo_64( 0x0100000000000000 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
|
||||
@@ -76,11 +67,10 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
|
||||
TF512_4way( ctx->chaining, &in[ i * SIZE256 ] );
|
||||
ctx->buf_ptr = blocks * SIZE256;
|
||||
|
||||
// copy any remaining data to buffer, it may already contain data
|
||||
// from a previous update for a midstate precalc
|
||||
// copy any remaining data to buffer
|
||||
for ( i = 0; i < len % SIZE256; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
|
||||
// use i as rem_ptr in final
|
||||
|
||||
//--- final ---
|
||||
|
||||
@@ -89,18 +79,18 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
|
||||
if ( i == SIZE256 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m512_const2_64( (uint64_t)blocks << 56, 0x80 );
|
||||
ctx->buffer[i] = mm512_set2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
|
||||
ctx->buffer[i] = mm512_bcast128lo_64( 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = m512_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = m512_const2_64( (uint64_t)blocks << 56, 0 );
|
||||
ctx->buffer[i] = mm512_bcast128hi_64( blocks << 56 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
@@ -122,7 +112,7 @@ int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE256;
|
||||
uint64_t blocks = len / SIZE256;
|
||||
__m512i* in = (__m512i*)input;
|
||||
int i;
|
||||
|
||||
@@ -146,20 +136,18 @@ int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
|
||||
if ( i == SIZE256 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
|
||||
ctx->buffer[i] = mm512_set2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
|
||||
ctx->buffer[i] = mm512_bcast128lo_64( 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = m512_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
|
||||
ctx->buffer[i] = mm512_bcast128hi_64( blocks << 56 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
@@ -184,8 +172,8 @@ int groestl256_2way_init( groestl256_2way_context* ctx, uint64_t hashlen )
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
// if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
// return 1;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
@@ -194,7 +182,7 @@ int groestl256_2way_init( groestl256_2way_context* ctx, uint64_t hashlen )
|
||||
}
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
|
||||
ctx->chaining[ 3 ] = mm256_bcast128lo_64( 0x0100000000000000 );
|
||||
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
@@ -208,14 +196,10 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
|
||||
const int len = (int)datalen >> 4;
|
||||
const int hashlen_m128i = 32 >> 4; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE256;
|
||||
uint64_t blocks = len / SIZE256;
|
||||
__m256i* in = (__m256i*)input;
|
||||
int i;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = m256_zero;
|
||||
@@ -223,9 +207,8 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
|
||||
}
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
|
||||
ctx->chaining[ 3 ] = mm256_bcast128lo_64( 0x0100000000000000 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
|
||||
@@ -234,11 +217,10 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
|
||||
TF512_2way( ctx->chaining, &in[ i * SIZE256 ] );
|
||||
ctx->buf_ptr = blocks * SIZE256;
|
||||
|
||||
// copy any remaining data to buffer, it may already contain data
|
||||
// from a previous update for a midstate precalc
|
||||
// copy any remaining data to buffer
|
||||
for ( i = 0; i < len % SIZE256; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
|
||||
// use i as rem_ptr in final
|
||||
|
||||
//--- final ---
|
||||
|
||||
@@ -247,18 +229,18 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
|
||||
if ( i == SIZE256 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m256_const2_64( (uint64_t)blocks << 56, 0x80 );
|
||||
ctx->buffer[i] = mm256_set2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m256_const2_64( 0, 0x80 );
|
||||
ctx->buffer[i] = mm256_bcast128lo_64( 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = m256_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = m256_const2_64( (uint64_t)blocks << 56, 0 );
|
||||
ctx->buffer[i] = mm256_bcast128hi_64( blocks << 56 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
@@ -279,7 +261,7 @@ int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output,
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE256;
|
||||
uint64_t blocks = len / SIZE256;
|
||||
__m256i* in = (__m256i*)input;
|
||||
int i;
|
||||
|
||||
@@ -303,25 +285,22 @@ int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output,
|
||||
if ( i == SIZE256 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
|
||||
ctx->buffer[i] = mm256_set2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m256_const2_64( 0, 0x80 );
|
||||
ctx->buffer[i] = mm256_bcast128lo_64( 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = m256_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
|
||||
ctx->buffer[i] = mm256_bcast128hi_64( blocks << 56 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF512_2way( ctx->chaining, ctx->buffer );
|
||||
|
||||
OF512_2way( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
|
@@ -96,11 +96,9 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm512_xor_si512(j, j);\
|
||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
|
||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
|
||||
i = _mm512_add_epi8(i, i);\
|
||||
j = _mm512_and_si512(j, k);\
|
||||
i = _mm512_xor_si512(i, j);\
|
||||
i = mm512_xorand( i, j, k );\
|
||||
}
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
@@ -120,7 +118,9 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
|
||||
#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
|
||||
b0, b1, b2, b3, b4, b5, b6, b7) { \
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0; \
|
||||
b7 = a1; \
|
||||
@@ -128,7 +128,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
b0 = a2; \
|
||||
a1 = _mm512_xor_si512( a1, a2 ); \
|
||||
b1 = a3; \
|
||||
a2 = _mm512_xor_si512(a2, a3);\
|
||||
TEMP2 = _mm512_xor_si512( a2, a3 ); \
|
||||
b2 = a4; \
|
||||
a3 = _mm512_xor_si512( a3, a4 ); \
|
||||
b3 = a5; \
|
||||
@@ -140,34 +140,23 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
a7 = _mm512_xor_si512( a7, b6 ); \
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm512_xor_si512(b0, a4);\
|
||||
b6 = _mm512_xor_si512(b6, a4);\
|
||||
b1 = _mm512_xor_si512(b1, a5);\
|
||||
b7 = _mm512_xor_si512(b7, a5);\
|
||||
b2 = _mm512_xor_si512(b2, a6);\
|
||||
b0 = _mm512_xor_si512(b0, a6);\
|
||||
TEMP0 = mm512_xor3( b0, a4, a6 ); \
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm512_xor_si512(b3, a7);\
|
||||
b1 = _mm512_xor_si512(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm512_xor_si512(b4, a0);\
|
||||
b2 = _mm512_xor_si512(b2, a0);\
|
||||
TEMP1 = mm512_xor3( b1, a5, a7 ); \
|
||||
b2 = mm512_xor3( b2, a6, a0 ); \
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0; \
|
||||
b5 = _mm512_xor_si512(b5, a1);\
|
||||
b3 = _mm512_xor_si512(b3, a1);\
|
||||
b3 = mm512_xor3( b3, a7, a1 ); \
|
||||
b1 = a1; \
|
||||
b6 = _mm512_xor_si512(b6, a2);\
|
||||
b4 = _mm512_xor_si512(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm512_xor_si512(b7, a3);\
|
||||
b5 = _mm512_xor_si512(b5, a3);\
|
||||
b6 = mm512_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = mm512_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = mm512_xor3( b7, a5, a3 ); \
|
||||
b5 = mm512_xor3( b5, a1, a3 ); \
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm512_xor_si512( a0, a3 ); \
|
||||
a1 = _mm512_xor_si512( a1, a4 ); \
|
||||
a2 = _mm512_xor_si512(a2, a5);\
|
||||
a2 = _mm512_xor_si512( TEMP2, a5 ); \
|
||||
a3 = _mm512_xor_si512( a3, a6 ); \
|
||||
a4 = _mm512_xor_si512( a4, a7 ); \
|
||||
a5 = _mm512_xor_si512( a5, b0 ); \
|
||||
@@ -176,7 +165,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = _mm512_set1_epi64( 0x1b1b1b1b1b1b1b1b ); \
|
||||
MUL2( a0, b0, b1 ); \
|
||||
a0 = _mm512_xor_si512( a0, TEMP0 ); \
|
||||
MUL2( a1, b0, b1 ); \
|
||||
@@ -216,18 +205,18 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
b1 = _mm512_xor_si512( b1, a4 ); \
|
||||
}/*MixBytes*/
|
||||
|
||||
#define MASK_NOT( a ) _mm512_mask_ternarylogic_epi64( a, 0xaa, a, a, 1 )
|
||||
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = m512_const2_64( 0xffffffffffffffff, 0 ); \
|
||||
a0 = _mm512_xor_si512( a0, m512_const1_128( round_const_l0[i] ) );\
|
||||
a1 = _mm512_xor_si512( a1, b1 );\
|
||||
a2 = _mm512_xor_si512( a2, b1 );\
|
||||
a3 = _mm512_xor_si512( a3, b1 );\
|
||||
a4 = _mm512_xor_si512( a4, b1 );\
|
||||
a5 = _mm512_xor_si512( a5, b1 );\
|
||||
a6 = _mm512_xor_si512( a6, b1 );\
|
||||
a7 = _mm512_xor_si512( a7, m512_const1_128( round_const_l7[i] ) );\
|
||||
a0 = _mm512_xor_si512( a0, mm512_bcast_m128( round_const_l0[i] ) );\
|
||||
a1 = MASK_NOT( a1 ); \
|
||||
a2 = MASK_NOT( a2 ); \
|
||||
a3 = MASK_NOT( a3 ); \
|
||||
a4 = MASK_NOT( a4 ); \
|
||||
a5 = MASK_NOT( a5 ); \
|
||||
a6 = MASK_NOT( a6 ); \
|
||||
a7 = _mm512_xor_si512( a7, mm512_bcast_m128( round_const_l7[i] ) );\
|
||||
\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
b0 = _mm512_xor_si512( b0, b0 );\
|
||||
@@ -363,7 +352,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm512_xor_si512( t0, t0 );\
|
||||
t0 = m512_zero;\
|
||||
i1 = i0;\
|
||||
i3 = i2;\
|
||||
i5 = i4;\
|
||||
@@ -394,11 +383,11 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
|
||||
void TF512_4way( __m512i* chaining, __m512i* message )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m512i TEMP0;
|
||||
static __m512i TEMP1;
|
||||
static __m512i TEMP2;
|
||||
__m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m512i TEMP0;
|
||||
__m512i TEMP1;
|
||||
__m512i TEMP2;
|
||||
|
||||
/* load message into registers xmm12 - xmm15 */
|
||||
xmm12 = message[0];
|
||||
@@ -460,11 +449,11 @@ void TF512_4way( __m512i* chaining, __m512i* message )
|
||||
|
||||
void OF512_4way( __m512i* chaining )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m512i TEMP0;
|
||||
static __m512i TEMP1;
|
||||
static __m512i TEMP2;
|
||||
__m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m512i TEMP0;
|
||||
__m512i TEMP1;
|
||||
__m512i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
@@ -609,7 +598,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m256_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = _mm256_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2_2WAY(a0, b0, b1);\
|
||||
a0 = _mm256_xor_si256(a0, TEMP0);\
|
||||
MUL2_2WAY(a1, b0, b1);\
|
||||
@@ -651,15 +640,15 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
|
||||
#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = m256_const2_64( 0xffffffffffffffff, 0 ); \
|
||||
a0 = _mm256_xor_si256( a0, m256_const1_128( round_const_l0[i] ) );\
|
||||
b1 = mm256_bcast_m128( mm128_mask_32( m128_neg1, 0x3 ) ); \
|
||||
a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
|
||||
a1 = _mm256_xor_si256( a1, b1 );\
|
||||
a2 = _mm256_xor_si256( a2, b1 );\
|
||||
a3 = _mm256_xor_si256( a3, b1 );\
|
||||
a4 = _mm256_xor_si256( a4, b1 );\
|
||||
a5 = _mm256_xor_si256( a5, b1 );\
|
||||
a6 = _mm256_xor_si256( a6, b1 );\
|
||||
a7 = _mm256_xor_si256( a7, m256_const1_128( round_const_l7[i] ) );\
|
||||
a7 = _mm256_xor_si256( a7, mm256_bcast_m128( round_const_l7[i] ) );\
|
||||
\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
b0 = _mm256_xor_si256( b0, b0 );\
|
||||
@@ -763,7 +752,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
}/**/
|
||||
|
||||
#define Matrix_Transpose_O_B_2way(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm256_xor_si256( t0, t0 );\
|
||||
t0 = m256_zero;\
|
||||
i1 = i0;\
|
||||
i3 = i2;\
|
||||
i5 = i4;\
|
||||
@@ -787,11 +776,11 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
|
||||
void TF512_2way( __m256i* chaining, __m256i* message )
|
||||
{
|
||||
static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m256i TEMP0;
|
||||
static __m256i TEMP1;
|
||||
static __m256i TEMP2;
|
||||
__m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m256i TEMP0;
|
||||
__m256i TEMP1;
|
||||
__m256i TEMP2;
|
||||
|
||||
/* load message into registers xmm12 - xmm15 */
|
||||
xmm12 = message[0];
|
||||
@@ -853,11 +842,11 @@ void TF512_2way( __m256i* chaining, __m256i* message )
|
||||
|
||||
void OF512_2way( __m256i* chaining )
|
||||
{
|
||||
static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m256i TEMP0;
|
||||
static __m256i TEMP1;
|
||||
static __m256i TEMP2;
|
||||
__m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m256i TEMP0;
|
||||
__m256i TEMP1;
|
||||
__m256i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
|
@@ -21,15 +21,11 @@
|
||||
|
||||
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
|
||||
{
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
|
||||
memset_zero_512( ctx->chaining, SIZE512 );
|
||||
memset_zero_512( ctx->buffer, SIZE512 );
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
|
||||
|
||||
ctx->chaining[ 6 ] = mm512_bcast128hi_64( 0x0200000000000000 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
@@ -43,7 +39,7 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
|
||||
const int hashlen_m128i = 64 / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE512;
|
||||
uint64_t blocks = len / SIZE512;
|
||||
__m512i* in = (__m512i*)input;
|
||||
int i;
|
||||
|
||||
@@ -64,16 +60,14 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
|
||||
if ( i == SIZE512 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
|
||||
ctx->buffer[i] = mm512_set2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
|
||||
ctx->buffer[i] = mm512_bcast128lo_64( 0x80 );
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = m512_zero;
|
||||
ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
|
||||
ctx->buffer[i] = mm512_bcast128hi_64( blocks << 56 );
|
||||
}
|
||||
|
||||
TF1024_4way( ctx->chaining, ctx->buffer );
|
||||
@@ -99,9 +93,8 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
|
||||
|
||||
memset_zero_512( ctx->chaining, SIZE512 );
|
||||
memset_zero_512( ctx->buffer, SIZE512 );
|
||||
ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
|
||||
ctx->chaining[ 6 ] = mm512_bcast128hi_64( 0x0200000000000000 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
|
||||
@@ -110,8 +103,7 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
|
||||
ctx->buf_ptr = blocks * SIZE512;
|
||||
|
||||
for ( i = 0; i < len % SIZE512; i++ )
|
||||
ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += ctx->rem_ptr;
|
||||
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
|
||||
|
||||
// --- close ---
|
||||
|
||||
@@ -120,14 +112,14 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
|
||||
if ( i == SIZE512 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
|
||||
ctx->buffer[i] = mm512_set2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
|
||||
ctx->buffer[i] = mm512_bcast128lo_64( 0x80 );
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = m512_zero;
|
||||
ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
|
||||
ctx->buffer[i] = mm512_bcast128hi_64( blocks << 56 );
|
||||
}
|
||||
|
||||
TF1024_4way( ctx->chaining, ctx->buffer );
|
||||
@@ -146,14 +138,11 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
|
||||
|
||||
int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen )
|
||||
{
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
|
||||
memset_zero_256( ctx->chaining, SIZE512 );
|
||||
memset_zero_256( ctx->buffer, SIZE512 );
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 );
|
||||
ctx->chaining[ 6 ] = mm256_bcast128hi_64( 0x0200000000000000 );
|
||||
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
@@ -168,7 +157,7 @@ int groestl512_2way_update_close( groestl512_2way_context* ctx, void* output,
|
||||
const int hashlen_m128i = 64 / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE512;
|
||||
uint64_t blocks = len / SIZE512;
|
||||
__m256i* in = (__m256i*)input;
|
||||
int i;
|
||||
|
||||
@@ -189,16 +178,14 @@ int groestl512_2way_update_close( groestl512_2way_context* ctx, void* output,
|
||||
if ( i == SIZE512 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
|
||||
ctx->buffer[i] = mm256_set2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
ctx->buffer[i] = m256_const2_64( 0, 0x80 );
|
||||
ctx->buffer[i] = mm256_bcast128lo_64( 0x80 );
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = m256_zero;
|
||||
ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
|
||||
ctx->buffer[i] = mm256_bcast128hi_64( blocks << 56 );
|
||||
}
|
||||
|
||||
TF1024_2way( ctx->chaining, ctx->buffer );
|
||||
@@ -224,9 +211,8 @@ int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
|
||||
|
||||
memset_zero_256( ctx->chaining, SIZE512 );
|
||||
memset_zero_256( ctx->buffer, SIZE512 );
|
||||
ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 );
|
||||
ctx->chaining[ 6 ] = mm256_bcast128hi_64( 0x0200000000000000 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
|
||||
@@ -235,8 +221,7 @@ int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
|
||||
ctx->buf_ptr = blocks * SIZE512;
|
||||
|
||||
for ( i = 0; i < len % SIZE512; i++ )
|
||||
ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += ctx->rem_ptr;
|
||||
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
|
||||
|
||||
// --- close ---
|
||||
|
||||
@@ -245,14 +230,14 @@ int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
|
||||
if ( i == SIZE512 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
|
||||
ctx->buffer[i] = mm256_set2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
ctx->buffer[i] = m256_const2_64( 0, 0x80 );
|
||||
ctx->buffer[i] = mm256_bcast128lo_64( 0x80 );
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = m256_zero;
|
||||
ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
|
||||
ctx->buffer[i] = mm256_bcast128hi_64( blocks << 56 );
|
||||
}
|
||||
|
||||
TF1024_2way( ctx->chaining, ctx->buffer );
|
||||
|
@@ -104,11 +104,9 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm512_xor_si512(j, j);\
|
||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
|
||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
|
||||
i = _mm512_add_epi8(i, i);\
|
||||
j = _mm512_and_si512(j, k);\
|
||||
i = _mm512_xor_si512(i, j);\
|
||||
i = mm512_xorand( i, j, k );\
|
||||
}
|
||||
|
||||
/**/
|
||||
@@ -130,7 +128,8 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
|
||||
b0, b1, b2, b3, b4, b5, b6, b7) { \
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0; \
|
||||
b7 = a1; \
|
||||
@@ -138,7 +137,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
b0 = a2; \
|
||||
a1 = _mm512_xor_si512( a1, a2 ); \
|
||||
b1 = a3; \
|
||||
a2 = _mm512_xor_si512(a2, a3);\
|
||||
TEMP2 = _mm512_xor_si512( a2, a3 ); \
|
||||
b2 = a4; \
|
||||
a3 = _mm512_xor_si512( a3, a4 ); \
|
||||
b3 = a5; \
|
||||
@@ -150,34 +149,23 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
a7 = _mm512_xor_si512( a7, b6 ); \
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm512_xor_si512(b0, a4);\
|
||||
b6 = _mm512_xor_si512(b6, a4);\
|
||||
b1 = _mm512_xor_si512(b1, a5);\
|
||||
b7 = _mm512_xor_si512(b7, a5);\
|
||||
b2 = _mm512_xor_si512(b2, a6);\
|
||||
b0 = _mm512_xor_si512(b0, a6);\
|
||||
TEMP0 = mm512_xor3( b0, a4, a6 ); \
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm512_xor_si512(b3, a7);\
|
||||
b1 = _mm512_xor_si512(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm512_xor_si512(b4, a0);\
|
||||
b2 = _mm512_xor_si512(b2, a0);\
|
||||
TEMP1 = mm512_xor3( b1, a5, a7 ); \
|
||||
b2 = mm512_xor3( b2, a6, a0 ); \
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0; \
|
||||
b5 = _mm512_xor_si512(b5, a1);\
|
||||
b3 = _mm512_xor_si512(b3, a1);\
|
||||
b3 = mm512_xor3( b3, a7, a1 ); \
|
||||
b1 = a1; \
|
||||
b6 = _mm512_xor_si512(b6, a2);\
|
||||
b4 = _mm512_xor_si512(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm512_xor_si512(b7, a3);\
|
||||
b5 = _mm512_xor_si512(b5, a3);\
|
||||
b6 = mm512_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = mm512_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = mm512_xor3( b7, a5, a3 ); \
|
||||
b5 = mm512_xor3( b5, a1, a3 ); \
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm512_xor_si512( a0, a3 ); \
|
||||
a1 = _mm512_xor_si512( a1, a4 ); \
|
||||
a2 = _mm512_xor_si512(a2, a5);\
|
||||
a2 = _mm512_xor_si512( TEMP2, a5 ); \
|
||||
a3 = _mm512_xor_si512( a3, a6 ); \
|
||||
a4 = _mm512_xor_si512( a4, a7 ); \
|
||||
a5 = _mm512_xor_si512( a5, b0 ); \
|
||||
@@ -186,7 +174,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = _mm512_set1_epi64( 0x1b1b1b1b1b1b1b1b ); \
|
||||
MUL2( a0, b0, b1 ); \
|
||||
a0 = _mm512_xor_si512( a0, TEMP0 ); \
|
||||
MUL2( a1, b0, b1 ); \
|
||||
@@ -250,7 +238,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
|
||||
{ \
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm512_xor_si512( xmm8, m512_const1_128( \
|
||||
xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
|
||||
casti_m128i( round_const_p, round_counter ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
|
||||
@@ -265,7 +253,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm512_xor_si512( xmm0, m512_const1_128( \
|
||||
xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
|
||||
casti_m128i( round_const_p, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
|
||||
@@ -294,7 +282,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
xmm12 = _mm512_xor_si512( xmm12, xmm1 );\
|
||||
xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
|
||||
xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
|
||||
xmm15 = _mm512_xor_si512( xmm15, m512_const1_128( \
|
||||
xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
|
||||
casti_m128i( round_const_q, round_counter ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK1 );\
|
||||
@@ -317,7 +305,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
xmm4 = _mm512_xor_si512( xmm4, xmm9 );\
|
||||
xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
|
||||
xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
|
||||
xmm7 = _mm512_xor_si512( xmm7, m512_const1_128( \
|
||||
xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
|
||||
casti_m128i( round_const_q, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
|
||||
@@ -483,8 +471,8 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
|
||||
void INIT_4way( __m512i* chaining )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
@@ -512,12 +500,12 @@ void INIT_4way( __m512i* chaining )
|
||||
|
||||
void TF1024_4way( __m512i* chaining, const __m512i* message )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m512i QTEMP[8];
|
||||
static __m512i TEMP0;
|
||||
static __m512i TEMP1;
|
||||
static __m512i TEMP2;
|
||||
__m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m512i QTEMP[8];
|
||||
__m512i TEMP0;
|
||||
__m512i TEMP1;
|
||||
__m512i TEMP2;
|
||||
|
||||
/* load message into registers xmm8 - xmm15 (Q = message) */
|
||||
xmm8 = message[0];
|
||||
@@ -618,11 +606,11 @@ void TF1024_4way( __m512i* chaining, const __m512i* message )
|
||||
|
||||
void OF1024_4way( __m512i* chaining )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m512i TEMP0;
|
||||
static __m512i TEMP1;
|
||||
static __m512i TEMP2;
|
||||
__m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m512i TEMP0;
|
||||
__m512i TEMP1;
|
||||
__m512i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
@@ -709,11 +697,9 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2_2WAY(i, j, k){\
|
||||
j = _mm256_xor_si256(j, j);\
|
||||
j = _mm256_cmpgt_epi8(j, i );\
|
||||
j = _mm256_cmpgt_epi8( m256_zero, i );\
|
||||
i = _mm256_add_epi8(i, i);\
|
||||
j = _mm256_and_si256(j, k);\
|
||||
i = _mm256_xor_si256(i, j);\
|
||||
i = mm256_xorand( i, j, k );\
|
||||
}
|
||||
|
||||
#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
@@ -772,7 +758,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m256_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = _mm256_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2_2WAY(a0, b0, b1);\
|
||||
a0 = _mm256_xor_si256(a0, TEMP0);\
|
||||
MUL2_2WAY(a1, b0, b1);\
|
||||
@@ -836,7 +822,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
|
||||
{ \
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm256_xor_si256( xmm8, m256_const1_128( \
|
||||
xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
|
||||
casti_m128i( round_const_p, round_counter ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK0_2WAY ); \
|
||||
@@ -851,7 +837,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
SUBMIX_2WAY(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm256_xor_si256( xmm0, m256_const1_128( \
|
||||
xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
|
||||
casti_m128i( round_const_p, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
|
||||
@@ -880,7 +866,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
xmm12 = _mm256_xor_si256( xmm12, xmm1 );\
|
||||
xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
|
||||
xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
|
||||
xmm15 = _mm256_xor_si256( xmm15, m256_const1_128( \
|
||||
xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
|
||||
casti_m128i( round_const_q, round_counter ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK1_2WAY );\
|
||||
@@ -903,7 +889,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
xmm4 = _mm256_xor_si256( xmm4, xmm9 );\
|
||||
xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
|
||||
xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
|
||||
xmm7 = _mm256_xor_si256( xmm7, m256_const1_128( \
|
||||
xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
|
||||
casti_m128i( round_const_q, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
|
||||
@@ -1054,8 +1040,8 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
|
||||
void INIT_2way( __m256i *chaining )
|
||||
{
|
||||
static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
@@ -1083,12 +1069,12 @@ void INIT_2way( __m256i *chaining )
|
||||
|
||||
void TF1024_2way( __m256i *chaining, const __m256i *message )
|
||||
{
|
||||
static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m256i QTEMP[8];
|
||||
static __m256i TEMP0;
|
||||
static __m256i TEMP1;
|
||||
static __m256i TEMP2;
|
||||
__m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m256i QTEMP[8];
|
||||
__m256i TEMP0;
|
||||
__m256i TEMP1;
|
||||
__m256i TEMP2;
|
||||
|
||||
/* load message into registers xmm8 - xmm15 (Q = message) */
|
||||
xmm8 = message[0];
|
||||
@@ -1189,11 +1175,11 @@ void TF1024_2way( __m256i *chaining, const __m256i *message )
|
||||
|
||||
void OF1024_2way( __m256i* chaining )
|
||||
{
|
||||
static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m256i TEMP0;
|
||||
static __m256i TEMP1;
|
||||
static __m256i TEMP2;
|
||||
__m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m256i TEMP0;
|
||||
__m256i TEMP1;
|
||||
__m256i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
|
@@ -11,7 +11,7 @@
|
||||
#else
|
||||
#include "sph_groestl.h"
|
||||
#endif
|
||||
#include <openssl/sha.h>
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
typedef struct {
|
||||
#ifdef __AES__
|
||||
@@ -19,7 +19,6 @@ typedef struct {
|
||||
#else
|
||||
sph_groestl512_context groestl;
|
||||
#endif
|
||||
SHA256_CTX sha;
|
||||
} myrgr_ctx_holder;
|
||||
|
||||
myrgr_ctx_holder myrgr_ctx;
|
||||
@@ -31,7 +30,6 @@ void init_myrgr_ctx()
|
||||
#else
|
||||
sph_groestl512_init( &myrgr_ctx.groestl );
|
||||
#endif
|
||||
SHA256_Init( &myrgr_ctx.sha );
|
||||
}
|
||||
|
||||
void myriad_hash(void *output, const void *input)
|
||||
@@ -49,8 +47,7 @@ void myriad_hash(void *output, const void *input)
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#endif
|
||||
|
||||
SHA256_Update( &ctx.sha, (unsigned char*)hash, 64 );
|
||||
SHA256_Final( (unsigned char*)hash, &ctx.sha );
|
||||
sha256_full( hash, hash, 64 );
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
@@ -63,7 +60,7 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
int thr_id = mythr->id;
|
||||
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||
@@ -76,10 +73,11 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[19], nonce);
|
||||
myriad_hash(hash, endiandata);
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
if (hash[7] <= Htarg )
|
||||
if ( fulltest(hash, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
|
||||
|
@@ -44,6 +44,7 @@ void myriad_8way_hash( void *output, const void *input )
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, input, 640 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(groestl512_4way_context) );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 );
|
||||
|
||||
uint32_t hash0[20] __attribute__ ((aligned (64)));
|
||||
@@ -58,8 +59,6 @@ void myriad_8way_hash( void *output, const void *input )
|
||||
// rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
|
||||
hash6, hash7 );
|
||||
|
||||
#else
|
||||
|
||||
@@ -76,27 +75,27 @@ void myriad_8way_hash( void *output, const void *input )
|
||||
hash4, hash5, hash6, hash7, input, 640 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
|
||||
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, 512 );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
|
||||
#endif
|
||||
|
||||
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
|
||||
hash6, hash7 );
|
||||
|
||||
sha256_8way_update( &ctx.sha, vhash, 64 );
|
||||
sha256_8way_close( &ctx.sha, output );
|
||||
}
|
||||
|
@@ -545,39 +545,33 @@ static const sph_u32 T512[64][16] = {
|
||||
#define sE c7
|
||||
#define sF m7
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// Hamsi 8 way
|
||||
// Hamsi 8 way AVX512
|
||||
|
||||
// Intel says _mm512_movepi64_mask has (1L/1T) timimg while
|
||||
// _mm512_cmplt_epi64_mask as (3L/1T) timing, however, when tested hashing X13
|
||||
// on i9-9940x cmplt with zero was 3% faster than movepi.
|
||||
|
||||
#define INPUT_BIG8 \
|
||||
do { \
|
||||
__m512i db = *buf; \
|
||||
const uint64_t *tp = (uint64_t*)&T512[0][0]; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
|
||||
__m512i db = _mm512_ror_epi64( *buf, 1 ); \
|
||||
const __m512i zero = m512_zero; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
for ( int u = 0; u < 64; u++ ) \
|
||||
{ \
|
||||
__m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
|
||||
dm = mm512_negate_32( _mm512_or_si512( dm, \
|
||||
_mm512_slli_epi64( dm, 32 ) ) ); \
|
||||
m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[0] ) ) ); \
|
||||
m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[1] ) ) ); \
|
||||
m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[2] ) ) ); \
|
||||
m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[3] ) ) ); \
|
||||
m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[4] ) ) ); \
|
||||
m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[5] ) ) ); \
|
||||
m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[6] ) ) ); \
|
||||
m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[7] ) ) ); \
|
||||
const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \
|
||||
m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \
|
||||
m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \
|
||||
m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \
|
||||
m3 = _mm512_mask_xor_epi64( m3, dm, m3, m512_const1_64( tp[3] ) ); \
|
||||
m4 = _mm512_mask_xor_epi64( m4, dm, m4, m512_const1_64( tp[4] ) ); \
|
||||
m5 = _mm512_mask_xor_epi64( m5, dm, m5, m512_const1_64( tp[5] ) ); \
|
||||
m6 = _mm512_mask_xor_epi64( m6, dm, m6, m512_const1_64( tp[6] ) ); \
|
||||
m7 = _mm512_mask_xor_epi64( m7, dm, m7, m512_const1_64( tp[7] ) ); \
|
||||
db = _mm512_ror_epi64( db, 1 ); \
|
||||
tp += 8; \
|
||||
db = _mm512_srli_epi64( db, 1 ); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -585,22 +579,14 @@ do { \
|
||||
do { \
|
||||
__m512i t; \
|
||||
t = a; \
|
||||
a = _mm512_and_si512( a, c ); \
|
||||
a = _mm512_xor_si512( a, d ); \
|
||||
c = _mm512_xor_si512( c, b ); \
|
||||
c = _mm512_xor_si512( c, a ); \
|
||||
d = _mm512_or_si512( d, t ); \
|
||||
d = _mm512_xor_si512( d, b ); \
|
||||
a = mm512_xorand( d, a, c ); \
|
||||
c = mm512_xor3( a, b, c ); \
|
||||
b = mm512_xoror( b, d, t ); \
|
||||
t = _mm512_xor_si512( t, c ); \
|
||||
b = d; \
|
||||
d = _mm512_or_si512( d, t ); \
|
||||
d = _mm512_xor_si512( d, a ); \
|
||||
a = _mm512_and_si512( a, b ); \
|
||||
t = _mm512_xor_si512( t, a ); \
|
||||
b = _mm512_xor_si512( b, d ); \
|
||||
b = _mm512_xor_si512( b, t ); \
|
||||
d = mm512_xoror( a, b, t ); \
|
||||
t = mm512_xorand( t, a, b ); \
|
||||
a = c; \
|
||||
c = b; \
|
||||
c = mm512_xor3( b, d, t ); \
|
||||
b = d; \
|
||||
d = mm512_not( t ); \
|
||||
} while (0)
|
||||
@@ -609,14 +595,12 @@ do { \
|
||||
do { \
|
||||
a = mm512_rol_32( a, 13 ); \
|
||||
c = mm512_rol_32( c, 3 ); \
|
||||
b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \
|
||||
d = _mm512_xor_si512( d, _mm512_xor_si512( c, \
|
||||
_mm512_slli_epi32( a, 3 ) ) ); \
|
||||
b = mm512_xor3( a, b, c ); \
|
||||
d = mm512_xor3( d, c, _mm512_slli_epi32( a, 3 ) ); \
|
||||
b = mm512_rol_32( b, 1 ); \
|
||||
d = mm512_rol_32( d, 7 ); \
|
||||
a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \
|
||||
c = _mm512_xor_si512( c, _mm512_xor_si512( d, \
|
||||
_mm512_slli_epi32( b, 7 ) ) ); \
|
||||
a = mm512_xor3( a, b, d ); \
|
||||
c = mm512_xor3( c, d, _mm512_slli_epi32( b, 7 ) ); \
|
||||
a = mm512_rol_32( a, 5 ); \
|
||||
c = mm512_rol_32( c, 22 ); \
|
||||
} while (0)
|
||||
@@ -626,162 +610,184 @@ do { \
|
||||
|
||||
#define READ_STATE_BIG8(sc) \
|
||||
do { \
|
||||
c0 = sc->h[0x0]; \
|
||||
c1 = sc->h[0x1]; \
|
||||
c2 = sc->h[0x2]; \
|
||||
c3 = sc->h[0x3]; \
|
||||
c4 = sc->h[0x4]; \
|
||||
c5 = sc->h[0x5]; \
|
||||
c6 = sc->h[0x6]; \
|
||||
c7 = sc->h[0x7]; \
|
||||
c0 = sc->h[0]; \
|
||||
c1 = sc->h[1]; \
|
||||
c2 = sc->h[2]; \
|
||||
c3 = sc->h[3]; \
|
||||
c4 = sc->h[4]; \
|
||||
c5 = sc->h[5]; \
|
||||
c6 = sc->h[6]; \
|
||||
c7 = sc->h[7]; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE_BIG8(sc) \
|
||||
do { \
|
||||
sc->h[0x0] = c0; \
|
||||
sc->h[0x1] = c1; \
|
||||
sc->h[0x2] = c2; \
|
||||
sc->h[0x3] = c3; \
|
||||
sc->h[0x4] = c4; \
|
||||
sc->h[0x5] = c5; \
|
||||
sc->h[0x6] = c6; \
|
||||
sc->h[0x7] = c7; \
|
||||
sc->h[0] = c0; \
|
||||
sc->h[1] = c1; \
|
||||
sc->h[2] = c2; \
|
||||
sc->h[3] = c3; \
|
||||
sc->h[4] = c4; \
|
||||
sc->h[5] = c5; \
|
||||
sc->h[6] = c6; \
|
||||
sc->h[7] = c7; \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define ROUND_BIG8(rc, alpha) \
|
||||
#define ROUND_BIG8( alpha ) \
|
||||
do { \
|
||||
__m512i t0, t1, t2, t3; \
|
||||
s0 = _mm512_xor_si512( s0, m512_const1_64( \
|
||||
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
|
||||
s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
|
||||
s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
|
||||
s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
|
||||
s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
|
||||
s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
|
||||
s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
|
||||
s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
|
||||
s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
|
||||
s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
|
||||
sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
|
||||
sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
|
||||
sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
|
||||
sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
|
||||
sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
|
||||
sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
|
||||
__m512i t0, t1, t2, t3, t4, t5; \
|
||||
s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \
|
||||
s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \
|
||||
s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \
|
||||
s3 = _mm512_xor_si512( s3, alpha[ 3] ); /* c1 */ \
|
||||
s4 = _mm512_xor_si512( s4, alpha[ 4] ); /* c2 */ \
|
||||
s5 = _mm512_xor_si512( s5, alpha[ 5] ); /* m2 */ \
|
||||
s6 = _mm512_xor_si512( s6, alpha[ 6] ); /* c3 */ \
|
||||
s7 = _mm512_xor_si512( s7, alpha[ 7] ); /* m3 */ \
|
||||
s8 = _mm512_xor_si512( s8, alpha[ 8] ); /* m4 */ \
|
||||
s9 = _mm512_xor_si512( s9, alpha[ 9] ); /* c4 */ \
|
||||
sA = _mm512_xor_si512( sA, alpha[10] ); /* m5 */ \
|
||||
sB = _mm512_xor_si512( sB, alpha[11] ); /* c5 */ \
|
||||
sC = _mm512_xor_si512( sC, alpha[12] ); /* c6 */ \
|
||||
sD = _mm512_xor_si512( sD, alpha[13] ); /* m6 */ \
|
||||
sE = _mm512_xor_si512( sE, alpha[14] ); /* c7 */ \
|
||||
sF = _mm512_xor_si512( sF, alpha[15] ); /* m7 */ \
|
||||
\
|
||||
SBOX8( s0, s4, s8, sC ); \
|
||||
SBOX8( s1, s5, s9, sD ); \
|
||||
SBOX8( s2, s6, sA, sE ); \
|
||||
SBOX8( s3, s7, sB, sF ); \
|
||||
SBOX8( s0, s4, s8, sC ); /* ( m0, c2, m4, c6 ) */ \
|
||||
SBOX8( s1, s5, s9, sD ); /* ( c0, m2, c4, m6 ) */ \
|
||||
SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
|
||||
SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
|
||||
\
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \
|
||||
_mm512_bslli_epi128( s5, 4 ) ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \
|
||||
_mm512_bslli_epi128( sE, 4 ) ); \
|
||||
L8( s0, t1, s9, t3 ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||
s4 = mm512_swap64_32( s4 ); \
|
||||
s5 = mm512_swap64_32( s5 ); \
|
||||
sD = mm512_swap64_32( sD ); \
|
||||
sE = mm512_swap64_32( sE ); \
|
||||
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
|
||||
L8( s0, t0, s9, t1 ); \
|
||||
\
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
|
||||
_mm512_bslli_epi128( s6, 4 ) ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \
|
||||
_mm512_bslli_epi128( sF, 4 ) ); \
|
||||
L8( s1, t1, sA, t3 ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||
s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||
sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||
s6 = mm512_swap64_32( s6 ); \
|
||||
sF = mm512_swap64_32( sF ); \
|
||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
|
||||
L8( s1, t2, sA, t3 ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
|
||||
sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
|
||||
\
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \
|
||||
_mm512_bslli_epi128( s7, 4 ) ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \
|
||||
_mm512_bslli_epi128( sC, 4 ) ); \
|
||||
L8( s2, t1, sB, t3 ); \
|
||||
s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||
s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||
sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||
s7 = mm512_swap64_32( s7 ); \
|
||||
sC = mm512_swap64_32( sC ); \
|
||||
t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
|
||||
t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
|
||||
L8( s2, t4, sB, t5 ); \
|
||||
s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
|
||||
sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
|
||||
s6 = mm512_swap64_32( s6 ); \
|
||||
sF = mm512_swap64_32( sF ); \
|
||||
\
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \
|
||||
_mm512_bslli_epi128( s4, 4 ) ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
|
||||
_mm512_bslli_epi128( sD, 4 ) ); \
|
||||
L8( s3, t1, s8, t3 ); \
|
||||
s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
|
||||
L8( s3, t2, s8, t3 ); \
|
||||
s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
|
||||
sC = _mm512_mask_blend_epi32( 0x5555, t5, t3 ); \
|
||||
sD = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 ); \
|
||||
s7 = mm512_swap64_32( s7 ); \
|
||||
sC = mm512_swap64_32( sC ); \
|
||||
\
|
||||
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \
|
||||
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
|
||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \
|
||||
_mm512_bslli_epi128( sB, 4 ) ); \
|
||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
|
||||
t3 = mm512_swap64_32( t3 ); \
|
||||
L8( t0, t1, t2, t3 ); \
|
||||
t3 = mm512_swap64_32( t3 ); \
|
||||
s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
|
||||
s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \
|
||||
s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \
|
||||
s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
|
||||
s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
|
||||
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \
|
||||
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \
|
||||
sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
|
||||
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||
sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \
|
||||
sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \
|
||||
\
|
||||
t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
|
||||
_mm512_bslli_epi128( sD, 4 ) ); \
|
||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
|
||||
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
|
||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, sE ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
|
||||
L8( t0, t1, t2, t3 ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0x5555, s4, t0 ); \
|
||||
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
|
||||
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t1 ); \
|
||||
s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
|
||||
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \
|
||||
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t2 ); \
|
||||
s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
|
||||
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
|
||||
s4 = mm512_swap64_32( s4 ); \
|
||||
s5 = mm512_swap64_32( s5 ); \
|
||||
sD = mm512_swap64_32( sD ); \
|
||||
sE = mm512_swap64_32( sE ); \
|
||||
} while (0)
|
||||
|
||||
#define P_BIG8 \
|
||||
do { \
|
||||
ROUND_BIG8(0, alpha_n); \
|
||||
ROUND_BIG8(1, alpha_n); \
|
||||
ROUND_BIG8(2, alpha_n); \
|
||||
ROUND_BIG8(3, alpha_n); \
|
||||
ROUND_BIG8(4, alpha_n); \
|
||||
ROUND_BIG8(5, alpha_n); \
|
||||
__m512i alpha[16]; \
|
||||
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( (1ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( (2ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( (3ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( (4ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( (5ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
} while (0)
|
||||
|
||||
#define PF_BIG8 \
|
||||
do { \
|
||||
ROUND_BIG8( 0, alpha_f); \
|
||||
ROUND_BIG8( 1, alpha_f); \
|
||||
ROUND_BIG8( 2, alpha_f); \
|
||||
ROUND_BIG8( 3, alpha_f); \
|
||||
ROUND_BIG8( 4, alpha_f); \
|
||||
ROUND_BIG8( 5, alpha_f); \
|
||||
ROUND_BIG8( 6, alpha_f); \
|
||||
ROUND_BIG8( 7, alpha_f); \
|
||||
ROUND_BIG8( 8, alpha_f); \
|
||||
ROUND_BIG8( 9, alpha_f); \
|
||||
ROUND_BIG8(10, alpha_f); \
|
||||
ROUND_BIG8(11, alpha_f); \
|
||||
__m512i alpha[16]; \
|
||||
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( 1ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( 2ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( 3ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( 4ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( 5ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( 6ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( 7ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( 8ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( 9ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( (10ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( (11ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
} while (0)
|
||||
|
||||
#define T_BIG8 \
|
||||
do { /* order is important */ \
|
||||
c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \
|
||||
c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \
|
||||
c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \
|
||||
c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \
|
||||
c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \
|
||||
c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \
|
||||
c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \
|
||||
c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \
|
||||
c7 = sc->h[ 7 ] = _mm512_xor_si512( sc->h[ 7 ], sB ); /* c5 */ \
|
||||
c6 = sc->h[ 6 ] = _mm512_xor_si512( sc->h[ 6 ], sA ); /* m5 */ \
|
||||
c5 = sc->h[ 5 ] = _mm512_xor_si512( sc->h[ 5 ], s9 ); /* c4 */ \
|
||||
c4 = sc->h[ 4 ] = _mm512_xor_si512( sc->h[ 4 ], s8 ); /* m4 */ \
|
||||
c3 = sc->h[ 3 ] = _mm512_xor_si512( sc->h[ 3 ], s3 ); /* c1 */ \
|
||||
c2 = sc->h[ 2 ] = _mm512_xor_si512( sc->h[ 2 ], s2 ); /* m1 */ \
|
||||
c1 = sc->h[ 1 ] = _mm512_xor_si512( sc->h[ 1 ], s1 ); /* c0 */ \
|
||||
c0 = sc->h[ 0 ] = _mm512_xor_si512( sc->h[ 0 ], s0 ); /* m0 */ \
|
||||
} while (0)
|
||||
|
||||
void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
|
||||
@@ -818,7 +824,6 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
|
||||
WRITE_STATE_BIG8( sc );
|
||||
}
|
||||
|
||||
|
||||
void hamsi512_8way_init( hamsi_8way_big_context *sc )
|
||||
{
|
||||
sc->partial_len = 0;
|
||||
@@ -849,13 +854,11 @@ void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
|
||||
void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
|
||||
{
|
||||
__m512i pad[1];
|
||||
int ch, cl;
|
||||
uint32_t ch, cl;
|
||||
|
||||
sph_enc32be( &ch, sc->count_high );
|
||||
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
|
||||
pad[0] = _mm512_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch,
|
||||
cl, ch, cl, ch, cl, ch, cl, ch );
|
||||
// pad[0] = m512_const2_32( cl, ch );
|
||||
pad[0] = _mm512_set1_epi64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
|
||||
sc->buf[0] = m512_const1_64( 0x80 );
|
||||
hamsi_8way_big( sc, sc->buf, 1 );
|
||||
hamsi_8way_big_final( sc, pad );
|
||||
@@ -863,22 +866,19 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
|
||||
mm512_block_bswap_32( (__m512i*)dst, sc->h );
|
||||
}
|
||||
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
// Hamsi 4 way
|
||||
// Hamsi 4 way AVX2
|
||||
|
||||
#define INPUT_BIG \
|
||||
do { \
|
||||
__m256i db = *buf; \
|
||||
const uint64_t *tp = (uint64_t*)&T512[0][0]; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \
|
||||
for ( int u = 0; u < 64; u++ ) \
|
||||
const __m256i zero = m256_zero; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
for ( int u = 63; u >= 0; u-- ) \
|
||||
{ \
|
||||
__m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
|
||||
dm = mm256_negate_32( _mm256_or_si256( dm, \
|
||||
_mm256_slli_epi64( dm, 32 ) ) ); \
|
||||
__m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \
|
||||
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
|
||||
m256_const1_64( tp[0] ) ) ); \
|
||||
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
|
||||
@@ -896,7 +896,6 @@ do { \
|
||||
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
|
||||
m256_const1_64( tp[7] ) ) ); \
|
||||
tp += 8; \
|
||||
db = _mm256_srli_epi64( db, 1 ); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -916,10 +915,9 @@ do { \
|
||||
d = _mm256_xor_si256( d, a ); \
|
||||
a = _mm256_and_si256( a, b ); \
|
||||
t = _mm256_xor_si256( t, a ); \
|
||||
b = _mm256_xor_si256( b, d ); \
|
||||
b = _mm256_xor_si256( b, t ); \
|
||||
a = c; \
|
||||
c = b; \
|
||||
c = _mm256_xor_si256( b, d ); \
|
||||
c = _mm256_xor_si256( c, t ); \
|
||||
b = d; \
|
||||
d = mm256_not( t ); \
|
||||
} while (0)
|
||||
@@ -945,180 +943,184 @@ do { \
|
||||
|
||||
#define READ_STATE_BIG(sc) \
|
||||
do { \
|
||||
c0 = sc->h[0x0]; \
|
||||
c1 = sc->h[0x1]; \
|
||||
c2 = sc->h[0x2]; \
|
||||
c3 = sc->h[0x3]; \
|
||||
c4 = sc->h[0x4]; \
|
||||
c5 = sc->h[0x5]; \
|
||||
c6 = sc->h[0x6]; \
|
||||
c7 = sc->h[0x7]; \
|
||||
c0 = sc->h[0]; \
|
||||
c1 = sc->h[1]; \
|
||||
c2 = sc->h[2]; \
|
||||
c3 = sc->h[3]; \
|
||||
c4 = sc->h[4]; \
|
||||
c5 = sc->h[5]; \
|
||||
c6 = sc->h[6]; \
|
||||
c7 = sc->h[7]; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE_BIG(sc) \
|
||||
do { \
|
||||
sc->h[0x0] = c0; \
|
||||
sc->h[0x1] = c1; \
|
||||
sc->h[0x2] = c2; \
|
||||
sc->h[0x3] = c3; \
|
||||
sc->h[0x4] = c4; \
|
||||
sc->h[0x5] = c5; \
|
||||
sc->h[0x6] = c6; \
|
||||
sc->h[0x7] = c7; \
|
||||
sc->h[0] = c0; \
|
||||
sc->h[1] = c1; \
|
||||
sc->h[2] = c2; \
|
||||
sc->h[3] = c3; \
|
||||
sc->h[4] = c4; \
|
||||
sc->h[5] = c5; \
|
||||
sc->h[6] = c6; \
|
||||
sc->h[7] = c7; \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
#define s0 m0
|
||||
#define s1 c0
|
||||
#define s2 m1
|
||||
#define s3 c1
|
||||
#define s4 c2
|
||||
#define s5 m2
|
||||
#define s6 c3
|
||||
#define s7 m3
|
||||
#define s8 m4
|
||||
#define s9 c4
|
||||
#define sA m5
|
||||
#define sB c5
|
||||
#define sC c6
|
||||
#define sD m6
|
||||
#define sE c7
|
||||
#define sF m7
|
||||
*/
|
||||
|
||||
#define ROUND_BIG(rc, alpha) \
|
||||
#define ROUND_BIG( alpha ) \
|
||||
do { \
|
||||
__m256i t0, t1, t2, t3; \
|
||||
s0 = _mm256_xor_si256( s0, m256_const1_64( \
|
||||
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
|
||||
s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
|
||||
s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
|
||||
s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
|
||||
s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
|
||||
s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
|
||||
s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
|
||||
s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
|
||||
s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
|
||||
s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
|
||||
sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
|
||||
sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
|
||||
sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
|
||||
sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
|
||||
sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
|
||||
sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
|
||||
__m256i t0, t1, t2, t3, t4, t5; \
|
||||
s0 = _mm256_xor_si256( s0, alpha[ 0] ); \
|
||||
s1 = _mm256_xor_si256( s1, alpha[ 1] ); \
|
||||
s2 = _mm256_xor_si256( s2, alpha[ 2] ); \
|
||||
s3 = _mm256_xor_si256( s3, alpha[ 3] ); \
|
||||
s4 = _mm256_xor_si256( s4, alpha[ 4] ); \
|
||||
s5 = _mm256_xor_si256( s5, alpha[ 5] ); \
|
||||
s6 = _mm256_xor_si256( s6, alpha[ 6] ); \
|
||||
s7 = _mm256_xor_si256( s7, alpha[ 7] ); \
|
||||
s8 = _mm256_xor_si256( s8, alpha[ 8] ); \
|
||||
s9 = _mm256_xor_si256( s9, alpha[ 9] ); \
|
||||
sA = _mm256_xor_si256( sA, alpha[10] ); \
|
||||
sB = _mm256_xor_si256( sB, alpha[11] ); \
|
||||
sC = _mm256_xor_si256( sC, alpha[12] ); \
|
||||
sD = _mm256_xor_si256( sD, alpha[13] ); \
|
||||
sE = _mm256_xor_si256( sE, alpha[14] ); \
|
||||
sF = _mm256_xor_si256( sF, alpha[15] ); \
|
||||
\
|
||||
SBOX( s0, s4, s8, sC ); \
|
||||
SBOX( s1, s5, s9, sD ); \
|
||||
SBOX( s2, s6, sA, sE ); \
|
||||
SBOX( s3, s7, sB, sF ); \
|
||||
\
|
||||
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \
|
||||
_mm256_bslli_epi128( s5, 4 ), 0xAA ); \
|
||||
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \
|
||||
_mm256_bslli_epi128( sE, 4 ), 0xAA ); \
|
||||
L( s0, t1, s9, t3 ); \
|
||||
s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
|
||||
s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
|
||||
sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
|
||||
sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
|
||||
s4 = mm256_swap64_32( s4 ); \
|
||||
s5 = mm256_swap64_32( s5 ); \
|
||||
sD = mm256_swap64_32( sD ); \
|
||||
sE = mm256_swap64_32( sE ); \
|
||||
t0 = _mm256_blend_epi32( s4, s5, 0xaa ); \
|
||||
t1 = _mm256_blend_epi32( sD, sE, 0xaa ); \
|
||||
L( s0, t0, s9, t1 ); \
|
||||
\
|
||||
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
|
||||
_mm256_bslli_epi128( s6, 4 ), 0xAA ); \
|
||||
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \
|
||||
_mm256_bslli_epi128( sF, 4 ), 0xAA ); \
|
||||
L( s1, t1, sA, t3 ); \
|
||||
s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
|
||||
s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
|
||||
sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
|
||||
sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
|
||||
s6 = mm256_swap64_32( s6 ); \
|
||||
sF = mm256_swap64_32( sF ); \
|
||||
t2 = _mm256_blend_epi32( s5, s6, 0xaa ); \
|
||||
t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \
|
||||
L( s1, t2, sA, t3 ); \
|
||||
s5 = _mm256_blend_epi32( t0, t2, 0x55 ); \
|
||||
sE = _mm256_blend_epi32( t1, t3, 0x55 ); \
|
||||
\
|
||||
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \
|
||||
_mm256_bslli_epi128( s7, 4 ), 0xAA ); \
|
||||
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \
|
||||
_mm256_bslli_epi128( sC, 4 ), 0xAA ); \
|
||||
L( s2, t1, sB, t3 ); \
|
||||
s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
|
||||
s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
|
||||
sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
|
||||
sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
|
||||
s7 = mm256_swap64_32( s7 ); \
|
||||
sC = mm256_swap64_32( sC ); \
|
||||
t4 = _mm256_blend_epi32( s6, s7, 0xaa ); \
|
||||
t5 = _mm256_blend_epi32( sF, sC, 0xaa ); \
|
||||
L( s2, t4, sB, t5 ); \
|
||||
s6 = _mm256_blend_epi32( t2, t4, 0x55 ); \
|
||||
sF = _mm256_blend_epi32( t3, t5, 0x55 ); \
|
||||
s6 = mm256_swap64_32( s6 ); \
|
||||
sF = mm256_swap64_32( sF ); \
|
||||
\
|
||||
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \
|
||||
_mm256_bslli_epi128( s4, 4 ), 0xAA ); \
|
||||
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \
|
||||
_mm256_bslli_epi128( sD, 4 ), 0xAA ); \
|
||||
L( s3, t1, s8, t3 ); \
|
||||
s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
|
||||
s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
|
||||
sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
|
||||
sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
|
||||
t2 = _mm256_blend_epi32( s7, s4, 0xaa ); \
|
||||
t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \
|
||||
L( s3, t2, s8, t3 ); \
|
||||
s7 = _mm256_blend_epi32( t4, t2, 0x55 ); \
|
||||
s4 = _mm256_blend_epi32( t0, t2, 0xaa ); \
|
||||
sC = _mm256_blend_epi32( t5, t3, 0x55 ); \
|
||||
sD = _mm256_blend_epi32( t1, t3, 0xaa ); \
|
||||
s7 = mm256_swap64_32( s7 ); \
|
||||
sC = mm256_swap64_32( sC ); \
|
||||
\
|
||||
t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \
|
||||
t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \
|
||||
t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \
|
||||
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \
|
||||
_mm256_bslli_epi128( sB, 4 ), 0xAA ); \
|
||||
t0 = _mm256_blend_epi32( s0, mm256_swap64_32( s8 ), 0xaa ); \
|
||||
t1 = _mm256_blend_epi32( s1, s9, 0xaa ); \
|
||||
t2 = _mm256_blend_epi32( mm256_swap64_32( s2 ), sA, 0xaa ); \
|
||||
t3 = _mm256_blend_epi32( s3, sB, 0x55 ); \
|
||||
t3 = mm256_swap64_32( t3 ); \
|
||||
L( t0, t1, t2, t3 ); \
|
||||
t3 = mm256_swap64_32( t3 ); \
|
||||
s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \
|
||||
s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \
|
||||
s8 = _mm256_blend_epi32( s8, mm256_swap64_32( t0 ), 0x55 ); \
|
||||
s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \
|
||||
s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \
|
||||
s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \
|
||||
sA = _mm256_blend_epi32( sA, t2, 0xAA ); \
|
||||
s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \
|
||||
sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \
|
||||
s9 = _mm256_blend_epi32( s9, t1, 0xaa ); \
|
||||
s2 = _mm256_blend_epi32( s2, mm256_swap64_32( t2 ), 0xaa ); \
|
||||
sA = _mm256_blend_epi32( sA, t2, 0xaa ); \
|
||||
s3 = _mm256_blend_epi32( s3, t3, 0xaa ); \
|
||||
sB = _mm256_blend_epi32( sB, t3, 0x55 ); \
|
||||
\
|
||||
t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \
|
||||
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
|
||||
_mm256_bslli_epi128( sD, 4 ), 0xAA ); \
|
||||
t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
|
||||
t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \
|
||||
t0 = _mm256_blend_epi32( s4, sC, 0xaa ); \
|
||||
t1 = _mm256_blend_epi32( s5, sD, 0xaa ); \
|
||||
t2 = _mm256_blend_epi32( s6, sE, 0xaa ); \
|
||||
t3 = _mm256_blend_epi32( s7, sF, 0xaa ); \
|
||||
L( t0, t1, t2, t3 ); \
|
||||
s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \
|
||||
sC = _mm256_blend_epi32( sC, t0, 0xAA ); \
|
||||
s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \
|
||||
sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \
|
||||
s4 = _mm256_blend_epi32( s4, t0, 0x55 ); \
|
||||
sC = _mm256_blend_epi32( sC, t0, 0xaa ); \
|
||||
s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
|
||||
sD = _mm256_blend_epi32( sD, t1, 0xaa ); \
|
||||
s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \
|
||||
sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \
|
||||
sE = _mm256_blend_epi32( sE, t2, 0xaa ); \
|
||||
s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \
|
||||
sF = _mm256_blend_epi32( sF, t3, 0xAA ); \
|
||||
sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
|
||||
s4 = mm256_swap64_32( s4 ); \
|
||||
s5 = mm256_swap64_32( s5 ); \
|
||||
sD = mm256_swap64_32( sD ); \
|
||||
sE = mm256_swap64_32( sE ); \
|
||||
} while (0)
|
||||
|
||||
#define P_BIG \
|
||||
do { \
|
||||
ROUND_BIG(0, alpha_n); \
|
||||
ROUND_BIG(1, alpha_n); \
|
||||
ROUND_BIG(2, alpha_n); \
|
||||
ROUND_BIG(3, alpha_n); \
|
||||
ROUND_BIG(4, alpha_n); \
|
||||
ROUND_BIG(5, alpha_n); \
|
||||
__m256i alpha[16]; \
|
||||
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( (1ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( (2ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( (3ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( (4ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( (5ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
} while (0)
|
||||
|
||||
#define PF_BIG \
|
||||
do { \
|
||||
ROUND_BIG( 0, alpha_f); \
|
||||
ROUND_BIG( 1, alpha_f); \
|
||||
ROUND_BIG( 2, alpha_f); \
|
||||
ROUND_BIG( 3, alpha_f); \
|
||||
ROUND_BIG( 4, alpha_f); \
|
||||
ROUND_BIG( 5, alpha_f); \
|
||||
ROUND_BIG( 6, alpha_f); \
|
||||
ROUND_BIG( 7, alpha_f); \
|
||||
ROUND_BIG( 8, alpha_f); \
|
||||
ROUND_BIG( 9, alpha_f); \
|
||||
ROUND_BIG(10, alpha_f); \
|
||||
ROUND_BIG(11, alpha_f); \
|
||||
__m256i alpha[16]; \
|
||||
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( 1ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( 2ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( 3ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( 4ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( 5ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( 6ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( 7ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( 8ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( 9ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( (10ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( (11ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
} while (0)
|
||||
|
||||
#define T_BIG \
|
||||
do { /* order is important */ \
|
||||
c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \
|
||||
c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \
|
||||
c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \
|
||||
c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \
|
||||
c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \
|
||||
c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \
|
||||
c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \
|
||||
c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \
|
||||
c7 = sc->h[ 7 ] = _mm256_xor_si256( sc->h[ 7 ], sB ); \
|
||||
c6 = sc->h[ 6 ] = _mm256_xor_si256( sc->h[ 6 ], sA ); \
|
||||
c5 = sc->h[ 5 ] = _mm256_xor_si256( sc->h[ 5 ], s9 ); \
|
||||
c4 = sc->h[ 4 ] = _mm256_xor_si256( sc->h[ 4 ], s8 ); \
|
||||
c3 = sc->h[ 3 ] = _mm256_xor_si256( sc->h[ 3 ], s3 ); \
|
||||
c2 = sc->h[ 2 ] = _mm256_xor_si256( sc->h[ 2 ], s2 ); \
|
||||
c1 = sc->h[ 1 ] = _mm256_xor_si256( sc->h[ 1 ], s1 ); \
|
||||
c0 = sc->h[ 0 ] = _mm256_xor_si256( sc->h[ 0 ], s0 ); \
|
||||
} while (0)
|
||||
|
||||
void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
|
||||
@@ -1186,14 +1188,12 @@ void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
|
||||
void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
{
|
||||
__m256i pad[1];
|
||||
int ch, cl;
|
||||
uint32_t ch, cl;
|
||||
|
||||
sph_enc32be( &ch, sc->count_high );
|
||||
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
|
||||
pad[0] = _mm256_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch );
|
||||
pad[0] = _mm256_set1_epi64x( ((uint64_t)cl << 32 ) | (uint64_t)ch );
|
||||
sc->buf[0] = m256_const1_64( 0x80 );
|
||||
// sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL,
|
||||
// 0UL, 0x80UL, 0UL, 0x80UL );
|
||||
hamsi_big( sc, sc->buf, 1 );
|
||||
hamsi_big_final( sc, pad );
|
||||
|
||||
|
@@ -141,6 +141,13 @@ do { \
|
||||
_mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
|
||||
do { \
|
||||
__m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
|
||||
mm128_ror_32( x7, 11 ) ), w ); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* PASSy(n, in) computes pass number "y", for a total of "n", using the
|
||||
* one-argument macro "in" to access input words. Current state is assumed
|
||||
@@ -152,22 +159,22 @@ do { \
|
||||
#define PASS1(n, in) do { \
|
||||
unsigned pass_count; \
|
||||
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
|
||||
STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(pass_count + 0), SPH_C32(0x00000000)); \
|
||||
STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(pass_count + 1), SPH_C32(0x00000000)); \
|
||||
STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(pass_count + 2), SPH_C32(0x00000000)); \
|
||||
STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(pass_count + 3), SPH_C32(0x00000000)); \
|
||||
STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(pass_count + 4), SPH_C32(0x00000000)); \
|
||||
STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(pass_count + 5), SPH_C32(0x00000000)); \
|
||||
STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(pass_count + 6), SPH_C32(0x00000000)); \
|
||||
STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(pass_count + 7), SPH_C32(0x00000000)); \
|
||||
STEP1(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(pass_count + 0) ); \
|
||||
STEP1(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(pass_count + 1) ); \
|
||||
STEP1(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(pass_count + 2) ); \
|
||||
STEP1(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(pass_count + 3) ); \
|
||||
STEP1(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(pass_count + 4) ); \
|
||||
STEP1(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(pass_count + 5) ); \
|
||||
STEP1(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(pass_count + 6) ); \
|
||||
STEP1(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(pass_count + 7) ); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -522,49 +529,52 @@ do { \
|
||||
|
||||
// Haval-256 8 way 32 bit avx2
|
||||
|
||||
#if defined (__AVX512VL__)
|
||||
|
||||
// ( ~( a ^ b ) ) & c
|
||||
#define mm256_andnotxor( a, b, c ) \
|
||||
_mm256_ternarylogic_epi32( a, b, c, 0x82 )
|
||||
|
||||
#else
|
||||
|
||||
#define mm256_andnotxor( a, b, c ) \
|
||||
_mm256_andnot_si256( _mm256_xor_si256( a, b ), c )
|
||||
|
||||
#endif
|
||||
|
||||
#define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( x0, \
|
||||
_mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \
|
||||
mm256_xor3( x0, mm256_andxor( x1, x0, x4 ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
|
||||
_mm256_and_si256( x3, x6 ) ) ) ) \
|
||||
_mm256_and_si256( x3, x6 ) ) ) \
|
||||
|
||||
#define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x2, \
|
||||
_mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x4, x5 ), \
|
||||
_mm256_xor_si256( x6, x0 ) ) ) ), \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
|
||||
mm256_xor3( mm256_andxor( x2, _mm256_andnot_si256( x3, x1 ), \
|
||||
mm256_xor3( _mm256_and_si256( x4, x5 ), x6, x0 ) ), \
|
||||
mm256_andxor( x4, x1, x5 ), \
|
||||
mm256_xorand( x0, x3, x5 ) ) \
|
||||
|
||||
#define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( \
|
||||
mm256_xor3( x0, \
|
||||
_mm256_and_si256( x3, \
|
||||
_mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
|
||||
_mm256_xor_si256( x6, x0 ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \
|
||||
_mm256_and_si256( x2, x5 ) ), x0 ) )
|
||||
mm256_xor3( _mm256_and_si256( x1, x2 ), x6, x0 ) ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
|
||||
_mm256_and_si256( x2, x5 ) ) )
|
||||
|
||||
#define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x3, \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
|
||||
_mm256_or_si256( x4, x6 ) ), x5 ) ), \
|
||||
mm256_xor3( \
|
||||
mm256_andxor( x3, x5, \
|
||||
_mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
|
||||
_mm256_or_si256( x4, x6 ) ) ), \
|
||||
_mm256_and_si256( x4, \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \
|
||||
_mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
|
||||
|
||||
mm256_xor3( x0, _mm256_andnot_si256( x2, x5 ), \
|
||||
_mm256_xor_si256( x1, x6 ) ) ), \
|
||||
mm256_xorand( x0, x2, x6 ) )
|
||||
|
||||
#define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x0, \
|
||||
mm256_not( _mm256_xor_si256( \
|
||||
_mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
|
||||
_mm256_and_si256( x2, x5 ) ), \
|
||||
mm256_andnotxor( mm256_and3( x1, x2, x3 ), x5, x0 ), \
|
||||
mm256_xor3( _mm256_and_si256( x1, x4 ), \
|
||||
_mm256_and_si256( x2, x5 ), \
|
||||
_mm256_and_si256( x3, x6 ) ) )
|
||||
|
||||
#define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
@@ -602,25 +612,32 @@ do { \
|
||||
_mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define STEP1_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
|
||||
do { \
|
||||
__m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
|
||||
mm256_ror_32( x7, 11 ) ), w ); \
|
||||
} while (0)
|
||||
|
||||
#define PASS1_8W(n, in) do { \
|
||||
unsigned pass_count; \
|
||||
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
|
||||
STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(pass_count + 0), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(pass_count + 1), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(pass_count + 2), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(pass_count + 3), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(pass_count + 4), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(pass_count + 5), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(pass_count + 6), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(pass_count + 7), SPH_C32(0x00000000)); \
|
||||
STEP1_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(pass_count + 0) ); \
|
||||
STEP1_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(pass_count + 1) ); \
|
||||
STEP1_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(pass_count + 2) ); \
|
||||
STEP1_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(pass_count + 3) ); \
|
||||
STEP1_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(pass_count + 4) ); \
|
||||
STEP1_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(pass_count + 5) ); \
|
||||
STEP1_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(pass_count + 6) ); \
|
||||
STEP1_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(pass_count + 7) ); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
@@ -1,382 +0,0 @@
|
||||
/*
|
||||
* HEFTY1 cryptographic hash function
|
||||
*
|
||||
* Copyright (c) 2014, dbcc14 <BM-NBx4AKznJuyem3dArgVY8MGyABpihRy5>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* The views and conclusions contained in the software and documentation are those
|
||||
* of the authors and should not be interpreted as representing official policies,
|
||||
* either expressed or implied, of the FreeBSD Project.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define inline __inline
|
||||
#endif
|
||||
|
||||
#include "sph_hefty1.h"
|
||||
|
||||
#define Min(A, B) (A <= B ? A : B)
|
||||
#define RoundFunc(ctx, A, B, C, D, E, F, G, H, W, K) \
|
||||
{ \
|
||||
/* To thwart parallelism, Br modifies itself each time it's \
|
||||
* called. This also means that calling it in different \
|
||||
* orders yeilds different results. In C the order of \
|
||||
* evaluation of function arguments and + operands are \
|
||||
* unspecified (and depends on the compiler), so we must make \
|
||||
* the order of Br calls explicit. \
|
||||
*/ \
|
||||
uint32_t brG = Br(ctx, G); \
|
||||
uint32_t tmp1 = Ch(E, Br(ctx, F), brG) + H + W + K; \
|
||||
uint32_t tmp2 = tmp1 + Sigma1(Br(ctx, E)); \
|
||||
uint32_t brC = Br(ctx, C); \
|
||||
uint32_t brB = Br(ctx, B); \
|
||||
uint32_t tmp3 = Ma(Br(ctx, A), brB, brC); \
|
||||
uint32_t tmp4 = tmp3 + Sigma0(Br(ctx, A)); \
|
||||
H = G; \
|
||||
G = F; \
|
||||
F = E; \
|
||||
E = D + Br(ctx, tmp2); \
|
||||
D = C; \
|
||||
C = B; \
|
||||
B = A; \
|
||||
A = tmp2 + tmp4; \
|
||||
} \
|
||||
|
||||
/* Nothing up my sleeve constants */
|
||||
const static uint32_t K[64] = {
|
||||
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
|
||||
0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
|
||||
0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
|
||||
0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
|
||||
0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
|
||||
0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
|
||||
0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
|
||||
0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
|
||||
0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
|
||||
0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
|
||||
0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
|
||||
0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
|
||||
0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
|
||||
0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
|
||||
0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
|
||||
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
|
||||
};
|
||||
|
||||
/* Initial hash values */
|
||||
const static uint32_t H[HEFTY1_STATE_WORDS] = {
|
||||
0x6a09e667UL,
|
||||
0xbb67ae85UL,
|
||||
0x3c6ef372UL,
|
||||
0xa54ff53aUL,
|
||||
0x510e527fUL,
|
||||
0x9b05688cUL,
|
||||
0x1f83d9abUL,
|
||||
0x5be0cd19UL
|
||||
};
|
||||
|
||||
static inline uint32_t Rr(uint32_t X, uint8_t n)
|
||||
{
|
||||
return (X >> n) | (X << (32 - n));
|
||||
}
|
||||
|
||||
static inline uint32_t Ch(uint32_t E, uint32_t F, uint32_t G)
|
||||
{
|
||||
return (E & F) ^ (~E & G);
|
||||
}
|
||||
|
||||
static inline uint32_t Sigma1(uint32_t E)
|
||||
{
|
||||
return Rr(E, 6) ^ Rr(E, 11) ^ Rr(E, 25);
|
||||
}
|
||||
|
||||
static inline uint32_t sigma1(uint32_t X)
|
||||
{
|
||||
return Rr(X, 17) ^ Rr(X, 19) ^ (X >> 10);
|
||||
}
|
||||
|
||||
static inline uint32_t Ma(uint32_t A, uint32_t B, uint32_t C)
|
||||
{
|
||||
return (A & B) ^ (A & C) ^ (B & C);
|
||||
}
|
||||
|
||||
static inline uint32_t Sigma0(uint32_t A)
|
||||
{
|
||||
return Rr(A, 2) ^ Rr(A, 13) ^ Rr(A, 22);
|
||||
}
|
||||
|
||||
static inline uint32_t sigma0(uint32_t X)
|
||||
{
|
||||
return Rr(X, 7) ^ Rr(X, 18) ^ (X >> 3);
|
||||
}
|
||||
|
||||
static inline uint32_t Reverse32(uint32_t n)
|
||||
{
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
return n << 24 | (n & 0x0000ff00) << 8 | (n & 0x00ff0000) >> 8 | n >> 24;
|
||||
#else
|
||||
return n;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uint64_t Reverse64(uint64_t n)
|
||||
{
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
uint32_t a = n >> 32;
|
||||
uint32_t b = (n << 32) >> 32;
|
||||
|
||||
return (uint64_t)Reverse32(b) << 32 | Reverse32(a);
|
||||
#else
|
||||
return n;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Smoosh byte into nibble */
|
||||
static inline uint8_t Smoosh4(uint8_t X)
|
||||
{
|
||||
return (X >> 4) ^ (X & 0xf);
|
||||
}
|
||||
|
||||
/* Smoosh 32-bit word into 2-bits */
|
||||
static inline uint8_t Smoosh2(uint32_t X)
|
||||
{
|
||||
uint16_t w = (X >> 16) ^ (X & 0xffff);
|
||||
uint8_t n = Smoosh4((w >> 8) ^ (w & 0xff));
|
||||
return (n >> 2) ^ (n & 0x3);
|
||||
}
|
||||
|
||||
static void Mangle(uint32_t *S)
|
||||
{
|
||||
uint32_t *R = S;
|
||||
uint32_t *C = &S[1];
|
||||
|
||||
uint8_t r0 = Smoosh4(R[0] >> 24);
|
||||
uint8_t r1 = Smoosh4(R[0] >> 16);
|
||||
uint8_t r2 = Smoosh4(R[0] >> 8);
|
||||
uint8_t r3 = Smoosh4(R[0] & 0xff);
|
||||
|
||||
int i;
|
||||
|
||||
/* Diffuse */
|
||||
uint32_t tmp = 0;
|
||||
for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++) {
|
||||
uint8_t r = Smoosh2(tmp);
|
||||
switch (r) {
|
||||
case 0:
|
||||
C[i] ^= Rr(R[0], i + r0);
|
||||
break;
|
||||
case 1:
|
||||
C[i] += Rr(~R[0], i + r1);
|
||||
break;
|
||||
case 2:
|
||||
C[i] &= Rr(~R[0], i + r2);
|
||||
break;
|
||||
case 3:
|
||||
C[i] ^= Rr(R[0], i + r3);
|
||||
break;
|
||||
}
|
||||
tmp ^= C[i];
|
||||
}
|
||||
|
||||
/* Compress */
|
||||
tmp = 0;
|
||||
for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++)
|
||||
if (i % 2)
|
||||
tmp ^= C[i];
|
||||
else
|
||||
tmp += C[i];
|
||||
R[0] ^= tmp;
|
||||
}
|
||||
|
||||
static void Absorb(uint32_t *S, uint32_t X)
|
||||
{
|
||||
uint32_t *R = S;
|
||||
R[0] ^= X;
|
||||
Mangle(S);
|
||||
}
|
||||
|
||||
static uint32_t Squeeze(uint32_t *S)
|
||||
{
|
||||
uint32_t Y = S[0];
|
||||
Mangle(S);
|
||||
return Y;
|
||||
}
|
||||
|
||||
/* Branch, compress and serialize function */
|
||||
static inline uint32_t Br(HEFTY1_CTX *ctx, uint32_t X)
|
||||
{
|
||||
uint32_t R = Squeeze(ctx->sponge);
|
||||
|
||||
uint8_t r0 = R >> 8;
|
||||
uint8_t r1 = R & 0xff;
|
||||
|
||||
uint32_t Y = 1 << (r0 % 32);
|
||||
|
||||
switch (r1 % 4)
|
||||
{
|
||||
case 0:
|
||||
/* Do nothing */
|
||||
break;
|
||||
case 1:
|
||||
return X & ~Y;
|
||||
case 2:
|
||||
return X | Y;
|
||||
case 3:
|
||||
return X ^ Y;
|
||||
}
|
||||
|
||||
return X;
|
||||
}
|
||||
|
||||
static void HashBlock(HEFTY1_CTX *ctx)
|
||||
{
|
||||
uint32_t A, B, C, D, E, F, G, H;
|
||||
uint32_t W[HEFTY1_BLOCK_BYTES];
|
||||
|
||||
assert(ctx);
|
||||
|
||||
A = ctx->h[0];
|
||||
B = ctx->h[1];
|
||||
C = ctx->h[2];
|
||||
D = ctx->h[3];
|
||||
E = ctx->h[4];
|
||||
F = ctx->h[5];
|
||||
G = ctx->h[6];
|
||||
H = ctx->h[7];
|
||||
|
||||
int t = 0;
|
||||
for (; t < 16; t++) {
|
||||
W[t] = Reverse32(((uint32_t *)&ctx->block[0])[t]); /* To host byte order */
|
||||
Absorb(ctx->sponge, W[t] ^ K[t]);
|
||||
}
|
||||
|
||||
for (t = 0; t < 16; t++) {
|
||||
Absorb(ctx->sponge, D ^ H);
|
||||
RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]);
|
||||
}
|
||||
for (t = 16; t < 64; t++) {
|
||||
Absorb(ctx->sponge, H + D);
|
||||
W[t] = sigma1(W[t - 2]) + W[t - 7] + sigma0(W[t - 15]) + W[t - 16];
|
||||
RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]);
|
||||
}
|
||||
|
||||
ctx->h[0] += A;
|
||||
ctx->h[1] += B;
|
||||
ctx->h[2] += C;
|
||||
ctx->h[3] += D;
|
||||
ctx->h[4] += E;
|
||||
ctx->h[5] += F;
|
||||
ctx->h[6] += G;
|
||||
ctx->h[7] += H;
|
||||
|
||||
A = 0;
|
||||
B = 0;
|
||||
C = 0;
|
||||
D = 0;
|
||||
E = 0;
|
||||
F = 0;
|
||||
G = 0;
|
||||
H = 0;
|
||||
|
||||
memset(W, 0, sizeof(W));
|
||||
}
|
||||
|
||||
/* Public interface */
|
||||
|
||||
void HEFTY1_Init(HEFTY1_CTX *ctx)
|
||||
{
|
||||
assert(ctx);
|
||||
|
||||
memcpy(ctx->h, H, sizeof(ctx->h));
|
||||
memset(ctx->block, 0, sizeof(ctx->block));
|
||||
ctx->written = 0;
|
||||
memset(ctx->sponge, 0, sizeof(ctx->sponge));
|
||||
}
|
||||
|
||||
void HEFTY1_Update(HEFTY1_CTX *ctx, const void *buf, size_t len)
|
||||
{
|
||||
assert(ctx);
|
||||
|
||||
uint64_t read = 0;
|
||||
while (len) {
|
||||
size_t end = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES);
|
||||
size_t count = Min(len, HEFTY1_BLOCK_BYTES - end);
|
||||
memcpy(&ctx->block[end], &((unsigned char *)buf)[read], count);
|
||||
len -= count;
|
||||
read += count;
|
||||
ctx->written += count;
|
||||
if (!(ctx->written % HEFTY1_BLOCK_BYTES))
|
||||
HashBlock(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *ctx)
|
||||
{
|
||||
assert(digest);
|
||||
assert(ctx);
|
||||
|
||||
/* Pad message (FIPS 180 Section 5.1.1) */
|
||||
size_t used = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES);
|
||||
ctx->block[used++] = 0x80; /* Append 1 to end of message */
|
||||
if (used > HEFTY1_BLOCK_BYTES - 8) {
|
||||
/* We have already written into the last 64bits, so
|
||||
* we must continue into the next block. */
|
||||
memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - used);
|
||||
HashBlock(ctx);
|
||||
used = 0; /* Create a new block (below) */
|
||||
}
|
||||
|
||||
/* All remaining bits to zero */
|
||||
memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - 8 - used);
|
||||
|
||||
/* The last 64bits encode the length (in network byte order) */
|
||||
uint64_t *len = (uint64_t *)&ctx->block[HEFTY1_BLOCK_BYTES - 8];
|
||||
*len = Reverse64(ctx->written*8);
|
||||
|
||||
HashBlock(ctx);
|
||||
|
||||
/* Convert back to network byte order */
|
||||
int i = 0;
|
||||
for (; i < HEFTY1_STATE_WORDS; i++)
|
||||
ctx->h[i] = Reverse32(ctx->h[i]);
|
||||
|
||||
memcpy(digest, ctx->h, sizeof(ctx->h));
|
||||
memset(ctx, 0, sizeof(HEFTY1_CTX));
|
||||
}
|
||||
|
||||
unsigned char* HEFTY1(const unsigned char *buf, size_t len, unsigned char *digest)
|
||||
{
|
||||
HEFTY1_CTX ctx;
|
||||
static unsigned char m[HEFTY1_DIGEST_BYTES];
|
||||
|
||||
if (!digest)
|
||||
digest = m;
|
||||
|
||||
HEFTY1_Init(&ctx);
|
||||
HEFTY1_Update(&ctx, buf, len);
|
||||
HEFTY1_Final(digest, &ctx);
|
||||
|
||||
return digest;
|
||||
}
|
@@ -1,66 +0,0 @@
|
||||
/*
|
||||
* HEFTY1 cryptographic hash function
|
||||
*
|
||||
* Copyright (c) 2014, dbcc14 <BM-NBx4AKznJuyem3dArgVY8MGyABpihRy5>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* The views and conclusions contained in the software and documentation are those
|
||||
* of the authors and should not be interpreted as representing official policies,
|
||||
* either expressed or implied, of the FreeBSD Project.
|
||||
*/
|
||||
|
||||
#ifndef __HEFTY1_H__
|
||||
#define __HEFTY1_H__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifndef WIN32
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
|
||||
#include <inttypes.h>
|
||||
|
||||
#define HEFTY1_DIGEST_BYTES 32
|
||||
#define HEFTY1_BLOCK_BYTES 64
|
||||
#define HEFTY1_STATE_WORDS 8
|
||||
#define HEFTY1_SPONGE_WORDS 4
|
||||
|
||||
typedef struct HEFTY1_CTX {
|
||||
uint32_t h[HEFTY1_STATE_WORDS];
|
||||
uint8_t block[HEFTY1_BLOCK_BYTES];
|
||||
uint64_t written;
|
||||
uint32_t sponge[HEFTY1_SPONGE_WORDS];
|
||||
} HEFTY1_CTX;
|
||||
|
||||
void HEFTY1_Init(HEFTY1_CTX *cxt);
|
||||
void HEFTY1_Update(HEFTY1_CTX *cxt, const void *data, size_t len);
|
||||
void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *cxt);
|
||||
unsigned char* HEFTY1(const unsigned char *data, size_t len, unsigned char *digest);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __HEFTY1_H__ */
|
@@ -99,13 +99,13 @@ void hodl_build_block_header( struct work* g_work, uint32_t version,
|
||||
// called only by thread 0, saves a backup of g_work
|
||||
void hodl_get_new_work( struct work* work, struct work* g_work)
|
||||
{
|
||||
pthread_rwlock_rdlock( &g_work_lock );
|
||||
// pthread_rwlock_rdlock( &g_work_lock );
|
||||
|
||||
work_free( &hodl_work );
|
||||
work_copy( &hodl_work, g_work );
|
||||
hodl_work.data[ algo_gate.nonce_index ] = ( clock() + rand() ) % 9999;
|
||||
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
// pthread_rwlock_unlock( &g_work_lock );
|
||||
}
|
||||
|
||||
json_t *hodl_longpoll_rpc_call( CURL *curl, int *err, char* lp_url )
|
||||
@@ -125,7 +125,7 @@ json_t *hodl_longpoll_rpc_call( CURL *curl, int *err, char* lp_url )
|
||||
}
|
||||
|
||||
// called by every thread, copies the backup to each thread's work.
|
||||
void hodl_resync_threads( struct work* work )
|
||||
void hodl_resync_threads( int thr_id, struct work* work )
|
||||
{
|
||||
int nonce_index = algo_gate.nonce_index;
|
||||
pthread_barrier_wait( &hodl_barrier );
|
||||
@@ -135,6 +135,7 @@ void hodl_resync_threads( struct work* work )
|
||||
work_copy( work, &hodl_work );
|
||||
}
|
||||
work->data[ nonce_index ] = swab32( hodl_work.data[ nonce_index ] );
|
||||
work_restart[thr_id].restart = 0;
|
||||
}
|
||||
|
||||
bool hodl_do_this_thread( int thr_id )
|
||||
|
@@ -7,6 +7,7 @@
|
||||
#include "hodl-gate.h"
|
||||
#include "hodl-wolf.h"
|
||||
#include "miner.h"
|
||||
#include "algo/sha/sha256d.h"
|
||||
|
||||
#if defined(__AES__)
|
||||
|
||||
|
@@ -45,6 +45,6 @@ void sha512Compute32b_parallel(
|
||||
uint64_t *data[SHA512_PARALLEL_N],
|
||||
uint64_t *digest[SHA512_PARALLEL_N]);
|
||||
|
||||
void sha512ProcessBlock(Sha512Context *context);
|
||||
void sha512ProcessBlock(Sha512Context contexti[2] );
|
||||
|
||||
#endif
|
||||
|
@@ -49,17 +49,16 @@ extern "C"{
|
||||
|
||||
#define Sb_8W(x0, x1, x2, x3, c) \
|
||||
do { \
|
||||
__m512i cc = _mm512_set1_epi64( c ); \
|
||||
x3 = mm512_not( x3 ); \
|
||||
x0 = _mm512_xor_si512( x0, _mm512_andnot_si512( x2, cc ) ); \
|
||||
tmp = _mm512_xor_si512( cc, _mm512_and_si512( x0, x1 ) ); \
|
||||
x0 = _mm512_xor_si512( x0, _mm512_and_si512( x2, x3 ) ); \
|
||||
x3 = _mm512_xor_si512( x3, _mm512_andnot_si512( x1, x2 ) ); \
|
||||
x1 = _mm512_xor_si512( x1, _mm512_and_si512( x0, x2 ) ); \
|
||||
x2 = _mm512_xor_si512( x2, _mm512_andnot_si512( x3, x0 ) ); \
|
||||
x0 = _mm512_xor_si512( x0, _mm512_or_si512( x1, x3 ) ); \
|
||||
x3 = _mm512_xor_si512( x3, _mm512_and_si512( x1, x2 ) ); \
|
||||
x1 = _mm512_xor_si512( x1, _mm512_and_si512( tmp, x0 ) ); \
|
||||
const __m512i cc = _mm512_set1_epi64( c ); \
|
||||
x0 = mm512_xorandnot( x0, x2, cc ); \
|
||||
tmp = mm512_xorand( cc, x0, x1 ); \
|
||||
x0 = mm512_xorandnot( x0, x3, x2 ); \
|
||||
x3 = _mm512_ternarylogic_epi64( x3, x1, x2, 0x2d ); /* ~x3 ^ (~x1 & x2) */\
|
||||
x1 = mm512_xorand( x1, x0, x2 ); \
|
||||
x2 = mm512_xorandnot( x2, x3, x0 ); \
|
||||
x0 = mm512_xoror( x0, x1, x3 ); \
|
||||
x3 = mm512_xorand( x3, x1, x2 ); \
|
||||
x1 = mm512_xorand( x1, tmp, x0 ); \
|
||||
x2 = _mm512_xor_si512( x2, tmp ); \
|
||||
} while (0)
|
||||
|
||||
@@ -67,11 +66,11 @@ do { \
|
||||
do { \
|
||||
x4 = _mm512_xor_si512( x4, x1 ); \
|
||||
x5 = _mm512_xor_si512( x5, x2 ); \
|
||||
x6 = _mm512_xor_si512( x6, _mm512_xor_si512( x3, x0 ) ); \
|
||||
x6 = mm512_xor3( x6, x3, x0 ); \
|
||||
x7 = _mm512_xor_si512( x7, x0 ); \
|
||||
x0 = _mm512_xor_si512( x0, x5 ); \
|
||||
x1 = _mm512_xor_si512( x1, x6 ); \
|
||||
x2 = _mm512_xor_si512( x2, _mm512_xor_si512( x7, x4 ) ); \
|
||||
x2 = mm512_xor3( x2, x7, x4 ); \
|
||||
x3 = _mm512_xor_si512( x3, x4 ); \
|
||||
} while (0)
|
||||
|
||||
@@ -79,7 +78,7 @@ do { \
|
||||
|
||||
#define Sb(x0, x1, x2, x3, c) \
|
||||
do { \
|
||||
__m256i cc = _mm256_set1_epi64x( c ); \
|
||||
const __m256i cc = _mm256_set1_epi64x( c ); \
|
||||
x3 = mm256_not( x3 ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
|
||||
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
|
||||
@@ -318,12 +317,12 @@ static const sph_u64 C[] = {
|
||||
#define Wz_8W(x, c, n) \
|
||||
do { \
|
||||
__m512i t = _mm512_slli_epi64( _mm512_and_si512(x ## h, (c)), (n) ); \
|
||||
x ## h = _mm512_or_si512( _mm512_and_si512( \
|
||||
_mm512_srli_epi64(x ## h, (n)), (c)), t ); \
|
||||
x ## h = mm512_orand( t, _mm512_srli_epi64( x ## h, (n) ), (c) ); \
|
||||
t = _mm512_slli_epi64( _mm512_and_si512(x ## l, (c)), (n) ); \
|
||||
x ## l = _mm512_or_si512( _mm512_and_si512((x ## l >> (n)), (c)), t ); \
|
||||
x ## l = mm512_orand( t, (x ## l >> (n)), (c) ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define W80(x) Wz_8W(x, m512_const1_64( 0x5555555555555555 ), 1 )
|
||||
#define W81(x) Wz_8W(x, m512_const1_64( 0x3333333333333333 ), 2 )
|
||||
#define W82(x) Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 )
|
||||
|
@@ -1,5 +1,6 @@
|
||||
#include "keccak-gate.h"
|
||||
#include "sph_keccak.h"
|
||||
#include "algo/sha/sha256d.h"
|
||||
|
||||
int hard_coded_eb = 1;
|
||||
|
||||
|
@@ -53,7 +53,8 @@ static const uint64_t RC[] = {
|
||||
#define WRITE_STATE(sc)
|
||||
|
||||
#define MOV64(d, s) (d = s)
|
||||
#define XOR64_IOTA XOR64
|
||||
#define XOR64_IOTA XOR
|
||||
|
||||
|
||||
#define LPAR (
|
||||
#define RPAR )
|
||||
@@ -71,11 +72,15 @@ static const uint64_t RC[] = {
|
||||
// Targetted macros, keccak-macros.h is included for each target.
|
||||
|
||||
#define DECL64(x) __m512i x
|
||||
#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b))
|
||||
#define XOR(d, a, b) (d = _mm512_xor_si512(a,b))
|
||||
#define XOR64 XOR
|
||||
#define AND64(d, a, b) (d = _mm512_and_si512(a,b))
|
||||
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
|
||||
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
|
||||
#define NOT64(d, s) (d = mm512_not( s ) )
|
||||
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
|
||||
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
|
||||
#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
|
||||
#define XOR3( d, a, b, c ) (d = mm512_xor3( a, b, c ))
|
||||
|
||||
#include "keccak-macros.c"
|
||||
|
||||
@@ -233,12 +238,15 @@ keccak512_8way_close(void *cc, void *dst)
|
||||
#undef INPUT_BUF
|
||||
#undef DECL64
|
||||
#undef XOR64
|
||||
#undef XOR
|
||||
#undef AND64
|
||||
#undef OR64
|
||||
#undef NOT64
|
||||
#undef ROL64
|
||||
#undef KECCAK_F_1600
|
||||
|
||||
#undef XOROR
|
||||
#undef XORAND
|
||||
#undef XOR3
|
||||
#endif // AVX512
|
||||
|
||||
// AVX2
|
||||
@@ -250,11 +258,15 @@ keccak512_8way_close(void *cc, void *dst)
|
||||
} while (0)
|
||||
|
||||
#define DECL64(x) __m256i x
|
||||
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b))
|
||||
#define XOR(d, a, b) (d = _mm256_xor_si256(a,b))
|
||||
#define XOR64 XOR
|
||||
#define AND64(d, a, b) (d = _mm256_and_si256(a,b))
|
||||
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
|
||||
#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
|
||||
#define NOT64(d, s) (d = mm256_not( s ) )
|
||||
#define ROL64(d, v, n) (d = mm256_rol_64(v, n))
|
||||
#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
|
||||
#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
|
||||
#define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))
|
||||
|
||||
#include "keccak-macros.c"
|
||||
|
||||
@@ -414,10 +426,14 @@ keccak512_4way_close(void *cc, void *dst)
|
||||
#undef INPUT_BUF
|
||||
#undef DECL64
|
||||
#undef XOR64
|
||||
#undef XOR
|
||||
#undef AND64
|
||||
#undef OR64
|
||||
#undef NOT64
|
||||
#undef ROL64
|
||||
#undef KECCAK_F_1600
|
||||
#undef XOROR
|
||||
#undef XORAND
|
||||
#undef XOR3
|
||||
|
||||
#endif // AVX2
|
||||
|
@@ -1,6 +1,19 @@
|
||||
#ifdef TH_ELT
|
||||
#undef TH_ELT
|
||||
#endif
|
||||
|
||||
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
|
||||
DECL64(tt0); \
|
||||
DECL64(tt1); \
|
||||
XOR3( tt0, d0, d1, d4 ); \
|
||||
XOR( tt1, d2, d3 ); \
|
||||
XOR( tt0, tt0, tt1 ); \
|
||||
ROL64( tt0, tt0, 1 ); \
|
||||
XOR3( tt1, c0, c1, c4 ); \
|
||||
XOR3( tt0, tt0, c2, c3 ); \
|
||||
XOR( t, tt0, tt1 ); \
|
||||
} while (0)
|
||||
/*
|
||||
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
|
||||
DECL64(tt0); \
|
||||
DECL64(tt1); \
|
||||
@@ -17,7 +30,7 @@
|
||||
XOR64(tt2, tt2, tt3); \
|
||||
XOR64(t, tt0, tt2); \
|
||||
} while (0)
|
||||
|
||||
*/
|
||||
#ifdef THETA
|
||||
#undef THETA
|
||||
#endif
|
||||
@@ -110,20 +123,34 @@
|
||||
#ifdef KHI_XO
|
||||
#undef KHI_XO
|
||||
#endif
|
||||
|
||||
#define KHI_XO(d, a, b, c) do { \
|
||||
XOROR(d, a, b, c); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
#define KHI_XO(d, a, b, c) do { \
|
||||
DECL64(kt); \
|
||||
OR64(kt, b, c); \
|
||||
XOR64(d, a, kt); \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
#ifdef KHI_XA
|
||||
#undef KHI_XA
|
||||
#endif
|
||||
|
||||
#define KHI_XA(d, a, b, c) do { \
|
||||
XORAND(d, a, b, c); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
#define KHI_XA(d, a, b, c) do { \
|
||||
DECL64(kt); \
|
||||
AND64(kt, b, c); \
|
||||
XOR64(d, a, kt); \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
#ifdef KHI
|
||||
#undef KHI
|
||||
@@ -134,65 +161,47 @@
|
||||
do { \
|
||||
DECL64(c0); \
|
||||
DECL64(c1); \
|
||||
DECL64(c2); \
|
||||
DECL64(c3); \
|
||||
DECL64(c4); \
|
||||
DECL64(bnn); \
|
||||
NOT64(bnn, b20); \
|
||||
KHI_XO(c0, b00, b10, b20); \
|
||||
KHI_XO(c1, b10, bnn, b30); \
|
||||
KHI_XA(c2, b20, b30, b40); \
|
||||
KHI_XO(c3, b30, b40, b00); \
|
||||
KHI_XA(c4, b40, b00, b10); \
|
||||
KHI_XA(b20, b20, b30, b40); \
|
||||
KHI_XO(b30, b30, b40, b00); \
|
||||
KHI_XA(b40, b40, b00, b10); \
|
||||
MOV64(b00, c0); \
|
||||
MOV64(b10, c1); \
|
||||
MOV64(b20, c2); \
|
||||
MOV64(b30, c3); \
|
||||
MOV64(b40, c4); \
|
||||
NOT64(bnn, b41); \
|
||||
KHI_XO(c0, b01, b11, b21); \
|
||||
KHI_XA(c1, b11, b21, b31); \
|
||||
KHI_XO(c2, b21, b31, bnn); \
|
||||
KHI_XO(c3, b31, b41, b01); \
|
||||
KHI_XA(c4, b41, b01, b11); \
|
||||
KHI_XO(b21, b21, b31, bnn); \
|
||||
KHI_XO(b31, b31, b41, b01); \
|
||||
KHI_XA(b41, b41, b01, b11); \
|
||||
MOV64(b01, c0); \
|
||||
MOV64(b11, c1); \
|
||||
MOV64(b21, c2); \
|
||||
MOV64(b31, c3); \
|
||||
MOV64(b41, c4); \
|
||||
NOT64(bnn, b32); \
|
||||
KHI_XO(c0, b02, b12, b22); \
|
||||
KHI_XA(c1, b12, b22, b32); \
|
||||
KHI_XA(c2, b22, bnn, b42); \
|
||||
KHI_XO(c3, bnn, b42, b02); \
|
||||
KHI_XA(c4, b42, b02, b12); \
|
||||
KHI_XA(b22, b22, bnn, b42); \
|
||||
KHI_XO(b32, bnn, b42, b02); \
|
||||
KHI_XA(b42, b42, b02, b12); \
|
||||
MOV64(b02, c0); \
|
||||
MOV64(b12, c1); \
|
||||
MOV64(b22, c2); \
|
||||
MOV64(b32, c3); \
|
||||
MOV64(b42, c4); \
|
||||
NOT64(bnn, b33); \
|
||||
KHI_XA(c0, b03, b13, b23); \
|
||||
KHI_XO(c1, b13, b23, b33); \
|
||||
KHI_XO(c2, b23, bnn, b43); \
|
||||
KHI_XA(c3, bnn, b43, b03); \
|
||||
KHI_XO(c4, b43, b03, b13); \
|
||||
KHI_XO(b23, b23, bnn, b43); \
|
||||
KHI_XA(b33, bnn, b43, b03); \
|
||||
KHI_XO(b43, b43, b03, b13); \
|
||||
MOV64(b03, c0); \
|
||||
MOV64(b13, c1); \
|
||||
MOV64(b23, c2); \
|
||||
MOV64(b33, c3); \
|
||||
MOV64(b43, c4); \
|
||||
NOT64(bnn, b14); \
|
||||
KHI_XA(c0, b04, bnn, b24); \
|
||||
KHI_XO(c1, bnn, b24, b34); \
|
||||
KHI_XA(c2, b24, b34, b44); \
|
||||
KHI_XO(c3, b34, b44, b04); \
|
||||
KHI_XA(c4, b44, b04, b14); \
|
||||
KHI_XA(b24, b24, b34, b44); \
|
||||
KHI_XO(b34, b34, b44, b04); \
|
||||
KHI_XA(b44, b44, b04, b14); \
|
||||
MOV64(b04, c0); \
|
||||
MOV64(b14, c1); \
|
||||
MOV64(b24, c2); \
|
||||
MOV64(b34, c3); \
|
||||
MOV64(b44, c4); \
|
||||
} while (0)
|
||||
|
||||
#ifdef IOTA
|
||||
@@ -201,6 +210,7 @@
|
||||
#define IOTA(r) XOR64_IOTA(a00, a00, r)
|
||||
|
||||
#ifdef P0
|
||||
#undef P0
|
||||
#undef P1
|
||||
#undef P2
|
||||
#undef P3
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -19,29 +19,37 @@
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include <emmintrin.h>
|
||||
#include "simd-utils.h"
|
||||
#include "luffa_for_sse2.h"
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
|
||||
a0 = _mm_alignr_epi32( a1, b, 1 ); \
|
||||
a1 = _mm_alignr_epi32( b, a1, 1 ); \
|
||||
}
|
||||
|
||||
#elif defined(__SSE4_1__)
|
||||
|
||||
#define MULT2( a0, a1 ) do \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
|
||||
a0 = _mm_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm_alignr_epi8( b, a1, 4 ); \
|
||||
} while(0)
|
||||
|
||||
#else
|
||||
|
||||
#define MULT2( a0, a1 ) do \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
|
||||
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
|
||||
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
static inline __m256i mult2_avx2( a )
|
||||
{
|
||||
__m128 a0, a0, b;
|
||||
a0 = mm128_extractlo_256( a );
|
||||
a1 = mm128_extracthi_256( a );
|
||||
b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) );
|
||||
a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) );
|
||||
a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) );
|
||||
return mm256_concat_128( a1, a0 );
|
||||
}
|
||||
*/
|
||||
#endif
|
||||
|
||||
#define STEP_PART(x,c,t)\
|
||||
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||
@@ -73,13 +81,13 @@ static inline __m256i mult2_avx2( a )
|
||||
t = _mm_load_si128(&a0);\
|
||||
a0 = _mm_or_si128(a0,a1);\
|
||||
a2 = _mm_xor_si128(a2,a3);\
|
||||
a1 = _mm_andnot_si128(a1,ALLONE);\
|
||||
a1 = mm128_not( a1 );\
|
||||
a0 = _mm_xor_si128(a0,a3);\
|
||||
a3 = _mm_and_si128(a3,t);\
|
||||
a1 = _mm_xor_si128(a1,a3);\
|
||||
a3 = _mm_xor_si128(a3,a2);\
|
||||
a2 = _mm_and_si128(a2,a0);\
|
||||
a0 = _mm_andnot_si128(a0,ALLONE);\
|
||||
a0 = mm128_not( a0 );\
|
||||
a2 = _mm_xor_si128(a2,a1);\
|
||||
a1 = _mm_or_si128(a1,a3);\
|
||||
t = _mm_xor_si128(t,a1);\
|
||||
@@ -255,17 +263,18 @@ static const uint32 CNS_INIT[128] __attribute((aligned(16))) = {
|
||||
|
||||
|
||||
__m128i CNS128[32];
|
||||
__m128i ALLONE;
|
||||
#if !defined(__SSE4_1__)
|
||||
__m128i MASK;
|
||||
#endif
|
||||
|
||||
HashReturn init_luffa(hashState_luffa *state, int hashbitlen)
|
||||
{
|
||||
int i;
|
||||
state->hashbitlen = hashbitlen;
|
||||
#if !defined(__SSE4_1__)
|
||||
/* set the lower 32 bits to '1' */
|
||||
MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
|
||||
/* set all bits to '1' */
|
||||
ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||
#endif
|
||||
/* set the 32-bit round constant values to the 128-bit data field */
|
||||
for ( i=0; i<32; i++ )
|
||||
CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
|
||||
@@ -345,11 +354,11 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
// padding of partial block
|
||||
rnd512( state, m128_const_64( 0, 0x80000000 ),
|
||||
rnd512( state, mm128_mov64_128( 0x80000000 ),
|
||||
mm128_bswap_32( cast_m128i( data ) ) );
|
||||
else
|
||||
// empty pad block
|
||||
rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
|
||||
rnd512( state, m128_zero, mm128_mov64_128( 0x80000000 ) );
|
||||
|
||||
finalization512( state, (uint32*) output );
|
||||
if ( state->hashbitlen > 512 )
|
||||
@@ -365,10 +374,10 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
|
||||
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
|
||||
int i;
|
||||
state->hashbitlen = hashbitlen;
|
||||
#if !defined(__SSE4_1__)
|
||||
/* set the lower 32 bits to '1' */
|
||||
MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
|
||||
/* set all bits to '1' */
|
||||
ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||
#endif
|
||||
/* set the 32-bit round constant values to the 128-bit data field */
|
||||
for ( i=0; i<32; i++ )
|
||||
CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
|
||||
@@ -394,11 +403,11 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
// padding of partial block
|
||||
rnd512( state, m128_const_64( 0, 0x80000000 ),
|
||||
rnd512( state, mm128_mov64_128( 0x80000000 ),
|
||||
mm128_bswap_32( cast_m128i( data ) ) );
|
||||
else
|
||||
// empty pad block
|
||||
rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
|
||||
rnd512( state, m128_zero, mm128_mov64_128( 0x80000000 ) );
|
||||
|
||||
finalization512( state, (uint32*) output );
|
||||
if ( state->hashbitlen > 512 )
|
||||
@@ -587,7 +596,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
__m256i* chainv = (__m256i*)state->chainv;
|
||||
__m256i t;
|
||||
const __m128i zero = m128_zero;
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
const __m256i shuff_bswap32 = _mm256_set_epi64x( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
@@ -606,7 +615,6 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
|
||||
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
// casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
||||
|
||||
rnd512( state, zero, zero );
|
||||
|
||||
@@ -621,7 +629,6 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
|
||||
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
// casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
||||
}
|
||||
|
||||
#else
|
||||
|
@@ -13,10 +13,9 @@
|
||||
|
||||
#if defined (ALLIUM_16WAY)
|
||||
|
||||
typedef struct {
|
||||
blake256_16way_context blake;
|
||||
typedef union {
|
||||
keccak256_8way_context keccak;
|
||||
cube_4way_context cube;
|
||||
cube_4way_2buf_context cube;
|
||||
skein256_8way_context skein;
|
||||
#if defined(__VAES__)
|
||||
groestl256_4way_context groestl;
|
||||
@@ -25,47 +24,31 @@ typedef struct {
|
||||
#endif
|
||||
} allium_16way_ctx_holder;
|
||||
|
||||
static __thread allium_16way_ctx_holder allium_16way_ctx;
|
||||
|
||||
bool init_allium_16way_ctx()
|
||||
{
|
||||
keccak256_8way_init( &allium_16way_ctx.keccak );
|
||||
cube_4way_init( &allium_16way_ctx.cube, 256, 16, 32 );
|
||||
skein256_8way_init( &allium_16way_ctx.skein );
|
||||
#if defined(__VAES__)
|
||||
groestl256_4way_init( &allium_16way_ctx.groestl, 32 );
|
||||
#else
|
||||
init_groestl256( &allium_16way_ctx.groestl, 32 );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
void allium_16way_hash( void *state, const void *input )
|
||||
static void allium_16way_hash( void *state, const void *midstate_vars,
|
||||
const void *midhash, const void *block )
|
||||
{
|
||||
uint32_t vhash[16*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vhashA[16*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhashB[16*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash7[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash8[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash9[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash10[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash11[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash12[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash13[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash14[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash15[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash4[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash5[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash6[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash7[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash8[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash9[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash10[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash11[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash12[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash13[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash14[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash15[8] __attribute__ ((aligned (32)));
|
||||
allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx, &allium_16way_ctx, sizeof(allium_16way_ctx) );
|
||||
blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
|
||||
blake256_16way_close( &ctx.blake, vhash );
|
||||
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
|
||||
|
||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
|
||||
@@ -75,7 +58,7 @@ void allium_16way_hash( void *state, const void *input )
|
||||
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
||||
hash15, 256 );
|
||||
|
||||
// rintrlv_8x32_8x64( vhashA, vhash, 256 );
|
||||
keccak256_8way_init( &ctx.keccak );
|
||||
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
|
||||
keccak256_8way_close( &ctx.keccak, vhashA);
|
||||
keccak256_8way_init( &ctx.keccak );
|
||||
@@ -115,8 +98,7 @@ void allium_16way_hash( void *state, const void *input )
|
||||
intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||
intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||
|
||||
cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||
cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
||||
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 );
|
||||
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
|
||||
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
|
||||
@@ -124,8 +106,7 @@ void allium_16way_hash( void *state, const void *input )
|
||||
intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 );
|
||||
intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 );
|
||||
|
||||
cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||
cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
||||
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 );
|
||||
|
||||
dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 );
|
||||
dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 );
|
||||
@@ -160,6 +141,7 @@ void allium_16way_hash( void *state, const void *input )
|
||||
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
||||
hash15, 256 );
|
||||
|
||||
skein256_8way_init( &ctx.skein );
|
||||
skein256_8way_update( &ctx.skein, vhashA, 32 );
|
||||
skein256_8way_close( &ctx.skein, vhashA );
|
||||
skein256_8way_init( &ctx.skein );
|
||||
@@ -207,6 +189,7 @@ void allium_16way_hash( void *state, const void *input )
|
||||
groestl256_full( &ctx.groestl, state+416, hash13, 256 );
|
||||
groestl256_full( &ctx.groestl, state+448, hash14, 256 );
|
||||
groestl256_full( &ctx.groestl, state+480, hash15, 256 );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -214,35 +197,60 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
|
||||
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
|
||||
__m512i block0_hash[8] __attribute__ ((aligned (64)));
|
||||
__m512i block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i sixteen = m512_const1_32( 16 );
|
||||
|
||||
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
// Prehash first block.
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
|
||||
// Interleave hash for second block prehash.
|
||||
block0_hash[0] = _mm512_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm512_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm512_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm512_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm512_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm512_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm512_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm512_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
|
||||
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
|
||||
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
|
||||
block_buf[ 3] =
|
||||
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
|
||||
|
||||
blake256_16way_init( &allium_16way_ctx.blake );
|
||||
blake256_16way_update( &allium_16way_ctx.blake, vdata, 64 );
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
allium_16way_hash( hash, vdata );
|
||||
allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, hash+(lane<<3), mythr );
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
|
||||
block_buf[ 3] = _mm512_add_epi32( block_buf[ 3], sixteen );
|
||||
n += 16;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
|
||||
pdata[19] = n;
|
||||
@@ -252,10 +260,9 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined (ALLIUM_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake256_8way_context blake;
|
||||
typedef union {
|
||||
keccak256_4way_context keccak;
|
||||
cubehashParam cube;
|
||||
cube_2way_context cube;
|
||||
skein256_4way_context skein;
|
||||
#if defined(__VAES__)
|
||||
groestl256_2way_context groestl;
|
||||
@@ -264,25 +271,11 @@ typedef struct {
|
||||
#endif
|
||||
} allium_8way_ctx_holder;
|
||||
|
||||
static __thread allium_8way_ctx_holder allium_8way_ctx;
|
||||
|
||||
bool init_allium_8way_ctx()
|
||||
{
|
||||
keccak256_4way_init( &allium_8way_ctx.keccak );
|
||||
cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 );
|
||||
skein256_4way_init( &allium_8way_ctx.skein );
|
||||
#if defined(__VAES__)
|
||||
groestl256_2way_init( &allium_8way_ctx.groestl, 32 );
|
||||
#else
|
||||
init_groestl256( &allium_8way_ctx.groestl, 32 );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
void allium_8way_hash( void *hash, const void *input )
|
||||
static void allium_8way_hash( void *hash, const void *midstate_vars,
|
||||
const void *midhash, const void *block )
|
||||
{
|
||||
uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[4*8] __attribute__ ((aligned (32)));
|
||||
uint64_t *hash0 = (uint64_t*)hash;
|
||||
uint64_t *hash1 = (uint64_t*)hash+ 4;
|
||||
uint64_t *hash2 = (uint64_t*)hash+ 8;
|
||||
@@ -293,15 +286,14 @@ void allium_8way_hash( void *hash, const void *input )
|
||||
uint64_t *hash7 = (uint64_t*)hash+28;
|
||||
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
|
||||
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
|
||||
blake256_8way_close( &ctx.blake, vhashA );
|
||||
blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block );
|
||||
|
||||
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhashA, 256 );
|
||||
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||
|
||||
keccak256_4way_init( &ctx.keccak );
|
||||
keccak256_4way_update( &ctx.keccak, vhashA, 32 );
|
||||
keccak256_4way_close( &ctx.keccak, vhashA );
|
||||
keccak256_4way_init( &ctx.keccak );
|
||||
@@ -320,21 +312,19 @@ void allium_8way_hash( void *hash, const void *input )
|
||||
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*)hash4, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*)hash5, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*)hash6, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*)hash7, 32 );
|
||||
intrlv_2x128( vhashA, hash0, hash1, 256 );
|
||||
intrlv_2x128( vhashB, hash2, hash3, 256 );
|
||||
cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||
cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
||||
dintrlv_2x128( hash0, hash1, vhashA, 256 );
|
||||
dintrlv_2x128( hash2, hash3, vhashB, 256 );
|
||||
|
||||
intrlv_2x128( vhashA, hash4, hash5, 256 );
|
||||
intrlv_2x128( vhashB, hash6, hash7, 256 );
|
||||
cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||
cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
||||
dintrlv_2x128( hash4, hash5, vhashA, 256 );
|
||||
dintrlv_2x128( hash6, hash7, vhashB, 256 );
|
||||
|
||||
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
|
||||
@@ -348,6 +338,7 @@ void allium_8way_hash( void *hash, const void *input )
|
||||
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||
|
||||
skein256_4way_init( &ctx.skein );
|
||||
skein256_4way_update( &ctx.skein, vhashA, 32 );
|
||||
skein256_4way_close( &ctx.skein, vhashA );
|
||||
skein256_4way_init( &ctx.skein );
|
||||
@@ -356,8 +347,8 @@ void allium_8way_hash( void *hash, const void *input )
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
uint64_t vhashC[4*2] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashD[4*2] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashC[4*2] __attribute__ ((aligned (32)));
|
||||
uint64_t vhashD[4*2] __attribute__ ((aligned (32)));
|
||||
|
||||
rintrlv_4x64_2x128( vhashC, vhashD, vhashA, 256 );
|
||||
groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 );
|
||||
@@ -392,36 +383,60 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint64_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
|
||||
__m256i block0_hash[8] __attribute__ ((aligned (64)));
|
||||
__m256i block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
uint64_t *ptarget = (uint64_t*)work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i eight = m256_const1_32( 8 );
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
|
||||
blake256_8way_init( &allium_8way_ctx.blake );
|
||||
blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
|
||||
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm256_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm256_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm256_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm256_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm256_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
|
||||
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
|
||||
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
|
||||
block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4,
|
||||
n+ 3, n+ 2, n+ 1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
allium_8way_hash( hash, vdata );
|
||||
allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
{
|
||||
const uint64_t *lane_hash = hash + (lane<<2);
|
||||
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
|
||||
block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
|
||||
} while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
|
@@ -132,11 +132,11 @@ bool register_lyra2z_algo( algo_gate_t* gate )
|
||||
#if defined(LYRA2Z_16WAY)
|
||||
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_16way;
|
||||
gate->hash = (void*)&lyra2z_16way_hash;
|
||||
// gate->hash = (void*)&lyra2z_16way_hash;
|
||||
#elif defined(LYRA2Z_8WAY)
|
||||
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_8way;
|
||||
gate->hash = (void*)&lyra2z_8way_hash;
|
||||
// gate->hash = (void*)&lyra2z_8way_hash;
|
||||
#elif defined(LYRA2Z_4WAY)
|
||||
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_4way;
|
||||
@@ -175,20 +175,16 @@ bool register_lyra2h_algo( algo_gate_t* gate )
|
||||
bool register_allium_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (ALLIUM_16WAY)
|
||||
gate->miner_thread_init = (void*)&init_allium_16way_ctx;
|
||||
gate->scanhash = (void*)&scanhash_allium_16way;
|
||||
gate->hash = (void*)&allium_16way_hash;
|
||||
#elif defined (ALLIUM_8WAY)
|
||||
gate->miner_thread_init = (void*)&init_allium_8way_ctx;
|
||||
gate->scanhash = (void*)&scanhash_allium_8way;
|
||||
gate->hash = (void*)&allium_8way_hash;
|
||||
#else
|
||||
gate->miner_thread_init = (void*)&init_allium_ctx;
|
||||
gate->scanhash = (void*)&scanhash_allium;
|
||||
gate->hash = (void*)&allium_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT
|
||||
| VAES_OPT | VAES256_OPT;
|
||||
| VAES_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
|
@@ -99,14 +99,14 @@ bool init_lyra2rev2_ctx();
|
||||
|
||||
#if defined(LYRA2Z_16WAY)
|
||||
|
||||
void lyra2z_16way_hash( void *state, const void *input );
|
||||
//void lyra2z_16way_hash( void *state, const void *input );
|
||||
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool lyra2z_16way_thread_init();
|
||||
|
||||
#elif defined(LYRA2Z_8WAY)
|
||||
|
||||
void lyra2z_8way_hash( void *state, const void *input );
|
||||
//void lyra2z_8way_hash( void *state, const void *input );
|
||||
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool lyra2z_8way_thread_init();
|
||||
@@ -163,17 +163,13 @@ bool register_allium_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(ALLIUM_16WAY)
|
||||
|
||||
void allium_16way_hash( void *state, const void *input );
|
||||
int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool init_allium_16way_ctx();
|
||||
|
||||
#elif defined(ALLIUM_8WAY)
|
||||
|
||||
void allium_8way_hash( void *state, const void *input );
|
||||
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool init_allium_8way_ctx();
|
||||
|
||||
#else
|
||||
|
||||
|
@@ -75,7 +75,7 @@ void lyra2rev2_16way_hash( void *state, const void *input )
|
||||
keccak256_8way_close( &ctx.keccak, vhash );
|
||||
|
||||
dintrlv_8x64( hash8, hash9, hash10, hash11,
|
||||
hash12, hash13, hash14, hash5, vhash, 256 );
|
||||
hash12, hash13, hash14, hash15, vhash, 256 );
|
||||
|
||||
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
|
||||
cubehash_full( &ctx.cube, (byte*) hash1, 256, (const byte*) hash1, 32 );
|
||||
|
@@ -14,38 +14,28 @@ bool lyra2z_16way_thread_init()
|
||||
return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||
}
|
||||
|
||||
static __thread blake256_16way_context l2z_16way_blake_mid;
|
||||
|
||||
void lyra2z_16way_midstate( const void* input )
|
||||
{
|
||||
blake256_16way_init( &l2z_16way_blake_mid );
|
||||
blake256_16way_update( &l2z_16way_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2z_16way_hash( void *state, const void *input )
|
||||
static void lyra2z_16way_hash( void *state, const void *midstate_vars,
|
||||
const void *midhash, const void *block )
|
||||
{
|
||||
uint32_t vhash[8*16] __attribute__ ((aligned (128)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash7[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash8[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash9[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash10[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash11[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash12[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash13[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash14[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash15[8] __attribute__ ((aligned (64)));
|
||||
blake256_16way_context ctx_blake __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash4[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash5[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash6[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash7[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash8[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash9[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash10[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash11[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash12[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash13[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash14[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash15[8] __attribute__ ((aligned (32)));
|
||||
|
||||
memcpy( &ctx_blake, &l2z_16way_blake_mid, sizeof l2z_16way_blake_mid );
|
||||
blake256_16way_update( &ctx_blake, input + (64*16), 16 );
|
||||
blake256_16way_close( &ctx_blake, vhash );
|
||||
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
|
||||
|
||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
|
||||
@@ -97,40 +87,62 @@ void lyra2z_16way_hash( void *state, const void *input )
|
||||
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint64_t hash[4*16] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (128)));
|
||||
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
|
||||
__m512i block0_hash[8] __attribute__ ((aligned (64)));
|
||||
__m512i block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i sixteen = m512_const1_32( 16 );
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0000ff;
|
||||
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
|
||||
block0_hash[0] = _mm512_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm512_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm512_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm512_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm512_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm512_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm512_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm512_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
|
||||
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
|
||||
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
|
||||
block_buf[ 3] =
|
||||
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
lyra2z_16way_midstate( vdata );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
lyra2z_16way_hash( hash, vdata );
|
||||
lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
const uint64_t *lane_hash = hash + (lane<<2);
|
||||
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, hash+(lane<<3), mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
|
||||
block_buf[ 3] = _mm512_add_epi32( block_buf[ 3], sixteen );
|
||||
n += 16;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
@@ -145,30 +157,20 @@ bool lyra2z_8way_thread_init()
|
||||
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||
}
|
||||
|
||||
static __thread blake256_8way_context l2z_8way_blake_mid;
|
||||
|
||||
void lyra2z_8way_midstate( const void* input )
|
||||
{
|
||||
blake256_8way_init( &l2z_8way_blake_mid );
|
||||
blake256_8way_update( &l2z_8way_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2z_8way_hash( void *state, const void *input )
|
||||
static void lyra2z_8way_hash( void *state, const void *midstate_vars,
|
||||
const void *midhash, const void *block )
|
||||
{
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash7[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash4[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash5[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash6[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash7[8] __attribute__ ((aligned (32)));
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
|
||||
blake256_8way_update( &ctx_blake, input + (64*8), 16 );
|
||||
blake256_8way_close( &ctx_blake, vhash );
|
||||
blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block );
|
||||
|
||||
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
@@ -182,7 +184,6 @@ void lyra2z_8way_hash( void *state, const void *input )
|
||||
LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
|
||||
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+ 32, hash1, 32 );
|
||||
memcpy( state+ 64, hash2, 32 );
|
||||
@@ -197,43 +198,66 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint64_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
|
||||
__m256i block0_hash[8] __attribute__ ((aligned (64)));
|
||||
__m256i block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint64_t *ptarget = (uint64_t*)work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i eight = m256_const1_32( 8 );
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0000ff;
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
lyra2z_8way_midstate( vdata );
|
||||
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm256_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm256_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm256_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm256_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm256_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
|
||||
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
|
||||
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
|
||||
block_buf[ 3] =
|
||||
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
lyra2z_8way_hash( hash, vdata );
|
||||
lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
{
|
||||
const uint64_t *lane_hash = hash + (lane<<2);
|
||||
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
|
||||
n += 8;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
|
||||
block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
|
||||
} while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#elif defined(LYRA2Z_4WAY)
|
||||
|
||||
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include "lyra2.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
__thread uint64_t* lyra2z330_wholeMatrix;
|
||||
static __thread uint64_t* lyra2z330_wholeMatrix;
|
||||
|
||||
void lyra2z330_hash(void *state, const void *input, uint32_t height)
|
||||
{
|
||||
|
@@ -85,9 +85,9 @@ inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
|
||||
|
||||
state0 =
|
||||
state1 = m512_zero;
|
||||
state2 = m512_const4_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
|
||||
state2 = _mm512_set4_epi64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
|
||||
0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
|
||||
state3 = m512_const4_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
|
||||
state3 = _mm512_set4_epi64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
|
||||
0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
|
||||
|
||||
for ( int i = 0; i < nBlocks; i++ )
|
||||
@@ -261,7 +261,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
||||
// overlap it's unified.
|
||||
// As a result normal is Nrows-2 / Nrows.
|
||||
// for 4 rows: 1 unified, 2 overlap, 1 normal.
|
||||
// for 8 rows: 1 unified, 2 overlap, 56 normal.
|
||||
// for 8 rows: 1 unified, 2 overlap, 5 normal.
|
||||
|
||||
static inline void reducedDuplexRow_2way_normal( uint64_t *State,
|
||||
uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
|
||||
@@ -283,6 +283,15 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
//Absorbing "M[prev] [+] M[row*]"
|
||||
io0 = _mm512_load_si512( inout0 );
|
||||
io1 = _mm512_load_si512( inout0 +1 );
|
||||
io2 = _mm512_load_si512( inout0 +2 );
|
||||
|
||||
io0 = _mm512_mask_load_epi64( io0, 0xf0, inout1 );
|
||||
io1 = _mm512_mask_load_epi64( io1, 0xf0, inout1 +1 );
|
||||
io2 = _mm512_mask_load_epi64( io2, 0xf0, inout1 +2 );
|
||||
|
||||
/*
|
||||
io0 = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 ) );
|
||||
@@ -292,6 +301,7 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
|
||||
io2 = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 +2 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 +2 ) );
|
||||
*/
|
||||
|
||||
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0 ) );
|
||||
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1 ) );
|
||||
@@ -359,6 +369,15 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
//Absorbing "M[prev] [+] M[row*]"
|
||||
io0.v512 = _mm512_load_si512( inout0 );
|
||||
io1.v512 = _mm512_load_si512( inout0 +1 );
|
||||
io2.v512 = _mm512_load_si512( inout0 +2 );
|
||||
|
||||
io0.v512 = _mm512_mask_load_epi64( io0.v512, 0xf0, inout1 );
|
||||
io1.v512 = _mm512_mask_load_epi64( io1.v512, 0xf0, inout1 +1 );
|
||||
io2.v512 = _mm512_mask_load_epi64( io2.v512, 0xf0, inout1 +2 );
|
||||
|
||||
/*
|
||||
io0.v512 = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 ) );
|
||||
@@ -368,27 +387,12 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
|
||||
io2.v512 = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 +2 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 +2 ) );
|
||||
*/
|
||||
|
||||
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0.v512 ) );
|
||||
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1.v512 ) );
|
||||
state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io2.v512 ) );
|
||||
|
||||
/*
|
||||
io.v512[0] = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 ) );
|
||||
io.v512[1] = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 +1 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 +1 ) );
|
||||
io.v512[2] = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 +2 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 +2 ) );
|
||||
|
||||
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io.v512[0] ) );
|
||||
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io.v512[1] ) );
|
||||
state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io.v512[2] ) );
|
||||
*/
|
||||
|
||||
//Applies the reduced-round transformation f to the sponge's state
|
||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
|
||||
@@ -415,22 +419,6 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
|
||||
io2.v512 = _mm512_mask_blend_epi64( 0xf0, io2.v512, out[2] );
|
||||
}
|
||||
|
||||
/*
|
||||
if ( rowOut == rowInOut0 )
|
||||
{
|
||||
io.v512[0] = _mm512_mask_blend_epi64( 0x0f, io.v512[0], out[0] );
|
||||
io.v512[1] = _mm512_mask_blend_epi64( 0x0f, io.v512[1], out[1] );
|
||||
io.v512[2] = _mm512_mask_blend_epi64( 0x0f, io.v512[2], out[2] );
|
||||
|
||||
}
|
||||
if ( rowOut == rowInOut1 )
|
||||
{
|
||||
io.v512[0] = _mm512_mask_blend_epi64( 0xf0, io.v512[0], out[0] );
|
||||
io.v512[1] = _mm512_mask_blend_epi64( 0xf0, io.v512[1], out[1] );
|
||||
io.v512[2] = _mm512_mask_blend_epi64( 0xf0, io.v512[2], out[2] );
|
||||
}
|
||||
*/
|
||||
|
||||
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
|
||||
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
||||
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
||||
@@ -444,12 +432,23 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
|
||||
_mm512_mask_blend_epi64( 0x11, t2, t1 ) );
|
||||
}
|
||||
|
||||
/*
|
||||
casti_m256i( inout0, 0 ) = _mm512_castsi512_si256( io0.v512 );
|
||||
casti_m256i( inout0, 2 ) = _mm512_castsi512_si256( io1.v512 );
|
||||
casti_m256i( inout0, 4 ) = _mm512_castsi512_si256( io2.v512 );
|
||||
_mm512_mask_store_epi64( inout1, 0xf0, io0.v512 );
|
||||
_mm512_mask_store_epi64( inout1 +1, 0xf0, io1.v512 );
|
||||
_mm512_mask_store_epi64( inout1 +2, 0xf0, io2.v512 );
|
||||
*/
|
||||
|
||||
|
||||
casti_m256i( inout0, 0 ) = io0.v256lo;
|
||||
casti_m256i( inout1, 1 ) = io0.v256hi;
|
||||
casti_m256i( inout0, 2 ) = io1.v256lo;
|
||||
casti_m256i( inout1, 3 ) = io1.v256hi;
|
||||
casti_m256i( inout0, 4 ) = io2.v256lo;
|
||||
casti_m256i( inout1, 5 ) = io2.v256hi;
|
||||
|
||||
/*
|
||||
_mm512_mask_store_epi64( inout0, 0x0f, io.v512[0] );
|
||||
_mm512_mask_store_epi64( inout1, 0xf0, io.v512[0] );
|
||||
|
@@ -48,9 +48,9 @@ inline void initState( uint64_t State[/*16*/] )
|
||||
const __m256i zero = m256_zero;
|
||||
state[0] = zero;
|
||||
state[1] = zero;
|
||||
state[2] = m256_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
|
||||
state[2] = _mm256_set_epi64x( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
|
||||
0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
|
||||
state[3] = m256_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
|
||||
state[3] = _mm256_set_epi64x( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
|
||||
0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
|
||||
|
||||
#elif defined (__SSE2__)
|
||||
@@ -271,9 +271,9 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In,
|
||||
|
||||
state0 =
|
||||
state1 = m256_zero;
|
||||
state2 = m256_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
|
||||
state2 = _mm256_set_epi64x( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
|
||||
0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
|
||||
state3 = m256_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
|
||||
state3 = _mm256_set_epi64x( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
|
||||
0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
|
||||
|
||||
for ( int i = 0; i < nBlocks; i++ )
|
||||
|
@@ -66,13 +66,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
|
||||
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
G2W_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm512_ror256_64( s1); \
|
||||
s3 = mm512_shufll256_64( s3 ); \
|
||||
s1 = mm512_shuflr256_64( s1); \
|
||||
s2 = mm512_swap256_128( s2 ); \
|
||||
s3 = mm512_rol256_64( s3 ); \
|
||||
G2W_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm512_rol256_64( s1 ); \
|
||||
s2 = mm512_swap256_128( s2 ); \
|
||||
s3 = mm512_ror256_64( s3 );
|
||||
s3 = mm512_shuflr256_64( s3 ); \
|
||||
s1 = mm512_shufll256_64( s1 ); \
|
||||
s2 = mm512_swap256_128( s2 );
|
||||
|
||||
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
@@ -97,23 +97,23 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
// returns void, updates all args
|
||||
#define G_4X64(a,b,c,d) \
|
||||
a = _mm256_add_epi64( a, b ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
|
||||
d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
|
||||
b = mm256_shuflr64_24( _mm256_xor_si256( b, c ) ); \
|
||||
a = _mm256_add_epi64( a, b ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
|
||||
d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );
|
||||
|
||||
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm256_ror_1x64( s1); \
|
||||
s3 = mm256_shufll_64( s3 ); \
|
||||
s1 = mm256_shuflr_64( s1); \
|
||||
s2 = mm256_swap_128( s2 ); \
|
||||
s3 = mm256_rol_1x64( s3 ); \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm256_rol_1x64( s1 ); \
|
||||
s2 = mm256_swap_128( s2 ); \
|
||||
s3 = mm256_ror_1x64( s3 );
|
||||
s3 = mm256_shuflr_64( s3 ); \
|
||||
s1 = mm256_shufll_64( s1 ); \
|
||||
s2 = mm256_swap_128( s2 );
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
@@ -137,25 +137,34 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
// returns void, all args updated
|
||||
#define G_2X64(a,b,c,d) \
|
||||
a = _mm_add_epi64( a, b ); \
|
||||
d = mm128_ror_64( _mm_xor_si128( d, a), 32 ); \
|
||||
d = mm128_swap64_32( _mm_xor_si128( d, a) ); \
|
||||
c = _mm_add_epi64( c, d ); \
|
||||
b = mm128_ror_64( _mm_xor_si128( b, c ), 24 ); \
|
||||
b = mm128_shuflr64_24( _mm_xor_si128( b, c ) ); \
|
||||
a = _mm_add_epi64( a, b ); \
|
||||
d = mm128_ror_64( _mm_xor_si128( d, a ), 16 ); \
|
||||
d = mm128_shuflr64_16( _mm_xor_si128( d, a ) ); \
|
||||
c = _mm_add_epi64( c, d ); \
|
||||
b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );
|
||||
|
||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
{ \
|
||||
__m128i t; \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm128_ror256_64( s2, s3 ); \
|
||||
mm128_swap256_128( s4, s5 ); \
|
||||
mm128_rol256_64( s6, s7 ); \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm128_rol256_64( s2, s3 ); \
|
||||
mm128_swap256_128( s4, s5 ); \
|
||||
mm128_ror256_64( s6, s7 );
|
||||
t = mm128_alignr_64( s7, s6, 1 ); \
|
||||
s6 = mm128_alignr_64( s6, s7, 1 ); \
|
||||
s7 = t; \
|
||||
t = mm128_alignr_64( s2, s3, 1 ); \
|
||||
s2 = mm128_alignr_64( s3, s2, 1 ); \
|
||||
s3 = t; \
|
||||
G_2X64( s0, s2, s5, s6 ); \
|
||||
G_2X64( s1, s3, s4, s7 ); \
|
||||
t = mm128_alignr_64( s6, s7, 1 ); \
|
||||
s6 = mm128_alignr_64( s7, s6, 1 ); \
|
||||
s7 = t; \
|
||||
t = mm128_alignr_64( s3, s2, 1 ); \
|
||||
s2 = mm128_alignr_64( s2, s3, 1 ); \
|
||||
s3 = t; \
|
||||
}
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
|
@@ -12,8 +12,8 @@
|
||||
#include "algo/tiger/sph_tiger.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/ripemd/sph_ripemd.h"
|
||||
#include <openssl/sha.h>
|
||||
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
#define EPSa DBL_EPSILON
|
||||
#define EPS1 DBL_EPSILON
|
||||
@@ -105,8 +105,8 @@ uint32_t sw2_( int nnounce )
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
SHA256_CTX sha256;
|
||||
SHA512_CTX sha512;
|
||||
sha256_context sha256;
|
||||
sph_sha512_context sha512;
|
||||
sph_keccak512_context keccak;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sph_haval256_5_context haval;
|
||||
@@ -118,8 +118,8 @@ m7m_ctx_holder m7m_ctx;
|
||||
|
||||
void init_m7m_ctx()
|
||||
{
|
||||
SHA256_Init( &m7m_ctx.sha256 );
|
||||
SHA512_Init( &m7m_ctx.sha512 );
|
||||
sha256_ctx_init( &m7m_ctx.sha256 );
|
||||
sph_sha512_init( &m7m_ctx.sha512 );
|
||||
sph_keccak512_init( &m7m_ctx.keccak );
|
||||
sph_whirlpool_init( &m7m_ctx.whirlpool );
|
||||
sph_haval256_5_init( &m7m_ctx.haval );
|
||||
@@ -143,11 +143,10 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
uint32_t hash[8] __attribute__((aligned(64)));
|
||||
uint8_t bhash[7][64] __attribute__((aligned(64)));
|
||||
uint32_t n = pdata[19] - 1;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
int thr_id = mythr->id;
|
||||
uint32_t usw_, mpzscale;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
char data_str[161], hash_str[65], target_str[65];
|
||||
//uint8_t *bdata = 0;
|
||||
uint8_t bdata[8192] __attribute__ ((aligned (64)));
|
||||
int i, digits;
|
||||
int bytes;
|
||||
@@ -155,12 +154,11 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
|
||||
m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) );
|
||||
SHA256_CTX ctxf_sha256;
|
||||
|
||||
memcpy(data, pdata, 80);
|
||||
|
||||
SHA256_Update( &ctx1.sha256, data, M7_MIDSTATE_LEN );
|
||||
SHA512_Update( &ctx1.sha512, data, M7_MIDSTATE_LEN );
|
||||
sha256_update( &ctx1.sha256, data, M7_MIDSTATE_LEN );
|
||||
sph_sha512( &ctx1.sha512, data, M7_MIDSTATE_LEN );
|
||||
sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN );
|
||||
sph_whirlpool( &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
|
||||
sph_haval256_5( &ctx1.haval, data, M7_MIDSTATE_LEN );
|
||||
@@ -191,11 +189,11 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
|
||||
memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );
|
||||
|
||||
SHA256_Update( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
SHA256_Final( (unsigned char*) (bhash[0]), &ctx2.sha256 );
|
||||
sha256_update( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
sha256_final( &ctx2.sha256, bhash[0] );
|
||||
|
||||
SHA512_Update( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
SHA512_Final( (unsigned char*) (bhash[1]), &ctx2.sha512 );
|
||||
sph_sha512( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
sph_sha512_close( &ctx2.sha512, bhash[1] );
|
||||
|
||||
sph_keccak512( &ctx2.keccak, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
sph_keccak512_close( &ctx2.keccak, (void*)(bhash[2]) );
|
||||
@@ -227,9 +225,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
bytes = mpz_sizeinbase(product, 256);
|
||||
mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);
|
||||
|
||||
SHA256_Init( &ctxf_sha256 );
|
||||
SHA256_Update( &ctxf_sha256, bdata, bytes );
|
||||
SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
|
||||
sha256_full( hash, bdata, bytes );
|
||||
|
||||
digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
|
||||
mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
|
||||
@@ -262,18 +258,11 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
mpzscale=bytes;
|
||||
mpz_export(bdata, NULL, -1, 1, 0, 0, product);
|
||||
|
||||
SHA256_Init( &ctxf_sha256 );
|
||||
SHA256_Update( &ctxf_sha256, bdata, bytes );
|
||||
SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
|
||||
sha256_full( hash, bdata, bytes );
|
||||
}
|
||||
|
||||
|
||||
if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
|
||||
&& !opt_benchmark ) )
|
||||
|
||||
|
||||
// if ( unlikely( hash[7] <= ptarget[7] ) )
|
||||
// if ( likely( fulltest( hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
if ( opt_debug )
|
||||
{
|
||||
|
@@ -156,7 +156,7 @@ int scanhash_zr5( struct work *work, uint32_t max_nonce,
|
||||
void zr5_get_new_work( struct work* work, struct work* g_work, int thr_id,
|
||||
uint32_t* end_nonce_ptr )
|
||||
{
|
||||
pthread_rwlock_rdlock( &g_work_lock );
|
||||
// pthread_rwlock_rdlock( &g_work_lock );
|
||||
|
||||
// ignore POK in first word
|
||||
const int wkcmp_sz = 72; // (19-1) * sizeof(uint32_t)
|
||||
@@ -174,7 +174,7 @@ void zr5_get_new_work( struct work* work, struct work* g_work, int thr_id,
|
||||
else
|
||||
++(*nonceptr);
|
||||
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
// pthread_rwlock_unlock( &g_work_lock );
|
||||
}
|
||||
|
||||
void zr5_display_pok( struct work* work )
|
||||
|
@@ -312,10 +312,26 @@ do { \
|
||||
BUPDATE1_8W( 7, 1 ); \
|
||||
} while (0)
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define GAMMA_8W(n0, n1, n2, n4) \
|
||||
( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )
|
||||
|
||||
#define THETA_8W(n0, n1, n2, n4) \
|
||||
( g ## n0 = mm256_xor3( a ## n0, a ## n1, a ## n4 ) )
|
||||
|
||||
#else
|
||||
|
||||
#define GAMMA_8W(n0, n1, n2, n4) \
|
||||
(g ## n0 = _mm256_xor_si256( a ## n0, \
|
||||
_mm256_or_si256( a ## n1, mm256_not( a ## n2 ) ) ) )
|
||||
|
||||
#define THETA_8W(n0, n1, n2, n4) \
|
||||
( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \
|
||||
a ## n4 ) ) )
|
||||
|
||||
#endif
|
||||
|
||||
#define PI_ALL_8W do { \
|
||||
a0 = g0; \
|
||||
a1 = mm256_rol_32( g7, 1 ); \
|
||||
@@ -336,9 +352,6 @@ do { \
|
||||
a16 = mm256_rol_32( g10, 8 ); \
|
||||
} while (0)
|
||||
|
||||
#define THETA_8W(n0, n1, n2, n4) \
|
||||
( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \
|
||||
a ## n4 ) ) )
|
||||
|
||||
#define SIGMA_ALL_8W do { \
|
||||
a0 = _mm256_xor_si256( g0, m256_one_32 ); \
|
||||
|
@@ -15,7 +15,8 @@
|
||||
|
||||
#if defined (ANIME_8WAY)
|
||||
|
||||
typedef struct {
|
||||
union _anime_8way_context_overlay
|
||||
{
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
#if defined(__VAES__)
|
||||
@@ -26,23 +27,9 @@ typedef struct {
|
||||
jh512_8way_context jh;
|
||||
skein512_8way_context skein;
|
||||
keccak512_8way_context keccak;
|
||||
} anime_8way_ctx_holder;
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
anime_8way_ctx_holder anime_8way_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_anime_8way_ctx()
|
||||
{
|
||||
blake512_8way_init( &anime_8way_ctx.blake );
|
||||
bmw512_8way_init( &anime_8way_ctx.bmw );
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_init( &anime_8way_ctx.groestl, 64 );
|
||||
#else
|
||||
init_groestl( &anime_8way_ctx.groestl, 64 );
|
||||
#endif
|
||||
skein512_8way_init( &anime_8way_ctx.skein );
|
||||
jh512_8way_init( &anime_8way_ctx.jh );
|
||||
keccak512_8way_init( &anime_8way_ctx.keccak );
|
||||
}
|
||||
typedef union _anime_8way_context_overlay anime_8way_context_overlay;
|
||||
|
||||
void anime_8way_hash( void *state, const void *input )
|
||||
{
|
||||
@@ -65,17 +52,14 @@ void anime_8way_hash( void *state, const void *input )
|
||||
__m512i* vhB = (__m512i*)vhashB;
|
||||
__m512i* vhC = (__m512i*)vhashC;
|
||||
const __m512i bit3_mask = m512_const1_64( 8 );
|
||||
const __m512i zero = _mm512_setzero_si512();
|
||||
__mmask8 vh_mask;
|
||||
anime_8way_ctx_holder ctx;
|
||||
memcpy( &ctx, &anime_8way_ctx, sizeof(anime_8way_ctx) );
|
||||
anime_8way_context_overlay ctx __attribute__ ((aligned (64)));
|
||||
|
||||
bmw512_8way_full( &ctx.bmw, vhash, input, 80 );
|
||||
|
||||
blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
@@ -152,8 +136,7 @@ void anime_8way_hash( void *state, const void *input )
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhash );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
|
||||
@@ -168,8 +151,7 @@ void anime_8way_hash( void *state, const void *input )
|
||||
|
||||
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
{
|
||||
@@ -237,14 +219,20 @@ int scanhash_anime_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined (ANIME_4WAY)
|
||||
|
||||
typedef struct {
|
||||
union _anime_4way_context_overlay
|
||||
{
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
jh512_4way_context jh;
|
||||
skein512_4way_context skein;
|
||||
keccak512_4way_context keccak;
|
||||
} anime_4way_ctx_holder;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl2;
|
||||
#endif
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
typedef union _anime_4way_context_overlay anime_4way_context_overlay;
|
||||
|
||||
void anime_4way_hash( void *state, const void *input )
|
||||
{
|
||||
@@ -262,7 +250,7 @@ void anime_4way_hash( void *state, const void *input )
|
||||
int h_mask;
|
||||
const __m256i bit3_mask = m256_const1_64( 8 );
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
anime_4way_ctx_holder ctx;
|
||||
anime_4way_context_overlay ctx __attribute__ ((aligned (64)));
|
||||
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way_update( &ctx.bmw, input, 80 );
|
||||
@@ -293,6 +281,17 @@ void anime_4way_hash( void *state, const void *input )
|
||||
|
||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl2, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl2, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
@@ -302,6 +301,8 @@ void anime_4way_hash( void *state, const void *input )
|
||||
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
#endif
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
@@ -13,6 +13,7 @@
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/shavite/shavite-hash-2way.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/hamsi/hamsi-hash-4way.h"
|
||||
@@ -64,14 +65,14 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
uint32_t vhashA[16<<3] __attribute__ ((aligned (64)));
|
||||
uint32_t vhashB[16<<3] __attribute__ ((aligned (64)));
|
||||
uint32_t vhashC[16<<3] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0 [16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1 [16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2 [16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3 [16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash4 [16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash5 [16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash6 [16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash7 [16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0 [16] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1 [16] __attribute__ ((aligned (32)));
|
||||
uint32_t hash2 [16] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3 [16] __attribute__ ((aligned (32)));
|
||||
uint32_t hash4 [16] __attribute__ ((aligned (32)));
|
||||
uint32_t hash5 [16] __attribute__ ((aligned (32)));
|
||||
uint32_t hash6 [16] __attribute__ ((aligned (32)));
|
||||
uint32_t hash7 [16] __attribute__ ((aligned (32)));
|
||||
hmq1725_8way_context_overlay ctx __attribute__ ((aligned (64)));
|
||||
__mmask8 vh_mask;
|
||||
const __m512i vmask = m512_const1_64( 24 );
|
||||
@@ -98,8 +99,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
// A
|
||||
#if defined(__VAES__)
|
||||
@@ -154,8 +154,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx.keccak, vhash );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
// A
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
@@ -174,8 +173,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
if ( likely( ( vh_mask & 0xff ) != 0xff ) )
|
||||
{
|
||||
@@ -223,8 +221,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
simd512_4way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash );
|
||||
// 4x32 for haval
|
||||
@@ -302,8 +299,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
|
||||
blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
// A
|
||||
#if defined(__VAES__)
|
||||
@@ -374,8 +370,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7 );
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
// A
|
||||
#if defined(__VAES__)
|
||||
@@ -455,8 +450,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7 );
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
if ( hash0[0] & mask )
|
||||
fugue512_full( &ctx.fugue, hash0, hash0, 64 );
|
||||
@@ -520,8 +514,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
sha512_8way_update( &ctx.sha512, vhash, 64 );
|
||||
sha512_8way_close( &ctx.sha512, vhash );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
@@ -625,6 +618,7 @@ union _hmq1725_4way_context_overlay
|
||||
cube_2way_context cube2;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd sd;
|
||||
shavite512_2way_context shavite2;
|
||||
simd_2way_context simd;
|
||||
hashState_echo echo;
|
||||
hamsi512_4way_context hamsi;
|
||||
@@ -633,19 +627,23 @@ union _hmq1725_4way_context_overlay
|
||||
sph_whirlpool_context whirlpool;
|
||||
sha512_4way_context sha512;
|
||||
haval256_5_4way_context haval;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl2;
|
||||
echo_2way_context echo2;
|
||||
#endif
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
|
||||
|
||||
extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
{
|
||||
uint32_t hash0 [16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1 [16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2 [16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3 [16] __attribute__ ((aligned (64)));
|
||||
uint32_t vhash [16<<2] __attribute__ ((aligned (64)));
|
||||
uint32_t vhashA[16<<2] __attribute__ ((aligned (64)));
|
||||
uint32_t vhashB[16<<2] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0 [16] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1 [16] __attribute__ ((aligned (32)));
|
||||
uint32_t hash2 [16] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3 [16] __attribute__ ((aligned (32)));
|
||||
hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
|
||||
__m256i vh_mask;
|
||||
int h_mask;
|
||||
@@ -750,15 +748,10 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
|
||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
shavite512_full( &ctx.shavite, hash0, hash0, 64 );
|
||||
shavite512_full( &ctx.shavite, hash1, hash1, 64 );
|
||||
shavite512_full( &ctx.shavite, hash2, hash2, 64 );
|
||||
shavite512_full( &ctx.shavite, hash3, hash3, 64 );
|
||||
|
||||
intrlv_2x128_512( vhashA, hash0, hash1 );
|
||||
intrlv_2x128_512( vhashB, hash2, hash3 );
|
||||
shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
|
||||
shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );
|
||||
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
@@ -795,6 +788,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
|
||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
echo_2way_full( &ctx.echo2, vhashA, 512, vhashA, 64 );
|
||||
echo_2way_full( &ctx.echo2, vhashB, 512, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
|
||||
@@ -808,6 +812,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
#endif
|
||||
|
||||
blake512_4way_full( &ctx.blake, vhash, vhash, 64 );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
@@ -912,7 +918,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
|
||||
|
||||
// A = fugue serial, B = sha512 prarallel
|
||||
// A = fugue serial, B = sha512 parallel
|
||||
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
@@ -939,6 +945,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
|
||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl2, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl2, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
@@ -948,6 +965,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
#endif
|
||||
|
||||
sha512_4way_init( &ctx.sha512 );
|
||||
sha512_4way_update( &ctx.sha512, vhash, 64 );
|
||||
sha512_4way_close( &ctx.sha512, vhash );
|
||||
|
@@ -17,7 +17,7 @@
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include <openssl/sha.h>
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
@@ -44,7 +44,7 @@ typedef struct {
|
||||
sph_hamsi512_context hamsi1;
|
||||
sph_shabal512_context shabal1;
|
||||
sph_whirlpool_context whirlpool1, whirlpool2, whirlpool3, whirlpool4;
|
||||
SHA512_CTX sha1, sha2;
|
||||
sph_sha512_context sha1, sha2;
|
||||
sph_haval256_5_context haval1, haval2;
|
||||
#if defined(__AES__)
|
||||
hashState_echo echo1, echo2;
|
||||
@@ -106,8 +106,8 @@ void init_hmq1725_ctx()
|
||||
sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
|
||||
sph_whirlpool_init(&hmq1725_ctx.whirlpool4);
|
||||
|
||||
SHA512_Init( &hmq1725_ctx.sha1 );
|
||||
SHA512_Init( &hmq1725_ctx.sha2 );
|
||||
sph_sha512_init( &hmq1725_ctx.sha1 );
|
||||
sph_sha512_init( &hmq1725_ctx.sha2 );
|
||||
|
||||
sph_haval256_5_init(&hmq1725_ctx.haval1);
|
||||
sph_haval256_5_init(&hmq1725_ctx.haval2);
|
||||
@@ -285,8 +285,8 @@ extern void hmq1725hash(void *state, const void *input)
|
||||
}
|
||||
else
|
||||
{
|
||||
SHA512_Update( &h_ctx.sha1, hashB, 64 );
|
||||
SHA512_Final( (unsigned char*) hashA, &h_ctx.sha1 );
|
||||
sph_sha512( &h_ctx.sha1, hashB, 64 );
|
||||
sph_sha512_close( &h_ctx.sha1, hashA );
|
||||
}
|
||||
|
||||
#if defined(__AES__)
|
||||
@@ -297,8 +297,8 @@ extern void hmq1725hash(void *state, const void *input)
|
||||
sph_groestl512_close(&h_ctx.groestl2, hashB); //4
|
||||
#endif
|
||||
|
||||
SHA512_Update( &h_ctx.sha2, hashB, 64 );
|
||||
SHA512_Final( (unsigned char*) hashA, &h_ctx.sha2 );
|
||||
sph_sha512( &h_ctx.sha2, hashB, 64 );
|
||||
sph_sha512_close( &h_ctx.sha2, hashA );
|
||||
|
||||
if ( hashA[0] & mask ) //4
|
||||
{
|
||||
|
@@ -68,7 +68,6 @@ void quark_8way_hash( void *state, const void *input )
|
||||
quark_8way_ctx_holder ctx;
|
||||
const uint32_t mask = 8;
|
||||
const __m512i bit3_mask = m512_const1_64( mask );
|
||||
const __m512i zero = _mm512_setzero_si512();
|
||||
|
||||
memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );
|
||||
|
||||
@@ -76,9 +75,7 @@ void quark_8way_hash( void *state, const void *input )
|
||||
|
||||
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
@@ -127,9 +124,7 @@ void quark_8way_hash( void *state, const void *input )
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
if ( ( vh_mask & 0x0f ) != 0x0f )
|
||||
groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
if ( ( vh_mask & 0xf0 ) != 0xf0 )
|
||||
groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
@@ -139,21 +134,13 @@ void quark_8way_hash( void *state, const void *input )
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash, 512 );
|
||||
|
||||
if ( hash0[0] & 8 )
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
if ( hash1[0] & 8 )
|
||||
groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
if ( hash2[0] & 8)
|
||||
groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
if ( hash3[0] & 8 )
|
||||
groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
if ( hash4[0] & 8 )
|
||||
groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||
if ( hash5[0] & 8 )
|
||||
groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||
if ( hash6[0] & 8 )
|
||||
groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||
if ( hash7[0] & 8 )
|
||||
groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||
|
||||
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
@@ -164,8 +151,7 @@ void quark_8way_hash( void *state, const void *input )
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhash );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
|
||||
@@ -179,8 +165,7 @@ void quark_8way_hash( void *state, const void *input )
|
||||
|
||||
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
{
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -1,186 +0,0 @@
|
||||
/* $Id: sph_radiogatun.h 226 2010-06-16 17:28:08Z tp $ */
|
||||
/**
|
||||
* RadioGatun interface.
|
||||
*
|
||||
* RadioGatun has been published in: G. Bertoni, J. Daemen, M. Peeters
|
||||
* and G. Van Assche, "RadioGatun, a belt-and-mill hash function",
|
||||
* presented at the Second Cryptographic Hash Workshop, Santa Barbara,
|
||||
* August 24-25, 2006. The main Web site, containing that article, the
|
||||
* reference code and some test vectors, appears to be currently located
|
||||
* at the following URL: http://radiogatun.noekeon.org/
|
||||
*
|
||||
* The presentation article does not specify endianness or padding. The
|
||||
* reference code uses the following conventions, which we also apply
|
||||
* here:
|
||||
* <ul>
|
||||
* <li>The input message is an integral number of sequences of three
|
||||
* words. Each word is either a 32-bit of 64-bit word (depending on
|
||||
* the version of RadioGatun).</li>
|
||||
* <li>Input bytes are decoded into words using little-endian
|
||||
* convention.</li>
|
||||
* <li>Padding consists of a single bit of value 1, using little-endian
|
||||
* convention within bytes (i.e. for a byte-oriented input, a single
|
||||
* byte of value 0x01 is appended), then enough bits of value 0 to finish
|
||||
* the current block.</li>
|
||||
* <li>Output consists of 256 bits. Successive output words are encoded
|
||||
* with little-endian convention.</li>
|
||||
* </ul>
|
||||
* These conventions are very close to those we use for PANAMA, which is
|
||||
* a close ancestor or RadioGatun.
|
||||
*
|
||||
* RadioGatun is actually a family of functions, depending on some
|
||||
* internal parameters. We implement here two functions, with a "belt
|
||||
* length" of 13, a "belt width" of 3, and a "mill length" of 19. The
|
||||
* RadioGatun[32] version uses 32-bit words, while the RadioGatun[64]
|
||||
* variant uses 64-bit words.
|
||||
*
|
||||
* Strictly speaking, the name "RadioGatun" should use an acute accent
|
||||
* on the "u", which we omitted here to keep strict ASCII-compatibility
|
||||
* of this file.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_radiogatun.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef SPH_RADIOGATUN_H__
|
||||
#define SPH_RADIOGATUN_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for RadioGatun[32].
|
||||
*/
|
||||
#define SPH_SIZE_radiogatun32 256
|
||||
|
||||
/**
|
||||
* This structure is a context for RadioGatun[32] computations: it
|
||||
* contains intermediate values and some data from the last entered
|
||||
* block. Once a RadioGatun[32] computation has been performed, the
|
||||
* context can be reused for another computation.
|
||||
*
|
||||
* The contents of this structure are private. A running RadioGatun[32]
|
||||
* computation can be cloned by copying the context (e.g. with a
|
||||
* simple <code>memcpy()</code>).
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char data[156]; /* first field, for alignment */
|
||||
unsigned data_ptr;
|
||||
sph_u32 a[19], b[39];
|
||||
#endif
|
||||
} sph_radiogatun32_context;
|
||||
|
||||
/**
|
||||
* Initialize a RadioGatun[32] context. This process performs no
|
||||
* memory allocation.
|
||||
*
|
||||
* @param cc the RadioGatun[32] context (pointer to a
|
||||
* <code>sph_radiogatun32_context</code>)
|
||||
*/
|
||||
void sph_radiogatun32_init(void *cc);
|
||||
|
||||
/**
|
||||
* Process some data bytes. It is acceptable that <code>len</code> is zero
|
||||
* (in which case this function does nothing).
|
||||
*
|
||||
* @param cc the RadioGatun[32] context
|
||||
* @param data the input data
|
||||
* @param len the input data length (in bytes)
|
||||
*/
|
||||
void sph_radiogatun32(void *cc, const void *data, size_t len);
|
||||
|
||||
/**
|
||||
* Terminate the current RadioGatun[32] computation and output the
|
||||
* result into the provided buffer. The destination buffer must be wide
|
||||
* enough to accomodate the result (32 bytes). The context is
|
||||
* automatically reinitialized.
|
||||
*
|
||||
* @param cc the RadioGatun[32] context
|
||||
* @param dst the destination buffer
|
||||
*/
|
||||
void sph_radiogatun32_close(void *cc, void *dst);
|
||||
|
||||
#if SPH_64
|
||||
|
||||
/**
|
||||
* Output size (in bits) for RadioGatun[64].
|
||||
*/
|
||||
#define SPH_SIZE_radiogatun64 256
|
||||
|
||||
/**
|
||||
* This structure is a context for RadioGatun[64] computations: it
|
||||
* contains intermediate values and some data from the last entered
|
||||
* block. Once a RadioGatun[64] computation has been performed, the
|
||||
* context can be reused for another computation.
|
||||
*
|
||||
* The contents of this structure are private. A running RadioGatun[64]
|
||||
* computation can be cloned by copying the context (e.g. with a
|
||||
* simple <code>memcpy()</code>).
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char data[312]; /* first field, for alignment */
|
||||
unsigned data_ptr;
|
||||
sph_u64 a[19], b[39];
|
||||
#endif
|
||||
} sph_radiogatun64_context;
|
||||
|
||||
/**
|
||||
* Initialize a RadioGatun[64] context. This process performs no
|
||||
* memory allocation.
|
||||
*
|
||||
* @param cc the RadioGatun[64] context (pointer to a
|
||||
* <code>sph_radiogatun64_context</code>)
|
||||
*/
|
||||
void sph_radiogatun64_init(void *cc);
|
||||
|
||||
/**
|
||||
* Process some data bytes. It is acceptable that <code>len</code> is zero
|
||||
* (in which case this function does nothing).
|
||||
*
|
||||
* @param cc the RadioGatun[64] context
|
||||
* @param data the input data
|
||||
* @param len the input data length (in bytes)
|
||||
*/
|
||||
void sph_radiogatun64(void *cc, const void *data, size_t len);
|
||||
|
||||
/**
|
||||
* Terminate the current RadioGatun[64] computation and output the
|
||||
* result into the provided buffer. The destination buffer must be wide
|
||||
* enough to accomodate the result (32 bytes). The context is
|
||||
* automatically reinitialized.
|
||||
*
|
||||
* @param cc the RadioGatun[64] context
|
||||
* @param dst the destination buffer
|
||||
*/
|
||||
void sph_radiogatun64_close(void *cc, void *dst);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -4,24 +4,6 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
double lbry_calc_network_diff( struct work *work )
|
||||
{
|
||||
// sample for diff 43.281 : 1c05ea29
|
||||
// todo: endian reversed on longpoll could be zr5 specific...
|
||||
|
||||
uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
|
||||
uint32_t bits = (nbits & 0xffffff);
|
||||
int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
|
||||
double d = (double)0x0000ffff / (double)bits;
|
||||
|
||||
for (int m=shift; m < 29; m++) d *= 256.0;
|
||||
for (int m=29; m < shift; m++) d /= 256.0;
|
||||
if (opt_debug_diff)
|
||||
applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
// std_le should work but it doesn't
|
||||
void lbry_le_build_stratum_request( char *req, struct work *work,
|
||||
struct stratum_ctx *sctx )
|
||||
@@ -41,31 +23,6 @@ void lbry_le_build_stratum_request( char *req, struct work *work,
|
||||
free(xnonce2str);
|
||||
}
|
||||
|
||||
/*
|
||||
void lbry_build_block_header( struct work* g_work, uint32_t version,
|
||||
uint32_t *prevhash, uint32_t *merkle_root,
|
||||
uint32_t ntime, uint32_t nbits )
|
||||
{
|
||||
int i;
|
||||
memset( g_work->data, 0, sizeof(g_work->data) );
|
||||
g_work->data[0] = version;
|
||||
|
||||
if ( have_stratum )
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[1 + i] = le32dec( prevhash + i );
|
||||
else
|
||||
for (i = 0; i < 8; i++)
|
||||
g_work->data[ 8-i ] = le32dec( prevhash + i );
|
||||
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[9 + i] = be32dec( merkle_root + i );
|
||||
|
||||
g_work->data[ LBRY_NTIME_INDEX ] = ntime;
|
||||
g_work->data[ LBRY_NBITS_INDEX ] = nbits;
|
||||
g_work->data[28] = 0x80000000;
|
||||
}
|
||||
*/
|
||||
|
||||
void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
{
|
||||
unsigned char merkle_root[64] = { 0 };
|
||||
@@ -112,9 +69,7 @@ bool register_lbry_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&lbry_hash;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
||||
#endif
|
||||
gate->calc_network_diff = (void*)&lbry_calc_network_diff;
|
||||
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
|
||||
// gate->build_block_header = (void*)&build_block_header;
|
||||
gate->build_extraheader = (void*)&lbry_build_extraheader;
|
||||
gate->ntime_index = LBRY_NTIME_INDEX;
|
||||
gate->nbits_index = LBRY_NBITS_INDEX;
|
||||
|
@@ -7,28 +7,23 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sph_ripemd.h"
|
||||
#include <openssl/sha.h>
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
void lbry_hash(void* output, const void* input)
|
||||
{
|
||||
SHA256_CTX ctx_sha256 __attribute__ ((aligned (64)));
|
||||
SHA512_CTX ctx_sha512 __attribute__ ((aligned (64)));
|
||||
sha256_context ctx_sha256 __attribute__ ((aligned (64)));
|
||||
sph_sha512_context ctx_sha512 __attribute__ ((aligned (64)));
|
||||
sph_ripemd160_context ctx_ripemd __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) hashA[16];
|
||||
uint32_t _ALIGN(64) hashB[16];
|
||||
uint32_t _ALIGN(64) hashC[16];
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, input, 112 );
|
||||
SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
|
||||
sha256_full( hashA, input, 112 );
|
||||
sha256_full( hashA, hashA, 32 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, hashA, 32 );
|
||||
SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
|
||||
|
||||
SHA512_Init( &ctx_sha512 );
|
||||
SHA512_Update( &ctx_sha512, hashA, 32 );
|
||||
SHA512_Final( (unsigned char*) hashA, &ctx_sha512 );
|
||||
sph_sha512_init( &ctx_sha512 );
|
||||
sph_sha512( &ctx_sha512, hashA, 32 );
|
||||
sph_sha512_close( &ctx_sha512, hashA );
|
||||
|
||||
sph_ripemd160_init( &ctx_ripemd );
|
||||
sph_ripemd160 ( &ctx_ripemd, hashA, 32 );
|
||||
@@ -38,14 +33,12 @@ void lbry_hash(void* output, const void* input)
|
||||
sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 );
|
||||
sph_ripemd160_close( &ctx_ripemd, hashC );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, hashB, 20 );
|
||||
SHA256_Update( &ctx_sha256, hashC, 20 );
|
||||
SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
|
||||
sha256_ctx_init( &ctx_sha256 );
|
||||
sha256_update( &ctx_sha256, hashB, 20 );
|
||||
sha256_update( &ctx_sha256, hashC, 20 );
|
||||
sha256_final( &ctx_sha256, hashA );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, hashA, 32 );
|
||||
SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
|
||||
sha256_full( hashA, hashA, 32 );
|
||||
|
||||
memcpy( output, hashA, 32 );
|
||||
}
|
||||
|
@@ -35,6 +35,7 @@
|
||||
|
||||
#include "sph_ripemd.h"
|
||||
|
||||
#if 0
|
||||
/*
|
||||
* Round functions for RIPEMD (original).
|
||||
*/
|
||||
@@ -46,6 +47,7 @@ static const sph_u32 oIV[5] = {
|
||||
SPH_C32(0x67452301), SPH_C32(0xEFCDAB89),
|
||||
SPH_C32(0x98BADCFE), SPH_C32(0x10325476)
|
||||
};
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Round functions for RIPEMD-128 and RIPEMD-160.
|
||||
@@ -63,6 +65,8 @@ static const sph_u32 IV[5] = {
|
||||
|
||||
#define ROTL SPH_ROTL32
|
||||
|
||||
#if 0
|
||||
|
||||
/* ===================================================================== */
|
||||
/*
|
||||
* RIPEMD (original hash, deprecated).
|
||||
@@ -479,7 +483,7 @@ sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4])
|
||||
* One round of RIPEMD-128. The data must be aligned for 32-bit access.
|
||||
*/
|
||||
static void
|
||||
ripemd128_round(const unsigned char *data, sph_u32 r[5])
|
||||
ripemd128_round(const unsigned char *data, sph_u32 r[4])
|
||||
{
|
||||
#if SPH_LITTLE_FAST
|
||||
|
||||
@@ -539,6 +543,8 @@ sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4])
|
||||
#undef RIPEMD128_IN
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* ===================================================================== */
|
||||
/*
|
||||
* RIPEMD-160.
|
||||
|
@@ -84,6 +84,7 @@
|
||||
* can be cloned by copying the context (e.g. with a simple
|
||||
* <code>memcpy()</code>).
|
||||
*/
|
||||
#if 0
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char buf[64]; /* first field, for alignment */
|
||||
@@ -204,6 +205,8 @@ void sph_ripemd128_close(void *cc, void *dst);
|
||||
*/
|
||||
void sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]);
|
||||
|
||||
#endif
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/**
|
||||
|
@@ -69,8 +69,12 @@ typedef unsigned int uint;
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64U
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 32U
|
||||
|
||||
#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
|
||||
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
|
||||
//#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
|
||||
//#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
|
||||
|
||||
#define ROTL32(a,b) rol32(a,b)
|
||||
#define ROTR32(a,b) ror32(a,b)
|
||||
|
||||
|
||||
#define U8TO32_BE(p) \
|
||||
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
|
||||
|
2994
algo/scrypt/scrypt-core-4way.c
Normal file
2994
algo/scrypt/scrypt-core-4way.c
Normal file
File diff suppressed because it is too large
Load Diff
70
algo/scrypt/scrypt-core-4way.h
Normal file
70
algo/scrypt/scrypt-core-4way.h
Normal file
@@ -0,0 +1,70 @@
|
||||
#ifndef SCRYPT_CORE_4WAY_H__
|
||||
#define SCRYPT_CORE_4WAY_H__
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );
|
||||
|
||||
// Serial SIMD over 4 way parallel
|
||||
void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N );
|
||||
|
||||
// 4 way parallel over serial SIMD
|
||||
void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
void scrypt_core_8way( __m256i *X, __m256i *V, uint32_t N );
|
||||
|
||||
// 2 way parallel over SIMD128
|
||||
void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N );
|
||||
|
||||
// Double buffered 2 way parallel over SIMD128
|
||||
void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N );
|
||||
|
||||
// Triplee buffered 2 way parallel over SIMD128
|
||||
void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N );
|
||||
|
||||
// Serial SIMD128 over 2 way parallel
|
||||
void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N );
|
||||
|
||||
// Double buffered simd over parallel
|
||||
void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N );
|
||||
|
||||
// Triple buffered 2 way
|
||||
void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, const uint32_t N );
|
||||
|
||||
// Quadruple buffered
|
||||
void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
// Parallel 4 way, 4x memory
|
||||
void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N );
|
||||
|
||||
// Linear SIMD 1 way, 1x memory, lowest
|
||||
void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||
|
||||
// Double buffered, 2x memory
|
||||
void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||
|
||||
// Triple buffered
|
||||
void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||
|
||||
// Quadruple buffered, 4x memory
|
||||
void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||
|
||||
#endif
|
||||
|
||||
// For reference only
|
||||
void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||
|
||||
#endif
|
||||
|
206
algo/scrypt/scrypt-core-ref.c
Normal file
206
algo/scrypt/scrypt-core-ref.c
Normal file
@@ -0,0 +1,206 @@
|
||||
#include "scrypt-core-ref.h"
|
||||
|
||||
#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
|
||||
|
||||
static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
|
||||
{
|
||||
uint32_t x0 = (B[ 0] ^= C[ 0]),
|
||||
x1 = (B[ 1] ^= C[ 1]),
|
||||
x2 = (B[ 2] ^= C[ 2]),
|
||||
x3 = (B[ 3] ^= C[ 3]);
|
||||
uint32_t x4 = (B[ 4] ^= C[ 4]),
|
||||
x5 = (B[ 5] ^= C[ 5]),
|
||||
x6 = (B[ 6] ^= C[ 6]),
|
||||
x7 = (B[ 7] ^= C[ 7]);
|
||||
uint32_t x8 = (B[ 8] ^= C[ 8]),
|
||||
x9 = (B[ 9] ^= C[ 9]),
|
||||
xa = (B[10] ^= C[10]),
|
||||
xb = (B[11] ^= C[11]);
|
||||
uint32_t xc = (B[12] ^= C[12]),
|
||||
xd = (B[13] ^= C[13]),
|
||||
xe = (B[14] ^= C[14]),
|
||||
xf = (B[15] ^= C[15]);
|
||||
|
||||
/* Operate on columns. */
|
||||
x4 ^= ROTL(x0 + xc, 7);
|
||||
x9 ^= ROTL(x5 + x1, 7);
|
||||
xe ^= ROTL(xa + x6, 7);
|
||||
x3 ^= ROTL(xf + xb, 7);
|
||||
x8 ^= ROTL(x4 + x0, 9);
|
||||
xd ^= ROTL(x9 + x5, 9);
|
||||
x2 ^= ROTL(xe + xa, 9);
|
||||
x7 ^= ROTL(x3 + xf, 9);
|
||||
xc ^= ROTL(x8 + x4, 13);
|
||||
x1 ^= ROTL(xd + x9, 13);
|
||||
x6 ^= ROTL(x2 + xe, 13);
|
||||
xb ^= ROTL(x7 + x3, 13);
|
||||
x0 ^= ROTL(xc + x8, 18);
|
||||
x5 ^= ROTL(x1 + xd, 18);
|
||||
xa ^= ROTL(x6 + x2, 18);
|
||||
xf ^= ROTL(xb + x7, 18);
|
||||
|
||||
/* Operate on rows. */
|
||||
x1 ^= ROTL(x0 + x3, 7);
|
||||
x6 ^= ROTL(x5 + x4, 7);
|
||||
xb ^= ROTL(xa + x9, 7);
|
||||
xc ^= ROTL(xf + xe, 7);
|
||||
x2 ^= ROTL(x1 + x0, 9);
|
||||
x7 ^= ROTL(x6 + x5, 9);
|
||||
x8 ^= ROTL(xb + xa, 9);
|
||||
xd ^= ROTL(xc + xf, 9);
|
||||
x3 ^= ROTL(x2 + x1, 13);
|
||||
x4 ^= ROTL(x7 + x6, 13);
|
||||
x9 ^= ROTL(x8 + xb, 13);
|
||||
xe ^= ROTL(xd + xc, 13);
|
||||
x0 ^= ROTL(x3 + x2, 18);
|
||||
x5 ^= ROTL(x4 + x7, 18);
|
||||
xa ^= ROTL(x9 + x8, 18);
|
||||
xf ^= ROTL(xe + xd, 18);
|
||||
|
||||
/* Operate on columns. */
|
||||
x4 ^= ROTL(x0 + xc, 7);
|
||||
x9 ^= ROTL(x5 + x1, 7);
|
||||
xe ^= ROTL(xa + x6, 7);
|
||||
x3 ^= ROTL(xf + xb, 7);
|
||||
x8 ^= ROTL(x4 + x0, 9);
|
||||
xd ^= ROTL(x9 + x5, 9);
|
||||
x2 ^= ROTL(xe + xa, 9);
|
||||
x7 ^= ROTL(x3 + xf, 9);
|
||||
xc ^= ROTL(x8 + x4, 13);
|
||||
x1 ^= ROTL(xd + x9, 13);
|
||||
x6 ^= ROTL(x2 + xe, 13);
|
||||
xb ^= ROTL(x7 + x3, 13);
|
||||
x0 ^= ROTL(xc + x8, 18);
|
||||
x5 ^= ROTL(x1 + xd, 18);
|
||||
xa ^= ROTL(x6 + x2, 18);
|
||||
xf ^= ROTL(xb + x7, 18);
|
||||
|
||||
/* Operate on rows. */
|
||||
x1 ^= ROTL(x0 + x3, 7);
|
||||
x6 ^= ROTL(x5 + x4, 7);
|
||||
xb ^= ROTL(xa + x9, 7);
|
||||
xc ^= ROTL(xf + xe, 7);
|
||||
x2 ^= ROTL(x1 + x0, 9);
|
||||
x7 ^= ROTL(x6 + x5, 9);
|
||||
x8 ^= ROTL(xb + xa, 9);
|
||||
xd ^= ROTL(xc + xf, 9);
|
||||
x3 ^= ROTL(x2 + x1, 13);
|
||||
x4 ^= ROTL(x7 + x6, 13);
|
||||
x9 ^= ROTL(x8 + xb, 13);
|
||||
xe ^= ROTL(xd + xc, 13);
|
||||
x0 ^= ROTL(x3 + x2, 18);
|
||||
x5 ^= ROTL(x4 + x7, 18);
|
||||
xa ^= ROTL(x9 + x8, 18);
|
||||
xf ^= ROTL(xe + xd, 18);
|
||||
|
||||
/* Operate on columns. */
|
||||
x4 ^= ROTL(x0 + xc, 7);
|
||||
x9 ^= ROTL(x5 + x1, 7);
|
||||
xe ^= ROTL(xa + x6, 7);
|
||||
x3 ^= ROTL(xf + xb, 7);
|
||||
x8 ^= ROTL(x4 + x0, 9);
|
||||
xd ^= ROTL(x9 + x5, 9);
|
||||
x2 ^= ROTL(xe + xa, 9);
|
||||
x7 ^= ROTL(x3 + xf, 9);
|
||||
xc ^= ROTL(x8 + x4, 13);
|
||||
x1 ^= ROTL(xd + x9, 13);
|
||||
x6 ^= ROTL(x2 + xe, 13);
|
||||
xb ^= ROTL(x7 + x3, 13);
|
||||
x0 ^= ROTL(xc + x8, 18);
|
||||
x5 ^= ROTL(x1 + xd, 18);
|
||||
xa ^= ROTL(x6 + x2, 18);
|
||||
xf ^= ROTL(xb + x7, 18);
|
||||
|
||||
/* Operate on rows. */
|
||||
x1 ^= ROTL(x0 + x3, 7);
|
||||
x6 ^= ROTL(x5 + x4, 7);
|
||||
xb ^= ROTL(xa + x9, 7);
|
||||
xc ^= ROTL(xf + xe, 7);
|
||||
x2 ^= ROTL(x1 + x0, 9);
|
||||
x7 ^= ROTL(x6 + x5, 9);
|
||||
x8 ^= ROTL(xb + xa, 9);
|
||||
xd ^= ROTL(xc + xf, 9);
|
||||
x3 ^= ROTL(x2 + x1, 13);
|
||||
x4 ^= ROTL(x7 + x6, 13);
|
||||
x9 ^= ROTL(x8 + xb, 13);
|
||||
xe ^= ROTL(xd + xc, 13);
|
||||
x0 ^= ROTL(x3 + x2, 18);
|
||||
x5 ^= ROTL(x4 + x7, 18);
|
||||
xa ^= ROTL(x9 + x8, 18);
|
||||
xf ^= ROTL(xe + xd, 18);
|
||||
|
||||
/* Operate on columns. */
|
||||
x4 ^= ROTL(x0 + xc, 7);
|
||||
x9 ^= ROTL(x5 + x1, 7);
|
||||
xe ^= ROTL(xa + x6, 7);
|
||||
x3 ^= ROTL(xf + xb, 7);
|
||||
x8 ^= ROTL(x4 + x0, 9);
|
||||
xd ^= ROTL(x9 + x5, 9);
|
||||
x2 ^= ROTL(xe + xa, 9);
|
||||
x7 ^= ROTL(x3 + xf, 9);
|
||||
xc ^= ROTL(x8 + x4, 13);
|
||||
x1 ^= ROTL(xd + x9, 13);
|
||||
x6 ^= ROTL(x2 + xe, 13);
|
||||
xb ^= ROTL(x7 + x3, 13);
|
||||
x0 ^= ROTL(xc + x8, 18);
|
||||
x5 ^= ROTL(x1 + xd, 18);
|
||||
xa ^= ROTL(x6 + x2, 18);
|
||||
xf ^= ROTL(xb + x7, 18);
|
||||
|
||||
/* Operate on rows. */
|
||||
x1 ^= ROTL(x0 + x3, 7);
|
||||
x6 ^= ROTL(x5 + x4, 7);
|
||||
xb ^= ROTL(xa + x9, 7);
|
||||
xc ^= ROTL(xf + xe, 7);
|
||||
x2 ^= ROTL(x1 + x0, 9);
|
||||
x7 ^= ROTL(x6 + x5, 9);
|
||||
x8 ^= ROTL(xb + xa, 9);
|
||||
xd ^= ROTL(xc + xf, 9);
|
||||
x3 ^= ROTL(x2 + x1, 13);
|
||||
x4 ^= ROTL(x7 + x6, 13);
|
||||
x9 ^= ROTL(x8 + xb, 13);
|
||||
xe ^= ROTL(xd + xc, 13);
|
||||
x0 ^= ROTL(x3 + x2, 18);
|
||||
x5 ^= ROTL(x4 + x7, 18);
|
||||
xa ^= ROTL(x9 + x8, 18);
|
||||
xf ^= ROTL(xe + xd, 18);
|
||||
|
||||
B[ 0] += x0;
|
||||
B[ 1] += x1;
|
||||
B[ 2] += x2;
|
||||
B[ 3] += x3;
|
||||
B[ 4] += x4;
|
||||
B[ 5] += x5;
|
||||
B[ 6] += x6;
|
||||
B[ 7] += x7;
|
||||
B[ 8] += x8;
|
||||
B[ 9] += x9;
|
||||
B[10] += xa;
|
||||
B[11] += xb;
|
||||
B[12] += xc;
|
||||
B[13] += xd;
|
||||
B[14] += xe;
|
||||
B[15] += xf;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param X input/ouput
|
||||
* @param V scratch buffer
|
||||
* @param N factor (def. 1024)
|
||||
*/
|
||||
void scrypt_core_ref(uint32_t *X, uint32_t *V, uint32_t N)
|
||||
{
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
memcpy(&V[i * 32], X, 128);
|
||||
xor_salsa8(&X[0], &X[16]);
|
||||
xor_salsa8(&X[16], &X[0]);
|
||||
}
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
uint32_t j = 32 * (X[16] & (N - 1));
|
||||
for (uint8_t k = 0; k < 32; k++)
|
||||
X[k] ^= V[j + k];
|
||||
xor_salsa8(&X[0], &X[16]);
|
||||
xor_salsa8(&X[16], &X[0]);
|
||||
}
|
||||
}
|
||||
|
1551
algo/scrypt/scrypt.c
1551
algo/scrypt/scrypt.c
File diff suppressed because it is too large
Load Diff
@@ -39,10 +39,10 @@
|
||||
void
|
||||
SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
|
||||
{
|
||||
SHA256_CTX ctx;
|
||||
SHA256_Init( &ctx );
|
||||
SHA256_Update( &ctx, in, len );
|
||||
SHA256_Final( digest, &ctx );
|
||||
sha256_context ctx;
|
||||
sha256_ctx_init( &ctx );
|
||||
sha256_update( &ctx, in, len );
|
||||
sha256_final( &ctx, digest );
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -64,7 +64,7 @@ HMAC_SHA256_Buf( const void *K, size_t Klen, const void *in, size_t len,
|
||||
void
|
||||
HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
|
||||
{
|
||||
unsigned char pad[64];
|
||||
unsigned char pad[64] __attribute__ ((aligned (64)));
|
||||
unsigned char khash[32];
|
||||
const unsigned char * K = _K;
|
||||
size_t i;
|
||||
@@ -72,27 +72,28 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
|
||||
/* If Klen > 64, the key is really SHA256(K). */
|
||||
if ( Klen > 64 )
|
||||
{
|
||||
SHA256_Init( &ctx->ictx );
|
||||
SHA256_Update( &ctx->ictx, K, Klen );
|
||||
SHA256_Final( khash, &ctx->ictx );
|
||||
sha256_ctx_init( &ctx->ictx );
|
||||
sha256_update( &ctx->ictx, K, Klen );
|
||||
sha256_final( &ctx->ictx, khash );
|
||||
K = khash;
|
||||
Klen = 32;
|
||||
}
|
||||
|
||||
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
|
||||
SHA256_Init( &ctx->ictx );
|
||||
|
||||
sha256_ctx_init( &ctx->ictx );
|
||||
|
||||
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x36;
|
||||
|
||||
memset( pad + Klen, 0x36, 64 - Klen );
|
||||
SHA256_Update( &ctx->ictx, pad, 64 );
|
||||
sha256_update( &ctx->ictx, pad, 64 );
|
||||
|
||||
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
|
||||
SHA256_Init( &ctx->octx );
|
||||
sha256_ctx_init( &ctx->octx );
|
||||
|
||||
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x5c;
|
||||
|
||||
memset( pad + Klen, 0x5c, 64 - Klen );
|
||||
SHA256_Update( &ctx->octx, pad, 64 );
|
||||
sha256_update( &ctx->octx, pad, 64 );
|
||||
}
|
||||
|
||||
/* Add bytes to the HMAC-SHA256 operation. */
|
||||
@@ -100,23 +101,17 @@ void
|
||||
HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
|
||||
{
|
||||
/* Feed data to the inner SHA256 operation. */
|
||||
SHA256_Update( &ctx->ictx, in, len );
|
||||
sha256_update( &ctx->ictx, in, len );
|
||||
}
|
||||
|
||||
/* Finish an HMAC-SHA256 operation. */
|
||||
void
|
||||
HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
|
||||
HMAC_SHA256_Final( void *digest, HMAC_SHA256_CTX *ctx )
|
||||
{
|
||||
unsigned char ihash[32];
|
||||
|
||||
/* Finish the inner SHA256 operation. */
|
||||
SHA256_Final( ihash, &ctx->ictx );
|
||||
|
||||
/* Feed the inner hash to the outer SHA256 operation. */
|
||||
SHA256_Update( &ctx->octx, ihash, 32 );
|
||||
|
||||
/* Finish the outer SHA256 operation. */
|
||||
SHA256_Final( digest, &ctx->octx );
|
||||
uint32_t ihash[8] __attribute__ ((aligned (32)));
|
||||
sha256_final( &ctx->ictx, ihash );
|
||||
sha256_update( &ctx->octx, ihash, 32 );
|
||||
sha256_final( &ctx->octx, digest );
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -129,8 +124,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
|
||||
size_t saltlen, uint64_t c, uint8_t *buf, size_t dkLen )
|
||||
{
|
||||
HMAC_SHA256_CTX PShctx, hctx;
|
||||
uint8_t _ALIGN(128) T[32];
|
||||
uint8_t _ALIGN(128) U[32];
|
||||
uint64_t _ALIGN(128) T[4];
|
||||
uint64_t _ALIGN(128) U[4];
|
||||
// uint8_t _ALIGN(128) T[32];
|
||||
// uint8_t _ALIGN(128) U[32];
|
||||
uint32_t ivec;
|
||||
size_t i, clen;
|
||||
uint64_t j;
|
||||
@@ -166,10 +163,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
|
||||
// _mm_xor_si128( ((__m128i*)T)[0], ((__m128i*)U)[0] );
|
||||
// _mm_xor_si128( ((__m128i*)T)[1], ((__m128i*)U)[1] );
|
||||
|
||||
// for ( k = 0; k < 4; k++ ) T[k] ^= U[k];
|
||||
for ( k = 0; k < 4; k++ ) T[k] ^= U[k];
|
||||
|
||||
for ( k = 0; k < 32; k++ )
|
||||
T[k] ^= U[k];
|
||||
// for ( k = 0; k < 32; k++ )
|
||||
// T[k] ^= U[k];
|
||||
}
|
||||
|
||||
/* Copy as many bytes as necessary into buf. */
|
||||
|
@@ -31,18 +31,18 @@
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <openssl/sha.h>
|
||||
#include "sha256-hash.h"
|
||||
|
||||
typedef struct HMAC_SHA256Context
|
||||
{
|
||||
SHA256_CTX ictx;
|
||||
SHA256_CTX octx;
|
||||
sha256_context ictx;
|
||||
sha256_context octx;
|
||||
} HMAC_SHA256_CTX;
|
||||
|
||||
void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
|
||||
void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t );
|
||||
void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t );
|
||||
void HMAC_SHA256_Final( unsigned char [32], HMAC_SHA256_CTX * );
|
||||
void HMAC_SHA256_Final( void*, HMAC_SHA256_CTX * );
|
||||
void HMAC_SHA256_Buf( const void *, size_t Klen, const void *,
|
||||
size_t len, uint8_t digest[32] );
|
||||
|
||||
|
@@ -1,270 +0,0 @@
|
||||
/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
|
||||
/*
|
||||
* This file contains some functions which implement the external data
|
||||
* handling and padding for Merkle-Damgard hash functions which follow
|
||||
* the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
|
||||
*
|
||||
* API: this file is meant to be included, not compiled as a stand-alone
|
||||
* file. Some macros must be defined:
|
||||
* RFUN name for the round function
|
||||
* HASH "short name" for the hash function
|
||||
* BE32 defined for big-endian, 32-bit based (e.g. SHA-1)
|
||||
* LE32 defined for little-endian, 32-bit based (e.g. MD5)
|
||||
* BE64 defined for big-endian, 64-bit based (e.g. SHA-512)
|
||||
* LE64 defined for little-endian, 64-bit based (no example yet)
|
||||
* PW01 if defined, append 0x01 instead of 0x80 (for Tiger)
|
||||
* BLEN if defined, length of a message block (in bytes)
|
||||
* PLW1 if defined, length is defined on one 64-bit word only (for Tiger)
|
||||
* PLW4 if defined, length is defined on four 64-bit words (for WHIRLPOOL)
|
||||
* SVAL if defined, reference to the context state information
|
||||
*
|
||||
* BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
|
||||
* this is used for instance for Tiger, which works on 64-bit words but
|
||||
* uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
|
||||
* ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
|
||||
* set, then only one word (64 bits) will be used to encode the input
|
||||
* message length (in bits), otherwise two words will be used (as in
|
||||
* SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
|
||||
* not PLW1), four 64-bit words will be used to encode the message length
|
||||
* (in bits). Note that regardless of those settings, only 64-bit message
|
||||
* lengths are supported (in bits): messages longer than 2 Exabytes will be
|
||||
* improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
|
||||
* 2 millions Terabytes, which is huge).
|
||||
*
|
||||
* If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
|
||||
* function. This is used for Tiger2, which is identical to Tiger except
|
||||
* when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
|
||||
* of the 0x01 from original Tiger).
|
||||
*
|
||||
* The RFUN function is invoked with two arguments, the first pointing to
|
||||
* aligned data (as a "const void *"), the second being state information
|
||||
* from the context structure. By default, this state information is the
|
||||
* "val" field from the context, and this field is assumed to be an array
|
||||
* of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
|
||||
* from the context structure. The "val" field can have any type, except
|
||||
* for the output encoding which assumes that it is an array of "sph_u32"
|
||||
* values. By defining NO_OUTPUT, this last step is deactivated; the
|
||||
* includer code is then responsible for writing out the hash result. When
|
||||
* NO_OUTPUT is defined, the third parameter to the "close()" function is
|
||||
* ignored.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
#undef SPH_XCAT
|
||||
#define SPH_XCAT(a, b) SPH_XCAT_(a, b)
|
||||
#undef SPH_XCAT_
|
||||
#define SPH_XCAT_(a, b) a ## b
|
||||
|
||||
#undef SPH_BLEN
|
||||
#undef SPH_WLEN
|
||||
#if defined BE64 || defined LE64
|
||||
#define SPH_BLEN 128U
|
||||
#define SPH_WLEN 8U
|
||||
#else
|
||||
#define SPH_BLEN 64U
|
||||
#define SPH_WLEN 4U
|
||||
#endif
|
||||
|
||||
#ifdef BLEN
|
||||
#undef SPH_BLEN
|
||||
#define SPH_BLEN BLEN
|
||||
#endif
|
||||
|
||||
#undef SPH_MAXPAD
|
||||
#if defined PLW1
|
||||
#define SPH_MAXPAD (SPH_BLEN - SPH_WLEN)
|
||||
#elif defined PLW4
|
||||
#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 2))
|
||||
#else
|
||||
#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 1))
|
||||
#endif
|
||||
|
||||
#undef SPH_VAL
|
||||
#undef SPH_NO_OUTPUT
|
||||
#ifdef SVAL
|
||||
#define SPH_VAL SVAL
|
||||
#define SPH_NO_OUTPUT 1
|
||||
#else
|
||||
#define SPH_VAL sc->val
|
||||
#endif
|
||||
|
||||
#ifndef CLOSE_ONLY
|
||||
|
||||
#ifdef SPH_UPTR
|
||||
static void
|
||||
SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len )
|
||||
#else
|
||||
void
|
||||
HASH ( void *cc, const void *data, size_t len )
|
||||
#endif
|
||||
{
|
||||
SPH_XCAT( HASH, _context ) *sc;
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
size_t ptr;
|
||||
|
||||
sc = cc;
|
||||
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
clen = SPH_BLEN - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
|
||||
vdata = vdata + (clen>>3);
|
||||
ptr += clen;
|
||||
len -= clen;
|
||||
if ( ptr == SPH_BLEN )
|
||||
{
|
||||
RFUN( sc->buf, SPH_VAL );
|
||||
ptr = 0;
|
||||
}
|
||||
sc->count += clen;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef SPH_UPTR
|
||||
void
|
||||
HASH (void *cc, const void *data, size_t len)
|
||||
{
|
||||
SPH_XCAT(HASH, _context) *sc;
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
unsigned ptr;
|
||||
|
||||
if ( len < (2 * SPH_BLEN) )
|
||||
{
|
||||
SPH_XCAT(HASH, _short)(cc, data, len);
|
||||
return;
|
||||
}
|
||||
sc = cc;
|
||||
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
|
||||
if ( ptr > 0 )
|
||||
{
|
||||
unsigned t;
|
||||
t = SPH_BLEN - ptr;
|
||||
SPH_XCAT( HASH, _short )( cc, data, t );
|
||||
vdata = vdata + (t>>3);
|
||||
len -= t;
|
||||
}
|
||||
SPH_XCAT( HASH, _short )( cc, data, len );
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Perform padding and produce result. The context is NOT reinitialized
|
||||
* by this function.
|
||||
*/
|
||||
static void
|
||||
SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
|
||||
void *dst, unsigned rnum )
|
||||
{
|
||||
SPH_XCAT(HASH, _context) *sc;
|
||||
unsigned ptr, u;
|
||||
sc = cc;
|
||||
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
|
||||
|
||||
#ifdef PW01
|
||||
sc->buf[ptr>>3] = m256_const1_64( 0x100 >> 8 );
|
||||
#else
|
||||
sc->buf[ptr>>3] = m256_const1_64( 0x80 );
|
||||
#endif
|
||||
ptr += 8;
|
||||
|
||||
if ( ptr > SPH_MAXPAD )
|
||||
{
|
||||
memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
|
||||
RFUN( sc->buf, SPH_VAL );
|
||||
memset_zero_256( sc->buf, SPH_MAXPAD >> 3 );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
|
||||
}
|
||||
#if defined BE64
|
||||
#if defined PLW1
|
||||
sc->buf[ SPH_MAXPAD>>3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#elif defined PLW4
|
||||
memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
|
||||
sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#else
|
||||
sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#endif // PLW
|
||||
#else // LE64
|
||||
#if defined PLW1
|
||||
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
|
||||
#elif defined PLW4
|
||||
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
|
||||
sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
|
||||
_mm256_set1_epi64x( c->count >> 61 );
|
||||
memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
|
||||
2 * SPH_WLEN );
|
||||
#else
|
||||
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
|
||||
sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
|
||||
_mm256_set1_epi64x( sc->count >> 61 );
|
||||
#endif // PLW
|
||||
|
||||
#endif // LE64
|
||||
|
||||
RFUN( sc->buf, SPH_VAL );
|
||||
|
||||
#ifdef SPH_NO_OUTPUT
|
||||
(void)dst;
|
||||
(void)rnum;
|
||||
(void)u;
|
||||
#else
|
||||
for ( u = 0; u < rnum; u ++ )
|
||||
{
|
||||
#if defined BE64
|
||||
((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
|
||||
#else // LE64
|
||||
((__m256i*)dst)[u] = sc->val[u];
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum )
|
||||
{
|
||||
SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum );
|
||||
}
|
@@ -51,7 +51,6 @@ typedef struct {
|
||||
__m128i buf[64>>2];
|
||||
__m128i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
bool initialized;
|
||||
} sha256_4way_context __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256_4way_init( sha256_4way_context *sc );
|
||||
@@ -59,6 +58,16 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha256_4way_close( sha256_4way_context *sc, void *dst );
|
||||
void sha256_4way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
|
||||
const __m128i *W, const __m128i *state_in );
|
||||
void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
|
||||
int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
@@ -70,13 +79,23 @@ typedef struct {
|
||||
__m256i buf[64>>2];
|
||||
__m256i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
bool initialized;
|
||||
} sha256_8way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha256_8way_init( sha256_8way_context *sc );
|
||||
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
|
||||
void sha256_8way_close( sha256_8way_context *sc, void *dst );
|
||||
void sha256_8way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
|
||||
void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
const __m256i *W, const __m256i *state_in );
|
||||
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
|
||||
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
@@ -88,13 +107,23 @@ typedef struct {
|
||||
__m512i buf[64>>2];
|
||||
__m512i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
bool initialized;
|
||||
} sha256_16way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha256_16way_init( sha256_16way_context *sc );
|
||||
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
|
||||
void sha256_16way_close( sha256_16way_context *sc, void *dst );
|
||||
void sha256_16way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
const __m512i *W, const __m512i *state_in );
|
||||
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
|
||||
|
||||
int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
@@ -8,11 +8,10 @@
|
||||
* any later version. See COPYING for more details.
|
||||
*/
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include "sha256d-4way.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <inttypes.h>
|
||||
#include <openssl/sha.h>
|
||||
|
||||
#if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__)
|
||||
#define EXTERN_SHA256
|
||||
@@ -181,6 +180,9 @@ static const uint32_t sha256d_hash1[16] = {
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000100
|
||||
};
|
||||
|
||||
// this performs the entire hash all over again, why?
|
||||
// because main function only does 56 rounds.
|
||||
|
||||
static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
|
||||
{
|
||||
uint32_t S[16];
|
||||
@@ -196,18 +198,29 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
|
||||
hash[i] = swab32(hash[i]);
|
||||
}
|
||||
|
||||
extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
|
||||
{
|
||||
/*
|
||||
#if defined (__SHA__)
|
||||
SHA256_CTX ctx;
|
||||
SHA256_Init( &ctx );
|
||||
SHA256_Update( &ctx, data, len );
|
||||
SHA256_Final( (unsigned char*)hash, &ctx );
|
||||
SHA256_Init( &ctx );
|
||||
SHA256_Update( &ctx, hash, 32 );
|
||||
SHA256_Final( (unsigned char*)hash, &ctx );
|
||||
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
|
||||
void sha256d(unsigned char *hash, const unsigned char *data, int len)
|
||||
{
|
||||
sph_sha256_context ctx __attribute__ ((aligned (64)));
|
||||
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, data, len );
|
||||
sph_sha256_close( &ctx, hash );
|
||||
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, hash, 32 );
|
||||
sph_sha256_close( &ctx, hash );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void sha256d(unsigned char *hash, const unsigned char *data, int len)
|
||||
{
|
||||
|
||||
uint32_t S[16], T[16];
|
||||
int i, r;
|
||||
|
||||
@@ -229,9 +242,11 @@ extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
|
||||
sha256_transform(T, S, 0);
|
||||
for (i = 0; i < 8; i++)
|
||||
be32enc((uint32_t *)hash + i, T[i]);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
static inline void sha256d_preextend(uint32_t *W)
|
||||
{
|
||||
W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
|
||||
@@ -479,7 +494,7 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
|
||||
void sha256d_ms_4way(uint32_t *hash, uint32_t *data,
|
||||
const uint32_t *midstate, const uint32_t *prehash);
|
||||
|
||||
static inline int scanhash_sha256d_4way( struct work *work,
|
||||
static inline int scanhash_sha256d_4way_pooler( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
@@ -540,7 +555,7 @@ static inline int scanhash_sha256d_4way( struct work *work,
|
||||
void sha256d_ms_8way(uint32_t *hash, uint32_t *data,
|
||||
const uint32_t *midstate, const uint32_t *prehash);
|
||||
|
||||
static inline int scanhash_sha256d_8way( struct work *work,
|
||||
static inline int scanhash_sha256d_8way_pooler( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
@@ -596,8 +611,8 @@ static inline int scanhash_sha256d_8way( struct work *work,
|
||||
|
||||
#endif /* HAVE_SHA256_8WAY */
|
||||
|
||||
int scanhash_sha256d( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
int scanhash_sha256d_pooler( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -612,11 +627,11 @@ int scanhash_sha256d( struct work *work,
|
||||
|
||||
#ifdef HAVE_SHA256_8WAY
|
||||
if ( sha256_use_8way() )
|
||||
return scanhash_sha256d_8way( work, max_nonce, hashes_done, mythr );
|
||||
return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr );
|
||||
#endif
|
||||
#ifdef HAVE_SHA256_4WAY
|
||||
if ( sha256_use_4way() )
|
||||
return scanhash_sha256d_4way( work, max_nonce, hashes_done, mythr );
|
||||
return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr );
|
||||
#endif
|
||||
|
||||
memcpy(data, pdata + 16, 64);
|
||||
@@ -643,6 +658,7 @@ int scanhash_sha256d( struct work *work,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
@@ -672,18 +688,20 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
bool register_sha256d_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(__SHA__)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_SHA256d;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined(SHA256D_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_16way;
|
||||
//#elif defined(SHA256D_8WAY)
|
||||
// gate->scanhash = (void*)&scanhash_sha256d_8way;
|
||||
#else
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256d;
|
||||
gate->scanhash = (void*)&scanhash_sha256d_pooler;
|
||||
// gate->scanhash = (void*)&scanhash_sha256d_4way;
|
||||
#endif
|
||||
gate->hash = (void*)&sha256d;
|
||||
// gate->hash = (void*)&sha256d;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user