mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
10 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
71d6b97ee8 | ||
![]() |
b2331375a3 | ||
![]() |
7fec680835 | ||
![]() |
1b0a5aadf6 | ||
![]() |
0a3c52810e | ||
![]() |
4d4386a374 | ||
![]() |
ce259b915a | ||
![]() |
02202ab803 | ||
![]() |
77c5ae80ab | ||
![]() |
eb3f57bfc7 |
126
INSTALL_LINUX
Normal file
126
INSTALL_LINUX
Normal file
@@ -0,0 +1,126 @@
|
||||
|
||||
|
||||
Requirements:
|
||||
|
||||
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
||||
supported.
|
||||
64 bit Linux operating system. Apple is not supported.
|
||||
|
||||
Building on linux prerequisites:
|
||||
|
||||
It is assumed users know how to install packages on their system and
|
||||
be able to compile standard source packages. This is basic Linux and
|
||||
beyond the scope of cpuminer-opt. Regardless compiling is trivial if you
|
||||
follow the instructions.
|
||||
|
||||
Make sure you have the basic development packages installed.
|
||||
Here is a good start:
|
||||
|
||||
http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu
|
||||
|
||||
Install any additional dependencies needed by cpuminer-opt. The list below
|
||||
are some of the ones that may not be in the default install and need to
|
||||
be installed manually. There may be others, read the error messages they
|
||||
will give a clue as to the missing package.
|
||||
|
||||
The following command should install everything you need on Debian based
|
||||
distributions such as Ubuntu:
|
||||
|
||||
sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake zlib1g-dev
|
||||
|
||||
build-essential (Development Tools package group on Fedora)
|
||||
automake
|
||||
libjansson-dev
|
||||
libgmp-dev
|
||||
libcurl4-openssl-dev
|
||||
libssl-dev
|
||||
lib-thread
|
||||
zlib1g-dev
|
||||
|
||||
SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
|
||||
openssl 1.1.0e or higher. Add one of the following, depending on the
|
||||
compiler version, to CFLAGS:
|
||||
"-march=native" or "-march=znver1" or "-msha".
|
||||
|
||||
Due to poor AVX2 performance on Ryzen users should add -DRYZEN_ to CFLAGS
|
||||
to override multiway AVX2 on algos with sha256, and use SHA instead.
|
||||
|
||||
Additional instructions for static compilalation can be found here:
|
||||
https://lxadm.com/Static_compilation_of_cpuminer
|
||||
Static builds should only considered in a homogeneous HW and SW environment.
|
||||
Local builds will always have the best performance and compatibility.
|
||||
|
||||
Extract cpuminer source.
|
||||
|
||||
tar xvzf cpuminer-opt-x.y.z.tar.gz
|
||||
cd cpuminer-opt-x.y.z
|
||||
|
||||
Run ./build.sh to build on Linux or execute the following commands.
|
||||
|
||||
./autogen.sh
|
||||
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
make
|
||||
|
||||
Start mining.
|
||||
|
||||
./cpuminer -a algo -o url -u username -p password
|
||||
|
||||
Windows
|
||||
|
||||
Precompiled Windows binaries are built on a Linux host using Mingw
|
||||
with a more recent compiler than the following Windows hosted procedure.
|
||||
|
||||
Building on Windows prerequisites:
|
||||
|
||||
msys
|
||||
mingw_w64
|
||||
Visual C++ redistributable 2008 X64
|
||||
openssl
|
||||
|
||||
Install msys and mingw_w64, only needed once.
|
||||
|
||||
Unpack msys into C:\msys or your preferred directory.
|
||||
|
||||
Install mingw_w64 from win-builds.
|
||||
Follow instructions, check "msys or cygwin" and "x86_64" and accept default
|
||||
existing msys instalation.
|
||||
|
||||
Open a msys shell by double clicking on msys.bat.
|
||||
Note that msys shell uses linux syntax for file specifications, "C:\" is
|
||||
mounted at "/c/".
|
||||
|
||||
Add mingw bin directory to PATH variable
|
||||
PATH="/c/msys/opt/windows_64/bin/:$PATH"
|
||||
|
||||
Instalation complete, compile cpuminer-opt.
|
||||
|
||||
Unpack cpuminer-opt source files using tar from msys shell, or using 7zip
|
||||
or similar Windows program.
|
||||
|
||||
In msys shell cd to miner directory.
|
||||
cd /c/path/to/cpuminer-opt
|
||||
|
||||
Run build.sh to build on Windows or execute the following commands.
|
||||
|
||||
./autogen.sh
|
||||
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
make
|
||||
|
||||
Start mining
|
||||
|
||||
cpuminer.exe -a algo -o url -u user -p password
|
||||
|
||||
The following tips may be useful for older AMD CPUs.
|
||||
|
||||
AMD CPUs older than Steamroller, including Athlon x2 and Phenom II x4, are
|
||||
not supported by cpuminer-opt due to an incompatible implementation of SSE2
|
||||
on these CPUs. Some algos may crash the miner with an invalid instruction.
|
||||
Users are recommended to use an unoptimized miner such as cpuminer-multi.
|
||||
|
||||
Some users with AMD CPUs without AES_NI have reported problems compiling
|
||||
with build.sh or "-march=native". Problems have included compile errors
|
||||
and poor performance. These users are recommended to compile manually
|
||||
specifying "-march=btver1" on the configure command line.
|
||||
|
||||
Support for even older x86_64 without AES_NI or SSE2 is not availble.
|
||||
|
173
INSTALL_WINDOWS
Normal file
173
INSTALL_WINDOWS
Normal file
@@ -0,0 +1,173 @@
|
||||
Instructions for compiling cpuminer-opt for Windows.
|
||||
|
||||
|
||||
Windows compilation using Visual Studio is not supported. Mingw64 is
|
||||
used on a Linux system (bare metal or virtual machine) to cross-compile
|
||||
cpuminer-opt executable binaries for Windows.
|
||||
|
||||
These instructions were written for Debian and Ubuntu compatible distributions
|
||||
but should work on other major distributions as well. However some of the
|
||||
package names or file paths may be different.
|
||||
|
||||
It is assumed a Linux system is already available and running. And the user
|
||||
has enough Linux knowledge to find and install packages and follow these
|
||||
instructions.
|
||||
|
||||
First it is a good idea to create new user specifically for cross compiling.
|
||||
It keeps all mingw stuff contained and isolated from the rest of the system.
|
||||
|
||||
Step by step...
|
||||
|
||||
1. Install necessary packages from the distribution's repositories.
|
||||
|
||||
Refer to Linux compile instructions and install required packages.
|
||||
|
||||
Additionally, install mingw-64.
|
||||
|
||||
sudo apt-get install mingw-w64
|
||||
|
||||
|
||||
2. Create a local library directory for packages to be compiled in the next
|
||||
step. Recommended location is $HOME/usr/lib/
|
||||
|
||||
|
||||
3. Download and build other packages for mingw that don't have a mingw64
|
||||
version available in the repositories.
|
||||
|
||||
Download the following source code packages from their respective and
|
||||
respected download locations, copy them to ~/usr/lib/ and uncompress them.
|
||||
|
||||
openssl
|
||||
curl
|
||||
gmp
|
||||
|
||||
In most cases the latest vesrion is ok but it's safest to download
|
||||
the same major and minor version as included in your distribution.
|
||||
|
||||
Run the following commands or follow the supplied instructions.
|
||||
Do not run "make install" unless you are using ~/usr/lib, which isn't
|
||||
recommended.
|
||||
|
||||
Some instructions insist on running "make check". If make check fails
|
||||
it may still work, YMMV.
|
||||
|
||||
You can speed up "make" by using all CPU cores available with "-j n" where
|
||||
n is the number of CPU threads you want to use.
|
||||
|
||||
openssl:
|
||||
|
||||
./Configure mingw64 shared --cross-compile-prefix=x86_64-w64-mingw32
|
||||
make
|
||||
|
||||
curl:
|
||||
|
||||
./configure --with-winssl --with-winidn --host=x86_64-w64-mingw32
|
||||
make
|
||||
|
||||
gmp:
|
||||
|
||||
./configure --host=x86_64-w64-mingw32
|
||||
make
|
||||
|
||||
|
||||
|
||||
4. Tweak the environment.
|
||||
|
||||
This step is required everytime you login or the commands can be added to
|
||||
.bashrc.
|
||||
|
||||
Define some local variables to point to local library.
|
||||
|
||||
export LOCAL_LIB="$HOME/usr/lib"
|
||||
|
||||
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
|
||||
|
||||
export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
|
||||
|
||||
Create a release directory and copy some dll files previously built.
|
||||
This can be done outside of cpuminer-opt and only needs to be done once.
|
||||
If the release directory is in cpuminer-opt directory it needs to be
|
||||
recreated every a source package is decompressed.
|
||||
|
||||
mkdir release
|
||||
cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
|
||||
cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
|
||||
cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/
|
||||
cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libgcc_s_seh-1.dll release/
|
||||
cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
|
||||
cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
|
||||
|
||||
|
||||
|
||||
The following steps need to be done every time a new source package is
|
||||
opened.
|
||||
|
||||
5. Download cpuminer-opt
|
||||
|
||||
Download the latest source code package of cpumuner-opt to your desired
|
||||
location. .zip or .tar.gz, your choice.
|
||||
|
||||
https://github.com/JayDDee/cpuminer-opt/releases
|
||||
|
||||
Decompress and change to the cpuminer-opt directory.
|
||||
|
||||
|
||||
|
||||
6. Prepare to compile
|
||||
|
||||
Create a link to the locally compiled version of gmp.h
|
||||
|
||||
ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h
|
||||
|
||||
Edit configure.ac to fix lipthread package name.
|
||||
|
||||
sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
|
||||
|
||||
|
||||
7. Compile
|
||||
|
||||
you can use the default compile if you intend to use cpuminer-opt on the
|
||||
same CPU and the virtual machine supports that architecture.
|
||||
|
||||
./build.sh
|
||||
|
||||
Otherwise you can compile manually while setting options in CFLAGS.
|
||||
|
||||
Some common options:
|
||||
|
||||
To compile for a specific CPU architecture:
|
||||
|
||||
CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl
|
||||
|
||||
This will compile for AMD Ryzen.
|
||||
|
||||
You can compile more generically for a set of specific CPU features
|
||||
if you know what features you want:
|
||||
|
||||
CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
|
||||
|
||||
This will compile for an older CPU that does not have AVX.
|
||||
|
||||
You can find several examples in build-allarch.sh
|
||||
|
||||
If you have a CPU with more than 64 threads and Windows 7 or higher you
|
||||
can enable the CPU Groups feature:
|
||||
|
||||
-D_WIN32_WINNT==0x0601
|
||||
|
||||
Once you have run configure successfully run make with n CPU threads:
|
||||
|
||||
make -j n
|
||||
|
||||
Copy cpuminer.exe to the release directory, compress and copy the release
|
||||
directory to a Windows system and run cpuminer.exe from the command line.
|
||||
|
||||
Run cpuminer
|
||||
|
||||
In a command windows change directories to the unzipped release folder.
|
||||
to get a list of all options:
|
||||
|
||||
cpuminer.exe --help
|
||||
|
||||
Command options are specific to where you mine. Refer to the pool's
|
||||
instructions on how to set them.
|
26
Makefile.am
26
Makefile.am
@@ -42,10 +42,11 @@ cpuminer_SOURCES = \
|
||||
algo/argon2/argon2d/argon2d/argon2.c \
|
||||
algo/argon2/argon2d/argon2d/core.c \
|
||||
algo/argon2/argon2d/argon2d/opt.c \
|
||||
algo/argon2/argon2d/argon2d/thread.c \
|
||||
algo/argon2/argon2d/argon2d/argon2d_thread.c \
|
||||
algo/argon2/argon2d/argon2d/encoding.c \
|
||||
algo/blake/sph_blake.c \
|
||||
algo/blake/blake-hash-4way.c \
|
||||
algo/blake/blake256-hash-4way.c \
|
||||
algo/blake/blake512-hash-4way.c \
|
||||
algo/blake/blake-gate.c \
|
||||
algo/blake/blake.c \
|
||||
algo/blake/blake-4way.c \
|
||||
@@ -67,7 +68,8 @@ cpuminer_SOURCES = \
|
||||
algo/blake/pentablake-4way.c \
|
||||
algo/blake/pentablake.c \
|
||||
algo/bmw/sph_bmw.c \
|
||||
algo/bmw/bmw-hash-4way.c \
|
||||
algo/bmw/bmw256-hash-4way.c \
|
||||
algo/bmw/bmw512-hash-4way.c \
|
||||
algo/bmw/bmw256.c \
|
||||
algo/cryptonight/cryptolight.c \
|
||||
algo/cryptonight/cryptonight-common.c\
|
||||
@@ -136,6 +138,8 @@ cpuminer_SOURCES = \
|
||||
algo/nist5/nist5-4way.c \
|
||||
algo/nist5/nist5.c \
|
||||
algo/nist5/zr5.c \
|
||||
algo/panama/sph_panama.c \
|
||||
algo/radiogatun/sph_radiogatun.c \
|
||||
algo/pluck.c \
|
||||
algo/quark/quark-gate.c \
|
||||
algo/quark/quark.c \
|
||||
@@ -159,14 +163,18 @@ cpuminer_SOURCES = \
|
||||
algo/sha/sph_sha2.c \
|
||||
algo/sha/sph_sha2big.c \
|
||||
algo/sha/sha2-hash-4way.c \
|
||||
algo/sha/sha256_hash_11way.c \
|
||||
algo/sha/sha2.c \
|
||||
algo/sha/sha256t-gate.c \
|
||||
algo/sha/sha256t-4way.c \
|
||||
algo/sha/sha256t.c \
|
||||
algo/sha/sha256q-4way.c \
|
||||
algo/sha/sha256q.c \
|
||||
algo/shabal/sph_shabal.c \
|
||||
algo/shabal/shabal-hash-4way.c \
|
||||
algo/shavite/sph_shavite.c \
|
||||
algo/shavite/sph-shavite-aesni.c \
|
||||
algo/shavite/shavite-hash-2way.c \
|
||||
algo/shavite/shavite.c \
|
||||
algo/simd/sph_simd.c \
|
||||
algo/simd/nist.c \
|
||||
@@ -240,21 +248,25 @@ cpuminer_SOURCES = \
|
||||
algo/x15/x15-gate.c \
|
||||
algo/x15/x15.c \
|
||||
algo/x15/x15-4way.c \
|
||||
algo/x16/x16r-gate.c \
|
||||
algo/x16/x16r.c \
|
||||
algo/x16/x16r-4way.c \
|
||||
algo/x17/x17-gate.c \
|
||||
algo/x17/x17.c \
|
||||
algo/x17/x17-4way.c \
|
||||
algo/x17/xevan-gate.c \
|
||||
algo/x17/xevan.c \
|
||||
algo/x17/xevan-4way.c \
|
||||
algo/x17/x16r-gate.c \
|
||||
algo/x17/x16r.c \
|
||||
algo/x17/x16r-4way.c \
|
||||
algo/x17/hmq1725.c \
|
||||
algo/x17/sonoa-gate.c \
|
||||
algo/x17/sonoa-4way.c \
|
||||
algo/x17/sonoa.c \
|
||||
algo/x20/x20r.c \
|
||||
algo/yescrypt/yescrypt.c \
|
||||
algo/yescrypt/sha256_Y.c \
|
||||
algo/yescrypt/yescrypt-best.c \
|
||||
algo/yespower/yespower.c \
|
||||
algo/yespower/sha256.c \
|
||||
algo/yespower/sha256_p.c \
|
||||
algo/yespower/yespower-opt.c
|
||||
|
||||
disable_flags =
|
||||
|
@@ -16,7 +16,8 @@ https://bitcointalk.org/index.php?topic=1326803.0
|
||||
|
||||
mailto://jayddee246@gmail.com
|
||||
|
||||
See file RELEASE_NOTES for change log and compile instructions.
|
||||
See file RELEASE_NOTES for change log and INSTALL_LINUX or INSTALL_WINDOWS
|
||||
for compile instructions.
|
||||
|
||||
Requirements
|
||||
------------
|
||||
@@ -78,6 +79,7 @@ Supported Algorithms
|
||||
lyra2h Hppcoin
|
||||
lyra2re lyra2
|
||||
lyra2rev2 lyra2v2, Vertcoin
|
||||
lyra2rev3 lyrav2v3, Vertcoin
|
||||
lyra2z Zcoin (XZC)
|
||||
lyra2z330 Lyra2 330 rows, Zoin (ZOI)
|
||||
m7m Magi (XMG)
|
||||
@@ -100,6 +102,7 @@ Supported Algorithms
|
||||
skein Skein+Sha (Skeincoin)
|
||||
skein2 Double Skein (Woodcoin)
|
||||
skunk Signatum (SIGT)
|
||||
sonoa Sono
|
||||
timetravel Machinecoin (MAC)
|
||||
timetravel10 Bitcore
|
||||
tribus Denarius (DNR)
|
||||
@@ -130,6 +133,8 @@ Supported Algorithms
|
||||
Errata
|
||||
------
|
||||
|
||||
Cryptonight and variants are no longer supported, use another miner.
|
||||
|
||||
Neoscrypt crashes on Windows, use legacy version.
|
||||
|
||||
AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
|
||||
|
15
README.txt
15
README.txt
@@ -12,32 +12,27 @@ the software, don't use it.
|
||||
Choose the exe that best matches you CPU's features or use trial and
|
||||
error to find the fastest one that doesn't crash. Pay attention to
|
||||
the features listed at cpuminer startup to ensure you are mining at
|
||||
optimum speed using all the available features.
|
||||
optimum speed using the best available features.
|
||||
|
||||
Architecture names and compile options used are only provided for Intel
|
||||
Core series. Pentium and Celeron often have fewer features.
|
||||
Core series. Even the newest Pentium and Celeron CPUs are often missing
|
||||
features.
|
||||
|
||||
AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
|
||||
supported by cpuminer-opt due to an incompatible implementation of SSE2 on
|
||||
these CPUs. Some algos may crash the miner with an invalid instruction.
|
||||
Users are recommended to use an unoptimized miner such as cpuminer-multi.
|
||||
Changes in v3.8.4 may have improved compatibility with some of these CPUs.
|
||||
|
||||
|
||||
Exe name Compile flags Arch name
|
||||
|
||||
cpuminer-sse2.exe "-msse2" Core2, Nehalem
|
||||
cpuminer-aes-sse42.exe "-march=westmere" Westmere, Sandy-Ivybridge
|
||||
cpuminer-aes-sse42.exe "-march=westmere" Westmere
|
||||
cpuminer-avx.exe "-march=corei7-avx" Sandy-Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2" Haswell, Sky-Kaby-Coffeelake
|
||||
cpuminer-avx2-sha.exe "-march=core-avx2 -msha" Ryzen
|
||||
cpuminer-zen "-march=znver1 -DRYZEN_" Ryzen
|
||||
|
||||
If you like this software feel free to donate:
|
||||
|
||||
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
|
||||
ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
|
||||
LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
|
||||
BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
|
||||
BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
|
||||
|
||||
|
||||
|
232
RELEASE_NOTES
232
RELEASE_NOTES
@@ -1,11 +1,11 @@
|
||||
puminer-opt now supports HW SHA acceleration available on AMD Ryzen CPUs.
|
||||
cpuminer-opt is a console program run from the command line using the
|
||||
keyboard, not the mouse.
|
||||
|
||||
cpuminer-opt now supports HW SHA acceleration available on AMD Ryzen CPUs.
|
||||
This feature requires recent SW including GCC version 5 or higher and
|
||||
openssl version 1.1 or higher. It may also require using "-march=znver1"
|
||||
compile flag.
|
||||
|
||||
HW SHA support is only available when compiled from source, Windows binaries
|
||||
are not yet available.
|
||||
|
||||
cpuminer-opt is a console program, if you're using a mouse you're doing it
|
||||
wrong.
|
||||
|
||||
@@ -25,140 +25,110 @@ required.
|
||||
Compile Instructions
|
||||
--------------------
|
||||
|
||||
Requirements:
|
||||
See INSTALL_LINUX or INSTALL_WINDOWS fror compile instruuctions
|
||||
|
||||
Requirements
|
||||
------------
|
||||
|
||||
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
||||
supported.
|
||||
64 bit Linux or Windows operating system. Apple is not supported.
|
||||
|
||||
Building on linux prerequisites:
|
||||
|
||||
It is assumed users know how to install packages on their system and
|
||||
be able to compile standard source packages. This is basic Linux and
|
||||
beyond the scope of cpuminer-opt.
|
||||
|
||||
Make sure you have the basic development packages installed.
|
||||
Here is a good start:
|
||||
|
||||
http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu
|
||||
|
||||
Install any additional dependencies needed by cpuminer-opt. The list below
|
||||
are some of the ones that may not be in the default install and need to
|
||||
be installed manually. There may be others, read the error messages they
|
||||
will give a clue as to the missing package.
|
||||
|
||||
The following command should install everything you need on Debian based
|
||||
distributions such as Ubuntu:
|
||||
|
||||
sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake zlib1g-dev
|
||||
|
||||
build-essential (for Ubuntu, Development Tools package group on Fedora)
|
||||
automake
|
||||
libjansson-dev
|
||||
libgmp-dev
|
||||
libcurl4-openssl-dev
|
||||
libssl-dev
|
||||
pthreads
|
||||
zlib
|
||||
|
||||
SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and openssl 1.1
|
||||
or higher. Reports of improved performiance on Ryzen when using openssl 1.0.2
|
||||
have been due to AVX and AVX2 optimizations added to that version.
|
||||
Additional improvements are expected on Ryzen with openssl 1.1.
|
||||
"-march-znver1" or "-msha".
|
||||
|
||||
Additional instructions for static compilalation can be found here:
|
||||
https://lxadm.com/Static_compilation_of_cpuminer
|
||||
Static builds should only considered in a homogeneous HW and SW environment.
|
||||
Local builds will always have the best performance and compatibility.
|
||||
|
||||
Extract cpuminer source.
|
||||
|
||||
tar xvzf cpuminer-opt-x.y.z.tar.gz
|
||||
cd cpuminer-opt-x.y.z
|
||||
|
||||
Run ./build.sh to build on Linux or execute the following commands.
|
||||
|
||||
./autogen.sh
|
||||
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
make
|
||||
|
||||
Additional optional compile flags, add the following to CFLAGS to activate:
|
||||
|
||||
-DUSE_SPH_SHA (deprecated)
|
||||
|
||||
SPH may give slightly better performance on algos that use sha256 when using
|
||||
openssl 1.0.1 or older. Openssl 1.0.2 adds AVX2 and 1.1 adds SHA and perform
|
||||
better than SPH. This option is ignored when 4-way is used, even for CPUs
|
||||
with SHA.
|
||||
|
||||
Start mining.
|
||||
|
||||
./cpuminer -a algo -o url -u username -p password
|
||||
|
||||
Windows
|
||||
|
||||
Precompiled Windows binaries are built on a Linux host using Mingw
|
||||
with a more recent compiler than the following Windows hosted procedure.
|
||||
|
||||
Building on Windows prerequisites:
|
||||
|
||||
msys
|
||||
mingw_w64
|
||||
Visual C++ redistributable 2008 X64
|
||||
openssl
|
||||
|
||||
Install msys and mingw_w64, only needed once.
|
||||
|
||||
Unpack msys into C:\msys or your preferred directory.
|
||||
|
||||
Install mingw_w64 from win-builds.
|
||||
Follow instructions, check "msys or cygwin" and "x86_64" and accept default
|
||||
existing msys instalation.
|
||||
|
||||
Open a msys shell by double clicking on msys.bat.
|
||||
Note that msys shell uses linux syntax for file specifications, "C:\" is
|
||||
mounted at "/c/".
|
||||
|
||||
Add mingw bin directory to PATH variable
|
||||
PATH="/c/msys/opt/windows_64/bin/:$PATH"
|
||||
|
||||
Instalation complete, compile cpuminer-opt.
|
||||
|
||||
Unpack cpuminer-opt source files using tar from msys shell, or using 7zip
|
||||
or similar Windows program.
|
||||
|
||||
In msys shell cd to miner directory.
|
||||
cd /c/path/to/cpuminer-opt
|
||||
|
||||
Run build.sh to build on Windows or execute the following commands.
|
||||
|
||||
./autogen.sh
|
||||
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
make
|
||||
|
||||
Start mining
|
||||
|
||||
cpuminer.exe -a algo -o url -u user -p password
|
||||
|
||||
The following tips may be useful for older AMD CPUs.
|
||||
|
||||
AMD CPUs older than Steamroller, including Athlon x2 and Phenom II x4, are
|
||||
not supported by cpuminer-opt due to an incompatible implementation of SSE2
|
||||
on these CPUs. Some algos may crash the miner with an invalid instruction.
|
||||
Users are recommended to use an unoptimized miner such as cpuminer-multi.
|
||||
|
||||
Some users with AMD CPUs without AES_NI have reported problems compiling
|
||||
with build.sh or "-march=native". Problems have included compile errors
|
||||
and poor performance. These users are recommended to compile manually
|
||||
specifying "-march=btver1" on the configure command line.
|
||||
|
||||
Support for even older x86_64 without AES_NI or SSE2 is not availble.
|
||||
|
||||
64 bit Linux or Windows operating system. Apple and Android are not supported.
|
||||
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.9.3.1
|
||||
|
||||
Skippped v3.9.3 due to misidentification of v3.9.2.5 as v3.9.3.
|
||||
Fixed x16r algo 25% invalid share reject rate. The bug may have also
|
||||
affected other algos.
|
||||
|
||||
v3.9.2.5
|
||||
|
||||
Fixed 2 regressions: hodl AES detection, x16r invalid shares with AVX2.
|
||||
More restructuring.
|
||||
|
||||
v3.9.2.4
|
||||
|
||||
Yet another affinity fix. Hopefully the last one.
|
||||
|
||||
v3.9.2.3
|
||||
|
||||
Another cpu-affinity fix.
|
||||
Disabled test code that fails to compile on some CPUs with limited
|
||||
AVX512 capabilities.
|
||||
|
||||
v3.9.2.2
|
||||
|
||||
Fixed some day one cpu-affinity issues.
|
||||
|
||||
v3.9.2
|
||||
|
||||
Added sha256q algo.
|
||||
Yespower now uses openssl SHA256, but no observable hash rate increase
|
||||
on Ryzen.
|
||||
Ongoing rearchitecting.
|
||||
Lyra2z now hashes 8-way on CPUs with AVX2.
|
||||
Lyra2 (all including phi2) now runs optimized code with SSE2.
|
||||
|
||||
v3.9.1.1
|
||||
|
||||
Fixed lyra2v3 AVX and below.
|
||||
|
||||
Compiling on Windows using Cygwin now works. Simply use "./build.sh"
|
||||
just like on Linux. It isn't portable therefore the binaries package will
|
||||
continue to use the existing procedure.
|
||||
The Cygwin procedure will be documented in more detail later and will
|
||||
include a list of packages that need to be installed.
|
||||
|
||||
v3.9.1
|
||||
|
||||
Fixed AVX2 version of anime algo.
|
||||
|
||||
Added sonoa algo.
|
||||
|
||||
Added "-DRYZEN_" compile option for Ryzen to override 4-way hashing when algo
|
||||
contains sha256 and use SHA instead. This is due to a combination of
|
||||
the introduction of HW SHA support combined with the poor performance
|
||||
of AVX2 on Ryzen. The Windows binaries package replaces cpuminer-avx2-sha
|
||||
with cpuminer-zen compiled with the override. Refer to the build instructions
|
||||
for more information.
|
||||
|
||||
Ongoing restructuring to streamline the process, reduce latency,
|
||||
reduce memory usage and unnecessary copying of data. Most of these
|
||||
will not result in a notoceably higher reported hashrate as the
|
||||
change simply reduces the time wasted that wasn't factored into the
|
||||
hash rate reported by the miner. In short, less dead time resulting in
|
||||
a higher net hashrate.
|
||||
|
||||
One of these measures to reduce latency also results in an enhanced
|
||||
share submission message including the share number*, the CPU thread,
|
||||
and the vector lane that found the solution. The time difference between
|
||||
the share submission and acceptance (or rejection) response indicates
|
||||
network ltatency. One other effect of this change is a reduction in hash
|
||||
meter messages because the scan function no longer exits when a share is
|
||||
found. Scan cycles will go longer and submit multiple shares per cycle.
|
||||
*the share number is antcipated and includes both accepted and rejected
|
||||
shares. Because the share is antipated and not synchronized it may be
|
||||
incorrect in time of very rapid share submission. Under most conditions
|
||||
it should be easy to match the submission with the corresponding response.
|
||||
|
||||
Removed "-DUSE_SPH_SHA" option, all users should have a recent version of
|
||||
openssl installed: v1.0.2 (Ubuntu 16.04) or better. Ryzen SHA requires
|
||||
v1.1.0 or better. Ryzen SHA is not used when hashing multi-way parallel.
|
||||
Ryzen SHA is available in the Windows binaries release package.
|
||||
|
||||
Improved compile instructions, now in seperate files: INSTALL_LINUX and
|
||||
INSTALL_WINDOWS. The Windows instructions are used to build the binaries
|
||||
release package. It's built on a Linux system either running as a virtual
|
||||
machine or a seperate computer. At this time there is no known way to
|
||||
build natively on a Windows system.
|
||||
|
||||
v3.9.0.1
|
||||
|
||||
Isolate Windows CPU groups code when CPU groups support not explicitly defined.
|
||||
|
||||
v3.9.0
|
||||
|
||||
Added support for Windows CPU groups.
|
||||
@@ -167,6 +137,7 @@ Prep work for AVX512.
|
||||
Added lyra2rev3 for the vertcoin algo change.
|
||||
Added yespower, yespowerr16 (Yenten)
|
||||
Added phi2 algo for LUX
|
||||
Discontinued support for cryptonight and variants.
|
||||
|
||||
v3.8.8.1
|
||||
|
||||
@@ -350,6 +321,7 @@ Changed default sha256 and sha512 to openssl. This should be used when
|
||||
compiling with openssl 1.0.2 or higher (Ubuntu 16.04).
|
||||
This should increase the hashrate for yescrypt, yescryptr16, m7m, xevan, skein,
|
||||
myr-gr & others when openssl 1.0.2 is installed.
|
||||
Note: -DUSE_SPH_SHA has been removed in v3.9.1.
|
||||
Users with openssl 1.0.1 (Ubuntu 14.04) may get better perforance by adding
|
||||
"-DUSE_SPH_SHA" to CLAGS.
|
||||
Windows binaries are compiled with -DUSE_SPH_SHA and won't get the speedup.
|
||||
|
@@ -210,10 +210,12 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_SCRYPTJANE: register_scryptjane_algo ( gate ); break;
|
||||
case ALGO_SHA256D: register_sha256d_algo ( gate ); break;
|
||||
case ALGO_SHA256T: register_sha256t_algo ( gate ); break;
|
||||
case ALGO_SHA256Q: register_sha256q_algo ( gate ); break;
|
||||
case ALGO_SHAVITE3: register_shavite_algo ( gate ); break;
|
||||
case ALGO_SKEIN: register_skein_algo ( gate ); break;
|
||||
case ALGO_SKEIN2: register_skein2_algo ( gate ); break;
|
||||
case ALGO_SKUNK: register_skunk_algo ( gate ); break;
|
||||
case ALGO_SONOA: register_sonoa_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL: register_timetravel_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL10: register_timetravel10_algo ( gate ); break;
|
||||
case ALGO_TRIBUS: register_tribus_algo ( gate ); break;
|
||||
@@ -266,6 +268,10 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
// override std defaults with jr2 defaults
|
||||
bool register_json_rpc2( algo_gate_t *gate )
|
||||
{
|
||||
applog(LOG_WARNING,"\nCryptonight algorithm and variants are no longer");
|
||||
applog(LOG_WARNING,"supported by cpuminer-opt. Shares submitted will");
|
||||
applog(LOG_WARNING,"likely be rejected. Proceed at your own risk.\n");
|
||||
|
||||
gate->wait_for_diff = (void*)&do_nothing;
|
||||
gate->get_new_work = (void*)&jr2_get_new_work;
|
||||
gate->get_nonceptr = (void*)&jr2_get_nonceptr;
|
||||
@@ -339,9 +345,9 @@ const char* const algo_alias_map[][2] =
|
||||
{ NULL, NULL }
|
||||
};
|
||||
|
||||
// if arg is a valid alias for a known algo it is updated with the proper name.
|
||||
// No validation of the algo or alias is done, It is the responsinility of the
|
||||
// calling function to validate the algo after return.
|
||||
// if arg is a valid alias for a known algo it is updated with the proper
|
||||
// name. No validation of the algo or alias is done, It is the responsinility
|
||||
// of the calling function to validate the algo after return.
|
||||
void get_algo_alias( char** algo_or_alias )
|
||||
{
|
||||
int i;
|
||||
@@ -354,3 +360,24 @@ void get_algo_alias( char** algo_or_alias )
|
||||
}
|
||||
}
|
||||
|
||||
#undef ALIAS
|
||||
#undef PROPER
|
||||
|
||||
// only for parallel when there are lanes.
|
||||
bool submit_solution( struct work *work, void *hash,
|
||||
struct thr_info *thr, int lane )
|
||||
{
|
||||
work_set_target_ratio( work, hash );
|
||||
if ( submit_work( thr, work ) )
|
||||
{
|
||||
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
|
||||
accepted_share_count + rejected_share_count + 1,
|
||||
thr->id, lane );
|
||||
return true;
|
||||
}
|
||||
else
|
||||
applog( LOG_WARNING, "Failed to submit share." );
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
@@ -2,8 +2,7 @@
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include "miner.h"
|
||||
#include "avxdefs.h"
|
||||
#include "interleave.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
/////////////////////////////
|
||||
////
|
||||
@@ -109,8 +108,15 @@ inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
// special case, only one target, provides a callback for scanhash to
|
||||
// submit work with less overhead.
|
||||
// bool (*submit_work ) ( struct thr_info*, const struct work* );
|
||||
|
||||
// mandatory functions, must be overwritten
|
||||
int ( *scanhash ) ( int, struct work*, uint32_t, uint64_t* );
|
||||
// Added a 5th arg for the thread_info structure to replace the int thr id
|
||||
// in the first arg. Both will co-exist during the trasition.
|
||||
//int ( *scanhash ) ( int, struct work*, uint32_t, uint64_t* );
|
||||
int ( *scanhash ) ( int, struct work*, uint32_t, uint64_t*, struct thr_info* );
|
||||
|
||||
// optional unsafe, must be overwritten if algo uses function
|
||||
void ( *hash ) ( void*, const void*, uint32_t ) ;
|
||||
@@ -188,6 +194,12 @@ void four_way_not_tested();
|
||||
// allways returns failure
|
||||
int null_scanhash();
|
||||
|
||||
// The one and only, a callback for scanhash.
|
||||
bool submit_solution( struct work *work, void *hash,
|
||||
struct thr_info *thr, int lane );
|
||||
|
||||
bool submit_work( struct thr_info *thr, const struct work *work_in );
|
||||
|
||||
// displays warning
|
||||
void null_hash ();
|
||||
void null_hash_suw();
|
||||
|
@@ -17,7 +17,7 @@
|
||||
|
||||
#if !defined(ARGON2_NO_THREADS)
|
||||
|
||||
#include "thread.h"
|
||||
#include "argon2d_thread.h"
|
||||
#if defined(_WIN32)
|
||||
#include <windows.h>
|
||||
#endif
|
@@ -30,7 +30,7 @@
|
||||
#include <string.h>
|
||||
|
||||
#include "core.h"
|
||||
#include "thread.h"
|
||||
#include "argon2d_thread.h"
|
||||
#include "../blake2/blake2.h"
|
||||
#include "../blake2/blake2-impl.h"
|
||||
|
||||
|
@@ -37,7 +37,7 @@
|
||||
#ifndef __BLAKE_HASH_4WAY__
|
||||
#define __BLAKE_HASH_4WAY__ 1
|
||||
|
||||
#ifdef __SSE4_2__
|
||||
//#ifdef __SSE4_2__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
@@ -45,7 +45,7 @@ extern "C"{
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_blake256 256
|
||||
|
||||
@@ -57,19 +57,22 @@ extern "C"{
|
||||
// Blake-256 4 way
|
||||
|
||||
typedef struct {
|
||||
__m128i buf[16] __attribute__ ((aligned (64)));
|
||||
__m128i H[8];
|
||||
__m128i S[4];
|
||||
unsigned char buf[64<<2];
|
||||
uint32_t H[8<<2];
|
||||
uint32_t S[4<<2];
|
||||
// __m128i buf[16] __attribute__ ((aligned (64)));
|
||||
// __m128i H[8];
|
||||
// __m128i S[4];
|
||||
size_t ptr;
|
||||
sph_u32 T0, T1;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_4way_small_context;
|
||||
} blake_4way_small_context __attribute__ ((aligned (64)));
|
||||
|
||||
// Default 14 rounds
|
||||
typedef blake_4way_small_context blake256_4way_context;
|
||||
void blake256_4way_init(void *cc);
|
||||
void blake256_4way(void *cc, const void *data, size_t len);
|
||||
void blake256_4way_close(void *cc, void *dst);
|
||||
void blake256_4way_init(void *ctx);
|
||||
void blake256_4way(void *ctx, const void *data, size_t len);
|
||||
void blake256_4way_close(void *ctx, void *dst);
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_4way_small_context blake256r14_4way_context;
|
||||
@@ -132,12 +135,10 @@ void blake512_4way_close(void *cc, void *dst);
|
||||
void blake512_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#endif
|
||||
#endif // AVX2
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif // BLAKE_HASH_4WAY_H__
|
||||
|
@@ -30,9 +30,10 @@
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#if defined (__SSE4_2__)
|
||||
//#if defined (__SSE4_2__)
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
|
||||
@@ -60,26 +61,12 @@ extern "C"{
|
||||
|
||||
// Blake-256
|
||||
|
||||
static const sph_u32 IV256[8] = {
|
||||
SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
|
||||
SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
|
||||
SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
|
||||
SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
|
||||
static const uint32_t IV256[8] =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// Blake-512
|
||||
|
||||
static const sph_u64 IV512[8] = {
|
||||
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
||||
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
|
||||
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
|
||||
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
|
||||
|
||||
// Blake-256 4 & 8 way, Blake-512 4 way
|
||||
@@ -317,47 +304,6 @@ static const sph_u32 CS[16] = {
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
#define CBx(r, i) CBx_(Z ## r ## i)
|
||||
#define CBx_(n) CBx__(n)
|
||||
#define CBx__(n) CB ## n
|
||||
|
||||
#define CB0 SPH_C64(0x243F6A8885A308D3)
|
||||
#define CB1 SPH_C64(0x13198A2E03707344)
|
||||
#define CB2 SPH_C64(0xA4093822299F31D0)
|
||||
#define CB3 SPH_C64(0x082EFA98EC4E6C89)
|
||||
#define CB4 SPH_C64(0x452821E638D01377)
|
||||
#define CB5 SPH_C64(0xBE5466CF34E90C6C)
|
||||
#define CB6 SPH_C64(0xC0AC29B7C97C50DD)
|
||||
#define CB7 SPH_C64(0x3F84D5B5B5470917)
|
||||
#define CB8 SPH_C64(0x9216D5D98979FB1B)
|
||||
#define CB9 SPH_C64(0xD1310BA698DFB5AC)
|
||||
#define CBA SPH_C64(0x2FFD72DBD01ADFB7)
|
||||
#define CBB SPH_C64(0xB8E1AFED6A267E96)
|
||||
#define CBC SPH_C64(0xBA7C9045F12C7F99)
|
||||
#define CBD SPH_C64(0x24A19947B3916CF7)
|
||||
#define CBE SPH_C64(0x0801F2E2858EFC16)
|
||||
#define CBF SPH_C64(0x636920D871574E69)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_64
|
||||
// not used
|
||||
static const sph_u64 CB[16] = {
|
||||
SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
|
||||
SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
|
||||
SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
|
||||
SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
|
||||
SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
|
||||
SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
|
||||
SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
|
||||
SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
do { \
|
||||
@@ -411,125 +357,41 @@ do { \
|
||||
|
||||
#endif
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// Blake-256 8 way
|
||||
|
||||
#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
do { \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
|
||||
_mm256_set1_epi32( c1 ), m0 ), b ), a ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
|
||||
_mm256_set1_epi32( c0 ), m1 ), b ), a ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
} while (0)
|
||||
|
||||
#define ROUND_S_8WAY(r) do { \
|
||||
GS_8WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
|
||||
GS_8WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
|
||||
GS_8WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
|
||||
GS_8WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
|
||||
GS_8WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
|
||||
GS_8WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
|
||||
GS_8WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
|
||||
GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) do { \
|
||||
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
|
||||
_mm256_set_epi64x( c1, c1, c1, c1 ), m0 ), b ), a ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
|
||||
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
|
||||
_mm256_set_epi64x( c0, c0, c0, c0 ), m1 ), b ), a ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
|
||||
} while (0)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_64
|
||||
// not used
|
||||
#define ROUND_B_4WAY(r) do { \
|
||||
GB_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
|
||||
CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
|
||||
GB_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \
|
||||
CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \
|
||||
GB_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \
|
||||
CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \
|
||||
GB_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \
|
||||
CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \
|
||||
GB_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \
|
||||
CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \
|
||||
GB_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \
|
||||
CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \
|
||||
GB_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \
|
||||
CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \
|
||||
GB_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \
|
||||
CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
//current_impl
|
||||
#define ROUND_B_4WAY(r) do { \
|
||||
GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
|
||||
GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
|
||||
GB_4WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
|
||||
GB_4WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
|
||||
GB_4WAY(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
|
||||
GB_4WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
|
||||
GB_4WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
|
||||
GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
// Blake-256 4 way
|
||||
|
||||
#define DECL_STATE32_4WAY \
|
||||
__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m128i S0, S1, S2, S3; \
|
||||
sph_u32 T0, T1;
|
||||
uint32_t T0, T1;
|
||||
|
||||
#define READ_STATE32_4WAY(state) do { \
|
||||
H0 = (state)->H[0]; \
|
||||
H1 = (state)->H[1]; \
|
||||
H2 = (state)->H[2]; \
|
||||
H3 = (state)->H[3]; \
|
||||
H4 = (state)->H[4]; \
|
||||
H5 = (state)->H[5]; \
|
||||
H6 = (state)->H[6]; \
|
||||
H7 = (state)->H[7]; \
|
||||
S0 = (state)->S[0]; \
|
||||
S1 = (state)->S[1]; \
|
||||
S2 = (state)->S[2]; \
|
||||
S3 = (state)->S[3]; \
|
||||
H0 = casti_m128i( state->H, 0 ); \
|
||||
H1 = casti_m128i( state->H, 1 ); \
|
||||
H2 = casti_m128i( state->H, 2 ); \
|
||||
H3 = casti_m128i( state->H, 3 ); \
|
||||
H4 = casti_m128i( state->H, 4 ); \
|
||||
H5 = casti_m128i( state->H, 5 ); \
|
||||
H6 = casti_m128i( state->H, 6 ); \
|
||||
H7 = casti_m128i( state->H, 7 ); \
|
||||
S0 = casti_m128i( state->S, 0 ); \
|
||||
S1 = casti_m128i( state->S, 1 ); \
|
||||
S2 = casti_m128i( state->S, 2 ); \
|
||||
S3 = casti_m128i( state->S, 3 ); \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE32_4WAY(state) do { \
|
||||
(state)->H[0] = H0; \
|
||||
(state)->H[1] = H1; \
|
||||
(state)->H[2] = H2; \
|
||||
(state)->H[3] = H3; \
|
||||
(state)->H[4] = H4; \
|
||||
(state)->H[5] = H5; \
|
||||
(state)->H[6] = H6; \
|
||||
(state)->H[7] = H7; \
|
||||
(state)->S[0] = S0; \
|
||||
(state)->S[1] = S1; \
|
||||
(state)->S[2] = S2; \
|
||||
(state)->S[3] = S3; \
|
||||
casti_m128i( state->H, 0 ) = H0; \
|
||||
casti_m128i( state->H, 1 ) = H1; \
|
||||
casti_m128i( state->H, 2 ) = H2; \
|
||||
casti_m128i( state->H, 3 ) = H3; \
|
||||
casti_m128i( state->H, 4 ) = H4; \
|
||||
casti_m128i( state->H, 5 ) = H5; \
|
||||
casti_m128i( state->H, 6 ) = H6; \
|
||||
casti_m128i( state->H, 7 ) = H7; \
|
||||
casti_m128i( state->S, 0 ) = S0; \
|
||||
casti_m128i( state->S, 1 ) = S1; \
|
||||
casti_m128i( state->S, 2 ) = S2; \
|
||||
casti_m128i( state->S, 3 ) = S3; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
@@ -616,30 +478,30 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
|
||||
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
|
||||
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
|
||||
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
|
||||
V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \
|
||||
V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \
|
||||
VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \
|
||||
VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \
|
||||
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
|
||||
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
|
||||
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
|
||||
M0 = mm128_bswap_32( * buf ); \
|
||||
M1 = mm128_bswap_32( *(buf+1) ); \
|
||||
M2 = mm128_bswap_32( *(buf+2) ); \
|
||||
M3 = mm128_bswap_32( *(buf+3) ); \
|
||||
M4 = mm128_bswap_32( *(buf+4) ); \
|
||||
M5 = mm128_bswap_32( *(buf+5) ); \
|
||||
M6 = mm128_bswap_32( *(buf+6) ); \
|
||||
M7 = mm128_bswap_32( *(buf+7) ); \
|
||||
M8 = mm128_bswap_32( *(buf+8) ); \
|
||||
M9 = mm128_bswap_32( *(buf+9) ); \
|
||||
MA = mm128_bswap_32( *(buf+10) ); \
|
||||
MB = mm128_bswap_32( *(buf+11) ); \
|
||||
MC = mm128_bswap_32( *(buf+12) ); \
|
||||
MD = mm128_bswap_32( *(buf+13) ); \
|
||||
ME = mm128_bswap_32( *(buf+14) ); \
|
||||
MF = mm128_bswap_32( *(buf+15) ); \
|
||||
M0 = mm128_bswap_32( buf[ 0] ); \
|
||||
M1 = mm128_bswap_32( buf[ 1] ); \
|
||||
M2 = mm128_bswap_32( buf[ 2] ); \
|
||||
M3 = mm128_bswap_32( buf[ 3] ); \
|
||||
M4 = mm128_bswap_32( buf[ 4] ); \
|
||||
M5 = mm128_bswap_32( buf[ 5] ); \
|
||||
M6 = mm128_bswap_32( buf[ 6] ); \
|
||||
M7 = mm128_bswap_32( buf[ 7] ); \
|
||||
M8 = mm128_bswap_32( buf[ 8] ); \
|
||||
M9 = mm128_bswap_32( buf[ 9] ); \
|
||||
MA = mm128_bswap_32( buf[10] ); \
|
||||
MB = mm128_bswap_32( buf[11] ); \
|
||||
MC = mm128_bswap_32( buf[12] ); \
|
||||
MD = mm128_bswap_32( buf[13] ); \
|
||||
ME = mm128_bswap_32( buf[14] ); \
|
||||
MF = mm128_bswap_32( buf[15] ); \
|
||||
ROUND_S_4WAY(0); \
|
||||
ROUND_S_4WAY(1); \
|
||||
ROUND_S_4WAY(2); \
|
||||
@@ -673,6 +535,31 @@ do { \
|
||||
|
||||
// Blake-256 8 way
|
||||
|
||||
#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
do { \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
|
||||
_mm256_set1_epi32( c1 ), m0 ), b ), a ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
|
||||
_mm256_set1_epi32( c0 ), m1 ), b ), a ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
} while (0)
|
||||
|
||||
#define ROUND_S_8WAY(r) do { \
|
||||
GS_8WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
|
||||
GS_8WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
|
||||
GS_8WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
|
||||
GS_8WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
|
||||
GS_8WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
|
||||
GS_8WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
|
||||
GS_8WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
|
||||
GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#define DECL_STATE32_8WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m256i S0, S1, S2, S3; \
|
||||
@@ -787,312 +674,136 @@ do { \
|
||||
S3 ), H7 ); \
|
||||
} while (0)
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
#define DECL_STATE64_4WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m256i S0, S1, S2, S3; \
|
||||
sph_u64 T0, T1;
|
||||
|
||||
#define READ_STATE64_4WAY(state) do { \
|
||||
H0 = (state)->H[0]; \
|
||||
H1 = (state)->H[1]; \
|
||||
H2 = (state)->H[2]; \
|
||||
H3 = (state)->H[3]; \
|
||||
H4 = (state)->H[4]; \
|
||||
H5 = (state)->H[5]; \
|
||||
H6 = (state)->H[6]; \
|
||||
H7 = (state)->H[7]; \
|
||||
S0 = (state)->S[0]; \
|
||||
S1 = (state)->S[1]; \
|
||||
S2 = (state)->S[2]; \
|
||||
S3 = (state)->S[3]; \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE64_4WAY(state) do { \
|
||||
(state)->H[0] = H0; \
|
||||
(state)->H[1] = H1; \
|
||||
(state)->H[2] = H2; \
|
||||
(state)->H[3] = H3; \
|
||||
(state)->H[4] = H4; \
|
||||
(state)->H[5] = H5; \
|
||||
(state)->H[6] = H6; \
|
||||
(state)->H[7] = H7; \
|
||||
(state)->S[0] = S0; \
|
||||
(state)->S[1] = S1; \
|
||||
(state)->S[2] = S2; \
|
||||
(state)->S[3] = S3; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_64
|
||||
|
||||
// not used
|
||||
#define COMPRESS64_4WAY do { \
|
||||
__m256i M[16]; \
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
unsigned r; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
|
||||
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
|
||||
M[0x0] = mm256_bswap_64( *(buf+0) ); \
|
||||
M[0x1] = mm256_bswap_64( *(buf+1) ); \
|
||||
M[0x2] = mm256_bswap_64( *(buf+2) ); \
|
||||
M[0x3] = mm256_bswap_64( *(buf+3) ); \
|
||||
M[0x4] = mm256_bswap_64( *(buf+4) ); \
|
||||
M[0x5] = mm256_bswap_64( *(buf+5) ); \
|
||||
M[0x6] = mm256_bswap_64( *(buf+6) ); \
|
||||
M[0x7] = mm256_bswap_64( *(buf+7) ); \
|
||||
M[0x8] = mm256_bswap_64( *(buf+8) ); \
|
||||
M[0x9] = mm256_bswap_64( *(buf+9) ); \
|
||||
M[0xA] = mm256_bswap_64( *(buf+10) ); \
|
||||
M[0xB] = mm256_bswap_64( *(buf+11) ); \
|
||||
M[0xC] = mm256_bswap_64( *(buf+12) ); \
|
||||
M[0xD] = mm256_bswap_64( *(buf+13) ); \
|
||||
M[0xE] = mm256_bswap_64( *(buf+14) ); \
|
||||
M[0xF] = mm256_bswap_64( *(buf+15) ); \
|
||||
for (r = 0; r < 16; r ++) \
|
||||
ROUND_B_4WAY(r); \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
|
||||
H1 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
|
||||
H2 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S2, V2 ), VA ), H2 ); \
|
||||
H3 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S3, V3 ), VB ), H3 ); \
|
||||
H4 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S0, V4 ), VC ), H4 ); \
|
||||
H5 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S1, V5 ), VD ), H5 ); \
|
||||
H6 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S2, V6 ), VE ), H6 ); \
|
||||
H7 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S3, V7 ), VF ), H7 ); \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
|
||||
//current impl
|
||||
|
||||
#define COMPRESS64_4WAY do { \
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
|
||||
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
|
||||
M0 = mm256_bswap_64( *(buf + 0) ); \
|
||||
M1 = mm256_bswap_64( *(buf + 1) ); \
|
||||
M2 = mm256_bswap_64( *(buf + 2) ); \
|
||||
M3 = mm256_bswap_64( *(buf + 3) ); \
|
||||
M4 = mm256_bswap_64( *(buf + 4) ); \
|
||||
M5 = mm256_bswap_64( *(buf + 5) ); \
|
||||
M6 = mm256_bswap_64( *(buf + 6) ); \
|
||||
M7 = mm256_bswap_64( *(buf + 7) ); \
|
||||
M8 = mm256_bswap_64( *(buf + 8) ); \
|
||||
M9 = mm256_bswap_64( *(buf + 9) ); \
|
||||
MA = mm256_bswap_64( *(buf + 10) ); \
|
||||
MB = mm256_bswap_64( *(buf + 11) ); \
|
||||
MC = mm256_bswap_64( *(buf + 12) ); \
|
||||
MD = mm256_bswap_64( *(buf + 13) ); \
|
||||
ME = mm256_bswap_64( *(buf + 14) ); \
|
||||
MF = mm256_bswap_64( *(buf + 15) ); \
|
||||
ROUND_B_4WAY(0); \
|
||||
ROUND_B_4WAY(1); \
|
||||
ROUND_B_4WAY(2); \
|
||||
ROUND_B_4WAY(3); \
|
||||
ROUND_B_4WAY(4); \
|
||||
ROUND_B_4WAY(5); \
|
||||
ROUND_B_4WAY(6); \
|
||||
ROUND_B_4WAY(7); \
|
||||
ROUND_B_4WAY(8); \
|
||||
ROUND_B_4WAY(9); \
|
||||
ROUND_B_4WAY(0); \
|
||||
ROUND_B_4WAY(1); \
|
||||
ROUND_B_4WAY(2); \
|
||||
ROUND_B_4WAY(3); \
|
||||
ROUND_B_4WAY(4); \
|
||||
ROUND_B_4WAY(5); \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
|
||||
H1 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
|
||||
H2 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S2, V2 ), VA ), H2 ); \
|
||||
H3 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S3, V3 ), VB ), H3 ); \
|
||||
H4 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S0, V4 ), VC ), H4 ); \
|
||||
H5 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S1, V5 ), VD ), H5 ); \
|
||||
H6 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S2, V6 ), VE ), H6 ); \
|
||||
H7 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S3, V7 ), VF ), H7 ); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
// Blake-256 4 way
|
||||
|
||||
static const sph_u32 salt_zero_4way_small[4] = { 0, 0, 0, 0 };
|
||||
static const uint32_t salt_zero_4way_small[4] = { 0, 0, 0, 0 };
|
||||
|
||||
static void
|
||||
blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
|
||||
const sph_u32 *salt, int rounds )
|
||||
blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
{
|
||||
int i;
|
||||
for ( i = 0; i < 8; i++ )
|
||||
sc->H[i] = _mm_set1_epi32( iv[i] );
|
||||
for ( i = 0; i < 4; i++ )
|
||||
sc->S[i] = _mm_set1_epi32( salt[i] );
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
sc->rounds = rounds;
|
||||
casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
|
||||
casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
|
||||
casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
|
||||
casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
|
||||
casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
|
||||
casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
|
||||
casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
|
||||
casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
|
||||
|
||||
casti_m128i( ctx->S, 0 ) = m128_zero;
|
||||
casti_m128i( ctx->S, 1 ) = m128_zero;
|
||||
casti_m128i( ctx->S, 2 ) = m128_zero;
|
||||
casti_m128i( ctx->S, 3 ) = m128_zero;
|
||||
/*
|
||||
sc->S[0] = _mm_set1_epi32( salt[0] );
|
||||
sc->S[1] = _mm_set1_epi32( salt[1] );
|
||||
sc->S[2] = _mm_set1_epi32( salt[2] );
|
||||
sc->S[3] = _mm_set1_epi32( salt[3] );
|
||||
*/
|
||||
ctx->T0 = ctx->T1 = 0;
|
||||
ctx->ptr = 0;
|
||||
ctx->rounds = rounds;
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
|
||||
blake32_4way( blake_4way_small_context *ctx, const void *data, size_t len )
|
||||
{
|
||||
__m128i *vdata = (__m128i*)data;
|
||||
__m128i *buf;
|
||||
size_t ptr;
|
||||
const int buf_size = 64; // number of elements, sizeof/4
|
||||
__m128i *buf = (__m128i*)ctx->buf;
|
||||
size_t bptr = ctx->ptr<<2;
|
||||
size_t vptr = ctx->ptr >> 2;
|
||||
size_t blen = len << 2;
|
||||
DECL_STATE32_4WAY
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
if ( len < buf_size - ptr )
|
||||
|
||||
if ( blen < (sizeof ctx->buf) - bptr )
|
||||
{
|
||||
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
memcpy( buf + vptr, data, (sizeof ctx->buf) - bptr );
|
||||
bptr += blen;
|
||||
ctx->ptr = bptr>>2;
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE32_4WAY(sc);
|
||||
while ( len > 0 )
|
||||
READ_STATE32_4WAY( ctx );
|
||||
while ( blen > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
size_t clen = ( sizeof ctx->buf ) - bptr;
|
||||
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
|
||||
ptr += clen;
|
||||
vdata += (clen>>2);
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
if ( clen > blen )
|
||||
clen = blen;
|
||||
memcpy( buf + vptr, data, clen );
|
||||
bptr += clen;
|
||||
data = (const unsigned char *)data + clen;
|
||||
blen -= clen;
|
||||
if ( bptr == ( sizeof ctx->buf ) )
|
||||
{
|
||||
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
|
||||
T1 = SPH_T32(T1 + 1);
|
||||
COMPRESS32_4WAY( sc->rounds );
|
||||
ptr = 0;
|
||||
if ( ( T0 = T0 + 512 ) < 512 )
|
||||
T1 = T1 + 1;
|
||||
COMPRESS32_4WAY( ctx->rounds );
|
||||
bptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE32_4WAY(sc);
|
||||
sc->ptr = ptr;
|
||||
WRITE_STATE32_4WAY( ctx );
|
||||
ctx->ptr = bptr>>2;
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w32 )
|
||||
{
|
||||
// union {
|
||||
__m128i buf[16];
|
||||
// sph_u32 dummy;
|
||||
// } u;
|
||||
size_t ptr, k;
|
||||
unsigned bit_len;
|
||||
sph_u32 th, tl;
|
||||
__m128i *out;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr>>2] = _mm_set1_epi32( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
__m128i buf[16] __attribute__ ((aligned (64)));
|
||||
size_t ptr = ctx->ptr;
|
||||
size_t vptr = ctx->ptr>>2;
|
||||
unsigned bit_len = ( (unsigned)ptr << 3 );
|
||||
uint32_t tl = ctx->T0 + bit_len;
|
||||
uint32_t th = ctx->T1;
|
||||
|
||||
if ( ptr == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||
ctx->T0 = 0xFFFFFE00UL;
|
||||
ctx->T1 = 0xFFFFFFFFUL;
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
else if ( ctx->T0 == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
|
||||
sc->T1 = SPH_T32(sc->T1 - 1);
|
||||
ctx->T0 = 0xFFFFFE00UL + bit_len;
|
||||
ctx->T1 = ctx->T1 - 1;
|
||||
}
|
||||
else
|
||||
sc->T0 -= 512 - bit_len;
|
||||
ctx->T0 -= 512 - bit_len;
|
||||
|
||||
if ( ptr <= 52 )
|
||||
buf[vptr] = _mm_set1_epi32( 0x80 );
|
||||
|
||||
if ( vptr < 12 )
|
||||
{
|
||||
memset_zero_128( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
|
||||
if (out_size_w32 == 8)
|
||||
buf[52>>2] = _mm_or_si128( buf[52>>2],
|
||||
_mm_set1_epi32( 0x01000000UL ) );
|
||||
*(buf+(56>>2)) = mm128_bswap_32( _mm_set1_epi32( th ) );
|
||||
*(buf+(60>>2)) = mm128_bswap_32( _mm_set1_epi32( tl ) );
|
||||
blake32_4way( sc, buf + (ptr>>2), 64 - ptr );
|
||||
memset_zero_128( buf + vptr + 1, 13 - vptr );
|
||||
buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) );
|
||||
buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
|
||||
buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
|
||||
blake32_4way( ctx, buf + vptr, 64 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_128( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
||||
blake32_4way( sc, buf + (ptr>>2), 64 - ptr );
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||
memset_zero_128( buf, 56>>2 );
|
||||
if (out_size_w32 == 8)
|
||||
buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
|
||||
*(buf+(56>>2)) = mm128_bswap_32( _mm_set1_epi32( th ) );
|
||||
*(buf+(60>>2)) = mm128_bswap_32( _mm_set1_epi32( tl ) );
|
||||
blake32_4way( sc, buf, 64 );
|
||||
memset_zero_128( buf + vptr + 1, (60-ptr) >> 2 );
|
||||
blake32_4way( ctx, buf + vptr, 64 - ptr );
|
||||
ctx->T0 = 0xFFFFFE00UL;
|
||||
ctx->T1 = 0xFFFFFFFFUL;
|
||||
memset_zero_128( buf, 56>>2 );
|
||||
buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) );
|
||||
buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
|
||||
buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
|
||||
blake32_4way( ctx, buf, 64 );
|
||||
}
|
||||
out = (__m128i*)dst;
|
||||
for ( k = 0; k < out_size_w32; k++ )
|
||||
out[k] = mm128_bswap_32( sc->H[k] );
|
||||
|
||||
casti_m128i( dst, 0 ) = mm128_bswap_32( casti_m128i( ctx->H, 0 ) );
|
||||
casti_m128i( dst, 1 ) = mm128_bswap_32( casti_m128i( ctx->H, 1 ) );
|
||||
casti_m128i( dst, 2 ) = mm128_bswap_32( casti_m128i( ctx->H, 2 ) );
|
||||
casti_m128i( dst, 3 ) = mm128_bswap_32( casti_m128i( ctx->H, 3 ) );
|
||||
casti_m128i( dst, 4 ) = mm128_bswap_32( casti_m128i( ctx->H, 4 ) );
|
||||
casti_m128i( dst, 5 ) = mm128_bswap_32( casti_m128i( ctx->H, 5 ) );
|
||||
casti_m128i( dst, 6 ) = mm128_bswap_32( casti_m128i( ctx->H, 6 ) );
|
||||
casti_m128i( dst, 7 ) = mm128_bswap_32( casti_m128i( ctx->H, 7 ) );
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
@@ -1217,163 +928,32 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
out[k] = mm256_bswap_32( sc->H[k] );
|
||||
}
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
|
||||
static void
|
||||
blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
|
||||
const sph_u64 *salt )
|
||||
{
|
||||
int i;
|
||||
for ( i = 0; i < 8; i++ )
|
||||
sc->H[i] = _mm256_set1_epi64x( iv[i] );
|
||||
for ( i = 0; i < 4; i++ )
|
||||
sc->S[i] = _mm256_set1_epi64x( salt[i] );
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
__m256i *buf;
|
||||
size_t ptr;
|
||||
DECL_STATE64_4WAY
|
||||
|
||||
const int buf_size = 128; // sizeof/8
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
if ( len < (buf_size - ptr) )
|
||||
{
|
||||
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE64_4WAY(sc);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata = vdata + (clen>>3);
|
||||
len -= clen;
|
||||
if (ptr == buf_size )
|
||||
{
|
||||
if ((T0 = SPH_T64(T0 + 1024)) < 1024)
|
||||
T1 = SPH_T64(T1 + 1);
|
||||
COMPRESS64_4WAY;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE64_4WAY(sc);
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
blake64_4way_close( blake_4way_big_context *sc,
|
||||
unsigned ub, unsigned n, void *dst, size_t out_size_w64)
|
||||
{
|
||||
// union {
|
||||
__m256i buf[16];
|
||||
// sph_u64 dummy;
|
||||
// } u;
|
||||
size_t ptr, k;
|
||||
unsigned bit_len;
|
||||
uint64_t z, zz;
|
||||
sph_u64 th, tl;
|
||||
__m256i *out;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
z = 0x80 >> n;
|
||||
zz = ((ub & -z) | z) & 0xFF;
|
||||
buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
|
||||
sc->T1 = SPH_T64(sc->T1 - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
}
|
||||
if ( ptr <= 104 )
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
|
||||
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
|
||||
*(buf+(112>>3)) = mm256_bswap_64(
|
||||
_mm256_set_epi64x( th, th, th, th ) );
|
||||
*(buf+(120>>3)) = mm256_bswap_64(
|
||||
_mm256_set_epi64x( tl, tl, tl, tl ) );
|
||||
|
||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
|
||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
memset_zero_256( buf, 112>>3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
|
||||
*(buf+(112>>3)) = mm256_bswap_64(
|
||||
_mm256_set_epi64x( th, th, th, th ) );
|
||||
*(buf+(120>>3)) = mm256_bswap_64(
|
||||
_mm256_set_epi64x( tl, tl, tl, tl ) );
|
||||
|
||||
blake64_4way( sc, buf, 128 );
|
||||
}
|
||||
out = (__m256i*)dst;
|
||||
for ( k = 0; k < out_size_w64; k++ )
|
||||
out[k] = mm256_bswap_64( sc->H[k] );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Blake-256 4 way
|
||||
|
||||
// default 14 rounds, backward copatibility
|
||||
void
|
||||
blake256_4way_init(void *cc)
|
||||
blake256_4way_init(void *ctx)
|
||||
{
|
||||
blake32_4way_init( cc, IV256, salt_zero_4way_small, 14 );
|
||||
blake32_4way_init( ctx, IV256, salt_zero_4way_small, 14 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256_4way(void *cc, const void *data, size_t len)
|
||||
blake256_4way(void *ctx, const void *data, size_t len)
|
||||
{
|
||||
blake32_4way(cc, data, len);
|
||||
blake32_4way(ctx, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256_4way_close(void *cc, void *dst)
|
||||
blake256_4way_close(void *ctx, void *dst)
|
||||
{
|
||||
blake32_4way_close(cc, 0, 0, dst, 8);
|
||||
blake32_4way_close(ctx, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// Blake-256 8way
|
||||
// Blake-256 8 way
|
||||
|
||||
void
|
||||
blake256_8way_init(void *cc)
|
||||
@@ -1473,38 +1053,8 @@ blake256r8_8way_close(void *cc, void *dst)
|
||||
|
||||
#endif
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
void
|
||||
blake512_4way_init(void *cc)
|
||||
{
|
||||
blake64_4way_init(cc, IV512, salt_zero_big);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_4way(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake64_4way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_4way_close(void *cc, void *dst)
|
||||
{
|
||||
blake512_4way_addbits_and_close(cc, 0, 0, dst);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
blake64_4way_close(cc, ub, n, dst, 8);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
//#endif
|
322
algo/blake/blake256-hash-4way.c.new
Normal file
322
algo/blake/blake256-hash-4way.c.new
Normal file
@@ -0,0 +1,322 @@
|
||||
// convert blake256 32 bit to use 64 bit with serial vectoring
|
||||
//
|
||||
// cut calls to GS in half
|
||||
//
|
||||
// combine V
|
||||
// v0 = {V0,V1}
|
||||
// v1 = {V2,V3}
|
||||
// v2 = {V4,V5}
|
||||
// v3 = {V6,V7}
|
||||
// v4 = {V8,V9}
|
||||
// v5 = {VA,VB}
|
||||
// v6 = {VC,VD}
|
||||
// v7 = {CE,VF}
|
||||
//
|
||||
// v6x = {VD,VC} swap(VC,VD) swap(v6)
|
||||
// v7x = {VF,VE} swap(VE,VF) swap(v7)
|
||||
//
|
||||
// V0 = v1v0
|
||||
// V1 = v3v2
|
||||
// V2 = v5v4
|
||||
// V3 = v7v6
|
||||
// V4 = v9v8
|
||||
// V5 = vbva
|
||||
// V6 = vdvc
|
||||
// V7 = vfve
|
||||
//
|
||||
// The rotate in ROUND is to effect straddle and unstraddle for the third
|
||||
// and 4th iteration of GS.
|
||||
// It concatenates 2 contiguous 256 bit vectors and extracts the middle
|
||||
// 256 bits. After the transform they must be restored with only the
|
||||
// chosen bits modified in the original 2 vectors.
|
||||
// ror1x128 achieves this by putting the chosen bits in arg1, the "low"
|
||||
// 256 bit vector and saves the untouched bits temporailly in arg0, the
|
||||
// "high" 256 bit vector. Simply reverse the process to restore data back
|
||||
// to original positions.
|
||||
|
||||
// Use standard 4way when AVX2 is not available use x2 mode with AVX2.
|
||||
//
|
||||
// Data is organised the same as 32 bit 4 way, in effect serial vectoring
|
||||
// on top of parallel vectoring. Same data in the same place just taking
|
||||
// two chunks at a time.
|
||||
//
|
||||
// Transparent to user, x2 mode used when AVX2 detected.
|
||||
// Use existing 4way context but revert to scalar types.
|
||||
// Same interleave function (128 bit) or x2 with 256 bit?
|
||||
// User trsnaparency would have to apply to interleave as well.
|
||||
//
|
||||
// Use common 4way update and close
|
||||
|
||||
/*
|
||||
typedef struct {
|
||||
unsigned char buf[64<<2];
|
||||
uint32_t H[8<<2];
|
||||
uint32_t S[4<<2];
|
||||
size_t ptr;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blakex2_4way_small_context __attribute__ ((aligned (64)));
|
||||
*/
|
||||
|
||||
static void
|
||||
blake32x2_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
{
|
||||
casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
|
||||
casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
|
||||
casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
|
||||
casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
|
||||
casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
|
||||
casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
|
||||
casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
|
||||
casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
|
||||
|
||||
casti_m128i( ctx->S, 0 ) = m128_zero;
|
||||
casti_m128i( ctx->S, 1 ) = m128_zero;
|
||||
casti_m128i( ctx->S, 2 ) = m128_zero;
|
||||
casti_m128i( ctx->S, 3 ) = m128_zero;
|
||||
/*
|
||||
sc->S[0] = _mm_set1_epi32( salt[0] );
|
||||
sc->S[1] = _mm_set1_epi32( salt[1] );
|
||||
sc->S[2] = _mm_set1_epi32( salt[2] );
|
||||
sc->S[3] = _mm_set1_epi32( salt[3] );
|
||||
*/
|
||||
ctx->T0 = ctx->T1 = 0;
|
||||
ctx->ptr = 0;
|
||||
ctx->rounds = rounds;
|
||||
}
|
||||
|
||||
static void
|
||||
blake32x2( blake_4way_small_context *ctx, const void *data, size_t len )
|
||||
{
|
||||
__m128i *buf = (__m256i*)ctx->buf;
|
||||
size_t bptr = ctx->ptr << 2;
|
||||
size_t vptr = ctx->ptr >> 3;
|
||||
size_t blen = len << 2;
|
||||
// unsigned char *buf = ctx->buf;
|
||||
// size_t ptr = ctx->ptr<<4; // repurposed
|
||||
DECL_STATE32x2
|
||||
|
||||
// buf = sc->buf;
|
||||
// ptr = sc->ptr;
|
||||
|
||||
// adjust len for use with ptr, clen, all absolute bytes.
|
||||
// int blen = len<<2;
|
||||
|
||||
if ( blen < (sizeof ctx->buf) - bptr )
|
||||
{
|
||||
memcpy( buf + vptr, data, blen );
|
||||
ptr += blen;
|
||||
ctx->ptr = bptr >> 2;;
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE32( ctx );
|
||||
while ( blen > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = ( sizeof sc->buf ) - ptr;
|
||||
if ( clen > blen )
|
||||
clen = blen;
|
||||
memcpy( buf + vptr, data, clen );
|
||||
bptr += clen;
|
||||
vptr = bptr >> 5;
|
||||
data = (const unsigned char *)data + clen;
|
||||
blen -= clen;
|
||||
if ( bptr == sizeof ctx->buf )
|
||||
{
|
||||
if ( ( T0 = T0 + 512 ) < 512 ) // not needed, will never rollover
|
||||
T1 += 1;
|
||||
COMPRESS32x2_4WAY( ctx->rounds );
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE32x2( ctx );
|
||||
ctx->ptr = bptr >> 2;
|
||||
}
|
||||
|
||||
static void
|
||||
blake32x2_4way_close( blake_4way_small_context *ctx, void *dst )
|
||||
{
|
||||
__m256i buf[8] __attribute__ ((aligned (64)));
|
||||
size_t ptr = ctx->ptr;
|
||||
size_t vptr = ctx->ptr>>2;
|
||||
unsigned bit_len = ( (unsigned)ptr << 3 ); // one lane
|
||||
uint32_t th = ctx->T1;
|
||||
uint32_t tl = ctx->T0 + bit_len;
|
||||
|
||||
if ( ptr == 0 )
|
||||
{
|
||||
ctx->T0 = 0xFFFFFE00UL;
|
||||
ctx->T1 = 0xFFFFFFFFUL;
|
||||
}
|
||||
else if ( ctx->T0 == 0 )
|
||||
{
|
||||
ctx->T0 = 0xFFFFFE00UL + bit_len;
|
||||
ctx->T1 -= 1;
|
||||
}
|
||||
else
|
||||
ctx->T0 -= 512 - bit_len;
|
||||
|
||||
// memset doesn't do ints
|
||||
buf[ vptr ] = _mm256_set_epi32( 0,0,0,0, 0x80, 0x80, 0x80, 0x80 );
|
||||
|
||||
if ( vptr < 5 )
|
||||
{
|
||||
memset_zero_256( buf + vptr + 1, 6 - vptr );
|
||||
buf[ 6 ] = _mm256_or_si256( vbuf[ 6 ], _mm256_set_epi32(
|
||||
0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL, 0,0,0,0 ) );
|
||||
buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl,tl,tl,tl,
|
||||
th,th,th,th ) );
|
||||
blake32x2_4way( ctx, buf + vptr, 64 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_256( vbuf + vptr + 1, 7 - vptr );
|
||||
blake32x2_4way( ctx, vbuf + ptr, 64 - ptr );
|
||||
ctx->T0 = 0xFFFFFE00UL;
|
||||
ctx->T1 = 0xFFFFFFFFUL;
|
||||
buf[ 6 ] = mm256_zero;
|
||||
buf[ 6 ] = _mm256_set_epi32( 0,0,0,0,
|
||||
0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL );
|
||||
buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl, tl, tl, tl,
|
||||
th, th, th, th );
|
||||
blake32x2_4way( ctx, buf, 64 );
|
||||
}
|
||||
|
||||
casti_m256i( dst, 0 ) = mm256_bswap_32( casti_m256i( ctx->H, 0 ) );
|
||||
casti_m256i( dst, 1 ) = mm256_bswap_32( casti_m256i( ctx->H, 1 ) );
|
||||
casti_m256i( dst, 2 ) = mm256_bswap_32( casti_m256i( ctx->H, 2 ) );
|
||||
casti_m256i( dst, 3 ) = mm256_bswap_32( casti_m256i( ctx->H, 3 ) );
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#define DECL_STATE32x2_4WAY \
|
||||
__m256i H0, H1, H2, H3; \
|
||||
__m256i S0, S1; \
|
||||
uint32_t T0, T1;
|
||||
|
||||
#define READ_STATE32x2_4WAY(state) do \
|
||||
{ \
|
||||
H0 = casti_m256i( state->H, 0 ); \
|
||||
H1 = casti_m256i( state->H, 1 ); \
|
||||
H2 = casti_m256i( state->H, 2 ); \
|
||||
H3 = casti_m256i( state->H, 3 ); \
|
||||
S0 = casti_m256i( state->S, 0 ); \
|
||||
S1 = casti_m256i( state->S, 1 ); \
|
||||
T0 = state->T0; \
|
||||
T1 = state->T1; \
|
||||
|
||||
#define WRITE_STATE32x2_4WAY(state) do { \
|
||||
casti_m256i( state->H, 0 ) = H0; \
|
||||
casti_m256i( state->H, 1 ) = H1; \
|
||||
casti_m256i( state->H, 2 ) = H2; \
|
||||
casti_m256i( state->H, 3 ) = H3; \
|
||||
casti_m256i( state->S, 0 ) = S0; \
|
||||
casti_m256i( state->S, 1 ) = S1; \
|
||||
state->T0 = T0; \
|
||||
state->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define GSx2_4WAY( m0m2, m1m3, c0c2, c1c3, a, b, c, d ) do \
|
||||
{ \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
|
||||
_mm256_set_epi32( c1,c3, c1,c3, c1,c3, c1,c3 ), \
|
||||
_mm256_set_epi32( m0,m2, m0,m2, m0,m2, m0,m2 ) ), b ), a ); \
|
||||
d = mm256_ror_32( _mm_xor_si128( d, a ), 16 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
|
||||
_mm256_set_epi32( c0,c2, c0,c2, c0,c2, c0,c2 ), \
|
||||
_mm256_set_epi32( m1,m3, m1,m3, m1,m3, m1,m3 ) ), b ), a ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
} while (0)
|
||||
|
||||
#define ROUND_Sx2_4WAY(r) do \
|
||||
{ \
|
||||
GS2_4WAY( Mx(r, 0), Mx(r, 1), Mx(r, 2), Mx(r, 3), \
|
||||
CSx(r, 0), CSx(r, 1), CSx(r, 2), CSx(r, 3), V0, V2, V4, V6 ); \
|
||||
GS2_4WAY( Mx(r, 4), Mx(r, 5), Mx(r, 6), Mx(r, 7), \
|
||||
CSx(r, 4), CSx(r, 5), CSx(r, 6), CSx(r, 7), V1, V3, V5, V7 ); \
|
||||
mm256_ror1x128_512( V3, V2 ); \
|
||||
mm256_ror1x128_512( V6, V7 ); \
|
||||
GS2_4WAY( Mx(r, 8), Mx(r, 9), Mx(r, A), Mx(r, B), \
|
||||
CSx(r, 8), CSx(r, 9), CSx(r, A), CSx(r, B), V0, V2, V5, V7 ); \
|
||||
GS2_4WAY( Mx(r, C), Mx(r, D), Mx(r, C), Mx(r, D), \
|
||||
CSx(r, C), CSx(r, D), CSx(r, C), CSx(r, D), V1, V3, V4, V6 ); \
|
||||
mm256_rol1x128_512( V2, V3 ); \
|
||||
mm256_rol1x128_512( V7, V6 );
|
||||
|
||||
#define COMPRESS32x2_4WAY( rounds ) do \
|
||||
{ \
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
unsigned r; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = _mm256_xor_si256( S0, _mm256_set_epi32( CS1, CS1, CS1, CS1, \
|
||||
CS0, CS0, CS0, CS0 ) ); \
|
||||
V5 = _mm256_xor_si256( S1, _mm256_set_epi32( CS3, CS3, CS3, CS3, \
|
||||
CS2, CS2, CS2, CS2 ) ); \
|
||||
V6 = _mm256_xor_si256( _mm256_set1_epi32( T0 ), \
|
||||
_mm256_set_epi32( CS5, CS5, CS5, CS5, \
|
||||
CS4, CS4, CS4, CS4 ) ); \
|
||||
V7 = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
|
||||
_mm256_set_epi32( CS7, CS7, CS7, CS7, \
|
||||
CS6, CS6, CS6, CS6 ) ); \
|
||||
M0 = mm256_bswap_32( buf[ 0] ); \
|
||||
M1 = mm256_bswap_32( buf[ 1] ); \
|
||||
M2 = mm256_bswap_32( buf[ 2] ); \
|
||||
M3 = mm256_bswap_32( buf[ 3] ); \
|
||||
M4 = mm256_bswap_32( buf[ 4] ); \
|
||||
M5 = mm256_bswap_32( buf[ 5] ); \
|
||||
M6 = mm256_bswap_32( buf[ 6] ); \
|
||||
M7 = mm256_bswap_32( buf[ 7] ); \
|
||||
ROUND_Sx2_4WAY(0); \
|
||||
ROUND_Sx2_4WAY(1); \
|
||||
ROUND_Sx2_4WAY(2); \
|
||||
ROUND_Sx2_4WAY(3); \
|
||||
ROUND_Sx2_4WAY(4); \
|
||||
ROUND_Sx2_4WAY(5); \
|
||||
ROUND_Sx2_4WAY(6); \
|
||||
ROUND_Sx2_4WAY(7); \
|
||||
if (rounds == 14) \
|
||||
{ \
|
||||
ROUND_Sx2_4WAY(8); \
|
||||
ROUND_Sx2_4WAY(9); \
|
||||
ROUND_Sx2_4WAY(0); \
|
||||
ROUND_Sx2_4WAY(1); \
|
||||
ROUND_Sx2_4WAY(2); \
|
||||
ROUND_Sx2_4WAY(3); \
|
||||
} \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( V8, V0 ), S0 ), H0 ); \
|
||||
H1 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( V9, V1 ), S1 ), H1 ); \
|
||||
H2 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( VA, V2 ), S2 ), H2 ); \
|
||||
H3 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( VB, V3 ), S3 ), H3 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@@ -16,7 +16,7 @@
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
701
algo/blake/blake512-hash-4way.c
Normal file
701
algo/blake/blake512-hash-4way.c
Normal file
@@ -0,0 +1,701 @@
|
||||
/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
|
||||
/*
|
||||
* BLAKE implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include "blake-hash-4way.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
|
||||
#define SPH_SMALL_FOOTPRINT_BLAKE 1
|
||||
#endif
|
||||
|
||||
#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
|
||||
#define SPH_COMPACT_BLAKE_64 1
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
|
||||
// Blake-512
|
||||
|
||||
static const sph_u64 IV512[8] = {
|
||||
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
||||
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
|
||||
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
|
||||
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
|
||||
};
|
||||
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
|
||||
|
||||
// Blake-256 4 & 8 way, Blake-512 4 way
|
||||
|
||||
static const unsigned sigma[16][16] = {
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
|
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
|
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
|
||||
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
|
||||
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
|
||||
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
|
||||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
|
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
|
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#define Z00 0
|
||||
#define Z01 1
|
||||
#define Z02 2
|
||||
#define Z03 3
|
||||
#define Z04 4
|
||||
#define Z05 5
|
||||
#define Z06 6
|
||||
#define Z07 7
|
||||
#define Z08 8
|
||||
#define Z09 9
|
||||
#define Z0A A
|
||||
#define Z0B B
|
||||
#define Z0C C
|
||||
#define Z0D D
|
||||
#define Z0E E
|
||||
#define Z0F F
|
||||
|
||||
#define Z10 E
|
||||
#define Z11 A
|
||||
#define Z12 4
|
||||
#define Z13 8
|
||||
#define Z14 9
|
||||
#define Z15 F
|
||||
#define Z16 D
|
||||
#define Z17 6
|
||||
#define Z18 1
|
||||
#define Z19 C
|
||||
#define Z1A 0
|
||||
#define Z1B 2
|
||||
#define Z1C B
|
||||
#define Z1D 7
|
||||
#define Z1E 5
|
||||
#define Z1F 3
|
||||
|
||||
#define Z20 B
|
||||
#define Z21 8
|
||||
#define Z22 C
|
||||
#define Z23 0
|
||||
#define Z24 5
|
||||
#define Z25 2
|
||||
#define Z26 F
|
||||
#define Z27 D
|
||||
#define Z28 A
|
||||
#define Z29 E
|
||||
#define Z2A 3
|
||||
#define Z2B 6
|
||||
#define Z2C 7
|
||||
#define Z2D 1
|
||||
#define Z2E 9
|
||||
#define Z2F 4
|
||||
|
||||
#define Z30 7
|
||||
#define Z31 9
|
||||
#define Z32 3
|
||||
#define Z33 1
|
||||
#define Z34 D
|
||||
#define Z35 C
|
||||
#define Z36 B
|
||||
#define Z37 E
|
||||
#define Z38 2
|
||||
#define Z39 6
|
||||
#define Z3A 5
|
||||
#define Z3B A
|
||||
#define Z3C 4
|
||||
#define Z3D 0
|
||||
#define Z3E F
|
||||
#define Z3F 8
|
||||
|
||||
#define Z40 9
|
||||
#define Z41 0
|
||||
#define Z42 5
|
||||
#define Z43 7
|
||||
#define Z44 2
|
||||
#define Z45 4
|
||||
#define Z46 A
|
||||
#define Z47 F
|
||||
#define Z48 E
|
||||
#define Z49 1
|
||||
#define Z4A B
|
||||
#define Z4B C
|
||||
#define Z4C 6
|
||||
#define Z4D 8
|
||||
#define Z4E 3
|
||||
#define Z4F D
|
||||
|
||||
#define Z50 2
|
||||
#define Z51 C
|
||||
#define Z52 6
|
||||
#define Z53 A
|
||||
#define Z54 0
|
||||
#define Z55 B
|
||||
#define Z56 8
|
||||
#define Z57 3
|
||||
#define Z58 4
|
||||
#define Z59 D
|
||||
#define Z5A 7
|
||||
#define Z5B 5
|
||||
#define Z5C F
|
||||
#define Z5D E
|
||||
#define Z5E 1
|
||||
#define Z5F 9
|
||||
|
||||
#define Z60 C
|
||||
#define Z61 5
|
||||
#define Z62 1
|
||||
#define Z63 F
|
||||
#define Z64 E
|
||||
#define Z65 D
|
||||
#define Z66 4
|
||||
#define Z67 A
|
||||
#define Z68 0
|
||||
#define Z69 7
|
||||
#define Z6A 6
|
||||
#define Z6B 3
|
||||
#define Z6C 9
|
||||
#define Z6D 2
|
||||
#define Z6E 8
|
||||
#define Z6F B
|
||||
|
||||
#define Z70 D
|
||||
#define Z71 B
|
||||
#define Z72 7
|
||||
#define Z73 E
|
||||
#define Z74 C
|
||||
#define Z75 1
|
||||
#define Z76 3
|
||||
#define Z77 9
|
||||
#define Z78 5
|
||||
#define Z79 0
|
||||
#define Z7A F
|
||||
#define Z7B 4
|
||||
#define Z7C 8
|
||||
#define Z7D 6
|
||||
#define Z7E 2
|
||||
#define Z7F A
|
||||
|
||||
#define Z80 6
|
||||
#define Z81 F
|
||||
#define Z82 E
|
||||
#define Z83 9
|
||||
#define Z84 B
|
||||
#define Z85 3
|
||||
#define Z86 0
|
||||
#define Z87 8
|
||||
#define Z88 C
|
||||
#define Z89 2
|
||||
#define Z8A D
|
||||
#define Z8B 7
|
||||
#define Z8C 1
|
||||
#define Z8D 4
|
||||
#define Z8E A
|
||||
#define Z8F 5
|
||||
|
||||
#define Z90 A
|
||||
#define Z91 2
|
||||
#define Z92 8
|
||||
#define Z93 4
|
||||
#define Z94 7
|
||||
#define Z95 6
|
||||
#define Z96 1
|
||||
#define Z97 5
|
||||
#define Z98 F
|
||||
#define Z99 B
|
||||
#define Z9A 9
|
||||
#define Z9B E
|
||||
#define Z9C 3
|
||||
#define Z9D C
|
||||
#define Z9E D
|
||||
#define Z9F 0
|
||||
|
||||
#define Mx(r, i) Mx_(Z ## r ## i)
|
||||
#define Mx_(n) Mx__(n)
|
||||
#define Mx__(n) M ## n
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
#define CBx(r, i) CBx_(Z ## r ## i)
|
||||
#define CBx_(n) CBx__(n)
|
||||
#define CBx__(n) CB ## n
|
||||
|
||||
#define CB0 SPH_C64(0x243F6A8885A308D3)
|
||||
#define CB1 SPH_C64(0x13198A2E03707344)
|
||||
#define CB2 SPH_C64(0xA4093822299F31D0)
|
||||
#define CB3 SPH_C64(0x082EFA98EC4E6C89)
|
||||
#define CB4 SPH_C64(0x452821E638D01377)
|
||||
#define CB5 SPH_C64(0xBE5466CF34E90C6C)
|
||||
#define CB6 SPH_C64(0xC0AC29B7C97C50DD)
|
||||
#define CB7 SPH_C64(0x3F84D5B5B5470917)
|
||||
#define CB8 SPH_C64(0x9216D5D98979FB1B)
|
||||
#define CB9 SPH_C64(0xD1310BA698DFB5AC)
|
||||
#define CBA SPH_C64(0x2FFD72DBD01ADFB7)
|
||||
#define CBB SPH_C64(0xB8E1AFED6A267E96)
|
||||
#define CBC SPH_C64(0xBA7C9045F12C7F99)
|
||||
#define CBD SPH_C64(0x24A19947B3916CF7)
|
||||
#define CBE SPH_C64(0x0801F2E2858EFC16)
|
||||
#define CBF SPH_C64(0x636920D871574E69)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_64
|
||||
// not used
|
||||
static const sph_u64 CB[16] = {
|
||||
SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
|
||||
SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
|
||||
SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
|
||||
SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
|
||||
SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
|
||||
SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
|
||||
SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
|
||||
SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) do { \
|
||||
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
|
||||
_mm256_set_epi64x( c1, c1, c1, c1 ), m0 ), b ), a ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
|
||||
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
|
||||
_mm256_set_epi64x( c0, c0, c0, c0 ), m1 ), b ), a ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
|
||||
} while (0)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_64
|
||||
// not used
|
||||
#define ROUND_B_4WAY(r) do { \
|
||||
GB_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
|
||||
CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
|
||||
GB_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \
|
||||
CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \
|
||||
GB_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \
|
||||
CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \
|
||||
GB_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \
|
||||
CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \
|
||||
GB_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \
|
||||
CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \
|
||||
GB_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \
|
||||
CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \
|
||||
GB_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \
|
||||
CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \
|
||||
GB_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \
|
||||
CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
//current_impl
|
||||
#define ROUND_B_4WAY(r) do { \
|
||||
GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
|
||||
GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
|
||||
GB_4WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
|
||||
GB_4WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
|
||||
GB_4WAY(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
|
||||
GB_4WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
|
||||
GB_4WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
|
||||
GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
#define DECL_STATE64_4WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m256i S0, S1, S2, S3; \
|
||||
sph_u64 T0, T1;
|
||||
|
||||
#define READ_STATE64_4WAY(state) do { \
|
||||
H0 = (state)->H[0]; \
|
||||
H1 = (state)->H[1]; \
|
||||
H2 = (state)->H[2]; \
|
||||
H3 = (state)->H[3]; \
|
||||
H4 = (state)->H[4]; \
|
||||
H5 = (state)->H[5]; \
|
||||
H6 = (state)->H[6]; \
|
||||
H7 = (state)->H[7]; \
|
||||
S0 = (state)->S[0]; \
|
||||
S1 = (state)->S[1]; \
|
||||
S2 = (state)->S[2]; \
|
||||
S3 = (state)->S[3]; \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE64_4WAY(state) do { \
|
||||
(state)->H[0] = H0; \
|
||||
(state)->H[1] = H1; \
|
||||
(state)->H[2] = H2; \
|
||||
(state)->H[3] = H3; \
|
||||
(state)->H[4] = H4; \
|
||||
(state)->H[5] = H5; \
|
||||
(state)->H[6] = H6; \
|
||||
(state)->H[7] = H7; \
|
||||
(state)->S[0] = S0; \
|
||||
(state)->S[1] = S1; \
|
||||
(state)->S[2] = S2; \
|
||||
(state)->S[3] = S3; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_64
|
||||
|
||||
// not used
|
||||
#define COMPRESS64_4WAY do { \
|
||||
__m256i M[16]; \
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
unsigned r; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
|
||||
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
|
||||
M[0x0] = mm256_bswap_64( *(buf+0) ); \
|
||||
M[0x1] = mm256_bswap_64( *(buf+1) ); \
|
||||
M[0x2] = mm256_bswap_64( *(buf+2) ); \
|
||||
M[0x3] = mm256_bswap_64( *(buf+3) ); \
|
||||
M[0x4] = mm256_bswap_64( *(buf+4) ); \
|
||||
M[0x5] = mm256_bswap_64( *(buf+5) ); \
|
||||
M[0x6] = mm256_bswap_64( *(buf+6) ); \
|
||||
M[0x7] = mm256_bswap_64( *(buf+7) ); \
|
||||
M[0x8] = mm256_bswap_64( *(buf+8) ); \
|
||||
M[0x9] = mm256_bswap_64( *(buf+9) ); \
|
||||
M[0xA] = mm256_bswap_64( *(buf+10) ); \
|
||||
M[0xB] = mm256_bswap_64( *(buf+11) ); \
|
||||
M[0xC] = mm256_bswap_64( *(buf+12) ); \
|
||||
M[0xD] = mm256_bswap_64( *(buf+13) ); \
|
||||
M[0xE] = mm256_bswap_64( *(buf+14) ); \
|
||||
M[0xF] = mm256_bswap_64( *(buf+15) ); \
|
||||
for (r = 0; r < 16; r ++) \
|
||||
ROUND_B_4WAY(r); \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
|
||||
H1 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
|
||||
H2 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S2, V2 ), VA ), H2 ); \
|
||||
H3 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S3, V3 ), VB ), H3 ); \
|
||||
H4 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S0, V4 ), VC ), H4 ); \
|
||||
H5 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S1, V5 ), VD ), H5 ); \
|
||||
H6 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S2, V6 ), VE ), H6 ); \
|
||||
H7 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S3, V7 ), VF ), H7 ); \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
|
||||
//current impl
|
||||
|
||||
#define COMPRESS64_4WAY do { \
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
|
||||
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
|
||||
M0 = mm256_bswap_64( *(buf + 0) ); \
|
||||
M1 = mm256_bswap_64( *(buf + 1) ); \
|
||||
M2 = mm256_bswap_64( *(buf + 2) ); \
|
||||
M3 = mm256_bswap_64( *(buf + 3) ); \
|
||||
M4 = mm256_bswap_64( *(buf + 4) ); \
|
||||
M5 = mm256_bswap_64( *(buf + 5) ); \
|
||||
M6 = mm256_bswap_64( *(buf + 6) ); \
|
||||
M7 = mm256_bswap_64( *(buf + 7) ); \
|
||||
M8 = mm256_bswap_64( *(buf + 8) ); \
|
||||
M9 = mm256_bswap_64( *(buf + 9) ); \
|
||||
MA = mm256_bswap_64( *(buf + 10) ); \
|
||||
MB = mm256_bswap_64( *(buf + 11) ); \
|
||||
MC = mm256_bswap_64( *(buf + 12) ); \
|
||||
MD = mm256_bswap_64( *(buf + 13) ); \
|
||||
ME = mm256_bswap_64( *(buf + 14) ); \
|
||||
MF = mm256_bswap_64( *(buf + 15) ); \
|
||||
ROUND_B_4WAY(0); \
|
||||
ROUND_B_4WAY(1); \
|
||||
ROUND_B_4WAY(2); \
|
||||
ROUND_B_4WAY(3); \
|
||||
ROUND_B_4WAY(4); \
|
||||
ROUND_B_4WAY(5); \
|
||||
ROUND_B_4WAY(6); \
|
||||
ROUND_B_4WAY(7); \
|
||||
ROUND_B_4WAY(8); \
|
||||
ROUND_B_4WAY(9); \
|
||||
ROUND_B_4WAY(0); \
|
||||
ROUND_B_4WAY(1); \
|
||||
ROUND_B_4WAY(2); \
|
||||
ROUND_B_4WAY(3); \
|
||||
ROUND_B_4WAY(4); \
|
||||
ROUND_B_4WAY(5); \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
|
||||
H1 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
|
||||
H2 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S2, V2 ), VA ), H2 ); \
|
||||
H3 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S3, V3 ), VB ), H3 ); \
|
||||
H4 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S0, V4 ), VC ), H4 ); \
|
||||
H5 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S1, V5 ), VD ), H5 ); \
|
||||
H6 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S2, V6 ), VE ), H6 ); \
|
||||
H7 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S3, V7 ), VF ), H7 ); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
|
||||
static void
|
||||
blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
|
||||
const sph_u64 *salt )
|
||||
{
|
||||
int i;
|
||||
for ( i = 0; i < 8; i++ )
|
||||
sc->H[i] = _mm256_set1_epi64x( iv[i] );
|
||||
for ( i = 0; i < 4; i++ )
|
||||
sc->S[i] = _mm256_set1_epi64x( salt[i] );
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
__m256i *buf;
|
||||
size_t ptr;
|
||||
DECL_STATE64_4WAY
|
||||
|
||||
const int buf_size = 128; // sizeof/8
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
if ( len < (buf_size - ptr) )
|
||||
{
|
||||
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE64_4WAY(sc);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata = vdata + (clen>>3);
|
||||
len -= clen;
|
||||
if (ptr == buf_size )
|
||||
{
|
||||
if ((T0 = SPH_T64(T0 + 1024)) < 1024)
|
||||
T1 = SPH_T64(T1 + 1);
|
||||
COMPRESS64_4WAY;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE64_4WAY(sc);
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
blake64_4way_close( blake_4way_big_context *sc,
|
||||
unsigned ub, unsigned n, void *dst, size_t out_size_w64)
|
||||
{
|
||||
// union {
|
||||
__m256i buf[16];
|
||||
// sph_u64 dummy;
|
||||
// } u;
|
||||
size_t ptr, k;
|
||||
unsigned bit_len;
|
||||
uint64_t z, zz;
|
||||
sph_u64 th, tl;
|
||||
__m256i *out;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
z = 0x80 >> n;
|
||||
zz = ((ub & -z) | z) & 0xFF;
|
||||
buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
|
||||
sc->T1 = SPH_T64(sc->T1 - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
}
|
||||
if ( ptr <= 104 )
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
|
||||
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
|
||||
*(buf+(112>>3)) = mm256_bswap_64(
|
||||
_mm256_set_epi64x( th, th, th, th ) );
|
||||
*(buf+(120>>3)) = mm256_bswap_64(
|
||||
_mm256_set_epi64x( tl, tl, tl, tl ) );
|
||||
|
||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
|
||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
memset_zero_256( buf, 112>>3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
|
||||
*(buf+(112>>3)) = mm256_bswap_64(
|
||||
_mm256_set_epi64x( th, th, th, th ) );
|
||||
*(buf+(120>>3)) = mm256_bswap_64(
|
||||
_mm256_set_epi64x( tl, tl, tl, tl ) );
|
||||
|
||||
blake64_4way( sc, buf, 128 );
|
||||
}
|
||||
out = (__m256i*)dst;
|
||||
for ( k = 0; k < out_size_w64; k++ )
|
||||
out[k] = mm256_bswap_64( sc->H[k] );
|
||||
}
|
||||
|
||||
void
|
||||
blake512_4way_init(void *cc)
|
||||
{
|
||||
blake64_4way_init(cc, IV512, salt_zero_big);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_4way(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake64_4way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_4way_close(void *cc, void *dst)
|
||||
{
|
||||
blake512_4way_addbits_and_close(cc, 0, 0, dst);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
blake64_4way_close(cc, ub, n, dst, 8);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -41,15 +41,18 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_bmw256 256
|
||||
|
||||
#define SPH_SIZE_bmw512 512
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
// BMW-256 4 way 32
|
||||
|
||||
typedef struct {
|
||||
__m128i buf[64];
|
||||
__m128i H[16];
|
||||
@@ -59,6 +62,60 @@ typedef struct {
|
||||
|
||||
typedef bmw_4way_small_context bmw256_4way_context;
|
||||
|
||||
void bmw256_4way_init(void *cc);
|
||||
|
||||
void bmw256_4way(void *cc, const void *data, size_t len);
|
||||
|
||||
void bmw256_4way_close(void *cc, void *dst);
|
||||
|
||||
void bmw256_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#endif // __SSE2__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// BMW-256 8 way 32
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[64];
|
||||
__m256i H[16];
|
||||
size_t ptr;
|
||||
uint32_t bit_count; // assume bit_count fits in 32 bits
|
||||
} bmw_8way_small_context __attribute__ ((aligned (64)));
|
||||
|
||||
typedef bmw_8way_small_context bmw256_8way_context;
|
||||
|
||||
void bmw256_8way_init( bmw256_8way_context *ctx );
|
||||
void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len );
|
||||
void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
// BMW-512 2 way 64
|
||||
|
||||
typedef struct {
|
||||
__m128i buf[16];
|
||||
__m128i H[16];
|
||||
size_t ptr;
|
||||
uint64_t bit_count;
|
||||
} bmw_2way_big_context __attribute__ ((aligned (64)));
|
||||
|
||||
typedef bmw_2way_big_context bmw512_2way_context;
|
||||
|
||||
void bmw512_2way_init( bmw512_2way_context *ctx );
|
||||
void bmw512_2way( bmw512_2way_context *ctx, const void *data, size_t len );
|
||||
void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );
|
||||
|
||||
#endif // __SSE2__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// BMW-512 4 way 64
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16];
|
||||
__m256i H[16];
|
||||
@@ -68,14 +125,6 @@ typedef struct {
|
||||
|
||||
typedef bmw_4way_big_context bmw512_4way_context;
|
||||
|
||||
void bmw256_4way_init(void *cc);
|
||||
|
||||
void bmw256_4way(void *cc, const void *data, size_t len);
|
||||
|
||||
void bmw256_4way_close(void *cc, void *dst);
|
||||
|
||||
void bmw256_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
void bmw512_4way_init(void *cc);
|
||||
|
||||
@@ -86,10 +135,10 @@ void bmw512_4way_close(void *cc, void *dst);
|
||||
void bmw512_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#endif
|
||||
#endif // __AVX2__
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif // BMW_HASH_H__
|
||||
|
File diff suppressed because it is too large
Load Diff
1109
algo/bmw/bmw512-hash-4way.c
Normal file
1109
algo/bmw/bmw512-hash-4way.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -7,6 +7,24 @@
|
||||
|
||||
// 2x128
|
||||
|
||||
// The result of hashing 10 rounds of initial data which consists of params
|
||||
// zero padded.
|
||||
static const uint64_t IV256[] =
|
||||
{
|
||||
0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
|
||||
0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
|
||||
0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
|
||||
0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
|
||||
};
|
||||
|
||||
static const uint64_t IV512[] =
|
||||
{
|
||||
0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
|
||||
0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
|
||||
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
|
||||
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
||||
};
|
||||
|
||||
static void transform_2way( cube_2way_context *sp )
|
||||
{
|
||||
int r;
|
||||
@@ -45,10 +63,10 @@ static void transform_2way( cube_2way_context *sp )
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = mm256_swap128_64( x4 );
|
||||
x5 = mm256_swap128_64( x5 );
|
||||
x6 = mm256_swap128_64( x6 );
|
||||
x7 = mm256_swap128_64( x7 );
|
||||
x4 = mm256_swap64_128( x4 );
|
||||
x5 = mm256_swap64_128( x5 );
|
||||
x6 = mm256_swap64_128( x6 );
|
||||
x7 = mm256_swap64_128( x7 );
|
||||
x4 = _mm256_add_epi32( x0, x4 );
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
@@ -69,10 +87,10 @@ static void transform_2way( cube_2way_context *sp )
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = mm256_swap64_32( x4 );
|
||||
x5 = mm256_swap64_32( x5 );
|
||||
x6 = mm256_swap64_32( x6 );
|
||||
x7 = mm256_swap64_32( x7 );
|
||||
x4 = mm256_swap32_64( x4 );
|
||||
x5 = mm256_swap32_64( x5 );
|
||||
x6 = mm256_swap32_64( x6 );
|
||||
x7 = mm256_swap32_64( x7 );
|
||||
}
|
||||
|
||||
_mm256_store_si256( (__m256i*)sp->h, x0 );
|
||||
@@ -86,36 +104,26 @@ static void transform_2way( cube_2way_context *sp )
|
||||
|
||||
}
|
||||
|
||||
cube_2way_context cube_2way_ctx_cache __attribute__ ((aligned (64)));
|
||||
|
||||
int cube_2way_reinit( cube_2way_context *sp )
|
||||
{
|
||||
memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
|
||||
int blockbytes )
|
||||
int blockbytes )
|
||||
{
|
||||
int i;
|
||||
const uint64_t* iv = hashbitlen == 512 ? IV512 : IV256;
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = blockbytes/16;
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
// all sizes of __m128i
|
||||
cube_2way_ctx_cache.hashlen = hashbitlen/128;
|
||||
cube_2way_ctx_cache.blocksize = blockbytes/16;
|
||||
cube_2way_ctx_cache.rounds = rounds;
|
||||
cube_2way_ctx_cache.pos = 0;
|
||||
__m256i* h = (__m256i*)sp->h;
|
||||
|
||||
for ( i = 0; i < 8; ++i )
|
||||
cube_2way_ctx_cache.h[i] = m256_zero;
|
||||
h[0] = _mm256_set_epi64x( iv[ 1], iv[ 0], iv[ 1], iv[ 0] );
|
||||
h[1] = _mm256_set_epi64x( iv[ 3], iv[ 2], iv[ 3], iv[ 2] );
|
||||
h[2] = _mm256_set_epi64x( iv[ 5], iv[ 4], iv[ 5], iv[ 4] );
|
||||
h[3] = _mm256_set_epi64x( iv[ 7], iv[ 6], iv[ 7], iv[ 6] );
|
||||
h[4] = _mm256_set_epi64x( iv[ 9], iv[ 8], iv[ 9], iv[ 8] );
|
||||
h[5] = _mm256_set_epi64x( iv[11], iv[10], iv[11], iv[10] );
|
||||
h[6] = _mm256_set_epi64x( iv[13], iv[12], iv[13], iv[12] );
|
||||
h[7] = _mm256_set_epi64x( iv[15], iv[14], iv[15], iv[14] );
|
||||
|
||||
cube_2way_ctx_cache.h[0] = _mm256_set_epi32(
|
||||
0, rounds, blockbytes, hashbitlen / 8,
|
||||
0, rounds, blockbytes, hashbitlen / 8 );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_2way( &cube_2way_ctx_cache );
|
||||
|
||||
memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <stdint.h>
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
// 2x128, 2 way parallel SSE2
|
||||
|
||||
|
@@ -13,7 +13,26 @@
|
||||
#include <stdbool.h>
|
||||
#include <unistd.h>
|
||||
#include <memory.h>
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
#include <stdio.h>
|
||||
|
||||
// The result of hashing 10 rounds of initial data which is params and
|
||||
// mostly zeros.
|
||||
static const uint64_t IV256[] =
|
||||
{
|
||||
0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
|
||||
0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
|
||||
0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
|
||||
0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
|
||||
};
|
||||
|
||||
static const uint64_t IV512[] =
|
||||
{
|
||||
0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
|
||||
0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
|
||||
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
|
||||
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
||||
};
|
||||
|
||||
static void transform( cubehashParam *sp )
|
||||
{
|
||||
@@ -128,48 +147,37 @@ static void transform( cubehashParam *sp )
|
||||
#endif
|
||||
} // transform
|
||||
|
||||
// Cubehash context initializing is very expensive.
|
||||
// Cache the intial value for faster reinitializing.
|
||||
cubehashParam cube_ctx_cache __attribute__ ((aligned (64)));
|
||||
|
||||
int cubehashReinit( cubehashParam *sp )
|
||||
{
|
||||
memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
|
||||
return SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
// Initialize the cache then copy to sp.
|
||||
int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
|
||||
{
|
||||
int i;
|
||||
const uint64_t* iv = hashbitlen == 512 ? IV512 : IV256;
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = blockbytes/16;
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
if ( hashbitlen < 8 ) return BAD_HASHBITLEN;
|
||||
if ( hashbitlen > 512 ) return BAD_HASHBITLEN;
|
||||
if ( hashbitlen != 8 * (hashbitlen / 8) ) return BAD_HASHBITLEN;
|
||||
__m256i* x = (__m256i*)sp->x;
|
||||
|
||||
/* Sanity checks */
|
||||
if ( rounds <= 0 || rounds > 32 )
|
||||
rounds = CUBEHASH_ROUNDS;
|
||||
if ( blockbytes <= 0 || blockbytes >= 256)
|
||||
blockbytes = CUBEHASH_BLOCKBYTES;
|
||||
x[0] = _mm256_set_epi64x( iv[ 3], iv[ 2], iv[ 1], iv[ 0] );
|
||||
x[1] = _mm256_set_epi64x( iv[ 7], iv[ 6], iv[ 5], iv[ 4] );
|
||||
x[2] = _mm256_set_epi64x( iv[11], iv[10], iv[ 9], iv[ 8] );
|
||||
x[3] = _mm256_set_epi64x( iv[15], iv[14], iv[13], iv[12] );
|
||||
|
||||
// all sizes of __m128i
|
||||
cube_ctx_cache.hashlen = hashbitlen/128;
|
||||
cube_ctx_cache.blocksize = blockbytes/16;
|
||||
cube_ctx_cache.rounds = rounds;
|
||||
cube_ctx_cache.pos = 0;
|
||||
#else
|
||||
|
||||
for ( i = 0; i < 8; ++i )
|
||||
cube_ctx_cache.x[i] = _mm_setzero_si128();;
|
||||
__m128i* x = (__m128i*)sp->x;
|
||||
|
||||
cube_ctx_cache.x[0] = _mm_set_epi32( 0, rounds, blockbytes,
|
||||
hashbitlen / 8 );
|
||||
x[0] = _mm_set_epi64x( iv[ 1], iv[ 0] );
|
||||
x[1] = _mm_set_epi64x( iv[ 3], iv[ 2] );
|
||||
x[2] = _mm_set_epi64x( iv[ 5], iv[ 4] );
|
||||
x[3] = _mm_set_epi64x( iv[ 7], iv[ 6] );
|
||||
x[4] = _mm_set_epi64x( iv[ 9], iv[ 8] );
|
||||
x[5] = _mm_set_epi64x( iv[11], iv[10] );
|
||||
x[6] = _mm_set_epi64x( iv[13], iv[12] );
|
||||
x[7] = _mm_set_epi64x( iv[15], iv[14] );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform( &cube_ctx_cache );
|
||||
|
||||
memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
|
||||
#endif
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
|
@@ -11,6 +11,8 @@ extern "C"{
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
#define SPH_FUGUE_NOCOPY 1
|
||||
|
||||
static const sph_u32 IV224[] = {
|
||||
SPH_C32(0xf4c9120d), SPH_C32(0x6286f757), SPH_C32(0xee39e01c),
|
||||
SPH_C32(0xe074e3cb), SPH_C32(0xa1127c62), SPH_C32(0x9a43d215),
|
||||
|
@@ -12,7 +12,7 @@
|
||||
#include <memory.h>
|
||||
#include "hash-groestl.h"
|
||||
#include "miner.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
|
||||
|
@@ -9,7 +9,7 @@
|
||||
#include <memory.h>
|
||||
#include "hash-groestl256.h"
|
||||
#include "miner.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
|
||||
|
@@ -40,7 +40,7 @@
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
|
@@ -69,7 +69,7 @@ extern "C"{
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_haval256_5 256
|
||||
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include "wolf-aes.h"
|
||||
#include "miner.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#if defined(__AES__)
|
||||
|
||||
static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
|
||||
{
|
||||
@@ -151,7 +151,7 @@ void AES256CBC(__m128i** data, const __m128i** next, __m128i ExpandedKey[][16],
|
||||
}
|
||||
}
|
||||
|
||||
#else // NO AVX
|
||||
#else // NO SSE4.2
|
||||
|
||||
static inline __m128i AES256Core(__m128i State, const __m128i *ExpandedKey)
|
||||
{
|
||||
|
@@ -101,39 +101,6 @@ void hodl_build_block_header( struct work* g_work, uint32_t version,
|
||||
g_work->data[31] = 0x00000280;
|
||||
}
|
||||
|
||||
// hodl build_extra_header is redundant, hodl can use std_build_extra_header
|
||||
// and call hodl_build_block_header.
|
||||
#if 0
|
||||
void hodl_build_extraheader( struct work* g_work, struct stratum_ctx *sctx )
|
||||
{
|
||||
uchar merkle_tree[64] = { 0 };
|
||||
size_t t;
|
||||
// int i;
|
||||
|
||||
algo_gate.gen_merkle_root( merkle_tree, sctx );
|
||||
// Increment extranonce2
|
||||
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
|
||||
|
||||
algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
|
||||
(uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
|
||||
le32dec( sctx->job.ntime ), le32dec( sctx->job.nbits ) );
|
||||
/*
|
||||
// Assemble block header
|
||||
memset( g_work->data, 0, sizeof(g_work->data) );
|
||||
g_work->data[0] = le32dec( sctx->job.version );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
|
||||
|
||||
g_work->data[ algo_gate.ntime_index ] = le32dec( sctx->job.ntime );
|
||||
g_work->data[ algo_gate.nbits_index ] = le32dec( sctx->job.nbits );
|
||||
g_work->data[22] = 0x80000000;
|
||||
g_work->data[31] = 0x00000280;
|
||||
*/
|
||||
}
|
||||
#endif
|
||||
|
||||
// called only by thread 0, saves a backup of g_work
|
||||
void hodl_get_new_work( struct work* work, struct work* g_work)
|
||||
{
|
||||
@@ -179,7 +146,7 @@ bool hodl_do_this_thread( int thr_id )
|
||||
int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
#ifndef NO_AES_NI
|
||||
#if defined(__AES__)
|
||||
GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, thr_id );
|
||||
pthread_barrier_wait( &hodl_barrier );
|
||||
return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
|
||||
@@ -189,7 +156,7 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
|
||||
|
||||
bool register_hodl_algo( algo_gate_t* gate )
|
||||
{
|
||||
#ifdef NO_AES_NI
|
||||
#if !defined(__AES__)
|
||||
applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
|
||||
return false;
|
||||
#endif
|
||||
@@ -207,7 +174,6 @@ bool register_hodl_algo( algo_gate_t* gate )
|
||||
gate->build_stratum_request = (void*)&hodl_le_build_stratum_request;
|
||||
gate->malloc_txs_request = (void*)&hodl_malloc_txs_request;
|
||||
gate->build_block_header = (void*)&hodl_build_block_header;
|
||||
// gate->build_extraheader = (void*)&hodl_build_extraheader;
|
||||
gate->resync_threads = (void*)&hodl_resync_threads;
|
||||
gate->do_this_thread = (void*)&hodl_do_this_thread;
|
||||
gate->work_cmp_size = 76;
|
||||
|
@@ -8,7 +8,7 @@
|
||||
#include "hodl-wolf.h"
|
||||
#include "miner.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#if defined(__AES__)
|
||||
|
||||
void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
|
||||
void *MidHash )
|
||||
@@ -139,7 +139,7 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
|
||||
return(0);
|
||||
|
||||
|
||||
#else // no AVX
|
||||
#else // no SSE4.2
|
||||
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -160,7 +160,6 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
|
||||
{
|
||||
// copy data to first l2 cache
|
||||
memcpy(Cache.dwords, Garbage + k, GARBAGE_SLICE_SIZE);
|
||||
#ifndef NO_AES_NI
|
||||
for(int j = 0; j < AES_ITERATIONS; j++)
|
||||
{
|
||||
CacheEntry TmpXOR;
|
||||
@@ -184,7 +183,6 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
|
||||
AES256CBC( Cache.dqwords, TmpXOR.dqwords, ExpKey,
|
||||
TmpXOR.dqwords[ (GARBAGE_SLICE_SIZE / sizeof(__m128i))
|
||||
- 1 ], 256 ); }
|
||||
#endif
|
||||
// use last X bits as solution
|
||||
if( ( Cache.dwords[ (GARBAGE_SLICE_SIZE >> 2) - 1 ]
|
||||
& (COMPARE_SIZE - 1) ) < 1000 )
|
||||
@@ -206,7 +204,7 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
|
||||
*hashes_done = CollisionCount;
|
||||
return(0);
|
||||
|
||||
#endif
|
||||
#endif // SSE4.2 else
|
||||
|
||||
}
|
||||
|
||||
@@ -218,5 +216,5 @@ void GenRandomGarbage(CacheEntry *Garbage, uint32_t *pdata, int thr_id)
|
||||
GenerateGarbageCore(Garbage, thr_id, opt_n_threads, MidHash);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // AES
|
||||
|
||||
|
@@ -22,16 +22,20 @@ typedef struct
|
||||
#ifdef __AVX2__
|
||||
__m256i h[8];
|
||||
__m256i w[80];
|
||||
#else // AVX
|
||||
#elif defined(__SSE4_2__)
|
||||
__m128i h[8];
|
||||
__m128i w[80];
|
||||
#else
|
||||
int dummy;
|
||||
#endif
|
||||
} Sha512Context;
|
||||
|
||||
#ifdef __AVX2__
|
||||
#define SHA512_PARALLEL_N 8
|
||||
#else // AVX
|
||||
#elif defined(__SSE$_2__)
|
||||
#define SHA512_PARALLEL_N 4
|
||||
#else
|
||||
#define SHA512_PARALLEL_N 1 // dummy value
|
||||
#endif
|
||||
|
||||
//SHA-512 related functions
|
||||
|
@@ -1,4 +1,5 @@
|
||||
#ifndef __AVX2__
|
||||
|
||||
#ifdef __SSE4_2__
|
||||
//#ifdef __AVX__
|
||||
|
||||
@@ -10,6 +11,10 @@
|
||||
#include <sys/endian.h>
|
||||
#endif
|
||||
|
||||
#if defined(__CYGWIN__)
|
||||
#include <endian.h>
|
||||
#endif
|
||||
|
||||
#include "tmmintrin.h"
|
||||
#include "smmintrin.h"
|
||||
|
||||
|
@@ -8,6 +8,10 @@
|
||||
#include <sys/endian.h>
|
||||
#endif
|
||||
|
||||
#if defined(__CYGWIN__)
|
||||
#include <endian.h>
|
||||
#endif
|
||||
|
||||
#include "tmmintrin.h"
|
||||
#include "smmintrin.h"
|
||||
#include "immintrin.h"
|
||||
|
@@ -44,7 +44,7 @@ extern "C"{
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_jh256 256
|
||||
|
||||
|
@@ -44,7 +44,7 @@ extern "C"{
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_keccak256 256
|
||||
|
||||
|
@@ -91,7 +91,7 @@ extern "C"{
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
static const sph_u64 RC[] = {
|
||||
SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
|
||||
SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
|
||||
@@ -106,7 +106,7 @@ static const sph_u64 RC[] = {
|
||||
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
|
||||
SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
|
||||
};
|
||||
|
||||
*/
|
||||
#define kekDECL_STATE \
|
||||
sph_u64 keca00, keca01, keca02, keca03, keca04; \
|
||||
sph_u64 keca10, keca11, keca12, keca13, keca14; \
|
||||
@@ -756,6 +756,20 @@ static const sph_u64 RC[] = {
|
||||
* tested faster saving space
|
||||
*/
|
||||
#define KECCAK_F_1600_ do { \
|
||||
static const sph_u64 RC[] = { \
|
||||
SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), \
|
||||
SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), \
|
||||
SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), \
|
||||
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), \
|
||||
SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), \
|
||||
SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), \
|
||||
SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), \
|
||||
SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), \
|
||||
SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), \
|
||||
SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), \
|
||||
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), \
|
||||
SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) \
|
||||
}; \
|
||||
int j; \
|
||||
for (j = 0; j < 24; j += 4) { \
|
||||
KF_ELT( 0, 1, RC[j + 0]); \
|
||||
@@ -791,7 +805,7 @@ static const sph_u64 RC[] = {
|
||||
/* load initial constants */
|
||||
#define KEC_I
|
||||
|
||||
static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 };
|
||||
//static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 };
|
||||
/*
|
||||
unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \
|
||||
*/
|
||||
@@ -799,6 +813,7 @@ static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0
|
||||
/* load hash for loop */
|
||||
#define KEC_U \
|
||||
do { \
|
||||
static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \
|
||||
/*memcpy(hashbuf, hash, 64); */ \
|
||||
memcpy(hash + 64, keczword, 8); \
|
||||
} while (0);
|
||||
|
@@ -24,7 +24,7 @@
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
|
||||
0UL, 0UL, 0UL, 0xffffffffUL )
|
||||
|
@@ -24,7 +24,7 @@
|
||||
|
||||
#include <immintrin.h>
|
||||
#include "algo/sha/sha3-defs.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
/* The length of digests*/
|
||||
#define DIGEST_BIT_LEN_224 224
|
||||
|
@@ -20,7 +20,7 @@
|
||||
|
||||
#include <string.h>
|
||||
#include <emmintrin.h>
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
#include "luffa_for_sse2.h"
|
||||
|
||||
#define MULT2(a0,a1) do \
|
||||
@@ -30,6 +30,19 @@
|
||||
a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) ); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
static inline __m256i mult2_avx2( a )
|
||||
{
|
||||
__m128 a0, a0, b;
|
||||
a0 = mm128_extractlo_256( a );
|
||||
a1 = mm128_extracthi_256( a );
|
||||
b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) );
|
||||
a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) );
|
||||
a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) );
|
||||
return mm256_concat_128( a1, a0 );
|
||||
}
|
||||
*/
|
||||
|
||||
#define STEP_PART(x,c,t)\
|
||||
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
||||
|
@@ -44,10 +44,11 @@ void allium_4way_hash( void *state, const void *input )
|
||||
blake256_4way( &ctx.blake, input + (64<<2), 16 );
|
||||
blake256_4way_close( &ctx.blake, vhash32 );
|
||||
|
||||
mm256_reinterleave_4x64( vhash64, vhash32, 256 );
|
||||
mm256_rintrlv_4x32_4x64( vhash64, vhash32, 256 );
|
||||
keccak256_4way( &ctx.keccak, vhash64, 32 );
|
||||
keccak256_4way_close( &ctx.keccak, vhash64 );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
|
||||
@@ -55,11 +56,11 @@ void allium_4way_hash( void *state, const void *input )
|
||||
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
|
||||
|
||||
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
|
||||
@@ -67,73 +68,64 @@ void allium_4way_hash( void *state, const void *input )
|
||||
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
|
||||
|
||||
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
|
||||
mm256_intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
|
||||
|
||||
skein256_4way( &ctx.skein, vhash64, 32 );
|
||||
skein256_4way_close( &ctx.skein, vhash64 );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
|
||||
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
|
||||
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
|
||||
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
|
||||
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
|
||||
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
|
||||
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
|
||||
}
|
||||
|
||||
int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep = vdata + 76; // 19*4
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
swab32_array( edata, pdata, 20 );
|
||||
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
mm128_bswap_intrlv80_4x32( vdata, pdata );
|
||||
blake256_4way_init( &allium_4way_ctx.blake );
|
||||
blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
|
||||
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+1, n+1 );
|
||||
be32enc( noncep+2, n+2 );
|
||||
be32enc( noncep+3, n+3 );
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
|
||||
allium_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, hash+(lane<<3), mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
&& !work_restart[thr_id].restart);
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -70,7 +70,7 @@ void allium_hash(void *state, const void *input)
|
||||
}
|
||||
|
||||
int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(128) hash[8];
|
||||
uint32_t _ALIGN(128) endiandata[20];
|
||||
@@ -80,6 +80,7 @@ int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x3ffff;
|
||||
|
@@ -1,6 +1,43 @@
|
||||
#include "lyra2-gate.h"
|
||||
|
||||
|
||||
// huge pages
|
||||
//
|
||||
// Use MAP_PRIVATE instead
|
||||
// In register algo:
|
||||
// replace thread safe whole matrix with a char**
|
||||
// alloc huge pages matrixsize * threads
|
||||
// make pointers to each thread to each thread, creating an
|
||||
// array[thread][matrix].
|
||||
// Each thread can create its own matrix pointer:
|
||||
// my_matrix = the matrix + ( thread_id * matrix_size )
|
||||
//
|
||||
// Compiler version check?
|
||||
// Fallback?
|
||||
//
|
||||
// create a generic utility to map & unmap huge pages.
|
||||
// ptr = malloc_huge( size );
|
||||
// Yespower wrapper checks for 64 byte alignment, seems unnecessary as
|
||||
// it should be aligned to the page boundary. It may be desireable to
|
||||
// have the matrix size rounded up if necessary to something bigger
|
||||
// than 64 byte, say 4 kbytes a small page size.
|
||||
|
||||
// Define some constants for indivual parameters and matrix size for
|
||||
// each algo. Use the parameter constants where apropriate.
|
||||
// Convert algos that don't yet do so to use dynamic alllocation.
|
||||
// Alloc huge pages globally. If ok each thread will create a pointer to
|
||||
// its chunk. If fail each thread will use use _mm_alloc for itself.
|
||||
// BLOCK_LEN_BYTES is 768.
|
||||
|
||||
#define LYRA2REV3_NROWS 4
|
||||
#define LYRA2REV3_NCOLS 4
|
||||
/*
|
||||
#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)*(LYRA2REV3_NCOLS)* \
|
||||
(LYRA2REV3_NROWS)*8)
|
||||
*/
|
||||
|
||||
#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)<<4)
|
||||
|
||||
__thread uint64_t* l2v3_wholeMatrix;
|
||||
|
||||
bool lyra2rev3_thread_init()
|
||||
@@ -27,7 +64,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_lyra2rev3;
|
||||
gate->hash = (void*)&lyra2rev3_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
|
||||
gate->set_target = (void*)&alt_set_target;
|
||||
return true;
|
||||
|
@@ -5,7 +5,9 @@
|
||||
#include <stdint.h>
|
||||
#include "lyra2.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
//#if defined(__AVX2__)
|
||||
|
||||
#if defined(__SSE2__)
|
||||
#define LYRA2REV3_4WAY
|
||||
#endif
|
||||
|
||||
@@ -17,14 +19,14 @@ bool register_lyra2rev3_algo( algo_gate_t* gate );
|
||||
|
||||
void lyra2rev3_4way_hash( void *state, const void *input );
|
||||
int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool init_lyra2rev3_4way_ctx();
|
||||
|
||||
#else
|
||||
|
||||
void lyra2rev3_hash( void *state, const void *input );
|
||||
int scanhash_lyra2rev3( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool init_lyra2rev3_ctx();
|
||||
|
||||
#endif
|
||||
@@ -43,25 +45,25 @@ bool register_lyra2rev2_algo( algo_gate_t* gate );
|
||||
|
||||
void lyra2rev2_4way_hash( void *state, const void *input );
|
||||
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool init_lyra2rev2_4way_ctx();
|
||||
|
||||
#else
|
||||
|
||||
void lyra2rev2_hash( void *state, const void *input );
|
||||
int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool init_lyra2rev2_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
/////////////////////////
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
#if defined(__SSE2__)
|
||||
#define LYRA2Z_4WAY
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
// #define LYRA2Z_8WAY
|
||||
#define LYRA2Z_8WAY
|
||||
#endif
|
||||
|
||||
|
||||
@@ -71,21 +73,21 @@ bool init_lyra2rev2_ctx();
|
||||
|
||||
void lyra2z_8way_hash( void *state, const void *input );
|
||||
int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool lyra2z_8way_thread_init();
|
||||
|
||||
#elif defined(LYRA2Z_4WAY)
|
||||
|
||||
void lyra2z_4way_hash( void *state, const void *input );
|
||||
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool lyra2z_4way_thread_init();
|
||||
|
||||
#else
|
||||
|
||||
void lyra2z_hash( void *state, const void *input );
|
||||
int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool lyra2z_thread_init();
|
||||
|
||||
#endif
|
||||
@@ -102,14 +104,14 @@ bool lyra2z_thread_init();
|
||||
|
||||
void lyra2h_4way_hash( void *state, const void *input );
|
||||
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool lyra2h_4way_thread_init();
|
||||
|
||||
#else
|
||||
|
||||
void lyra2h_hash( void *state, const void *input );
|
||||
int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool lyra2h_thread_init();
|
||||
|
||||
#endif
|
||||
@@ -126,14 +128,14 @@ bool register_allium_algo( algo_gate_t* gate );
|
||||
|
||||
void allium_4way_hash( void *state, const void *input );
|
||||
int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool init_allium_4way_ctx();
|
||||
|
||||
#else
|
||||
|
||||
void allium_hash( void *state, const void *input );
|
||||
int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool init_allium_ctx();
|
||||
|
||||
#endif
|
||||
@@ -146,7 +148,7 @@ bool register_phi2_algo( algo_gate_t* gate );
|
||||
|
||||
void phi2_hash( void *state, const void *input );
|
||||
int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_phi2_ctx();
|
||||
|
||||
#endif // LYRA2_GATE_H__
|
||||
|
@@ -236,7 +236,7 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
//Tries to allocate enough space for the whole memory matrix
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
|
||||
/*
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
@@ -566,7 +566,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
|
||||
|
||||
#if defined(__AVX2__)
|
||||
memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
|
||||
#elif defined(__SSE4_2__)
|
||||
#elif defined(__SSE2__)
|
||||
memset_zero_128( (__m128i*)wholeMatrix, i>>4 );
|
||||
#else
|
||||
memset( wholeMatrix, 0, i );
|
||||
|
@@ -36,66 +36,53 @@ void lyra2h_4way_hash( void *state, const void *input )
|
||||
blake256_4way( &ctx_blake, input + (64*4), 16 );
|
||||
blake256_4way_close( &ctx_blake, vhash );
|
||||
|
||||
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
|
||||
LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
|
||||
LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
|
||||
LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
LYRA2Z( lyra2h_4way_matrix, state, 32, hash0, 32, hash0, 32,
|
||||
16, 16, 16 );
|
||||
LYRA2Z( lyra2h_4way_matrix, state+32, 32, hash1, 32, hash1,
|
||||
32, 16, 16, 16 );
|
||||
LYRA2Z( lyra2h_4way_matrix, state+64, 32, hash2, 32, hash2,
|
||||
32, 16, 16, 16 );
|
||||
LYRA2Z( lyra2h_4way_matrix, state+96, 32, hash3, 32, hash3,
|
||||
32, 16, 16, 16 );
|
||||
}
|
||||
|
||||
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep= vdata + 76; // 19*4
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0000ff;
|
||||
|
||||
for ( int i=0; i < 20; i++ )
|
||||
be32enc( &edata[i], pdata[i] );
|
||||
|
||||
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
mm128_bswap_intrlv80_4x32( vdata, pdata );
|
||||
lyra2h_4way_midstate( vdata );
|
||||
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+1, n+1 );
|
||||
be32enc( noncep+2, n+2 );
|
||||
be32enc( noncep+3, n+3 );
|
||||
|
||||
be32enc( &edata[19], n );
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
lyra2h_4way_hash( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
|
||||
&& !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
submit_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
&& !work_restart[thr_id].restart);
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -36,7 +36,7 @@ void lyra2h_hash( void *state, const void *input )
|
||||
}
|
||||
|
||||
int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
@@ -45,6 +45,7 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if (opt_benchmark)
|
||||
ptarget[7] = 0x0000ff;
|
||||
|
@@ -6,9 +6,8 @@
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "lyra2.h"
|
||||
#include "algo-gate-api.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "simd-utils.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/groestl/aes_ni/hash-groestl256.h"
|
||||
#endif
|
||||
|
||||
@@ -18,10 +17,10 @@ typedef struct {
|
||||
sph_blake256_context blake;
|
||||
sph_keccak256_context keccak;
|
||||
sph_skein256_context skein;
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl256_context groestl;
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
hashState_groestl256 groestl;
|
||||
#else
|
||||
sph_groestl256_context groestl;
|
||||
#endif
|
||||
} lyra2re_ctx_holder;
|
||||
|
||||
@@ -33,10 +32,10 @@ void init_lyra2re_ctx()
|
||||
sph_blake256_init(&lyra2re_ctx.blake);
|
||||
sph_keccak256_init(&lyra2re_ctx.keccak);
|
||||
sph_skein256_init(&lyra2re_ctx.skein);
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl256_init(&lyra2re_ctx.groestl);
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
init_groestl256( &lyra2re_ctx.groestl, 32 );
|
||||
#else
|
||||
sph_groestl256_init(&lyra2re_ctx.groestl);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -72,18 +71,18 @@ void lyra2re_hash(void *state, const void *input)
|
||||
sph_skein256(&ctx.skein, hashA, 32);
|
||||
sph_skein256_close(&ctx.skein, hashB);
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
#if defined(__AES__)
|
||||
update_and_final_groestl256( &ctx.groestl, hashA, hashB, 256 );
|
||||
#else
|
||||
sph_groestl256( &ctx.groestl, hashB, 32 );
|
||||
sph_groestl256_close( &ctx.groestl, hashA );
|
||||
#else
|
||||
update_and_final_groestl256( &ctx.groestl, hashA, hashB, 256 );
|
||||
#endif
|
||||
|
||||
memcpy(state, hashA, 32);
|
||||
}
|
||||
|
||||
int scanhash_lyra2re(int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done)
|
||||
int scanhash_lyra2re( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -92,6 +91,7 @@ int scanhash_lyra2re(int thr_id, struct work *work,
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
|
@@ -42,17 +42,19 @@ void lyra2rev2_4way_hash( void *state, const void *input )
|
||||
blake256_4way( &ctx.blake, input + (64<<2), 16 );
|
||||
blake256_4way_close( &ctx.blake, vhash );
|
||||
|
||||
mm256_reinterleave_4x64( vhash64, vhash, 256 );
|
||||
mm256_rintrlv_4x32_4x64( vhash64, vhash, 256 );
|
||||
|
||||
keccak256_4way( &ctx.keccak, vhash64, 32 );
|
||||
keccak256_4way_close( &ctx.keccak, vhash64 );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
|
||||
|
||||
LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
|
||||
@@ -60,74 +62,71 @@ void lyra2rev2_4way_hash( void *state, const void *input )
|
||||
LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
|
||||
LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
|
||||
|
||||
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
|
||||
mm256_intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
|
||||
|
||||
skein256_4way( &ctx.skein, vhash64, 32 );
|
||||
skein256_4way_close( &ctx.skein, vhash64 );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
cubehashReinit( &ctx.cube );
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
|
||||
|
||||
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
bmw256_4way( &ctx.bmw, vhash, 32 );
|
||||
bmw256_4way_close( &ctx.bmw, vhash );
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
|
||||
mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
bmw256_4way( &ctx.bmw, vhash, 32 );
|
||||
bmw256_4way_close( &ctx.bmw, state );
|
||||
}
|
||||
|
||||
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep = vdata + 76; // 19*4
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
swab32_array( edata, pdata, 20 );
|
||||
|
||||
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
mm128_bswap_intrlv80_4x32( vdata, pdata );
|
||||
|
||||
blake256_4way_init( &l2v2_4way_ctx.blake );
|
||||
blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
|
||||
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+1, n+1 );
|
||||
be32enc( noncep+2, n+2 );
|
||||
be32enc( noncep+3, n+3 );
|
||||
do
|
||||
{
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
|
||||
lyra2rev2_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
&& !work_restart[thr_id].restart);
|
||||
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -73,7 +73,7 @@ void lyra2rev2_hash( void *state, const void *input )
|
||||
}
|
||||
|
||||
int scanhash_lyra2rev2(int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done)
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -82,6 +82,7 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||
|
@@ -35,7 +35,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
|
||||
|
||||
blake256_4way( &ctx.blake, input, 80 );
|
||||
blake256_4way_close( &ctx.blake, vhash );
|
||||
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
|
||||
@@ -43,11 +43,11 @@ void lyra2rev3_4way_hash( void *state, const void *input )
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
|
||||
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
|
||||
@@ -55,56 +55,50 @@ void lyra2rev3_4way_hash( void *state, const void *input )
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
|
||||
|
||||
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
bmw256_4way( &ctx.bmw, vhash, 32 );
|
||||
bmw256_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
bmw256_4way_close( &ctx.bmw, state );
|
||||
}
|
||||
|
||||
int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep = vdata + 76; // 19*4
|
||||
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
swab32_array( edata, pdata, 20 );
|
||||
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+1, n+1 );
|
||||
be32enc( noncep+2, n+2 );
|
||||
be32enc( noncep+3, n+3 );
|
||||
mm128_bswap_intrlv80_4x32( vdata, pdata );
|
||||
do
|
||||
{
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
|
||||
lyra2rev3_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
&& !work_restart[thr_id].restart);
|
||||
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -8,7 +8,6 @@
|
||||
|
||||
typedef struct {
|
||||
cubehashParam cube;
|
||||
// cubehashParam cube2;
|
||||
sph_blake256_context blake;
|
||||
sph_bmw256_context bmw;
|
||||
|
||||
@@ -20,7 +19,6 @@ static __thread sph_blake256_context l2v3_blake_mid;
|
||||
bool init_lyra2rev3_ctx()
|
||||
{
|
||||
cubehashInit( &lyra2v3_ctx.cube, 256, 16, 32 );
|
||||
// cubehashInit( &lyra2v3_ctx.cube2, 256, 16, 32 );
|
||||
sph_blake256_init( &lyra2v3_ctx.blake );
|
||||
sph_bmw256_init( &lyra2v3_ctx.bmw );
|
||||
return true;
|
||||
@@ -59,44 +57,51 @@ void lyra2rev3_hash( void *state, const void *input )
|
||||
memcpy( state, hash, 32 );
|
||||
}
|
||||
|
||||
int scanhash_lyra2rev3(int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done)
|
||||
int scanhash_lyra2rev3( int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t endiandata[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8] __attribute__((aligned(64)));
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t endiandata[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8] __attribute__((aligned(64)));
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
// need big endian data
|
||||
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
|
||||
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
|
||||
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
|
||||
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
|
||||
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
l2v3_blake256_midstate( endiandata );
|
||||
l2v3_blake256_midstate( endiandata );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
lyra2rev3_hash(hash, endiandata);
|
||||
do
|
||||
{
|
||||
be32enc(&endiandata[19], nonce);
|
||||
lyra2rev3_hash(hash, endiandata);
|
||||
|
||||
if (hash[7] <= Htarg )
|
||||
{
|
||||
if( fulltest(hash, ptarget) )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
work_set_target_ratio( work, hash );
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
nonce++;
|
||||
if (hash[7] <= Htarg )
|
||||
{
|
||||
if( fulltest(hash, ptarget) )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
work_set_target_ratio( work, hash );
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
nonce++;
|
||||
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -36,66 +36,51 @@ void lyra2z_4way_hash( void *state, const void *input )
|
||||
blake256_4way( &ctx_blake, input + (64*4), 16 );
|
||||
blake256_4way_close( &ctx_blake, vhash );
|
||||
|
||||
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
LYRA2Z( lyra2z_4way_matrix, state , 32, hash0, 32, hash0, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, state+32, 32, hash1, 32, hash1, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, state+64, 32, hash2, 32, hash2, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, state+96, 32, hash3, 32, hash3, 32, 8, 8, 8 );
|
||||
}
|
||||
|
||||
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep = vdata + 76; // 19*4
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0000ff;
|
||||
|
||||
for ( int i=0; i < 20; i++ )
|
||||
be32enc( &edata[i], pdata[i] );
|
||||
|
||||
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
mm128_bswap_intrlv80_4x32( vdata, pdata );
|
||||
lyra2z_4way_midstate( vdata );
|
||||
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+1, n+1 );
|
||||
be32enc( noncep+2, n+2 );
|
||||
be32enc( noncep+3, n+3 );
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
|
||||
lyra2z_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
|
||||
&& !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
submit_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
&& !work_restart[thr_id].restart);
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -134,8 +119,8 @@ void lyra2z_8way_hash( void *state, const void *input )
|
||||
blake256_8way( &ctx_blake, input + (64*8), 16 );
|
||||
blake256_8way_close( &ctx_blake, vhash );
|
||||
|
||||
mm256_deinterleave_8x32( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
mm256_dintrlv_8x32( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
|
||||
LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
|
||||
@@ -150,64 +135,49 @@ void lyra2z_8way_hash( void *state, const void *input )
|
||||
memcpy( state+ 32, hash1, 32 );
|
||||
memcpy( state+ 64, hash2, 32 );
|
||||
memcpy( state+ 96, hash3, 32 );
|
||||
memcpy( state+128, hash1, 32 );
|
||||
memcpy( state+160, hash2, 32 );
|
||||
memcpy( state+192, hash3, 32 );
|
||||
memcpy( state+224, hash1, 32 );
|
||||
memcpy( state+128, hash4, 32 );
|
||||
memcpy( state+160, hash5, 32 );
|
||||
memcpy( state+192, hash6, 32 );
|
||||
memcpy( state+224, hash7, 32 );
|
||||
}
|
||||
|
||||
int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep = vdata + 152; // 19*8
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0000ff;
|
||||
|
||||
for ( int i=0; i < 19; i++ )
|
||||
be32enc( &edata[i], pdata[i] );
|
||||
|
||||
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
|
||||
mm256_bswap_intrlv80_8x32( vdata, pdata );
|
||||
lyra2z_8way_midstate( vdata );
|
||||
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+1, n+1 );
|
||||
be32enc( noncep+2, n+2 );
|
||||
be32enc( noncep+3, n+3 );
|
||||
be32enc( noncep+4, n+4 );
|
||||
be32enc( noncep+5, n+5 );
|
||||
be32enc( noncep+6, n+6 );
|
||||
be32enc( noncep+7, n+7 );
|
||||
|
||||
*noncev = mm256_bswap_32(
|
||||
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
|
||||
lyra2z_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
|
||||
&& !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
submit_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 8;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
&& !work_restart[thr_id].restart);
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include "lyra2-gate.h"
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
__thread uint64_t* lyra2z_matrix;
|
||||
|
||||
@@ -44,7 +44,7 @@ void lyra2z_hash( void *state, const void *input )
|
||||
}
|
||||
|
||||
int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
@@ -53,6 +53,7 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if (opt_benchmark)
|
||||
ptarget[7] = 0x0000ff;
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#include <memory.h>
|
||||
#include "algo-gate-api.h"
|
||||
#include "lyra2.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
__thread uint64_t* lyra2z330_wholeMatrix;
|
||||
|
||||
@@ -16,39 +16,46 @@ void lyra2z330_hash(void *state, const void *input, uint32_t height)
|
||||
}
|
||||
|
||||
int scanhash_lyra2z330( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
if (opt_benchmark)
|
||||
ptarget[7] = 0x0000ff;
|
||||
uint32_t hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
for (int i=0; i < 19; i++) {
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
}
|
||||
if (opt_benchmark)
|
||||
ptarget[7] = 0x0000ff;
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
lyra2z330_hash( hash, endiandata, work->height );
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
work_set_target_ratio(work, hash);
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
|
||||
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
|
||||
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
|
||||
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
|
||||
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
do
|
||||
{
|
||||
be32enc(&endiandata[19], nonce);
|
||||
lyra2z330_hash( hash, endiandata, work->height );
|
||||
if ( hash[7] <= Htarg && fulltest(hash, ptarget) && !opt_benchmark )
|
||||
{
|
||||
work_set_target_ratio(work, hash);
|
||||
pdata[19] = nonce;
|
||||
if ( submit_work( mythr, work ) )
|
||||
applog( LOG_NOTICE, "Share %d submitted by thread %d",
|
||||
accepted_share_count + rejected_share_count + 1,
|
||||
mythr->id );
|
||||
else
|
||||
applog( LOG_WARNING, "Failed to submit share." );
|
||||
}
|
||||
nonce++;
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void lyra2z330_set_target( struct work* work, double job_diff )
|
||||
|
@@ -50,11 +50,11 @@ void phi2_hash(void *state, const void *input)
|
||||
unsigned char _ALIGN(128) hashA[64];
|
||||
unsigned char _ALIGN(128) hashB[64];
|
||||
|
||||
phi2_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
|
||||
phi2_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input,
|
||||
phi2_has_roots ? 144 : 80 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input,
|
||||
phi2_has_roots ? 144 : 80 );
|
||||
|
||||
LYRA2RE( &hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8 );
|
||||
LYRA2RE( &hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8 );
|
||||
@@ -63,17 +63,17 @@ void phi2_hash(void *state, const void *input)
|
||||
sph_jh512_close( &ctx.jh, (void*)hash );
|
||||
|
||||
if ( hash[0] & 1 )
|
||||
{
|
||||
sph_gost512( &ctx.gost, (const void*)hash, 64 );
|
||||
{
|
||||
sph_gost512( &ctx.gost, (const void*)hash, 64 );
|
||||
sph_gost512_close( &ctx.gost, (void*)hash );
|
||||
}
|
||||
else
|
||||
{
|
||||
else
|
||||
{
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &ctx.echo1, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
update_final_echo ( &ctx.echo2, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
update_final_echo ( &ctx.echo1, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
update_final_echo ( &ctx.echo2, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#else
|
||||
sph_echo512( &ctx.echo1, (const void*)hash, 64 );
|
||||
sph_echo512_close( &ctx.echo1, (void*)hash );
|
||||
@@ -92,42 +92,50 @@ void phi2_hash(void *state, const void *input)
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_phi2(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
|
||||
int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(128) hash[8];
|
||||
uint32_t _ALIGN(128) endiandata[36];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t _ALIGN(128) hash[8];
|
||||
uint32_t _ALIGN(128) endiandata[36];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
if(opt_benchmark){
|
||||
ptarget[7] = 0x00ff;
|
||||
}
|
||||
|
||||
if(opt_benchmark){
|
||||
ptarget[7] = 0x00ff;
|
||||
}
|
||||
phi2_has_roots = false;
|
||||
for ( int i=0; i < 36; i++ )
|
||||
{
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
if (i >= 20 && pdata[i]) phi2_has_roots = true;
|
||||
}
|
||||
|
||||
phi2_has_roots = false;
|
||||
for (int i=0; i < 36; i++) {
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
if (i >= 20 && pdata[i]) phi2_has_roots = true;
|
||||
}
|
||||
do {
|
||||
be32enc( &endiandata[19], n );
|
||||
phi2_hash( hash, endiandata );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
phi2_hash(hash, endiandata);
|
||||
|
||||
if (hash[7] < Htarg && fulltest(hash, ptarget)) {
|
||||
work_set_target_ratio(work, hash);
|
||||
if ( hash[7] < Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
if ( submit_work( mythr, work ) )
|
||||
applog( LOG_NOTICE, "Share %d submitted by thread %d.",
|
||||
accepted_share_count + rejected_share_count + 1,
|
||||
thr_id );
|
||||
else
|
||||
applog( LOG_WARNING, "Failed to submit share." );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return 1;
|
||||
}
|
||||
n++;
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
} while ( n < max_nonce && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
@@ -51,7 +51,7 @@ inline void initState( uint64_t State[/*16*/] )
|
||||
state[3] = _mm256_set_epi64x( blake2b_IV[7], blake2b_IV[6],
|
||||
blake2b_IV[5], blake2b_IV[4] );
|
||||
|
||||
#elif defined (__SSE4_2__)
|
||||
#elif defined (__SSE2__)
|
||||
|
||||
__m128i* state = (__m128i*)State;
|
||||
|
||||
@@ -137,7 +137,7 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
|
||||
//Squeezes remaining bytes
|
||||
memcpy_256( out, state, ( len_m256i % BLOCK_LEN_M256I ) );
|
||||
|
||||
#elif defined (__SSE4_2__)
|
||||
#elif defined (__SSE2__)
|
||||
|
||||
const int len_m128i = len / 16;
|
||||
const int fullBlocks = len_m128i / BLOCK_LEN_M128I;
|
||||
@@ -205,7 +205,7 @@ inline void absorbBlock( uint64_t *State, const uint64_t *In )
|
||||
_mm256_store_si256( (__m256i*)State + 2, state2 );
|
||||
_mm256_store_si256( (__m256i*)State + 3, state3 );
|
||||
|
||||
#elif defined (__SSE4_2__)
|
||||
#elif defined (__SSE2__)
|
||||
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i* in = (__m128i*)In;
|
||||
@@ -273,7 +273,7 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
|
||||
_mm256_store_si256( (__m256i*)State + 2, state2 );
|
||||
_mm256_store_si256( (__m256i*)State + 3, state3 );
|
||||
|
||||
#elif defined (__SSE4_2__)
|
||||
#elif defined (__SSE2__)
|
||||
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i* in = (__m128i*)In;
|
||||
@@ -355,7 +355,7 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
|
||||
_mm256_store_si256( (__m256i*)State + 2, state2 );
|
||||
_mm256_store_si256( (__m256i*)State + 3, state3 );
|
||||
|
||||
#elif defined (__SSE4_2__)
|
||||
#elif defined (__SSE2__)
|
||||
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i state0 = _mm_load_si128( state );
|
||||
@@ -494,7 +494,7 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
|
||||
_mm256_store_si256( (__m256i*)State + 2, state2 );
|
||||
_mm256_store_si256( (__m256i*)State + 3, state3 );
|
||||
|
||||
#elif defined (__SSE4_2__)
|
||||
#elif defined (__SSE2__)
|
||||
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i state0 = _mm_load_si128( state );
|
||||
@@ -694,7 +694,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
|
||||
_mm256_store_si256( (__m256i*)State + 2, state2 );
|
||||
_mm256_store_si256( (__m256i*)State + 3, state3 );
|
||||
|
||||
#elif defined (__SSE4_2__)
|
||||
#elif defined (__SSE2__)
|
||||
|
||||
__m128i* in = (__m128i*)rowIn;
|
||||
__m128i* inout = (__m128i*)rowInOut;
|
||||
@@ -713,9 +713,9 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
|
||||
__m128i* state = (__m128i*)State;
|
||||
|
||||
// For the last round in this function not optimized for AVX
|
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
|
||||
// uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
// uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
// uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
@@ -750,6 +750,28 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
|
||||
out[4] = _mm_xor_si128( state[4], in[4] );
|
||||
out[5] = _mm_xor_si128( state[5], in[5] );
|
||||
|
||||
|
||||
__m128i t0, t1;
|
||||
t0 = _mm_srli_si128( state[0], 8 );
|
||||
t1 = _mm_srli_si128( state[1], 8 );
|
||||
inout[0] = _mm_xor_si128( inout[0],
|
||||
_mm_or_si128( _mm_slli_si128( state[0], 8 ),
|
||||
_mm_srli_si128( state[5], 8 ) ) );
|
||||
inout[1] = _mm_xor_si128( inout[1],
|
||||
_mm_or_si128( _mm_slli_si128( state[1], 8 ), t0 ) );
|
||||
t0 = _mm_srli_si128( state[2], 8 );
|
||||
inout[2] = _mm_xor_si128( inout[2],
|
||||
_mm_or_si128( _mm_slli_si128( state[2], 8 ), t1 ) );
|
||||
t1 = _mm_srli_si128( state[3], 8 );
|
||||
inout[3] = _mm_xor_si128( inout[3],
|
||||
_mm_or_si128( _mm_slli_si128( state[3], 8 ), t0 ) );
|
||||
t0 = _mm_srli_si128( state[4], 8 );
|
||||
inout[4] = _mm_xor_si128( inout[4],
|
||||
_mm_or_si128( _mm_slli_si128( state[4], 8 ), t1 ) );
|
||||
inout[5] = _mm_xor_si128( inout[5],
|
||||
_mm_or_si128( _mm_slli_si128( state[5], 8 ), t0 ) );
|
||||
|
||||
/*
|
||||
ptrWordInOut[0] ^= State[11];
|
||||
ptrWordInOut[1] ^= State[0];
|
||||
ptrWordInOut[2] ^= State[1];
|
||||
@@ -768,7 +790,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
|
||||
ptrWordIn += BLOCK_LEN_INT64;
|
||||
//Output: goes to previous column
|
||||
ptrWordOut -= BLOCK_LEN_INT64;
|
||||
|
||||
*/
|
||||
inout += BLOCK_LEN_M128I;
|
||||
in += BLOCK_LEN_M128I;
|
||||
out -= BLOCK_LEN_M128I;
|
||||
@@ -930,7 +952,7 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
|
||||
_mm256_store_si256( (__m256i*)State + 2, state2 );
|
||||
_mm256_store_si256( (__m256i*)State + 3, state3 );
|
||||
|
||||
#elif defined(__SSE4_2__)
|
||||
#elif defined (__SSE2__)
|
||||
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i* in = (__m128i*)rowIn;
|
||||
|
@@ -23,7 +23,7 @@
|
||||
#define SPONGE_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#define ALIGN __attribute__ ((aligned(32)))
|
||||
@@ -59,7 +59,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
// returns void, updates all args
|
||||
#define G_4X64(a,b,c,d) \
|
||||
a = _mm256_add_epi64( a, b ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a), 32 ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
|
||||
a = _mm256_add_epi64( a, b ); \
|
||||
@@ -91,7 +91,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
|
||||
#elif defined(__SSE4_2__)
|
||||
#elif defined(__SSE2__)
|
||||
|
||||
// process 2 columns in parallel
|
||||
// returns void, all args updated
|
||||
@@ -108,14 +108,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm128_ror256_1x64( s2, s3 ); \
|
||||
mm128_swap256_128( s4, s5 ); \
|
||||
mm128_rol256_1x64( s6, s7 ); \
|
||||
mm128_ror1x64_256( s2, s3 ); \
|
||||
mm128_swap128_256( s4, s5 ); \
|
||||
mm128_rol1x64_256( s6, s7 ); \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm128_rol256_1x64( s2, s3 ); \
|
||||
mm128_swap256_128( s4, s5 ); \
|
||||
mm128_ror256_1x64( s6, s7 );
|
||||
mm128_rol1x64_256( s2, s3 ); \
|
||||
mm128_swap128_256( s4, s5 ); \
|
||||
mm128_ror1x64_256( s6, s7 );
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
@@ -132,7 +132,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
|
||||
|
||||
#endif // AVX2
|
||||
#endif // AVX2 else SSE2
|
||||
|
||||
// Scalar
|
||||
//Blake2b's G function
|
||||
|
49
algo/m7m.c
49
algo/m7m.c
@@ -7,7 +7,6 @@
|
||||
#include <string.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include "algo/tiger/sph_tiger.h"
|
||||
@@ -117,13 +116,8 @@ uint32_t sw2_(int nnounce)
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA256_CTX sha256;
|
||||
SHA512_CTX sha512;
|
||||
#else
|
||||
sph_sha256_context sha256;
|
||||
sph_sha512_context sha512;
|
||||
#endif
|
||||
sph_keccak512_context keccak;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sph_haval256_5_context haval;
|
||||
@@ -135,13 +129,8 @@ m7m_ctx_holder m7m_ctx;
|
||||
|
||||
void init_m7m_ctx()
|
||||
{
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA256_Init( &m7m_ctx.sha256 );
|
||||
SHA512_Init( &m7m_ctx.sha512 );
|
||||
#else
|
||||
sph_sha256_init( &m7m_ctx.sha256 );
|
||||
sph_sha512_init( &m7m_ctx.sha512 );
|
||||
#endif
|
||||
sph_keccak512_init( &m7m_ctx.keccak );
|
||||
sph_whirlpool_init( &m7m_ctx.whirlpool );
|
||||
sph_haval256_5_init( &m7m_ctx.haval );
|
||||
@@ -176,28 +165,18 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
|
||||
|
||||
m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) );
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA256_CTX ctxf_sha256;
|
||||
#else
|
||||
sph_sha256_context ctxf_sha256;
|
||||
#endif
|
||||
|
||||
memcpy(data, pdata, 80);
|
||||
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA256_Update( &ctx1.sha256, data, M7_MIDSTATE_LEN );
|
||||
SHA512_Update( &ctx1.sha512, data, M7_MIDSTATE_LEN );
|
||||
#else
|
||||
sph_sha256( &ctx1.sha256, data, M7_MIDSTATE_LEN );
|
||||
sph_sha512( &ctx1.sha512, data, M7_MIDSTATE_LEN );
|
||||
#endif
|
||||
sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN );
|
||||
sph_whirlpool( &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
|
||||
sph_haval256_5( &ctx1.haval, data, M7_MIDSTATE_LEN );
|
||||
sph_tiger( &ctx1.tiger, data, M7_MIDSTATE_LEN );
|
||||
sph_ripemd160( &ctx1.ripemd, data, M7_MIDSTATE_LEN );
|
||||
|
||||
// the following calculations can be performed once and the results shared
|
||||
mpz_t magipi, magisw, product, bns0, bns1;
|
||||
mpf_t magifpi, magifpi0, mpt1, mpt2, mptmp, mpten;
|
||||
|
||||
@@ -222,22 +201,11 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
|
||||
|
||||
memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );
|
||||
|
||||
// with 4 way can a single midstate be shared among lanes?
|
||||
// do sinlge round of midstate and inyerleave for final
|
||||
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA256_Update( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
SHA256_Final( (unsigned char*) (bhash[0]), &ctx2.sha256 );
|
||||
|
||||
SHA512_Update( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
SHA512_Final( (unsigned char*) (bhash[1]), &ctx2.sha512 );
|
||||
#else
|
||||
sph_sha256( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
sph_sha256_close( &ctx2.sha256, (void*)(bhash[0]) );
|
||||
|
||||
sph_sha512( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
sph_sha512_close( &ctx2.sha512, (void*)(bhash[1]) );
|
||||
#endif
|
||||
sph_keccak512( &ctx2.keccak, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
sph_keccak512_close( &ctx2.keccak, (void*)(bhash[2]) );
|
||||
|
||||
@@ -253,7 +221,6 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
|
||||
sph_ripemd160( &ctx2.ripemd, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
sph_ripemd160_close( &ctx2.ripemd, (void*)(bhash[6]) );
|
||||
|
||||
// 4 way serial
|
||||
mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
|
||||
mpz_set(bns1, bns0);
|
||||
mpz_set(product, bns0);
|
||||
@@ -269,17 +236,10 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
|
||||
bytes = mpz_sizeinbase(product, 256);
|
||||
mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);
|
||||
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA256_Init( &ctxf_sha256 );
|
||||
SHA256_Update( &ctxf_sha256, bdata, bytes );
|
||||
SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
|
||||
#else
|
||||
sph_sha256_init( &ctxf_sha256 );
|
||||
sph_sha256( &ctxf_sha256, bdata, bytes );
|
||||
sph_sha256_close( &ctxf_sha256, (void*)(hash) );
|
||||
#endif
|
||||
|
||||
// do once and share
|
||||
digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
|
||||
mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
|
||||
mpf_set_prec_raw(magifpi, prec);
|
||||
@@ -302,7 +262,6 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
|
||||
mpz_set_f(magipi, magifpi);
|
||||
mpz_add(magipi,magipi,magisw);
|
||||
mpz_add(product,product,magipi);
|
||||
// share magipi, product and do serial
|
||||
mpz_import(bns0, b, -1, p, -1, 0, (void*)(hash));
|
||||
mpz_add(bns1, bns1, bns0);
|
||||
mpz_mul(product,product,bns1);
|
||||
@@ -312,18 +271,11 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
|
||||
mpzscale=bytes;
|
||||
mpz_export(bdata, NULL, -1, 1, 0, 0, product);
|
||||
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA256_Init( &ctxf_sha256 );
|
||||
SHA256_Update( &ctxf_sha256, bdata, bytes );
|
||||
SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
|
||||
#else
|
||||
sph_sha256_init( &ctxf_sha256 );
|
||||
sph_sha256( &ctxf_sha256, bdata, bytes );
|
||||
sph_sha256_close( &ctxf_sha256, (void*)(hash) );
|
||||
#endif
|
||||
}
|
||||
|
||||
// this is the scanhash part
|
||||
const unsigned char *hash_ = (const unsigned char *)hash;
|
||||
const unsigned char *target_ = (const unsigned char *)ptarget;
|
||||
for ( i = 31; i >= 0; i-- )
|
||||
@@ -354,7 +306,6 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
|
||||
|
||||
pdata[19] = n;
|
||||
|
||||
// do this in hashm7m
|
||||
out:
|
||||
mpf_set_prec_raw(magifpi, prec0);
|
||||
mpf_set_prec_raw(magifpi0, prec0);
|
||||
|
334
algo/panama/sph_panama.c
Normal file
334
algo/panama/sph_panama.c
Normal file
@@ -0,0 +1,334 @@
|
||||
/* $Id: panama.c 216 2010-06-08 09:46:57Z tp $ */
|
||||
/*
|
||||
* PANAMA implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "sph_panama.h"
|
||||
|
||||
#define LVAR17(b) sph_u32 \
|
||||
b ## 0, b ## 1, b ## 2, b ## 3, b ## 4, b ## 5, \
|
||||
b ## 6, b ## 7, b ## 8, b ## 9, b ## 10, b ## 11, \
|
||||
b ## 12, b ## 13, b ## 14, b ## 15, b ## 16;
|
||||
|
||||
#define LVARS \
|
||||
LVAR17(a) \
|
||||
LVAR17(g) \
|
||||
LVAR17(p) \
|
||||
LVAR17(t)
|
||||
|
||||
#define M17(macro) do { \
|
||||
macro( 0, 1, 2, 4); \
|
||||
macro( 1, 2, 3, 5); \
|
||||
macro( 2, 3, 4, 6); \
|
||||
macro( 3, 4, 5, 7); \
|
||||
macro( 4, 5, 6, 8); \
|
||||
macro( 5, 6, 7, 9); \
|
||||
macro( 6, 7, 8, 10); \
|
||||
macro( 7, 8, 9, 11); \
|
||||
macro( 8, 9, 10, 12); \
|
||||
macro( 9, 10, 11, 13); \
|
||||
macro(10, 11, 12, 14); \
|
||||
macro(11, 12, 13, 15); \
|
||||
macro(12, 13, 14, 16); \
|
||||
macro(13, 14, 15, 0); \
|
||||
macro(14, 15, 16, 1); \
|
||||
macro(15, 16, 0, 2); \
|
||||
macro(16, 0, 1, 3); \
|
||||
} while (0)
|
||||
|
||||
#define BUPDATE1(n0, n2) do { \
|
||||
sc->buffer[ptr24][n0] ^= sc->buffer[ptr31][n2]; \
|
||||
sc->buffer[ptr31][n2] ^= INW1(n2); \
|
||||
} while (0)
|
||||
|
||||
#define BUPDATE do { \
|
||||
BUPDATE1(0, 2); \
|
||||
BUPDATE1(1, 3); \
|
||||
BUPDATE1(2, 4); \
|
||||
BUPDATE1(3, 5); \
|
||||
BUPDATE1(4, 6); \
|
||||
BUPDATE1(5, 7); \
|
||||
BUPDATE1(6, 0); \
|
||||
BUPDATE1(7, 1); \
|
||||
} while (0)
|
||||
|
||||
#define RSTATE(n0, n1, n2, n4) (a ## n0 = sc->state[n0])
|
||||
|
||||
#define WSTATE(n0, n1, n2, n4) (sc->state[n0] = a ## n0)
|
||||
|
||||
#define GAMMA(n0, n1, n2, n4) \
|
||||
(g ## n0 = a ## n0 ^ (a ## n1 | SPH_T32(~a ## n2)))
|
||||
|
||||
#define PI_ALL do { \
|
||||
p0 = g0; \
|
||||
p1 = SPH_ROTL32( g7, 1); \
|
||||
p2 = SPH_ROTL32(g14, 3); \
|
||||
p3 = SPH_ROTL32( g4, 6); \
|
||||
p4 = SPH_ROTL32(g11, 10); \
|
||||
p5 = SPH_ROTL32( g1, 15); \
|
||||
p6 = SPH_ROTL32( g8, 21); \
|
||||
p7 = SPH_ROTL32(g15, 28); \
|
||||
p8 = SPH_ROTL32( g5, 4); \
|
||||
p9 = SPH_ROTL32(g12, 13); \
|
||||
p10 = SPH_ROTL32( g2, 23); \
|
||||
p11 = SPH_ROTL32( g9, 2); \
|
||||
p12 = SPH_ROTL32(g16, 14); \
|
||||
p13 = SPH_ROTL32( g6, 27); \
|
||||
p14 = SPH_ROTL32(g13, 9); \
|
||||
p15 = SPH_ROTL32( g3, 24); \
|
||||
p16 = SPH_ROTL32(g10, 8); \
|
||||
} while (0)
|
||||
|
||||
#define THETA(n0, n1, n2, n4) \
|
||||
(t ## n0 = p ## n0 ^ p ## n1 ^ p ## n4)
|
||||
|
||||
#define SIGMA_ALL do { \
|
||||
a0 = t0 ^ 1; \
|
||||
a1 = t1 ^ INW2(0); \
|
||||
a2 = t2 ^ INW2(1); \
|
||||
a3 = t3 ^ INW2(2); \
|
||||
a4 = t4 ^ INW2(3); \
|
||||
a5 = t5 ^ INW2(4); \
|
||||
a6 = t6 ^ INW2(5); \
|
||||
a7 = t7 ^ INW2(6); \
|
||||
a8 = t8 ^ INW2(7); \
|
||||
a9 = t9 ^ sc->buffer[ptr16][0]; \
|
||||
a10 = t10 ^ sc->buffer[ptr16][1]; \
|
||||
a11 = t11 ^ sc->buffer[ptr16][2]; \
|
||||
a12 = t12 ^ sc->buffer[ptr16][3]; \
|
||||
a13 = t13 ^ sc->buffer[ptr16][4]; \
|
||||
a14 = t14 ^ sc->buffer[ptr16][5]; \
|
||||
a15 = t15 ^ sc->buffer[ptr16][6]; \
|
||||
a16 = t16 ^ sc->buffer[ptr16][7]; \
|
||||
} while (0)
|
||||
|
||||
#define PANAMA_STEP do { \
|
||||
unsigned ptr16, ptr24, ptr31; \
|
||||
\
|
||||
ptr24 = (ptr0 - 8) & 31; \
|
||||
ptr31 = (ptr0 - 1) & 31; \
|
||||
BUPDATE; \
|
||||
M17(GAMMA); \
|
||||
PI_ALL; \
|
||||
M17(THETA); \
|
||||
ptr16 = ptr0 ^ 16; \
|
||||
SIGMA_ALL; \
|
||||
ptr0 = ptr31; \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* These macros are used to compute
|
||||
*/
|
||||
#define INC0 1
|
||||
#define INC1 2
|
||||
#define INC2 3
|
||||
#define INC3 4
|
||||
#define INC4 5
|
||||
#define INC5 6
|
||||
#define INC6 7
|
||||
#define INC7 8
|
||||
|
||||
/*
|
||||
* Push data by blocks of 32 bytes. "pbuf" must be 32-bit aligned. Each
|
||||
* iteration processes 32 data bytes; "num" contains the number of
|
||||
* iterations.
|
||||
*/
|
||||
static void
|
||||
panama_push(sph_panama_context *sc, const unsigned char *pbuf, size_t num)
|
||||
{
|
||||
LVARS
|
||||
unsigned ptr0;
|
||||
#if SPH_LITTLE_FAST
|
||||
#define INW1(i) sph_dec32le_aligned(pbuf + 4 * (i))
|
||||
#else
|
||||
sph_u32 X_var[8];
|
||||
#define INW1(i) X_var[i]
|
||||
#endif
|
||||
#define INW2(i) INW1(i)
|
||||
|
||||
M17(RSTATE);
|
||||
ptr0 = sc->buffer_ptr;
|
||||
while (num -- > 0) {
|
||||
#if !SPH_LITTLE_FAST
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 8; i ++)
|
||||
X_var[i] = sph_dec32le_aligned(pbuf + 4 * (i));
|
||||
#endif
|
||||
PANAMA_STEP;
|
||||
pbuf = (const unsigned char *)pbuf + 32;
|
||||
}
|
||||
M17(WSTATE);
|
||||
sc->buffer_ptr = ptr0;
|
||||
|
||||
#undef INW1
|
||||
#undef INW2
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform the "pull" operation repeatedly ("num" times). The hash output
|
||||
* will be extracted from the state afterwards.
|
||||
*/
|
||||
static void
|
||||
panama_pull(sph_panama_context *sc, unsigned num)
|
||||
{
|
||||
LVARS
|
||||
unsigned ptr0;
|
||||
#define INW1(i) INW_H1(INC ## i)
|
||||
#define INW_H1(i) INW_H2(i)
|
||||
#define INW_H2(i) a ## i
|
||||
#define INW2(i) sc->buffer[ptr4][i]
|
||||
|
||||
M17(RSTATE);
|
||||
ptr0 = sc->buffer_ptr;
|
||||
while (num -- > 0) {
|
||||
unsigned ptr4;
|
||||
|
||||
ptr4 = (ptr0 + 4) & 31;
|
||||
PANAMA_STEP;
|
||||
}
|
||||
M17(WSTATE);
|
||||
|
||||
#undef INW1
|
||||
#undef INW_H1
|
||||
#undef INW_H2
|
||||
#undef INW2
|
||||
}
|
||||
|
||||
/* see sph_panama.h */
|
||||
void
|
||||
sph_panama_init(void *cc)
|
||||
{
|
||||
sph_panama_context *sc;
|
||||
|
||||
sc = cc;
|
||||
/*
|
||||
* This is not completely conformant, but "it will work
|
||||
* everywhere". Initial state consists of zeroes everywhere.
|
||||
* Conceptually, the sph_u32 type may have padding bits which
|
||||
* must not be set to 0; but such an architecture remains to
|
||||
* be seen.
|
||||
*/
|
||||
sc->data_ptr = 0;
|
||||
memset(sc->buffer, 0, sizeof sc->buffer);
|
||||
sc->buffer_ptr = 0;
|
||||
memset(sc->state, 0, sizeof sc->state);
|
||||
}
|
||||
|
||||
#ifdef SPH_UPTR
|
||||
static void
|
||||
panama_short(void *cc, const void *data, size_t len)
|
||||
#else
|
||||
void
|
||||
sph_panama(void *cc, const void *data, size_t len)
|
||||
#endif
|
||||
{
|
||||
sph_panama_context *sc;
|
||||
unsigned current;
|
||||
|
||||
sc = cc;
|
||||
current = sc->data_ptr;
|
||||
while (len > 0) {
|
||||
unsigned clen;
|
||||
|
||||
clen = (sizeof sc->data) - current;
|
||||
if (clen > len)
|
||||
clen = len;
|
||||
memcpy(sc->data + current, data, clen);
|
||||
data = (const unsigned char *)data + clen;
|
||||
len -= clen;
|
||||
current += clen;
|
||||
if (current == sizeof sc->data) {
|
||||
current = 0;
|
||||
panama_push(sc, sc->data, 1);
|
||||
}
|
||||
}
|
||||
sc->data_ptr = current;
|
||||
}
|
||||
|
||||
#ifdef SPH_UPTR
|
||||
/* see sph_panama.h */
|
||||
void
|
||||
sph_panama(void *cc, const void *data, size_t len)
|
||||
{
|
||||
sph_panama_context *sc;
|
||||
unsigned current;
|
||||
size_t rlen;
|
||||
|
||||
if (len < (2 * sizeof sc->data)) {
|
||||
panama_short(cc, data, len);
|
||||
return;
|
||||
}
|
||||
sc = cc;
|
||||
current = sc->data_ptr;
|
||||
if (current > 0) {
|
||||
unsigned t;
|
||||
|
||||
t = (sizeof sc->data) - current;
|
||||
panama_short(sc, data, t);
|
||||
data = (const unsigned char *)data + t;
|
||||
len -= t;
|
||||
}
|
||||
#if !SPH_UNALIGNED
|
||||
if (((SPH_UPTR)data & 3) != 0) {
|
||||
panama_short(sc, data, len);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
panama_push(sc, data, len >> 5);
|
||||
rlen = len & 31;
|
||||
if (rlen > 0)
|
||||
memcpy(sc->data,
|
||||
(const unsigned char *)data + len - rlen, rlen);
|
||||
sc->data_ptr = rlen;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* see sph_panama.h */
|
||||
void
|
||||
sph_panama_close(void *cc, void *dst)
|
||||
{
|
||||
sph_panama_context *sc;
|
||||
unsigned current;
|
||||
int i;
|
||||
|
||||
sc = cc;
|
||||
current = sc->data_ptr;
|
||||
sc->data[current ++] = 0x01;
|
||||
memset(sc->data + current, 0, (sizeof sc->data) - current);
|
||||
panama_push(sc, sc->data, 1);
|
||||
panama_pull(sc, 32);
|
||||
for (i = 0; i < 8; i ++)
|
||||
sph_enc32le((unsigned char *)dst + 4 * i, sc->state[i + 9]);
|
||||
sph_panama_init(sc);
|
||||
}
|
118
algo/panama/sph_panama.h
Normal file
118
algo/panama/sph_panama.h
Normal file
@@ -0,0 +1,118 @@
|
||||
/* $Id: sph_panama.h 154 2010-04-26 17:00:24Z tp $ */
|
||||
/**
|
||||
* PANAMA interface.
|
||||
*
|
||||
* PANAMA has been published in: J. Daemen and C. Clapp, "Fast Hashing
|
||||
* and Stream Encryption with PANAMA", Fast Software Encryption -
|
||||
* FSE'98, LNCS 1372, Springer (1998), pp. 60--74.
|
||||
*
|
||||
* PANAMA is not fully defined with regards to endianness and related
|
||||
* topics. This implementation follows strict little-endian conventions:
|
||||
* <ul>
|
||||
* <li>Each 32-byte input block is split into eight 32-bit words, the
|
||||
* first (leftmost) word being numbered 0.</li>
|
||||
* <li>Each such 32-bit word is decoded from memory in little-endian
|
||||
* convention.</li>
|
||||
* <li>The additional padding bit equal to "1" is added by considering
|
||||
* the least significant bit in a byte to come first; practically, this
|
||||
* means that a single byte of value 0x01 is appended to the (byte-oriented)
|
||||
* message, and then 0 to 31 bytes of value 0x00.</li>
|
||||
* <li>The output consists of eight 32-bit words; the word numbered 0 is
|
||||
* written first (in leftmost position) and it is encoded in little-endian
|
||||
* convention.
|
||||
* </ul>
|
||||
* With these conventions, PANAMA is sometimes known as "PANAMA-LE". The
|
||||
* PANAMA reference implementation uses our conventions for input, but
|
||||
* prescribes no convention for output.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_panama.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef SPH_PANAMA_H__
|
||||
#define SPH_PANAMA_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for PANAMA.
|
||||
*/
|
||||
#define SPH_SIZE_panama 256
|
||||
|
||||
/**
|
||||
* This structure is a context for PANAMA computations: it contains the
|
||||
* intermediate values and some data from the last entered block. Once
|
||||
* a PANAMA computation has been performed, the context can be reused for
|
||||
* another computation.
|
||||
*
|
||||
* The contents of this structure are private. A running PANAMA computation
|
||||
* can be cloned by copying the context (e.g. with a simple
|
||||
* <code>memcpy()</code>).
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char data[32]; /* first field, for alignment */
|
||||
unsigned data_ptr;
|
||||
|
||||
sph_u32 buffer[32][8];
|
||||
unsigned buffer_ptr;
|
||||
|
||||
sph_u32 state[17];
|
||||
#endif
|
||||
} sph_panama_context;
|
||||
|
||||
/**
|
||||
* Initialize a PANAMA context. This process performs no memory allocation.
|
||||
*
|
||||
* @param cc the PANAMA context (pointer to a <code>sph_panama_context</code>)
|
||||
*/
|
||||
void sph_panama_init(void *cc);
|
||||
|
||||
/**
|
||||
* Process some data bytes. It is acceptable that <code>len</code> is zero
|
||||
* (in which case this function does nothing).
|
||||
*
|
||||
* @param cc the PANAMA context
|
||||
* @param data the input data
|
||||
* @param len the input data length (in bytes)
|
||||
*/
|
||||
void sph_panama(void *cc, const void *data, size_t len);
|
||||
|
||||
/**
|
||||
* Terminate the current PANAMA computation and output the result into the
|
||||
* provided buffer. The destination buffer must be wide enough to
|
||||
* accomodate the result (32 bytes). The context is automatically
|
||||
* reinitialized.
|
||||
*
|
||||
* @param cc the PANAMA context
|
||||
* @param dst the destination buffer
|
||||
*/
|
||||
void sph_panama_close(void *cc, void *dst);
|
||||
|
||||
#endif
|
@@ -48,36 +48,36 @@ void anime_4way_hash( void *state, const void *input )
|
||||
__m256i* vhA = (__m256i*)vhashA;
|
||||
__m256i* vhB = (__m256i*)vhashB;
|
||||
__m256i vh_mask;
|
||||
__m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
|
||||
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
|
||||
int i;
|
||||
anime_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
|
||||
|
||||
bmw512_4way( &ctx.bmw, vhash, 80 );
|
||||
bmw512_4way( &ctx.bmw, input, 80 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
blake512_4way( &ctx.blake, input, 64 );
|
||||
blake512_4way( &ctx.blake, vhash, 64 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
m256_zero );
|
||||
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0,
|
||||
(char*)hash0, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1,
|
||||
(char*)hash1, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2,
|
||||
(char*)hash2, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3,
|
||||
(char*)hash3, 512 );
|
||||
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0,
|
||||
(char*)hash0, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1,
|
||||
(char*)hash1, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2,
|
||||
(char*)hash2, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3,
|
||||
(char*)hash3, 512 );
|
||||
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhashB );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhashB );
|
||||
|
||||
for ( i = 0; i < 8; i++ )
|
||||
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
|
||||
@@ -120,13 +120,13 @@ void anime_4way_hash( void *state, const void *input )
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
m256_zero );
|
||||
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhashA );
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhashA );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhashB );
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhashB );
|
||||
|
||||
for ( i = 0; i < 8; i++ )
|
||||
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
|
||||
|
1003
algo/radiogatun/sph_radiogatun.c
Normal file
1003
algo/radiogatun/sph_radiogatun.c
Normal file
File diff suppressed because it is too large
Load Diff
186
algo/radiogatun/sph_radiogatun.h
Normal file
186
algo/radiogatun/sph_radiogatun.h
Normal file
@@ -0,0 +1,186 @@
|
||||
/* $Id: sph_radiogatun.h 226 2010-06-16 17:28:08Z tp $ */
|
||||
/**
|
||||
* RadioGatun interface.
|
||||
*
|
||||
* RadioGatun has been published in: G. Bertoni, J. Daemen, M. Peeters
|
||||
* and G. Van Assche, "RadioGatun, a belt-and-mill hash function",
|
||||
* presented at the Second Cryptographic Hash Workshop, Santa Barbara,
|
||||
* August 24-25, 2006. The main Web site, containing that article, the
|
||||
* reference code and some test vectors, appears to be currently located
|
||||
* at the following URL: http://radiogatun.noekeon.org/
|
||||
*
|
||||
* The presentation article does not specify endianness or padding. The
|
||||
* reference code uses the following conventions, which we also apply
|
||||
* here:
|
||||
* <ul>
|
||||
* <li>The input message is an integral number of sequences of three
|
||||
* words. Each word is either a 32-bit of 64-bit word (depending on
|
||||
* the version of RadioGatun).</li>
|
||||
* <li>Input bytes are decoded into words using little-endian
|
||||
* convention.</li>
|
||||
* <li>Padding consists of a single bit of value 1, using little-endian
|
||||
* convention within bytes (i.e. for a byte-oriented input, a single
|
||||
* byte of value 0x01 is appended), then enough bits of value 0 to finish
|
||||
* the current block.</li>
|
||||
* <li>Output consists of 256 bits. Successive output words are encoded
|
||||
* with little-endian convention.</li>
|
||||
* </ul>
|
||||
* These conventions are very close to those we use for PANAMA, which is
|
||||
* a close ancestor or RadioGatun.
|
||||
*
|
||||
* RadioGatun is actually a family of functions, depending on some
|
||||
* internal parameters. We implement here two functions, with a "belt
|
||||
* length" of 13, a "belt width" of 3, and a "mill length" of 19. The
|
||||
* RadioGatun[32] version uses 32-bit words, while the RadioGatun[64]
|
||||
* variant uses 64-bit words.
|
||||
*
|
||||
* Strictly speaking, the name "RadioGatun" should use an acute accent
|
||||
* on the "u", which we omitted here to keep strict ASCII-compatibility
|
||||
* of this file.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_radiogatun.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef SPH_RADIOGATUN_H__
|
||||
#define SPH_RADIOGATUN_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for RadioGatun[32].
|
||||
*/
|
||||
#define SPH_SIZE_radiogatun32 256
|
||||
|
||||
/**
|
||||
* This structure is a context for RadioGatun[32] computations: it
|
||||
* contains intermediate values and some data from the last entered
|
||||
* block. Once a RadioGatun[32] computation has been performed, the
|
||||
* context can be reused for another computation.
|
||||
*
|
||||
* The contents of this structure are private. A running RadioGatun[32]
|
||||
* computation can be cloned by copying the context (e.g. with a
|
||||
* simple <code>memcpy()</code>).
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char data[156]; /* first field, for alignment */
|
||||
unsigned data_ptr;
|
||||
sph_u32 a[19], b[39];
|
||||
#endif
|
||||
} sph_radiogatun32_context;
|
||||
|
||||
/**
|
||||
* Initialize a RadioGatun[32] context. This process performs no
|
||||
* memory allocation.
|
||||
*
|
||||
* @param cc the RadioGatun[32] context (pointer to a
|
||||
* <code>sph_radiogatun32_context</code>)
|
||||
*/
|
||||
void sph_radiogatun32_init(void *cc);
|
||||
|
||||
/**
|
||||
* Process some data bytes. It is acceptable that <code>len</code> is zero
|
||||
* (in which case this function does nothing).
|
||||
*
|
||||
* @param cc the RadioGatun[32] context
|
||||
* @param data the input data
|
||||
* @param len the input data length (in bytes)
|
||||
*/
|
||||
void sph_radiogatun32(void *cc, const void *data, size_t len);
|
||||
|
||||
/**
|
||||
* Terminate the current RadioGatun[32] computation and output the
|
||||
* result into the provided buffer. The destination buffer must be wide
|
||||
* enough to accomodate the result (32 bytes). The context is
|
||||
* automatically reinitialized.
|
||||
*
|
||||
* @param cc the RadioGatun[32] context
|
||||
* @param dst the destination buffer
|
||||
*/
|
||||
void sph_radiogatun32_close(void *cc, void *dst);
|
||||
|
||||
#if SPH_64
|
||||
|
||||
/**
|
||||
* Output size (in bits) for RadioGatun[64].
|
||||
*/
|
||||
#define SPH_SIZE_radiogatun64 256
|
||||
|
||||
/**
|
||||
* This structure is a context for RadioGatun[64] computations: it
|
||||
* contains intermediate values and some data from the last entered
|
||||
* block. Once a RadioGatun[64] computation has been performed, the
|
||||
* context can be reused for another computation.
|
||||
*
|
||||
* The contents of this structure are private. A running RadioGatun[64]
|
||||
* computation can be cloned by copying the context (e.g. with a
|
||||
* simple <code>memcpy()</code>).
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char data[312]; /* first field, for alignment */
|
||||
unsigned data_ptr;
|
||||
sph_u64 a[19], b[39];
|
||||
#endif
|
||||
} sph_radiogatun64_context;
|
||||
|
||||
/**
|
||||
* Initialize a RadioGatun[64] context. This process performs no
|
||||
* memory allocation.
|
||||
*
|
||||
* @param cc the RadioGatun[64] context (pointer to a
|
||||
* <code>sph_radiogatun64_context</code>)
|
||||
*/
|
||||
void sph_radiogatun64_init(void *cc);
|
||||
|
||||
/**
|
||||
* Process some data bytes. It is acceptable that <code>len</code> is zero
|
||||
* (in which case this function does nothing).
|
||||
*
|
||||
* @param cc the RadioGatun[64] context
|
||||
* @param data the input data
|
||||
* @param len the input data length (in bytes)
|
||||
*/
|
||||
void sph_radiogatun64(void *cc, const void *data, size_t len);
|
||||
|
||||
/**
|
||||
* Terminate the current RadioGatun[64] computation and output the
|
||||
* result into the provided buffer. The destination buffer must be wide
|
||||
* enough to accomodate the result (32 bytes). The context is
|
||||
* automatically reinitialized.
|
||||
*
|
||||
* @param cc the RadioGatun[64] context
|
||||
* @param dst the destination buffer
|
||||
*/
|
||||
void sph_radiogatun64_close(void *cc, void *dst);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -4,10 +4,13 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
// Overide multi way on ryzen, SHA is better.
|
||||
#if !defined(RYZEN_)
|
||||
// need sha512 2 way AVX x2 or 1 way scalar x4 to support 4way AVX.
|
||||
#if defined(__AVX2__)
|
||||
#define LBRY_8WAY
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define LBRY_NTIME_INDEX 25
|
||||
#define LBRY_NBITS_INDEX 26
|
||||
|
@@ -4,24 +4,17 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sph_ripemd.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include <openssl/sha.h>
|
||||
|
||||
void lbry_hash(void* output, const void* input)
|
||||
{
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA256_CTX ctx_sha256 __attribute__ ((aligned (64)));
|
||||
SHA512_CTX ctx_sha512 __attribute__ ((aligned (64)));
|
||||
#else
|
||||
sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
|
||||
sph_sha512_context ctx_sha512 __attribute__ ((aligned (64)));
|
||||
#endif
|
||||
sph_ripemd160_context ctx_ripemd __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) hashA[16];
|
||||
uint32_t _ALIGN(64) hashB[16];
|
||||
uint32_t _ALIGN(64) hashC[16];
|
||||
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, input, 112 );
|
||||
SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
|
||||
@@ -33,19 +26,6 @@ void lbry_hash(void* output, const void* input)
|
||||
SHA512_Init( &ctx_sha512 );
|
||||
SHA512_Update( &ctx_sha512, hashA, 32 );
|
||||
SHA512_Final( (unsigned char*) hashA, &ctx_sha512 );
|
||||
#else
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256 ( &ctx_sha256, input, 112 );
|
||||
sph_sha256_close( &ctx_sha256, hashA );
|
||||
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256 ( &ctx_sha256, hashA, 32 );
|
||||
sph_sha256_close( &ctx_sha256, hashA );
|
||||
|
||||
sph_sha512_init( &ctx_sha512 );
|
||||
sph_sha512 ( &ctx_sha512, hashA, 32 );
|
||||
sph_sha512_close( &ctx_sha512, hashA );
|
||||
#endif
|
||||
|
||||
sph_ripemd160_init( &ctx_ripemd );
|
||||
sph_ripemd160 ( &ctx_ripemd, hashA, 32 );
|
||||
@@ -55,7 +35,6 @@ void lbry_hash(void* output, const void* input)
|
||||
sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 );
|
||||
sph_ripemd160_close( &ctx_ripemd, hashC );
|
||||
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, hashB, 20 );
|
||||
SHA256_Update( &ctx_sha256, hashC, 20 );
|
||||
@@ -64,16 +43,7 @@ void lbry_hash(void* output, const void* input)
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, hashA, 32 );
|
||||
SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
|
||||
#else
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256 ( &ctx_sha256, hashB, 20 );
|
||||
sph_sha256 ( &ctx_sha256, hashC, 20 );
|
||||
sph_sha256_close( &ctx_sha256, hashA );
|
||||
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256 ( &ctx_sha256, hashA, 32 );
|
||||
sph_sha256_close( &ctx_sha256, hashA );
|
||||
#endif
|
||||
memcpy( output, hashA, 32 );
|
||||
}
|
||||
|
||||
|
@@ -6,7 +6,7 @@
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@@ -296,6 +296,7 @@ get_xgetbv(uint32_t flags) {
|
||||
size_t cpu_detect_mask = (size_t)-1;
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
static size_t
|
||||
detect_cpu(void) {
|
||||
union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
|
||||
@@ -354,6 +355,7 @@ detect_cpu(void) {
|
||||
|
||||
return cpu_flags;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static const char *
|
||||
|
@@ -30,7 +30,7 @@
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
#if defined(__SSE2__)
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
@@ -716,4 +716,4 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
|
||||
}
|
||||
|
||||
#endif // __AVX2__
|
||||
#endif // __SSE4_2__
|
||||
#endif // __SSE2__
|
||||
|
@@ -42,9 +42,10 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include "sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
#if defined(__SSE2__)
|
||||
//#if defined(__SSE4_2__)
|
||||
|
||||
//#define SPH_SIZE_sha256 256
|
||||
|
||||
@@ -60,6 +61,26 @@ void sha256_4way_init( sha256_4way_context *sc );
|
||||
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
|
||||
void sha256_4way_close( sha256_4way_context *sc, void *dst );
|
||||
|
||||
/*
|
||||
// SHA-256 7 way hybrid
|
||||
// Combines SSE, MMX and scalar data to do 8 + 2 + 1 parallel.
|
||||
typedef struct {
|
||||
__m128i bufx[64>>2];
|
||||
__m128i valx[8];
|
||||
__m64 bufy[64>>2];
|
||||
__m64 valy[8];
|
||||
uint32_t bufz[64>>2];
|
||||
uint32_t valz[8];
|
||||
uint32_t count_high, count_low;
|
||||
} sha256_7way_context;
|
||||
|
||||
void sha256_7way_init( sha256_7way_context *ctx );
|
||||
void sha256_7way( sha256_7way_context *ctx, const void *datax,
|
||||
void *datay, void *dataz, size_t len );
|
||||
void sha256_7way_close( sha256_7way_context *ctx, void *dstx, void *dstyx,
|
||||
void *dstz );
|
||||
*/
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// SHA-256 8 way
|
||||
@@ -88,6 +109,24 @@ void sha512_4way_init( sha512_4way_context *sc);
|
||||
void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
|
||||
void sha512_4way_close( sha512_4way_context *sc, void *dst );
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
// SHA-256 11 way hybrid
|
||||
// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
|
||||
typedef struct {
|
||||
__m256i bufx[64>>2];
|
||||
__m256i valx[8];
|
||||
__m64 bufy[64>>2];
|
||||
__m64 valy[8];
|
||||
uint32_t bufz[64>>2];
|
||||
uint32_t valz[8];
|
||||
uint32_t count_high, count_low;
|
||||
} sha256_11way_context;
|
||||
|
||||
void sha256_11way_init( sha256_11way_context *ctx );
|
||||
void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
|
||||
const void *datay, const void *dataz, size_t len );
|
||||
void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
|
||||
void *dstz );
|
||||
|
||||
#endif // __AVX2__
|
||||
#endif // __SSE2__
|
||||
#endif // SHA256_4WAY_H__
|
||||
|
538
algo/sha/sha256_hash_11way.c
Normal file
538
algo/sha/sha256_hash_11way.c
Normal file
@@ -0,0 +1,538 @@
|
||||
#if 0
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "sha2-hash-4way.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// naming convention for variables and macros
|
||||
// VARx: AVX2 8 way 32 bit
|
||||
// VARy: MMX 2 way 32 bit
|
||||
// VARz: scalar integer 32 bit
|
||||
|
||||
|
||||
static const uint32_t H256[8] =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
|
||||
static const uint32_t K256[64] =
|
||||
{
|
||||
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
|
||||
0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
|
||||
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
|
||||
0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
|
||||
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
|
||||
0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
|
||||
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
|
||||
0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
|
||||
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
|
||||
0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
|
||||
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
|
||||
0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
|
||||
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
|
||||
0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
|
||||
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
|
||||
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
|
||||
};
|
||||
|
||||
#define CHx(X, Y, Z) \
|
||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
||||
|
||||
#define CHy(X, Y, Z) \
|
||||
_mm_xor_si64( _mm_and_si64( _mm_xor_si64( Y, Z ), X ), Z )
|
||||
|
||||
#define CHz(X, Y, Z) ((( (Y) ^ (Z) ) & (X) ) ^ (Z) )
|
||||
|
||||
|
||||
#define MAJx(X, Y, Z) \
|
||||
_mm256_or_si256( _mm256_and_si256( X, Y ), \
|
||||
_mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
|
||||
|
||||
#define MAJy(X, Y, Z) \
|
||||
_mm_or_si64( _mm_and_si64( X, Y ), \
|
||||
_mm_and_si64( _mm_or_si64( X, Y ), Z ) )
|
||||
|
||||
#define MAJz(X, Y, Z) ( ( (X) & (Y) ) | ( ( (X) | (Y) ) & (Z) ) )
|
||||
|
||||
#define BSG2_0x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_32(x,2), mm256_ror_32(x,13) ), _mm256_srli_epi32(x,22) )
|
||||
|
||||
#define BSG2_0y(x) \
|
||||
_mm_xor_si64( _mm_xor_si64( \
|
||||
mm64_ror_32(x,2), mm64_ror_32(x,13) ), _mm_srli_pi32(x,22) )
|
||||
|
||||
#define BSG2_0z(x) ( u32_ror_32(x,2) ^ u32_ror_32(x,13) ^ ((x)>>22) )
|
||||
|
||||
#define BSG2_1x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_32(x,6), mm256_ror_32(x,11) ), _mm256_srli_epi32(x,25) )
|
||||
|
||||
#define BSG2_1y(x) \
|
||||
_mm_xor_si64( _mm_xor_si64( \
|
||||
mm64_ror_32(x,6), mm64_ror_32(x,11) ), _mm_srli_pi32(x,25) )
|
||||
|
||||
#define BSG2_1z(x) ( u32_ror_32(x,6) ^ u32_ror_32(x,11) ^ ((x)>>25) )
|
||||
|
||||
#define SSG2_0x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_32(x,7), mm256_ror_32(x,18) ), _mm256_srli_epi32(x,3) )
|
||||
|
||||
#define SSG2_0y(x) \
|
||||
_mm_xor_si64( _mm_xor_si64( \
|
||||
mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm_srli_pi32(x,3) )
|
||||
|
||||
#define SSG2_0z(x) (( u32_ror_32(x,7) ^ u32_ror_32(x,18) ) ^ ((x)>>3) )
|
||||
|
||||
#define SSG2_1x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_32(x,17), mm256_ror_32(x,19) ), _mm256_srli_epi32(x,10) )
|
||||
|
||||
#define SSG2_1y(x) \
|
||||
_mm_xor_si64( _mm_xor_si64( \
|
||||
mm64_ror_32(x,17), mm64_ror_32(x,19) ), _mm_srli_pi32(x,10) )
|
||||
|
||||
#define SSG2_1z(x) ( u32_ror_32(x,17) ^ u32_ror_32(x,19) ^ ((x)>>10) )
|
||||
|
||||
#define SHA2x_MEXP( a, b, c, d ) \
|
||||
_mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
|
||||
SSG2_1x( Wx[a] ), Wx[b] ), SSG2_0x( Wx[c] ) ), Wx[d] )
|
||||
|
||||
#define SHA2y_MEXP( a, b, c, d ) \
|
||||
_mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
|
||||
SSG2_1y( Wy[a] ), Wy[b] ), SSG2_0y( Wy[c] ) ), Wy[d] )
|
||||
|
||||
#define SHA2z_MEXP( a, b, c, d ) \
|
||||
( SSG2_1z( Wz[a] ) + Wz[b] + SSG2_0z( Wz[c] ) + Wz[d] )
|
||||
|
||||
|
||||
#define SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, \
|
||||
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, \
|
||||
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, i, j) \
|
||||
do { \
|
||||
__m256i T1x, T2x; \
|
||||
__m64 T1y, T2y; \
|
||||
uint32_t T1z, T2z; \
|
||||
T1x = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
|
||||
_mm256_add_epi32( Hx, BSG2_1x(Ex) ), CHx(Ex, Fx, Gx) ), \
|
||||
_mm256_set1_epi32( K256[( (j)+(i) )] ) ), Wx[i] ); \
|
||||
T1y = _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
|
||||
_mm_add_pi32( Hy, BSG2_1y(Ey) ), CHy(Ey, Fy, Gy) ), \
|
||||
_mm_set1_pi32( K256[( (j)+(i) )] ) ), Wy[i] ); \
|
||||
T1z = Hz + BSG2_1z( Ez ) + CHz( Ez, Fz, Gz ) + K256[ ((j)+(i)) ] + Wz[i]; \
|
||||
T2x = _mm256_add_epi32( BSG2_0x(Ax), MAJx(Ax, Bx, Cx) ); \
|
||||
T2y = _mm_add_pi32( BSG2_0y(Ay), MAJy(Ay, By, Cy) ); \
|
||||
T2z = BSG2_0z( Az ) + MAJz( Az, Bz, Cz ); \
|
||||
Dx = _mm256_add_epi32( Dx, T1x ); \
|
||||
Dy = _mm_add_pi32( Dy, T1y ); \
|
||||
Dz = Dz + T1z; \
|
||||
Hx = _mm256_add_epi32( T1x, T2x ); \
|
||||
Hy = _mm_add_pi32( T1y, T2y ); \
|
||||
Hz = T1z + T2z; \
|
||||
} while (0)
|
||||
|
||||
void sha256_11way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 ry[8],
|
||||
uint32_t *inz, uint32_t rz[8] )
|
||||
{
|
||||
__m256i Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx;
|
||||
__m256i Wx[16];
|
||||
__m64 Ay, By, Cy, Dy, Ey, Fy, Gy, Hy;
|
||||
__m64 Wy[16];
|
||||
uint32_t Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz;
|
||||
uint32_t Wz[16];
|
||||
|
||||
Wx[ 0] = mm256_bswap_32( inx[ 0] );
|
||||
Wy[ 0] = mm64_bswap_32( iny[ 0] );
|
||||
Wz[ 0] = bswap_32( inz[ 0] );
|
||||
|
||||
Wx[ 1] = mm256_bswap_32( inx[ 1] );
|
||||
Wy[ 1] = mm64_bswap_32( iny[ 1] );
|
||||
Wz[ 1] = bswap_32( inz[ 1] );
|
||||
|
||||
Wx[ 2] = mm256_bswap_32( inx[ 2] );
|
||||
Wy[ 2] = mm64_bswap_32( iny[ 2] );
|
||||
Wz[ 2] = bswap_32( inz[ 2] );
|
||||
|
||||
Wx[ 3] = mm256_bswap_32( inx[ 3] );
|
||||
Wy[ 3] = mm64_bswap_32( iny[ 3] );
|
||||
Wz[ 3] = bswap_32( inz[ 3] );
|
||||
|
||||
Wx[ 4] = mm256_bswap_32( inx[ 4] );
|
||||
Wy[ 4] = mm64_bswap_32( iny[ 4] );
|
||||
Wz[ 4] = bswap_32( inz[ 4] );
|
||||
|
||||
Wx[ 5] = mm256_bswap_32( inx[ 5] );
|
||||
Wy[ 5] = mm64_bswap_32( iny[ 5] );
|
||||
Wz[ 5] = bswap_32( inz[ 5] );
|
||||
|
||||
Wx[ 6] = mm256_bswap_32( inx[ 6] );
|
||||
Wy[ 6] = mm64_bswap_32( iny[ 6] );
|
||||
Wz[ 6] = bswap_32( inz[ 6] );
|
||||
|
||||
Wx[ 7] = mm256_bswap_32( inx[ 7] );
|
||||
Wy[ 7] = mm64_bswap_32( iny[ 7] );
|
||||
Wz[ 7] = bswap_32( inz[ 7] );
|
||||
|
||||
Wx[ 8] = mm256_bswap_32( inx[ 8] );
|
||||
Wy[ 8] = mm64_bswap_32( iny[ 8] );
|
||||
Wz[ 8] = bswap_32( inz[ 8] );
|
||||
|
||||
Wx[ 9] = mm256_bswap_32( inx[ 9] );
|
||||
Wy[ 9] = mm64_bswap_32( iny[ 9] );
|
||||
Wz[ 9] = bswap_32( inz[ 9] );
|
||||
|
||||
Wx[10] = mm256_bswap_32( inx[10] );
|
||||
Wy[10] = mm64_bswap_32( iny[10] );
|
||||
Wz[10] = bswap_32( inz[10] );
|
||||
|
||||
Wx[11] = mm256_bswap_32( inx[11] );
|
||||
Wy[11] = mm64_bswap_32( iny[11] );
|
||||
Wz[11] = bswap_32( inz[11] );
|
||||
|
||||
Wx[12] = mm256_bswap_32( inx[12] );
|
||||
Wy[12] = mm64_bswap_32( iny[12] );
|
||||
Wz[12] = bswap_32( inz[12] );
|
||||
|
||||
Wx[13] = mm256_bswap_32( inx[13] );
|
||||
Wy[13] = mm64_bswap_32( iny[13] );
|
||||
Wz[13] = bswap_32( inz[13] );
|
||||
|
||||
Wx[14] = mm256_bswap_32( inx[14] );
|
||||
Wy[14] = mm64_bswap_32( iny[14] );
|
||||
Wz[14] = bswap_32( inz[14] );
|
||||
|
||||
Wx[15] = mm256_bswap_32( inx[15] );
|
||||
Wy[15] = mm64_bswap_32( iny[15] );
|
||||
Wz[15] = bswap_32( inz[15] );
|
||||
|
||||
Ax = rx[0]; Ay = ry[0]; Az = rz[0];
|
||||
Bx = rx[1]; By = ry[1]; Bz = rz[1];
|
||||
Cx = rx[2]; Cy = ry[2]; Cz = rz[2];
|
||||
Dx = rx[3]; Dy = ry[3]; Dz = rz[3];
|
||||
Ex = rx[4]; Ey = ry[4]; Ez = rz[4];
|
||||
Fx = rx[5]; Fy = ry[5]; Fz = rz[5];
|
||||
Gx = rx[6]; Gy = ry[6]; Gz = rz[6];
|
||||
Hx = rx[7]; Hy = ry[7]; Hz = rz[7];
|
||||
|
||||
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
|
||||
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
|
||||
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 0, 0 );
|
||||
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
|
||||
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
|
||||
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 1, 0 );
|
||||
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
|
||||
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
|
||||
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 2, 0 );
|
||||
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
|
||||
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
|
||||
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 3, 0 );
|
||||
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
|
||||
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
|
||||
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 4, 0 );
|
||||
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
|
||||
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
|
||||
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 5, 0 );
|
||||
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
|
||||
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
|
||||
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 6, 0 );
|
||||
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
|
||||
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
|
||||
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 7, 0 );
|
||||
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
|
||||
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
|
||||
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 8, 0 );
|
||||
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
|
||||
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
|
||||
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 9, 0 );
|
||||
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
|
||||
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
|
||||
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, 0 );
|
||||
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
|
||||
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
|
||||
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, 0 );
|
||||
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
|
||||
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
|
||||
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, 0 );
|
||||
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
|
||||
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
|
||||
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, 0 );
|
||||
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
|
||||
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
|
||||
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, 0 );
|
||||
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
|
||||
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
|
||||
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, 0 );
|
||||
|
||||
for ( int j = 16; j < 64; j += 16 )
|
||||
{
|
||||
Wx[ 0] = SHA2x_MEXP( 14, 9, 1, 0 );
|
||||
Wy[ 0] = SHA2y_MEXP( 14, 9, 1, 0 );
|
||||
Wz[ 0] = SHA2z_MEXP( 14, 9, 1, 0 );
|
||||
|
||||
Wx[ 1] = SHA2x_MEXP( 15, 10, 2, 1 );
|
||||
Wy[ 1] = SHA2y_MEXP( 15, 10, 2, 1 );
|
||||
Wz[ 1] = SHA2z_MEXP( 15, 10, 2, 1 );
|
||||
|
||||
Wx[ 2] = SHA2x_MEXP( 0, 11, 3, 2 );
|
||||
Wy[ 2] = SHA2y_MEXP( 0, 11, 3, 2 );
|
||||
Wz[ 2] = SHA2z_MEXP( 0, 11, 3, 2 );
|
||||
|
||||
Wx[ 3] = SHA2x_MEXP( 1, 12, 4, 3 );
|
||||
Wy[ 3] = SHA2y_MEXP( 1, 12, 4, 3 );
|
||||
Wz[ 3] = SHA2z_MEXP( 1, 12, 4, 3 );
|
||||
|
||||
Wx[ 4] = SHA2x_MEXP( 2, 13, 5, 4 );
|
||||
Wy[ 4] = SHA2y_MEXP( 2, 13, 5, 4 );
|
||||
Wz[ 4] = SHA2z_MEXP( 2, 13, 5, 4 );
|
||||
|
||||
Wx[ 5] = SHA2x_MEXP( 3, 14, 6, 5 );
|
||||
Wy[ 5] = SHA2y_MEXP( 3, 14, 6, 5 );
|
||||
Wz[ 5] = SHA2z_MEXP( 3, 14, 6, 5 );
|
||||
|
||||
Wx[ 6] = SHA2x_MEXP( 4, 15, 7, 6 );
|
||||
Wy[ 6] = SHA2y_MEXP( 4, 15, 7, 6 );
|
||||
Wz[ 6] = SHA2z_MEXP( 4, 15, 7, 6 );
|
||||
|
||||
Wx[ 7] = SHA2x_MEXP( 5, 0, 8, 7);
|
||||
Wy[ 7] = SHA2y_MEXP( 5, 0, 8, 7);
|
||||
Wz[ 7] = SHA2z_MEXP( 5, 0, 8, 7);
|
||||
|
||||
Wx[ 8] = SHA2x_MEXP( 6, 1, 9, 8);
|
||||
Wy[ 8] = SHA2y_MEXP( 6, 1, 9, 8);
|
||||
Wz[ 8] = SHA2z_MEXP( 6, 1, 9, 8);
|
||||
|
||||
Wx[ 9] = SHA2x_MEXP( 7, 2, 10, 9 );
|
||||
Wy[ 9] = SHA2y_MEXP( 7, 2, 10, 9);
|
||||
Wz[ 9] = SHA2z_MEXP( 7, 2, 10, 9);
|
||||
|
||||
Wx[10] = SHA2x_MEXP( 8, 3, 11, 10 );
|
||||
Wy[10] = SHA2y_MEXP( 8, 3, 11, 10);
|
||||
Wz[10] = SHA2z_MEXP( 8, 3, 11, 10);
|
||||
|
||||
Wx[11] = SHA2x_MEXP( 9, 4, 12, 11);
|
||||
Wy[11] = SHA2y_MEXP( 9, 4, 12, 11);
|
||||
Wz[11] = SHA2z_MEXP( 9, 4, 12, 11 );
|
||||
|
||||
Wx[12] = SHA2x_MEXP( 10, 5, 13, 12 );
|
||||
Wy[12] = SHA2y_MEXP( 10, 5, 13, 12 );
|
||||
Wz[12] = SHA2z_MEXP( 10, 5, 13, 12 );
|
||||
|
||||
Wx[13] = SHA2x_MEXP( 11, 6, 14, 13 );
|
||||
Wy[13] = SHA2y_MEXP( 11, 6, 14, 13 );
|
||||
Wz[13] = SHA2z_MEXP( 11, 6, 14, 13 );
|
||||
|
||||
Wx[14] = SHA2x_MEXP( 12, 7, 15, 14 );
|
||||
Wy[14] = SHA2y_MEXP( 12, 7, 15, 14 );
|
||||
Wz[14] = SHA2z_MEXP( 12, 7, 15, 14 );
|
||||
|
||||
Wx[15] = SHA2x_MEXP( 13, 8, 0, 15 );
|
||||
Wy[15] = SHA2y_MEXP( 13, 8, 0, 15 );
|
||||
Wz[15] = SHA2z_MEXP( 13, 8, 0, 15 );
|
||||
|
||||
|
||||
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
|
||||
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
|
||||
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 0, j );
|
||||
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
|
||||
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
|
||||
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 1, j );
|
||||
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
|
||||
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
|
||||
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 2, j );
|
||||
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
|
||||
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
|
||||
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 3, j );
|
||||
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
|
||||
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
|
||||
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 4, j );
|
||||
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
|
||||
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
|
||||
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 5, j );
|
||||
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
|
||||
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
|
||||
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 6, j );
|
||||
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
|
||||
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
|
||||
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 7, j );
|
||||
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
|
||||
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
|
||||
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 8, j );
|
||||
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
|
||||
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
|
||||
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 9, j );
|
||||
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
|
||||
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
|
||||
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, j );
|
||||
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
|
||||
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
|
||||
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, j );
|
||||
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
|
||||
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
|
||||
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, j );
|
||||
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
|
||||
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
|
||||
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, j );
|
||||
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
|
||||
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
|
||||
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, j );
|
||||
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
|
||||
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
|
||||
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, j );
|
||||
}
|
||||
|
||||
rx[0] = _mm256_add_epi32( rx[0], Ax );
|
||||
ry[0] = _mm_add_pi32( ry[0], Ay );
|
||||
rz[0] = rz[0]+ Az;
|
||||
rx[1] = _mm256_add_epi32( rx[1], Bx );
|
||||
ry[1] = _mm_add_pi32( ry[1], By );
|
||||
rz[1] = rz[1]+ Bz;
|
||||
rx[2] = _mm256_add_epi32( rx[2], Cx );
|
||||
ry[2] = _mm_add_pi32( ry[2], Cy );
|
||||
rz[3] = rz[3]+ Dz;
|
||||
rx[4] = _mm256_add_epi32( rx[4], Ex );
|
||||
ry[4] = _mm_add_pi32( ry[4], Ey );
|
||||
rz[4] = rz[4]+ Ez;
|
||||
rx[5] = _mm256_add_epi32( rx[5], Fx );
|
||||
ry[5] = _mm_add_pi32( ry[5], Fy );
|
||||
rz[5] = rz[5]+ Fz;
|
||||
rx[6] = _mm256_add_epi32( rx[6], Gx );
|
||||
ry[6] = _mm_add_pi32( ry[6], Gy );
|
||||
rz[6] = rz[6]+ Gz;
|
||||
rx[7] = _mm256_add_epi32( rx[7], Hx );
|
||||
ry[7] = _mm_add_pi32( ry[7], Hy );
|
||||
rz[7] = rz[7]+ Hz;
|
||||
|
||||
}
|
||||
|
||||
void sha256_11way_init( sha256_11way_context *ctx )
|
||||
{
|
||||
ctx->count_high = ctx->count_low = 0;
|
||||
ctx->valx[0] = _mm256_set1_epi32( H256[0] );
|
||||
ctx->valy[0] = _mm_set1_pi32( H256[0] );
|
||||
ctx->valx[1] = _mm256_set1_epi32( H256[0] );
|
||||
ctx->valy[1] = _mm_set1_pi32( H256[0] );
|
||||
ctx->valx[2] = _mm256_set1_epi32( H256[0] );
|
||||
ctx->valy[2] = _mm_set1_pi32( H256[0] );
|
||||
ctx->valx[3] = _mm256_set1_epi32( H256[0] );
|
||||
ctx->valy[3] = _mm_set1_pi32( H256[0] );
|
||||
ctx->valx[4] = _mm256_set1_epi32( H256[0] );
|
||||
ctx->valy[4] = _mm_set1_pi32( H256[0] );
|
||||
ctx->valx[5] = _mm256_set1_epi32( H256[0] );
|
||||
ctx->valy[5] = _mm_set1_pi32( H256[0] );
|
||||
ctx->valx[6] = _mm256_set1_epi32( H256[0] );
|
||||
ctx->valy[6] = _mm_set1_pi32( H256[0] );
|
||||
ctx->valx[7] = _mm256_set1_epi32( H256[0] );
|
||||
ctx->valy[7] = _mm_set1_pi32( H256[0] );
|
||||
memcpy( ctx->valz, H256, 32 );
|
||||
}
|
||||
|
||||
|
||||
void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
|
||||
const void *datay, const void *dataz, size_t len )
|
||||
{
|
||||
__m256i *vdatax = (__m256i*) datax;
|
||||
__m64 *vdatay = (__m64*) datay;
|
||||
uint32_t *idataz = (uint32_t*)dataz;
|
||||
size_t ptr;
|
||||
const int buf_size = 64;
|
||||
|
||||
ptr = (unsigned)ctx->count_low & (buf_size - 1U);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
uint32_t clow, clow2;
|
||||
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 );
|
||||
memcpy_m64( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 );
|
||||
memcpy ( ctx->bufz + ptr, idataz + ptr, clen );
|
||||
ptr += clen;
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
sha256_11way_round( ctx->bufx, ctx->valx,
|
||||
ctx->bufy, ctx->valy,
|
||||
ctx->bufz, ctx->valz );
|
||||
ptr = 0;
|
||||
}
|
||||
clow = ctx->count_low;
|
||||
clow2 = clow + clen;
|
||||
ctx->count_low = clow2;
|
||||
if ( clow2 < clow )
|
||||
ctx->count_high++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty,
|
||||
void *dstz)
|
||||
{
|
||||
unsigned ptr, u;
|
||||
uint32_t low, high;
|
||||
const int buf_size = 64;
|
||||
const int pad = buf_size - 8;
|
||||
|
||||
ptr = (unsigned)ctx->count_low & (buf_size - 1U);
|
||||
ctx->bufx[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
|
||||
ctx->bufy[ ptr>>2 ] = _mm_set1_pi32( 0x80 );
|
||||
ctx->bufz[ ptr>>2 ] = 0x80;
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
{
|
||||
memset_zero_256( ctx->bufx + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||
memset_zero_m64( ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||
memset( ctx->bufz + (ptr>>2), 0, (buf_size - ptr) >> 2 );
|
||||
sha256_11way_round( ctx->bufx, ctx->valx,
|
||||
ctx->bufy, ctx->valy,
|
||||
ctx->bufz, ctx->valz );
|
||||
memset_zero_256( ctx->bufx, pad >> 2 );
|
||||
memset_zero_m64( ctx->bufy, pad >> 2 );
|
||||
memset( ctx->bufz, 0, pad >> 2 );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_256( ctx->bufx + (ptr>>2), (pad - ptr) >> 2 );
|
||||
memset_zero_m64( ctx->bufy + (ptr>>2), (pad - ptr) >> 2 );
|
||||
memset( ctx->bufz + (ptr>>2), 0, (pad - ptr) >> 2 );
|
||||
}
|
||||
|
||||
low = ctx->count_low;
|
||||
high = (ctx->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
ctx->bufx[ pad >> 2 ] =
|
||||
mm256_bswap_32( _mm256_set1_epi32( high ) );
|
||||
ctx->bufy[ pad >> 2 ] =
|
||||
mm64_bswap_32( _mm_set1_pi32( high ) );
|
||||
ctx->bufz[ pad >> 2 ] =
|
||||
bswap_32( high );
|
||||
|
||||
|
||||
ctx->bufx[ ( pad+4 ) >> 2 ] =
|
||||
mm256_bswap_32( _mm256_set1_epi32( low ) );
|
||||
ctx->bufy[ ( pad+4 ) >> 2 ] =
|
||||
mm64_bswap_32( _mm_set1_pi32( low ) );
|
||||
ctx->bufz[ ( pad+4 ) >> 2 ] =
|
||||
bswap_32( low );
|
||||
|
||||
sha256_11way_round( ctx->bufx, ctx->valx,
|
||||
ctx->bufy, ctx->valy,
|
||||
ctx->bufz, ctx->valz );
|
||||
|
||||
for ( u = 0; u < 8; u ++ )
|
||||
{
|
||||
casti_m256i( dstx, u ) = mm256_bswap_32( ctx->valx[u] );
|
||||
casti_m64 ( dsty, u ) = mm64_bswap_32( ctx->valy[u] );
|
||||
((uint32_t*)dstz)[u] = bswap_32( ctx->valz[u] );
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // 0
|
188
algo/sha/sha256q-4way.c
Normal file
188
algo/sha/sha256q-4way.c
Normal file
@@ -0,0 +1,188 @@
|
||||
#include "sha256t-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sha2-hash-4way.h"
|
||||
|
||||
#if defined(SHA256T_8WAY)
|
||||
|
||||
static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256q_8way_hash( void* output, const void* input )
|
||||
{
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
sha256_8way_context ctx;
|
||||
memcpy( &ctx, &sha256_ctx8, sizeof ctx );
|
||||
|
||||
sha256_8way( &ctx, input + (64<<3), 16 );
|
||||
sha256_8way_close( &ctx, vhash );
|
||||
|
||||
sha256_8way_init( &ctx );
|
||||
sha256_8way( &ctx, vhash, 32 );
|
||||
sha256_8way_close( &ctx, vhash );
|
||||
|
||||
sha256_8way_init( &ctx );
|
||||
sha256_8way( &ctx, vhash, 32 );
|
||||
sha256_8way_close( &ctx, vhash );
|
||||
|
||||
sha256_8way_init( &ctx );
|
||||
sha256_8way( &ctx, vhash, 32 );
|
||||
sha256_8way_close( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
const uint64_t htmax[] = { 0,
|
||||
0xF,
|
||||
0xFF,
|
||||
0xFFF,
|
||||
0xFFFF,
|
||||
0x10000000 };
|
||||
const uint32_t masks[] = { 0xFFFFFFFF,
|
||||
0xFFFFFFF0,
|
||||
0xFFFFFF00,
|
||||
0xFFFFF000,
|
||||
0xFFFF0000,
|
||||
0 };
|
||||
|
||||
// Need big endian data
|
||||
mm256_bswap_intrlv80_8x32( vdata, pdata );
|
||||
sha256_8way_init( &sha256_ctx8 );
|
||||
sha256_8way( &sha256_ctx8, vdata, 64 );
|
||||
|
||||
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
*noncev = mm256_bswap_32(
|
||||
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
|
||||
|
||||
pdata[19] = n;
|
||||
sha256q_8way_hash( hash, vdata );
|
||||
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( !( hash7[ lane ] & mask ) )
|
||||
{
|
||||
// deinterleave hash for lane
|
||||
uint32_t lane_hash[8];
|
||||
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256T_4WAY)
|
||||
|
||||
static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256q_4way_hash( void* output, const void* input )
|
||||
{
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
sha256_4way_context ctx;
|
||||
memcpy( &ctx, &sha256_ctx4, sizeof ctx );
|
||||
|
||||
sha256_4way( &ctx, input + (64<<2), 16 );
|
||||
sha256_4way_close( &ctx, vhash );
|
||||
|
||||
sha256_4way_init( &ctx );
|
||||
sha256_4way( &ctx, vhash, 32 );
|
||||
sha256_4way_close( &ctx, vhash );
|
||||
|
||||
sha256_4way_init( &ctx );
|
||||
sha256_4way( &ctx, vhash, 32 );
|
||||
sha256_4way_close( &ctx, vhash );
|
||||
|
||||
sha256_4way_init( &ctx );
|
||||
sha256_4way( &ctx, vhash, 32 );
|
||||
sha256_4way_close( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
const uint64_t htmax[] = { 0,
|
||||
0xF,
|
||||
0xFF,
|
||||
0xFFF,
|
||||
0xFFFF,
|
||||
0x10000000 };
|
||||
const uint32_t masks[] = { 0xFFFFFFFF,
|
||||
0xFFFFFFF0,
|
||||
0xFFFFFF00,
|
||||
0xFFFFF000,
|
||||
0xFFFF0000,
|
||||
0 };
|
||||
|
||||
mm128_bswap_intrlv80_4x32( vdata, pdata );
|
||||
sha256_4way_init( &sha256_ctx4 );
|
||||
sha256_4way( &sha256_ctx4, vdata, 64 );
|
||||
|
||||
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
sha256q_4way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( !( hash7[ lane ] & mask ) )
|
||||
{
|
||||
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
113
algo/sha/sha256q.c
Normal file
113
algo/sha/sha256q.c
Normal file
@@ -0,0 +1,113 @@
|
||||
#include "sha256t-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <openssl/sha.h>
|
||||
|
||||
static __thread SHA256_CTX sha256q_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256q_midstate( const void* input )
|
||||
{
|
||||
SHA256_Init( &sha256q_ctx );
|
||||
SHA256_Update( &sha256q_ctx, input, 64 );
|
||||
}
|
||||
|
||||
void sha256q_hash( void* output, const void* input )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[16];
|
||||
const int midlen = 64; // bytes
|
||||
const int tail = 80 - midlen; // 16
|
||||
|
||||
SHA256_CTX ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx );
|
||||
|
||||
SHA256_Update( &ctx, input + midlen, tail );
|
||||
SHA256_Final( (unsigned char*)hash, &ctx );
|
||||
|
||||
SHA256_Init( &ctx );
|
||||
SHA256_Update( &ctx, hash, 32 );
|
||||
SHA256_Final( (unsigned char*)hash, &ctx );
|
||||
|
||||
SHA256_Init( &ctx );
|
||||
SHA256_Update( &ctx, hash, 32 );
|
||||
SHA256_Final( (unsigned char*)hash, &ctx );
|
||||
|
||||
SHA256_Init( &ctx );
|
||||
SHA256_Update( &ctx, hash, 32 );
|
||||
SHA256_Final( (unsigned char*)hash, &ctx );
|
||||
|
||||
memcpy( output, hash, 32 );
|
||||
}
|
||||
|
||||
int scanhash_sha256q( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19] - 1;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
#ifdef _MSC_VER
|
||||
uint32_t __declspec(align(32)) hash64[8];
|
||||
#else
|
||||
uint32_t hash64[8] __attribute__((aligned(32)));
|
||||
#endif
|
||||
uint32_t endiandata[32];
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
uint64_t htmax[] = {
|
||||
0,
|
||||
0xF,
|
||||
0xFF,
|
||||
0xFFF,
|
||||
0xFFFF,
|
||||
0x10000000
|
||||
};
|
||||
uint32_t masks[] = {
|
||||
0xFFFFFFFF,
|
||||
0xFFFFFFF0,
|
||||
0xFFFFFF00,
|
||||
0xFFFFF000,
|
||||
0xFFFF0000,
|
||||
0
|
||||
};
|
||||
|
||||
// we need bigendian data...
|
||||
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
|
||||
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
|
||||
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
|
||||
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
|
||||
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
sha256q_midstate( endiandata );
|
||||
|
||||
for ( int m = 0; m < 6; m++ )
|
||||
{
|
||||
if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do {
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
sha256q_hash( hash64, endiandata );
|
||||
if ( ( !(hash64[7] & mask) ) && fulltest( hash64, ptarget ) )
|
||||
{
|
||||
work_set_target_ratio( work, hash64 );
|
||||
if ( submit_work( mythr, work ) )
|
||||
applog( LOG_NOTICE, "Share %d submitted by thread %d.",
|
||||
accepted_share_count + rejected_share_count + 1,
|
||||
thr_id );
|
||||
else
|
||||
applog( LOG_WARNING, "Failed to submit share." );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
}
|
||||
} while ( n < max_nonce && !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
@@ -5,6 +5,137 @@
|
||||
#include <stdio.h>
|
||||
#include "sha2-hash-4way.h"
|
||||
|
||||
#if defined(SHA256T_11WAY)
|
||||
|
||||
static __thread sha256_11way_context sha256_ctx11 __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
|
||||
const void *inpy, const void*inpz )
|
||||
{
|
||||
uint32_t hashx[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hashy[8*2] __attribute__ ((aligned (64)));
|
||||
uint32_t hashz[8] __attribute__ ((aligned (64)));
|
||||
sha256_11way_context ctx;
|
||||
const void *inpx64 = inpx+(64<<3);
|
||||
const void *inpy64 = inpy+(64<<1);
|
||||
const void *inpz64 = inpz+ 64;
|
||||
|
||||
memcpy( &ctx, &sha256_ctx11, sizeof ctx );
|
||||
sha256_11way_update( &ctx, inpx64, inpy64, inpz64, 16 );
|
||||
sha256_11way_close( &ctx, hashx, hashy, hashz );
|
||||
|
||||
sha256_11way_init( &ctx );
|
||||
sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
|
||||
sha256_11way_close( &ctx, hashx, hashy, hashz );
|
||||
|
||||
sha256_11way_init( &ctx );
|
||||
sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
|
||||
sha256_11way_close( &ctx, outx, outy, outz );
|
||||
}
|
||||
|
||||
int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t datax[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t datay[20*2] __attribute__ ((aligned (32)));
|
||||
uint32_t dataz[20] __attribute__ ((aligned (32)));
|
||||
uint32_t hashx[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t hashy[8*2] __attribute__ ((aligned (32)));
|
||||
uint32_t hashz[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7;
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncex = (__m256i*) datax + 19;
|
||||
__m64 *noncey = (__m64*) datay + 19;
|
||||
uint32_t *noncez = (uint32_t*)dataz + 19;
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
int i;
|
||||
const uint64_t htmax[] = { 0,
|
||||
0xF,
|
||||
0xFF,
|
||||
0xFFF,
|
||||
0xFFFF,
|
||||
0x10000000 };
|
||||
const uint32_t masks[] = { 0xFFFFFFFF,
|
||||
0xFFFFFFF0,
|
||||
0xFFFFFF00,
|
||||
0xFFFFF000,
|
||||
0xFFFF0000,
|
||||
0 };
|
||||
|
||||
// Use dataz (scalar) to stage bswapped data for the vectors.
|
||||
casti_m256i( dataz, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
|
||||
casti_m256i( dataz, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
|
||||
casti_m128i( dataz, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
mm256_intrlv_8x32( datax, dataz, dataz, dataz, dataz,
|
||||
dataz, dataz, dataz, dataz, 640 );
|
||||
mm64_interleave_2x32( datay, dataz, dataz, 640 );
|
||||
|
||||
sha256_11way_init( &sha256_ctx11 );
|
||||
sha256_11way_update( &sha256_ctx11, datax, datay, dataz, 64 );
|
||||
|
||||
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
*noncex = mm256_bswap_32(
|
||||
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
|
||||
*noncey = mm64_bswap_32( _mm_set_pi32( n+9, n+8 ) );
|
||||
*noncez = bswap_32( n+10 );
|
||||
|
||||
pdata[19] = n;
|
||||
|
||||
sha256t_11way_hash( hashx, hashy, hashz, datax, datay, dataz );
|
||||
|
||||
if ( opt_benchmark ) { n += 11; continue; }
|
||||
|
||||
hash7 = &(hashx[7<<3]);
|
||||
for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
|
||||
{
|
||||
// deinterleave hash for lane
|
||||
mm256_extract_lane_8x32( lane_hash, hashx, i, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = n + i;
|
||||
submit_solution( work, lane_hash, mythr, i );
|
||||
}
|
||||
}
|
||||
|
||||
hash7 = &(hashy[7<<1]);
|
||||
for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
|
||||
|
||||
{
|
||||
mm64_extract_lane_2x32( lane_hash, hashy, i, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = n + 8 + i;
|
||||
submit_solution( work, lane_hash, mythr, i+8 );
|
||||
}
|
||||
}
|
||||
|
||||
if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) )
|
||||
{
|
||||
pdata[19] = n+10;
|
||||
submit_solution( work, hashz, mythr, 10 );
|
||||
}
|
||||
n += 11;
|
||||
|
||||
} while ( (n < max_nonce-12) && !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256T_8WAY)
|
||||
|
||||
static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
|
||||
@@ -25,23 +156,22 @@ void sha256t_8way_hash( void* output, const void* input )
|
||||
sha256_8way_init( &ctx );
|
||||
sha256_8way( &ctx, vhash, 32 );
|
||||
sha256_8way_close( &ctx, output );
|
||||
|
||||
}
|
||||
|
||||
int scanhash_sha256t_8way( int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done )
|
||||
int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t edata[20] __attribute__ ((aligned (32)));;
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep = vdata + 152; // 19*8
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
const uint64_t htmax[] = { 0,
|
||||
0xF,
|
||||
@@ -56,58 +186,42 @@ int scanhash_sha256t_8way( int thr_id, struct work *work,
|
||||
0xFFFF0000,
|
||||
0 };
|
||||
|
||||
for ( int k = 0; k < 20; k++ )
|
||||
be32enc( &edata[k], pdata[k] );
|
||||
|
||||
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
// Need big endian data
|
||||
mm256_bswap_intrlv80_8x32( vdata, pdata );
|
||||
sha256_8way_init( &sha256_ctx8 );
|
||||
sha256_8way( &sha256_ctx8, vdata, 64 );
|
||||
|
||||
|
||||
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep +1, n+1 );
|
||||
be32enc( noncep +2, n+2 );
|
||||
be32enc( noncep +3, n+3 );
|
||||
be32enc( noncep +4, n+4 );
|
||||
be32enc( noncep +5, n+5 );
|
||||
be32enc( noncep +6, n+6 );
|
||||
be32enc( noncep +7, n+7 );
|
||||
do
|
||||
{
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32(
|
||||
n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
sha256t_8way_hash( hash, vdata );
|
||||
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( !( hash7[ lane ] & mask ) )
|
||||
{
|
||||
{
|
||||
// deinterleave hash for lane
|
||||
uint32_t lane_hash[8];
|
||||
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
nonces[ num_found++ ] = n + lane;
|
||||
work_set_target_ratio( work, lane_hash );
|
||||
}
|
||||
}
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
|
||||
} while ( (num_found == 0) && (n < max_nonce)
|
||||
&& !work_restart[thr_id].restart );
|
||||
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SHA256T_4WAY)
|
||||
#endif
|
||||
|
||||
#if defined(SHA256T_4WAY)
|
||||
|
||||
static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
|
||||
|
||||
@@ -127,25 +241,22 @@ void sha256t_4way_hash( void* output, const void* input )
|
||||
sha256_4way_init( &ctx );
|
||||
sha256_4way( &ctx, vhash, 32 );
|
||||
sha256_4way_close( &ctx, output );
|
||||
|
||||
}
|
||||
|
||||
int scanhash_sha256t_4way( int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done )
|
||||
int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t edata[20] __attribute__ ((aligned (32)));;
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep = vdata + 76; // 19*4
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
const uint64_t htmax[] = { 0,
|
||||
0xF,
|
||||
@@ -153,17 +264,14 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,
|
||||
0xFFF,
|
||||
0xFFFF,
|
||||
0x10000000 };
|
||||
const uint32_t masks[] = { 0xFFFFFFFF,
|
||||
0xFFFFFFF0,
|
||||
0xFFFFFF00,
|
||||
0xFFFFF000,
|
||||
0xFFFF0000,
|
||||
0 };
|
||||
const uint32_t masks[] = { 0xFFFFFFFF,
|
||||
0xFFFFFFF0,
|
||||
0xFFFFFF00,
|
||||
0xFFFFF000,
|
||||
0xFFFF0000,
|
||||
0 };
|
||||
|
||||
for ( int k = 0; k < 19; k++ )
|
||||
be32enc( &edata[k], pdata[k] );
|
||||
|
||||
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
mm128_bswap_intrlv80_4x32( vdata, pdata );
|
||||
sha256_4way_init( &sha256_ctx4 );
|
||||
sha256_4way( &sha256_ctx4, vdata, 64 );
|
||||
|
||||
@@ -171,10 +279,7 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep +1, n+1 );
|
||||
be32enc( noncep +2, n+2 );
|
||||
be32enc( noncep +3, n+3 );
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
sha256t_4way_hash( hash, vdata );
|
||||
@@ -183,24 +288,18 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,
|
||||
if ( !( hash7[ lane ] & mask ) )
|
||||
{
|
||||
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
nonces[ num_found++ ] = n + lane;
|
||||
work_set_target_ratio( work, lane_hash );
|
||||
}
|
||||
}
|
||||
|
||||
n += 4;
|
||||
|
||||
} while ( (num_found == 0) && (n < max_nonce)
|
||||
&& !work_restart[thr_id].restart );
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -3,15 +3,15 @@
|
||||
bool register_sha256t_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(SHA256T_8WAY)
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256t_8way;
|
||||
gate->hash = (void*)&sha256t_8way_hash;
|
||||
#elif defined(SHA256T_4WAY)
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256t_4way;
|
||||
gate->hash = (void*)&sha256t_4way_hash;
|
||||
#else
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT | SHA_OPT;
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256t;
|
||||
gate->hash = (void*)&sha256t_hash;
|
||||
#endif
|
||||
@@ -19,3 +19,19 @@ bool register_sha256t_algo( algo_gate_t* gate )
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_sha256q_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(SHA256T_4WAY)
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256q_4way;
|
||||
gate->hash = (void*)&sha256q_4way_hash;
|
||||
#else
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256q;
|
||||
gate->hash = (void*)&sha256q_hash;
|
||||
#endif
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
|
@@ -4,33 +4,45 @@
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
// Override multi way on ryzen, SHA is better.
|
||||
#if !defined(RYZEN_)
|
||||
#if defined(__SSE2__)
|
||||
#define SHA256T_4WAY
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
#define SHA256T_8WAY
|
||||
#endif
|
||||
#endif
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate );
|
||||
bool register_sha256t_algo( algo_gate_t* gate );
|
||||
bool register_sha256q_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(SHA256T_8WAY)
|
||||
|
||||
void sha256t_8way_hash( void *output, const void *input );
|
||||
int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void sha256q_8way_hash( void *output, const void *input );
|
||||
int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
#elif defined (SHA256T_4WAY)
|
||||
#if defined(SHA256T_4WAY)
|
||||
|
||||
void sha256t_4way_hash( void *output, const void *input );
|
||||
int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
#else
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void sha256q_4way_hash( void *output, const void *input );
|
||||
int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
void sha256t_hash( void *output, const void *input );
|
||||
int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
#endif
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void sha256q_hash( void *output, const void *input );
|
||||
int scanhash_sha256q( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -5,8 +5,6 @@
|
||||
#include <stdio.h>
|
||||
#include <openssl/sha.h>
|
||||
|
||||
#if !defined(SHA256T_4WAY)
|
||||
|
||||
static __thread SHA256_CTX sha256t_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256t_midstate( const void* input )
|
||||
@@ -39,7 +37,7 @@ void sha256t_hash( void* output, const void* input )
|
||||
}
|
||||
|
||||
int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done)
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -52,6 +50,7 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint32_t hash64[8] __attribute__((aligned(32)));
|
||||
#endif
|
||||
uint32_t endiandata[32];
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
uint64_t htmax[] = {
|
||||
0,
|
||||
@@ -71,8 +70,11 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
};
|
||||
|
||||
// we need bigendian data...
|
||||
for ( int k = 0; k < 19; k++ )
|
||||
be32enc( &endiandata[k], pdata[k] );
|
||||
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
|
||||
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
|
||||
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
|
||||
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
|
||||
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
sha256t_midstate( endiandata );
|
||||
|
||||
@@ -88,7 +90,13 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
if ( ( !(hash64[7] & mask) ) && fulltest( hash64, ptarget ) )
|
||||
{
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
if ( submit_work( mythr, work ) )
|
||||
applog( LOG_NOTICE, "Share %d submitted by thread %d.",
|
||||
accepted_share_count + rejected_share_count + 1,
|
||||
thr_id );
|
||||
else
|
||||
applog( LOG_WARNING, "Failed to submit share." );
|
||||
}
|
||||
} while ( n < max_nonce && !work_restart[thr_id].restart );
|
||||
break;
|
||||
@@ -99,4 +107,3 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
@@ -248,22 +248,22 @@ do { \
|
||||
*/
|
||||
#define SWAP_BC \
|
||||
do { \
|
||||
mm128_swap256_128( B0, C0 ); \
|
||||
mm128_swap256_128( B1, C1 ); \
|
||||
mm128_swap256_128( B2, C2 ); \
|
||||
mm128_swap256_128( B3, C3 ); \
|
||||
mm128_swap256_128( B4, C4 ); \
|
||||
mm128_swap256_128( B5, C5 ); \
|
||||
mm128_swap256_128( B6, C6 ); \
|
||||
mm128_swap256_128( B7, C7 ); \
|
||||
mm128_swap256_128( B8, C8 ); \
|
||||
mm128_swap256_128( B9, C9 ); \
|
||||
mm128_swap256_128( BA, CA ); \
|
||||
mm128_swap256_128( BB, CB ); \
|
||||
mm128_swap256_128( BC, CC ); \
|
||||
mm128_swap256_128( BD, CD ); \
|
||||
mm128_swap256_128( BE, CE ); \
|
||||
mm128_swap256_128( BF, CF ); \
|
||||
mm128_swap128_256( B0, C0 ); \
|
||||
mm128_swap128_256( B1, C1 ); \
|
||||
mm128_swap128_256( B2, C2 ); \
|
||||
mm128_swap128_256( B3, C3 ); \
|
||||
mm128_swap128_256( B4, C4 ); \
|
||||
mm128_swap128_256( B5, C5 ); \
|
||||
mm128_swap128_256( B6, C6 ); \
|
||||
mm128_swap128_256( B7, C7 ); \
|
||||
mm128_swap128_256( B8, C8 ); \
|
||||
mm128_swap128_256( B9, C9 ); \
|
||||
mm128_swap128_256( BA, CA ); \
|
||||
mm128_swap128_256( BB, CB ); \
|
||||
mm128_swap128_256( BC, CC ); \
|
||||
mm128_swap128_256( BD, CD ); \
|
||||
mm128_swap128_256( BE, CE ); \
|
||||
mm128_swap128_256( BF, CF ); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
|
||||
|
@@ -40,7 +40,7 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
|
410
algo/shavite/shavite-hash-2way.c
Normal file
410
algo/shavite/shavite-hash-2way.c
Normal file
@@ -0,0 +1,410 @@
|
||||
#include "shavite-hash-2way.h"
|
||||
#include "algo/sha/sph_types.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
static const uint32_t IV512[] =
|
||||
{
|
||||
0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
|
||||
0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
|
||||
0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
|
||||
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
|
||||
};
|
||||
|
||||
#define mm256_ror2x256hi_1x32( a, b ) \
|
||||
_mm256_blend_epi32( mm256_ror1x32_128( a ), \
|
||||
mm256_ror1x32_128( b ), 0x88 )
|
||||
|
||||
static void
|
||||
c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
{
|
||||
__m256i p0, p1, p2, p3, x;
|
||||
__m256i k00, k01, k02, k03, k10, k11, k12, k13;
|
||||
__m256i *m = (__m256i*)msg;
|
||||
__m256i *h = (__m256i*)ctx->h;
|
||||
int r;
|
||||
|
||||
p0 = h[0];
|
||||
p1 = h[1];
|
||||
p2 = h[2];
|
||||
p3 = h[3];
|
||||
|
||||
// round
|
||||
k00 = m[0];
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ) );
|
||||
k01 = m[1];
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
|
||||
k02 = m[2];
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
|
||||
k03 = m[3];
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
|
||||
|
||||
p0 = _mm256_xor_si256( p0, x );
|
||||
|
||||
k10 = m[4];
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ) );
|
||||
k11 = m[5];
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
|
||||
k12 = m[6];
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
|
||||
k13 = m[7];
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
|
||||
|
||||
p2 = _mm256_xor_si256( p2, x );
|
||||
|
||||
for ( r = 0; r < 3; r ++ )
|
||||
{
|
||||
// round 1, 5, 9
|
||||
|
||||
k00 = _mm256_xor_si256( k13, mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k00 ) ) );
|
||||
|
||||
if ( r == 0 )
|
||||
k00 = _mm256_xor_si256( k00, _mm256_set_epi32(
|
||||
~ctx->count3, ctx->count2, ctx->count1, ctx->count0,
|
||||
~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
|
||||
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ) );
|
||||
k01 = _mm256_xor_si256( k00,
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k01 ) ) );
|
||||
|
||||
if ( r == 1 )
|
||||
k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
|
||||
~ctx->count0, ctx->count1, ctx->count2, ctx->count3,
|
||||
~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) );
|
||||
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
|
||||
k02 = _mm256_xor_si256( k01,
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k02 ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
|
||||
k03 = _mm256_xor_si256( k02,
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k03 ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
|
||||
|
||||
p3 = _mm256_xor_si256( p3, x );
|
||||
|
||||
k10 = _mm256_xor_si256( k03,
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k10 ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ) );
|
||||
k11 = _mm256_xor_si256( k10,
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k11 ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
|
||||
k12 = _mm256_xor_si256( k11,
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k12 ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
|
||||
k13 = _mm256_xor_si256( k12,
|
||||
mm256_ror1x32_128( mm256_aesenc_2x128( k13 ) ) );
|
||||
|
||||
if ( r == 2 )
|
||||
k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
|
||||
~ctx->count1, ctx->count0, ctx->count3, ctx->count2,
|
||||
~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
|
||||
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
|
||||
p1 = _mm256_xor_si256( p1, x );
|
||||
|
||||
// round 2, 6, 10
|
||||
|
||||
k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k00 ) );
|
||||
k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
|
||||
k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
|
||||
k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
|
||||
|
||||
p2 = _mm256_xor_si256( p2, x );
|
||||
|
||||
k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k10 ) );
|
||||
k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
|
||||
k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
|
||||
k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
|
||||
|
||||
p0 = _mm256_xor_si256( p0, x );
|
||||
|
||||
// round 3, 7, 11
|
||||
|
||||
k00 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k00 ) ), k13 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ) );
|
||||
k01 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k01 ) ), k00 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
|
||||
k02 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k02 ) ), k01 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
|
||||
k03 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k03 ) ), k02 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
|
||||
|
||||
p1 = _mm256_xor_si256( p1, x );
|
||||
|
||||
k10 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k10 ) ), k03 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ) );
|
||||
k11 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k11 ) ), k10 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
|
||||
k12 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k12 ) ), k11 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
|
||||
k13 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k13 ) ), k12 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
|
||||
|
||||
p3 = _mm256_xor_si256( p3, x );
|
||||
|
||||
// round 4, 8, 12
|
||||
|
||||
k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ) );
|
||||
k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
|
||||
k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
|
||||
k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
|
||||
|
||||
p0 = _mm256_xor_si256( p0, x );
|
||||
|
||||
k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ) );
|
||||
k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
|
||||
k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
|
||||
k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
|
||||
|
||||
p2 = _mm256_xor_si256( p2, x );
|
||||
|
||||
}
|
||||
|
||||
// round 13
|
||||
|
||||
k00 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k00 ) ), k13 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ) );
|
||||
k01 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k01 ) ), k00 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
|
||||
k02 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k02 ) ), k01 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
|
||||
k03 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k03 ) ), k02 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
|
||||
|
||||
p3 = _mm256_xor_si256( p3, x );
|
||||
|
||||
k10 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k10 ) ), k03 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ) );
|
||||
k11 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k11 ) ), k10 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
|
||||
|
||||
k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12 ) );
|
||||
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
|
||||
~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
|
||||
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
|
||||
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
|
||||
k13 = _mm256_xor_si256( mm256_ror1x32_128(
|
||||
mm256_aesenc_2x128( k13 ) ), k12 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
|
||||
|
||||
p1 = _mm256_xor_si256( p1, x );
|
||||
|
||||
h[0] = _mm256_xor_si256( h[0], p2 );
|
||||
h[1] = _mm256_xor_si256( h[1], p3 );
|
||||
h[2] = _mm256_xor_si256( h[2], p0 );
|
||||
h[3] = _mm256_xor_si256( h[3], p1 );
|
||||
}
|
||||
|
||||
void shavite512_2way_init( shavite512_2way_context *ctx )
|
||||
{
|
||||
casti_m256i( ctx->h, 0 ) =
|
||||
_mm256_set_epi32( IV512[ 3], IV512[ 2], IV512[ 1], IV512[ 0],
|
||||
IV512[ 3], IV512[ 2], IV512[ 1], IV512[ 0] );
|
||||
casti_m256i( ctx->h, 1 ) =
|
||||
_mm256_set_epi32( IV512[ 7], IV512[ 6], IV512[ 5], IV512[ 4],
|
||||
IV512[ 7], IV512[ 6], IV512[ 5], IV512[ 4] );
|
||||
casti_m256i( ctx->h, 2 ) =
|
||||
_mm256_set_epi32( IV512[11], IV512[10], IV512[ 9], IV512[ 8],
|
||||
IV512[11], IV512[10], IV512[ 9], IV512[ 8] );
|
||||
casti_m256i( ctx->h, 3 ) =
|
||||
_mm256_set_epi32( IV512[15], IV512[14], IV512[13], IV512[12],
|
||||
IV512[15], IV512[14], IV512[13], IV512[12] );
|
||||
ctx->ptr = 0;
|
||||
ctx->count0 = 0;
|
||||
ctx->count1 = 0;
|
||||
ctx->count2 = 0;
|
||||
ctx->count3 = 0;
|
||||
}
|
||||
|
||||
void shavite512_2way_update( shavite512_2way_context *ctx, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
unsigned char *buf = ctx->buf;
|
||||
size_t ptr = ctx->ptr;
|
||||
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = (sizeof ctx->buf) - ptr;
|
||||
if ( clen > len << 1 )
|
||||
clen = len << 1;
|
||||
memcpy( buf + ptr, data, clen );
|
||||
data = (const unsigned char *)data + clen;
|
||||
ptr += clen;
|
||||
len -= clen >> 1;
|
||||
if ( ptr == sizeof ctx->buf )
|
||||
{
|
||||
if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 )
|
||||
{
|
||||
ctx->count1 = ctx->count1 + 1;
|
||||
if ( ctx->count1 == 0 )
|
||||
{
|
||||
ctx->count2 = ctx->count2 + 1;
|
||||
if ( ctx->count2 == 0 )
|
||||
ctx->count3 = ctx->count3 + 1;
|
||||
}
|
||||
}
|
||||
c512_2way( ctx, buf );
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
ctx->ptr = ptr;
|
||||
}
|
||||
|
||||
void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
|
||||
{
|
||||
unsigned char *buf;
|
||||
union
|
||||
{
|
||||
uint32_t u32[4];
|
||||
uint16_t u16[8];
|
||||
} count;
|
||||
|
||||
buf = ctx->buf;
|
||||
uint32_t vp = ctx->ptr>>5;
|
||||
|
||||
// Terminating byte then zero pad
|
||||
casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
|
||||
|
||||
// Zero pad full vectors up to count
|
||||
for ( ; vp < 6; vp++ )
|
||||
casti_m256i( buf, vp ) = m256_zero;
|
||||
|
||||
// Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
|
||||
// Count is misaligned to 16 bits and straddles a vector.
|
||||
// Use u32 overlay to stage then u16 to load buf.
|
||||
count.u32[0] = ctx->count0 += (ctx->ptr << 2); // ptr/2 * 8
|
||||
count.u32[1] = ctx->count1;
|
||||
count.u32[2] = ctx->count2;
|
||||
count.u32[3] = ctx->count3;
|
||||
|
||||
casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0,
|
||||
count.u16[0], 0,0,0,0,0,0,0 );
|
||||
casti_m256i( buf, 7 ) = _mm256_set_epi16(
|
||||
0x0200 , count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1],
|
||||
0x0200 , count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] );
|
||||
|
||||
c512_2way( ctx, buf);
|
||||
|
||||
casti_m256i( dst, 0 ) = casti_m256i( ctx->h, 0 );
|
||||
casti_m256i( dst, 1 ) = casti_m256i( ctx->h, 1 );
|
||||
casti_m256i( dst, 2 ) = casti_m256i( ctx->h, 2 );
|
||||
casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 );
|
||||
}
|
||||
|
||||
void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
|
||||
const void *data, size_t len )
|
||||
{
|
||||
unsigned char *buf = ctx->buf;
|
||||
size_t ptr = ctx->ptr;
|
||||
|
||||
// process full blocks and load buf with remainder.
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = (sizeof ctx->buf) - ptr;
|
||||
if ( clen > len << 1 )
|
||||
clen = len << 1;
|
||||
memcpy( buf + ptr, data, clen );
|
||||
data = (const unsigned char *)data + clen;
|
||||
ptr += clen;
|
||||
len -= (clen >> 1);
|
||||
if ( ptr == sizeof ctx->buf )
|
||||
{
|
||||
if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 )
|
||||
{
|
||||
ctx->count1 = ctx->count1 + 1;
|
||||
if ( ctx->count1 == 0 )
|
||||
{
|
||||
ctx->count2 = ctx->count2 + 1;
|
||||
if ( ctx->count2 == 0 )
|
||||
ctx->count3 = ctx->count3 + 1;
|
||||
}
|
||||
}
|
||||
c512_2way( ctx, buf );
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t vp = ptr>>5;
|
||||
// Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
|
||||
// Count is misaligned to 16 bits and straddles 2 vectors.
|
||||
// Use u32 overlay to stage then u16 to load buf.
|
||||
union
|
||||
{
|
||||
uint32_t u32[4];
|
||||
uint16_t u16[8];
|
||||
} count;
|
||||
|
||||
count.u32[0] = ctx->count0 += (ptr << 2); // ptr/2 * 8
|
||||
count.u32[1] = ctx->count1;
|
||||
count.u32[2] = ctx->count2;
|
||||
count.u32[3] = ctx->count3;
|
||||
|
||||
if ( vp == 0 ) // empty buf, xevan.
|
||||
{
|
||||
casti_m256i( buf, 0 ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
|
||||
memset_zero_256( (__m256i*)buf + 1, 5 );
|
||||
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
||||
}
|
||||
else // half full buf, everyone else.
|
||||
{
|
||||
casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
|
||||
memset_zero_256( (__m256i*)buf + vp, 6 - vp );
|
||||
}
|
||||
|
||||
casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0,
|
||||
count.u16[0], 0,0,0,0,0,0,0 );
|
||||
casti_m256i( buf, 7 ) = _mm256_set_epi16(
|
||||
0x0200 , count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1],
|
||||
0x0200 , count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] );
|
||||
|
||||
c512_2way( ctx, buf);
|
||||
|
||||
casti_m256i( dst, 0 ) = casti_m256i( ctx->h, 0 );
|
||||
casti_m256i( dst, 1 ) = casti_m256i( ctx->h, 1 );
|
||||
casti_m256i( dst, 2 ) = casti_m256i( ctx->h, 2 );
|
||||
casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 );
|
||||
}
|
||||
|
||||
#endif // AVX2
|
25
algo/shavite/shavite-hash-2way.h
Normal file
25
algo/shavite/shavite-hash-2way.h
Normal file
@@ -0,0 +1,25 @@
|
||||
#ifndef SHAVITE_HASH_2WAY_H__
|
||||
#define SHAVITE_HASH_2WAY_H__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
typedef struct {
|
||||
unsigned char buf[128<<1];
|
||||
uint32_t h[16<<1];
|
||||
size_t ptr;
|
||||
uint32_t count0, count1, count2, count3;
|
||||
} shavite512_2way_context __attribute__ ((aligned (64)));
|
||||
|
||||
void shavite512_2way_init( shavite512_2way_context *ctx );
|
||||
void shavite512_2way_update( shavite512_2way_context *ctx, const void *data,
|
||||
size_t len );
|
||||
void shavite512_2way_close( shavite512_2way_context *ctx, void *dst );
|
||||
void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
|
||||
const void *data, size_t len );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#endif // SHAVITE_HASH_2WAY_H__
|
||||
|
@@ -36,7 +36,7 @@
|
||||
#ifdef __AES__
|
||||
|
||||
#include "sph_shavite.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
@@ -102,35 +102,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
k00 = m[0];
|
||||
x = _mm_xor_si128( p1, k00 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k01 = m[1];
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k02 = m[2];
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k03 = m[3];
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
|
||||
k10 = m[4];
|
||||
x = _mm_xor_si128( p3, k10 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k11 = m[5];
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k12 = m[6];
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k13 = m[7];
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
|
||||
for ( r = 0; r < 3; r ++ )
|
||||
@@ -156,15 +152,15 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
|
||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
|
||||
@@ -172,12 +168,10 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
|
||||
k12 = _mm_xor_si128( k12, k11 );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
|
||||
@@ -196,118 +190,103 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
|
||||
x = _mm_xor_si128( p3, k00 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
|
||||
k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
|
||||
x = _mm_xor_si128( p1, k10 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
|
||||
// round 3, 7, 11
|
||||
|
||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
|
||||
x = _mm_xor_si128( p2, k00 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
p1 = _mm_xor_si128( p1, x );
|
||||
|
||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
|
||||
x = _mm_xor_si128( p0, k10 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
|
||||
k12 = _mm_xor_si128( k12, k11 );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
|
||||
// round 4, 8, 12
|
||||
|
||||
k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
|
||||
|
||||
x = _mm_xor_si128( p1, k00 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
|
||||
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
|
||||
k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
|
||||
x = _mm_xor_si128( p3, k10 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
}
|
||||
|
||||
@@ -315,46 +294,41 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
|
||||
x = _mm_xor_si128( p0, k00 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
|
||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
|
||||
x = _mm_xor_si128( p2, k10 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
|
||||
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
|
||||
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
p1 = _mm_xor_si128( p1, x );
|
||||
|
||||
h[0] = _mm_xor_si128( h[0], p2 );
|
||||
@@ -427,6 +401,9 @@ shavite_big_aesni_close( sph_shavite_big_context *sc, unsigned ub, unsigned n,
|
||||
count1 = sc->count1;
|
||||
count2 = sc->count2;
|
||||
count3 = sc->count3;
|
||||
|
||||
|
||||
|
||||
z = 0x80 >> n;
|
||||
z = ((ub & -z) | z) & 0xFF;
|
||||
if (ptr == 0 && n == 0) {
|
||||
@@ -443,6 +420,7 @@ shavite_big_aesni_close( sph_shavite_big_context *sc, unsigned ub, unsigned n,
|
||||
memset(buf, 0, 110);
|
||||
sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
|
||||
}
|
||||
|
||||
sph_enc32le(buf + 110, count0);
|
||||
sph_enc32le(buf + 114, count1);
|
||||
sph_enc32le(buf + 118, count2);
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -5,7 +5,7 @@
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
typedef struct {
|
||||
uint32_t A[ 32*2 ] __attribute__((aligned(64)));
|
||||
|
@@ -17,13 +17,13 @@ void skeinhash_4way( void *state, const void *input )
|
||||
skein512_4way( &ctx_skein, input, 80 );
|
||||
skein512_4way_close( &ctx_skein, vhash64 );
|
||||
|
||||
mm256_reinterleave_4x32( vhash32, vhash64, 512 );
|
||||
mm256_rintrlv_4x64_4x32( vhash32, vhash64, 512 );
|
||||
|
||||
sha256_4way_init( &ctx_sha256 );
|
||||
sha256_4way( &ctx_sha256, vhash32, 64 );
|
||||
sha256_4way_close( &ctx_sha256, state );
|
||||
|
||||
mm128_deinterleave_4x32( state, state+32, state+64, state+96,
|
||||
mm128_dintrlv_4x32( state, state+32, state+64, state+96,
|
||||
vhash32, 256 );
|
||||
}
|
||||
|
||||
@@ -48,7 +48,7 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
|
||||
swab32_array( edata, pdata, 20 );
|
||||
|
||||
mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );
|
||||
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
uint32_t *noncep = vdata + 73; // 9*8 + 1
|
||||
|
||||
|
@@ -3,9 +3,12 @@
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
// Override multi way on ryzen, SHA is better.
|
||||
#if !defined(RYZEN_)
|
||||
#if defined(__AVX2__)
|
||||
#define SKEIN_4WAY
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SKEIN_4WAY)
|
||||
|
||||
|
@@ -49,7 +49,7 @@ extern "C"{
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
// Output size in bits
|
||||
#define SPH_SIZE_skein256 256
|
||||
|
@@ -3,31 +3,20 @@
|
||||
#include <stdint.h>
|
||||
#include "sph_skein.h"
|
||||
#include <openssl/sha.h>
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
|
||||
void skeinhash(void *state, const void *input)
|
||||
{
|
||||
uint32_t hash[16] __attribute__ ((aligned (64)));
|
||||
sph_skein512_context ctx_skein;
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA256_CTX ctx_sha256;
|
||||
#else
|
||||
sph_sha256_context ctx_sha256;
|
||||
#endif
|
||||
|
||||
sph_skein512_init( &ctx_skein );
|
||||
sph_skein512( &ctx_skein, input, 80 );
|
||||
sph_skein512_close( &ctx_skein, hash );
|
||||
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash, 64 );
|
||||
SHA256_Final( (unsigned char*) hash, &ctx_sha256 );
|
||||
#else
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hash, 64 );
|
||||
sph_sha256_close( &ctx_sha256, hash );
|
||||
#endif
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
@@ -59,7 +59,7 @@
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@@ -52,7 +52,7 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for WHIRLPOOL.
|
||||
|
@@ -114,11 +114,11 @@ void x12_4way_hash( void *state, const void *input )
|
||||
|
||||
// 8 Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
// 9 Shavite
|
||||
|
@@ -17,8 +17,6 @@
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
//#include "algo/fugue/sph_fugue.h"
|
||||
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
@@ -27,45 +25,42 @@
|
||||
#include "algo/keccak/sse2/keccak.c"
|
||||
#include "algo/skein/sse2/skein.c"
|
||||
#include "algo/jh/sse2/jh_sse2_opt64.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#if defined(__AES__)
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
hashState_groestl groestl;
|
||||
hashState_echo echo;
|
||||
#else
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cubehash;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
// sph_fugue512_context fugue;
|
||||
} x12_ctx_holder;
|
||||
|
||||
x12_ctx_holder x12_ctx;
|
||||
|
||||
void init_x12_ctx()
|
||||
{
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init(&x12_ctx.groestl);
|
||||
sph_echo512_init(&x12_ctx.echo);
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
init_echo( &x12_ctx.echo, 512 );
|
||||
init_groestl (&x12_ctx.groestl, 64 );
|
||||
#else
|
||||
sph_groestl512_init(&x12_ctx.groestl);
|
||||
sph_echo512_init(&x12_ctx.echo);
|
||||
#endif
|
||||
init_luffa( &x12_ctx.luffa, 512 );
|
||||
cubehashInit( &x12_ctx.cubehash, 512, 16, 32 );
|
||||
sph_shavite512_init( &x12_ctx.shavite );
|
||||
init_sd( &x12_ctx.simd, 512 );
|
||||
sph_hamsi512_init( &x12_ctx.hamsi );
|
||||
// sph_fugue512_init( &x13_ctx.fugue );
|
||||
};
|
||||
|
||||
void x12hash(void *output, const void *input)
|
||||
@@ -108,12 +103,12 @@ void x12hash(void *output, const void *input)
|
||||
|
||||
//---groetl----
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512 (&ctx.groestl, hash, 64);
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
#else
|
||||
sph_groestl512 (&ctx.groestl, hash, 64);
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#endif
|
||||
|
||||
//---skein4---
|
||||
@@ -153,23 +148,18 @@ void x12hash(void *output, const void *input)
|
||||
|
||||
//11---echo---
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_echo512(&ctx.echo, hash, 64);
|
||||
sph_echo512_close(&ctx.echo, hashB);
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hashB,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#else
|
||||
sph_echo512(&ctx.echo, hash, 64);
|
||||
sph_echo512_close(&ctx.echo, hashB);
|
||||
#endif
|
||||
|
||||
// 12 Hamsi
|
||||
sph_hamsi512(&ctx.hamsi, hashB, 64);
|
||||
sph_hamsi512_close(&ctx.hamsi, hash);
|
||||
|
||||
/*
|
||||
// 13 Fugue
|
||||
sph_fugue512(&ctx.fugue, hash, 64);
|
||||
sph_fugue512_close(&ctx.fugue, hashB);
|
||||
*/
|
||||
asm volatile ("emms");
|
||||
memcpy(output, hashB, 32);
|
||||
}
|
||||
|
@@ -49,10 +49,10 @@ void polytimos_4way_hash( void *output, const void *input )
|
||||
|
||||
// Need to convert from 64 bit interleaved to 32 bit interleaved.
|
||||
uint32_t vhash32[16*4];
|
||||
mm256_reinterleave_4x32( vhash32, vhash, 512 );
|
||||
mm256_rintrlv_4x64_4x32( vhash32, vhash, 512 );
|
||||
shabal512_4way( &ctx.shabal, vhash32, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash32 );
|
||||
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
|
||||
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
@@ -66,13 +66,13 @@ void polytimos_4way_hash( void *output, const void *input )
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
|
||||
mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
|
||||
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
|
||||
mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
|
||||
mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
|
||||
mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
|
@@ -183,16 +183,16 @@ void x16r_4way_hash( void* output, const void* input )
|
||||
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
|
||||
break;
|
||||
case CUBEHASH:
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
|
||||
(const byte*)in0, size );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
|
||||
(const byte*)in1, size );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
|
||||
(const byte*)in2, size );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
|
||||
(const byte*)in3, size );
|
||||
break;
|
||||
@@ -293,7 +293,7 @@ void x16r_4way_hash( void* output, const void* input )
|
||||
}
|
||||
|
||||
int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t hash[4*16] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
@@ -303,6 +303,7 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep = vdata + 73; // 9*8 + 1
|
@@ -35,7 +35,7 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )
|
||||
bool register_x16r_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X16R_4WAY)
|
||||
init_x16r_4way_ctx();
|
||||
// init_x16r_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x16r_4way;
|
||||
gate->hash = (void*)&x16r_4way_hash;
|
||||
#else
|
||||
@@ -52,7 +52,7 @@ bool register_x16r_algo( algo_gate_t* gate )
|
||||
bool register_x16s_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X16R_4WAY)
|
||||
init_x16r_4way_ctx();
|
||||
// init_x16r_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x16r_4way;
|
||||
gate->hash = (void*)&x16r_4way_hash;
|
||||
#else
|
@@ -2,7 +2,7 @@
|
||||
#define X16R_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
@@ -41,7 +41,7 @@ bool register_x16s_algo( algo_gate_t* gate );
|
||||
void x16r_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void init_x16r_4way_ctx();
|
||||
|
||||
@@ -50,7 +50,7 @@ void init_x16r_4way_ctx();
|
||||
void x16r_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void init_x16r_ctx();
|
||||
|
@@ -25,7 +25,7 @@
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include <openssl/sha.h>
|
||||
#ifndef NO_AES_NI
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#endif
|
||||
@@ -34,12 +34,12 @@ static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
|
||||
|
||||
typedef struct {
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
#else
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
sph_blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
@@ -95,14 +95,14 @@ void x16r_hash( void* output, const void* input )
|
||||
sph_bmw512_close(&ctx.bmw, hash);
|
||||
break;
|
||||
case GROESTL:
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, in, size );
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)in, size<<3 );
|
||||
#else
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, in, size );
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#endif
|
||||
break;
|
||||
case SKEIN:
|
||||
@@ -141,14 +141,14 @@ void x16r_hash( void* output, const void* input )
|
||||
(const BitSequence*)in, size<<3 );
|
||||
break;
|
||||
case ECHO:
|
||||
#ifdef NO_AES_NI
|
||||
sph_echo512_init( &ctx.echo );
|
||||
sph_echo512( &ctx.echo, in, size );
|
||||
sph_echo512_close( &ctx.echo, hash );
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
(const BitSequence*)in, size<<3 );
|
||||
#else
|
||||
sph_echo512_init( &ctx.echo );
|
||||
sph_echo512( &ctx.echo, in, size );
|
||||
sph_echo512_close( &ctx.echo, hash );
|
||||
#endif
|
||||
break;
|
||||
case HAMSI:
|
||||
@@ -184,7 +184,7 @@ void x16r_hash( void* output, const void* input )
|
||||
}
|
||||
|
||||
int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(128) hash32[8];
|
||||
uint32_t _ALIGN(128) endiandata[20];
|
||||
@@ -192,6 +192,7 @@ int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
uint32_t nonce = first_nonce;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
|
@@ -16,10 +16,9 @@
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include <openssl/sha.h>
|
||||
#ifndef NO_AES_NI
|
||||
#if defined(__AES__)
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#endif
|
||||
@@ -42,18 +41,14 @@ typedef struct {
|
||||
sph_fugue512_context fugue1, fugue2;
|
||||
sph_shabal512_context shabal1;
|
||||
sph_whirlpool_context whirlpool1, whirlpool2, whirlpool3, whirlpool4;
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA512_CTX sha1, sha2;
|
||||
#else
|
||||
sph_sha512_context sha1, sha2;
|
||||
#endif
|
||||
sph_haval256_5_context haval1, haval2;
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context groestl1, groestl2;
|
||||
sph_echo512_context echo1, echo2;
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
hashState_echo echo1, echo2;
|
||||
hashState_groestl groestl1, groestl2;
|
||||
#else
|
||||
sph_groestl512_context groestl1, groestl2;
|
||||
sph_echo512_context echo1, echo2;
|
||||
#endif
|
||||
} hmq1725_ctx_holder;
|
||||
|
||||
@@ -101,26 +96,22 @@ void init_hmq1725_ctx()
|
||||
sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
|
||||
sph_whirlpool_init(&hmq1725_ctx.whirlpool4);
|
||||
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA512_Init( &hmq1725_ctx.sha1 );
|
||||
SHA512_Init( &hmq1725_ctx.sha2 );
|
||||
#else
|
||||
sph_sha512_init(&hmq1725_ctx.sha1);
|
||||
sph_sha512_init(&hmq1725_ctx.sha2);
|
||||
#endif
|
||||
|
||||
sph_haval256_5_init(&hmq1725_ctx.haval1);
|
||||
sph_haval256_5_init(&hmq1725_ctx.haval2);
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init( &hmq1725_ctx.groestl1 );
|
||||
sph_groestl512_init( &hmq1725_ctx.groestl2 );
|
||||
sph_echo512_init( &hmq1725_ctx.echo1 );
|
||||
sph_echo512_init( &hmq1725_ctx.echo2 );
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
init_echo( &hmq1725_ctx.echo1, 512 );
|
||||
init_echo( &hmq1725_ctx.echo2, 512 );
|
||||
init_groestl( &hmq1725_ctx.groestl1, 64 );
|
||||
init_groestl( &hmq1725_ctx.groestl2, 64 );
|
||||
#else
|
||||
sph_groestl512_init( &hmq1725_ctx.groestl1 );
|
||||
sph_groestl512_init( &hmq1725_ctx.groestl2 );
|
||||
sph_echo512_init( &hmq1725_ctx.echo1 );
|
||||
sph_echo512_init( &hmq1725_ctx.echo2 );
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -151,12 +142,12 @@ extern void hmq1725hash(void *state, const void *input)
|
||||
|
||||
if ( hashB[0] & mask ) //1
|
||||
{
|
||||
#ifdef NO_AES_NI
|
||||
#if defined(__AES__)
|
||||
update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
|
||||
(const char*)hashB, 512 );
|
||||
#else
|
||||
sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1
|
||||
sph_groestl512_close(&h_ctx.groestl1, hashA); //2
|
||||
#else
|
||||
update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
|
||||
(const char*)hashB, 512 );
|
||||
#endif
|
||||
}
|
||||
else
|
||||
@@ -217,12 +208,12 @@ extern void hmq1725hash(void *state, const void *input)
|
||||
memset(&hashB[8], 0, 32);
|
||||
}
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_echo512 (&h_ctx.echo1, hashB, 64); //5
|
||||
sph_echo512_close(&h_ctx.echo1, hashA); //6
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA,
|
||||
(const BitSequence *)hashB, 512 );
|
||||
#else
|
||||
sph_echo512 (&h_ctx.echo1, hashB, 64); //5
|
||||
sph_echo512_close(&h_ctx.echo1, hashA); //6
|
||||
#endif
|
||||
|
||||
sph_blake512 (&h_ctx.blake2, hashA, 64); //6
|
||||
@@ -247,12 +238,12 @@ extern void hmq1725hash(void *state, const void *input)
|
||||
|
||||
if ( hashA[0] & mask ) //4
|
||||
{
|
||||
#ifdef NO_AES_NI
|
||||
sph_echo512 (&h_ctx.echo2, hashA, 64); //
|
||||
sph_echo512_close(&h_ctx.echo2, hashB); //5
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB,
|
||||
(const BitSequence *)hashA, 512 );
|
||||
#else
|
||||
sph_echo512 (&h_ctx.echo2, hashA, 64); //
|
||||
sph_echo512_close(&h_ctx.echo2, hashB); //5
|
||||
#endif
|
||||
}
|
||||
else
|
||||
@@ -274,30 +265,20 @@ extern void hmq1725hash(void *state, const void *input)
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA512_Update( &h_ctx.sha1, hashB, 64 );
|
||||
SHA512_Final( (unsigned char*) hashA, &h_ctx.sha1 );
|
||||
#else
|
||||
sph_sha512 (&h_ctx.sha1, hashB, 64); //7
|
||||
sph_sha512_close(&h_ctx.sha1, hashA); //8
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
|
||||
sph_groestl512_close(&h_ctx.groestl2, hashB); //4
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
update_and_final_groestl( &h_ctx.groestl2, (char*)hashB,
|
||||
(const char*)hashA, 512 );
|
||||
#else
|
||||
sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
|
||||
sph_groestl512_close(&h_ctx.groestl2, hashB); //4
|
||||
#endif
|
||||
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA512_Update( &h_ctx.sha2, hashB, 64 );
|
||||
SHA512_Final( (unsigned char*) hashA, &h_ctx.sha2 );
|
||||
#else
|
||||
sph_sha512 (&h_ctx.sha2, hashB, 64); //2
|
||||
sph_sha512_close(&h_ctx.sha2, hashA); //3
|
||||
#endif
|
||||
|
||||
if ( hashA[0] & mask ) //4
|
||||
{
|
||||
@@ -318,7 +299,7 @@ extern void hmq1725hash(void *state, const void *input)
|
||||
}
|
||||
|
||||
int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t endiandata[32] __attribute__((aligned(64)));
|
||||
uint32_t hash64[8] __attribute__((aligned(64)));
|
||||
@@ -326,6 +307,7 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19] - 1;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
//const uint32_t Htarg = ptarget[7];
|
||||
|
||||
//we need bigendian data...
|
||||
|
856
algo/x17/sonoa-4way.c
Normal file
856
algo/x17/sonoa-4way.c
Normal file
@@ -0,0 +1,856 @@
|
||||
#include "sonoa-gate.h"
|
||||
|
||||
#if defined(SONOA_4WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/shavite/shavite-hash-2way.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/hamsi/hamsi-hash-4way.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/shabal/shabal-hash-4way.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/haval/haval-hash-4way.h"
|
||||
#include "algo/sha/sha2-hash-4way.h"
|
||||
|
||||
union _sonoa_4way_context_overlay
|
||||
{
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
luffa_2way_context luffa;
|
||||
cube_2way_context cube;
|
||||
shavite512_2way_context shavite;
|
||||
simd_2way_context simd;
|
||||
hashState_echo echo;
|
||||
hamsi512_4way_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
shabal512_4way_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sha512_4way_context sha512;
|
||||
haval256_5_4way_context haval;
|
||||
};
|
||||
|
||||
typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;
|
||||
|
||||
void sonoa_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
|
||||
sonoa_4way_context_overlay ctx;
|
||||
|
||||
// 1
|
||||
|
||||
blake512_4way_init( &ctx.blake );
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
skein512_4way_init( &ctx.skein );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_2way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
|
||||
cube_2way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
|
||||
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
|
||||
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
|
||||
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
|
||||
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
// 2
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
skein512_4way_init( &ctx.skein );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_2way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
|
||||
cube_2way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
|
||||
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
|
||||
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
|
||||
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
|
||||
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
hamsi512_4way( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
|
||||
// 3
|
||||
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
skein512_4way_init( &ctx.skein );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_2way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
|
||||
cube_2way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
|
||||
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
|
||||
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
|
||||
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
|
||||
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
hamsi512_4way( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
// 4
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
skein512_4way_init( &ctx.skein );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_2way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
|
||||
cube_2way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
|
||||
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
|
||||
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
|
||||
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
|
||||
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
hamsi512_4way( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
shabal512_4way_init( &ctx.shabal );
|
||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
|
||||
mm256_rintrlv_4x32_4x64( vhashB, vhash, 512 );
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
hamsi512_4way( &ctx.hamsi, vhashB, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
mm256_intrlv_2x128( vhashA, hash0, hash1, 512 );
|
||||
mm256_intrlv_2x128( vhashB, hash2, hash3, 512 );
|
||||
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
// 5
|
||||
mm256_rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
mm256_rintrlv_4x64_4x32( vhashB, vhash, 512 );
|
||||
|
||||
shabal512_4way_init( &ctx.shabal );
|
||||
shabal512_4way( &ctx.shabal, vhashB, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
skein512_4way_init( &ctx.skein );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_2way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
|
||||
cube_2way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
|
||||
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
|
||||
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
|
||||
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
|
||||
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
hamsi512_4way( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
shabal512_4way_init( &ctx.shabal );
|
||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash0 );
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash1 );
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash2 );
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash3 );
|
||||
|
||||
// 6
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
skein512_4way_init( &ctx.skein );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_2way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
|
||||
cube_2way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
|
||||
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
|
||||
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
|
||||
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
|
||||
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
hamsi512_4way( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
shabal512_4way_init( &ctx.shabal );
|
||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash0 );
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash1 );
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash2 );
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash3 );
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
sha512_4way_init( &ctx.sha512 );
|
||||
sha512_4way( &ctx.sha512, vhash, 64 );
|
||||
sha512_4way_close( &ctx.sha512, vhash );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash0 );
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash1 );
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash2 );
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash3 );
|
||||
|
||||
// 7
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
skein512_4way_init( &ctx.skein );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_2way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
|
||||
cube_2way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
|
||||
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
|
||||
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
|
||||
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
|
||||
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
hamsi512_4way( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
|
||||
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
shabal512_4way_init( &ctx.shabal );
|
||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash0 );
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash1 );
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash2 );
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash3 );
|
||||
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
sha512_4way_init( &ctx.sha512 );
|
||||
sha512_4way( &ctx.sha512, vhash, 64 );
|
||||
sha512_4way_close( &ctx.sha512, vhash );
|
||||
|
||||
mm256_rintrlv_4x64_4x32( vhashB, vhash, 512 );
|
||||
|
||||
haval256_5_4way_init( &ctx.haval );
|
||||
haval256_5_4way( &ctx.haval, vhashB, 64 );
|
||||
haval256_5_4way_close( &ctx.haval, state );
|
||||
}
|
||||
|
||||
int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// Need big endian data
|
||||
mm256_bswap_intrlv80_4x64( vdata, pdata );
|
||||
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ),
|
||||
*noncev );
|
||||
sonoa_4way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( ( hash7[ lane ] & mask ) == 0 ) )
|
||||
{
|
||||
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
18
algo/x17/sonoa-gate.c
Normal file
18
algo/x17/sonoa-gate.c
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "sonoa-gate.h"
|
||||
|
||||
bool register_sonoa_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (SONOA_4WAY)
|
||||
// init_sonoa_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_sonoa_4way;
|
||||
gate->hash = (void*)&sonoa_4way_hash;
|
||||
#else
|
||||
init_sonoa_ctx();
|
||||
gate->scanhash = (void*)&scanhash_sonoa;
|
||||
gate->hash = (void*)&sonoa_hash;
|
||||
#endif
|
||||
gate->get_max64 = (void*)&get_max64_0x1ffff;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/x17/sonoa-gate.h
Normal file
32
algo/x17/sonoa-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef SONOA_GATE_H__
|
||||
#define SONOA_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define SONOA_4WAY
|
||||
#endif
|
||||
|
||||
bool register_sonoa_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(SONOA_4WAY)
|
||||
|
||||
void sonoa_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
//void init_sonoa_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void sonoa_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_sonoa( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void init_sonoa_ctx();
|
||||
|
||||
#endif
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user