mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
32 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
3572cb53c4 | ||
![]() |
241bc26767 | ||
![]() |
c65b0ff7a6 | ||
![]() |
a17ff6f189 | ||
![]() |
73430b13b1 | ||
![]() |
40039386a0 | ||
![]() |
91ec6f1771 | ||
![]() |
a52c5eccf7 | ||
![]() |
86b889e1b0 | ||
![]() |
72330eb5a7 | ||
![]() |
789c8b70bc | ||
![]() |
01550d94a2 | ||
![]() |
a042fb7612 | ||
![]() |
9d49e0be7a | ||
![]() |
a51f59086b | ||
![]() |
6f49ba09b7 | ||
![]() |
e2d5762ef2 | ||
![]() |
e625ed5420 | ||
![]() |
9abc19a30a | ||
![]() |
0d769ee0fe | ||
![]() |
0d48d573ce | ||
![]() |
d6e8d7a46e | ||
![]() |
71d6b97ee8 | ||
![]() |
b2331375a3 | ||
![]() |
7fec680835 | ||
![]() |
1b0a5aadf6 | ||
![]() |
0a3c52810e | ||
![]() |
4d4386a374 | ||
![]() |
ce259b915a | ||
![]() |
02202ab803 | ||
![]() |
77c5ae80ab | ||
![]() |
eb3f57bfc7 |
150
INSTALL_LINUX
Normal file
150
INSTALL_LINUX
Normal file
@@ -0,0 +1,150 @@
|
||||
|
||||
|
||||
1. Requirements:
|
||||
---------------
|
||||
|
||||
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
||||
supported.
|
||||
64 bit Linux operating system. Apple is not supported.
|
||||
|
||||
2. Building on linux prerequisites:
|
||||
-----------------------------------
|
||||
|
||||
It is assumed users know how to install packages on their system and
|
||||
be able to compile standard source packages. This is basic Linux and
|
||||
beyond the scope of cpuminer-opt. Regardless compiling is trivial if you
|
||||
follow the instructions.
|
||||
|
||||
Make sure you have the basic development packages installed.
|
||||
Here is a good start:
|
||||
|
||||
http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu
|
||||
|
||||
Install any additional dependencies needed by cpuminer-opt. The list below
|
||||
are some of the ones that may not be in the default install and need to
|
||||
be installed manually. There may be others, read the compiler error messages,
|
||||
they will give a clue as to the missing package.
|
||||
|
||||
The following command should install everything you need on Debian based
|
||||
distributions such as Ubuntu. Fedora and other distributions may have similar
|
||||
but different package names.
|
||||
|
||||
$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
|
||||
|
||||
SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
|
||||
openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
|
||||
support depending on your CPU and compiler version:
|
||||
|
||||
"-march=native" is always the best choice
|
||||
|
||||
"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
|
||||
|
||||
"-msha" Add SHA to other tuning options
|
||||
|
||||
Additional instructions for static compilalation can be found here:
|
||||
https://lxadm.com/Static_compilation_of_cpuminer
|
||||
Static builds should only considered in a homogeneous HW and SW environment.
|
||||
Local builds will always have the best performance and compatibility.
|
||||
|
||||
3. Download cpuminer-opt
|
||||
------------------------
|
||||
|
||||
Download the source code for the latest realease from the official repository.
|
||||
|
||||
https://github.com/JayDDee/cpuminer-opt/releases
|
||||
|
||||
Extract the source code.
|
||||
|
||||
$ tar xvzf cpuminer-opt-x.y.z.tar.gz
|
||||
|
||||
|
||||
Alternatively it can be cloned from git.
|
||||
|
||||
$ git clone https://github.com/JayDDee/cpuminer-opt.git
|
||||
|
||||
4. Build cpuminer-opt
|
||||
---------------------
|
||||
|
||||
It is recomended to Build with default options, this will usuallly
|
||||
produce the best results.
|
||||
|
||||
$ ./build.sh to build on Linux or execute the following commands.
|
||||
|
||||
or
|
||||
|
||||
$ ./autogen.sh
|
||||
$ CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
$ make -j n
|
||||
|
||||
n is the number of threads.
|
||||
|
||||
5. Start mining.
|
||||
----------------
|
||||
|
||||
$ ./cpuminer -a algo -o url -u username -p password
|
||||
|
||||
|
||||
Windows
|
||||
-------
|
||||
|
||||
See also INSTAL_WINDOWS
|
||||
|
||||
The following procedure is obsolete and uses an old compiler.
|
||||
|
||||
Precompiled Windows binaries are built on a Linux host using Mingw
|
||||
with a more recent compiler than the following Windows hosted procedure.
|
||||
|
||||
Building on Windows prerequisites:
|
||||
|
||||
msys
|
||||
mingw_w64
|
||||
Visual C++ redistributable 2008 X64
|
||||
openssl
|
||||
|
||||
Install msys and mingw_w64, only needed once.
|
||||
|
||||
Unpack msys into C:\msys or your preferred directory.
|
||||
|
||||
Install mingw_w64 from win-builds.
|
||||
Follow instructions, check "msys or cygwin" and "x86_64" and accept default
|
||||
existing msys instalation.
|
||||
|
||||
Open a msys shell by double clicking on msys.bat.
|
||||
Note that msys shell uses linux syntax for file specifications, "C:\" is
|
||||
mounted at "/c/".
|
||||
|
||||
Add mingw bin directory to PATH variable
|
||||
PATH="/c/msys/opt/windows_64/bin/:$PATH"
|
||||
|
||||
Instalation complete, compile cpuminer-opt.
|
||||
|
||||
Unpack cpuminer-opt source files using tar from msys shell, or using 7zip
|
||||
or similar Windows program.
|
||||
|
||||
In msys shell cd to miner directory.
|
||||
cd /c/path/to/cpuminer-opt
|
||||
|
||||
Run build.sh to build on Windows or execute the following commands.
|
||||
|
||||
./autogen.sh
|
||||
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
make
|
||||
|
||||
Start mining
|
||||
|
||||
cpuminer.exe -a algo -o url -u user -p password
|
||||
|
||||
The following tips may be useful for older AMD CPUs.
|
||||
|
||||
AMD CPUs older than Steamroller, including Athlon x2 and Phenom II x4, are
|
||||
not supported by cpuminer-opt due to an incompatible implementation of SSE2
|
||||
on these CPUs. Some algos may crash the miner with an invalid instruction.
|
||||
Users are recommended to use an unoptimized miner such as cpuminer-multi.
|
||||
|
||||
Some users with AMD CPUs without AES_NI have reported problems compiling
|
||||
with build.sh or "-march=native". Problems have included compile errors
|
||||
and poor performance. These users are recommended to compile manually
|
||||
specifying "-march=btver1" on the configure command line.
|
||||
|
||||
Support for even older x86_64 without AES_NI or SSE2 is not availble.
|
||||
|
172
INSTALL_WINDOWS
Normal file
172
INSTALL_WINDOWS
Normal file
@@ -0,0 +1,172 @@
|
||||
Instructions for compiling cpuminer-opt for Windows.
|
||||
|
||||
|
||||
Windows compilation using Visual Studio is not supported. Mingw64 is
|
||||
used on a Linux system (bare metal or virtual machine) to cross-compile
|
||||
cpuminer-opt executable binaries for Windows.
|
||||
|
||||
These instructions were written for Debian and Ubuntu compatible distributions
|
||||
but should work on other major distributions as well. However some of the
|
||||
package names or file paths may be different.
|
||||
|
||||
It is assumed a Linux system is already available and running. And the user
|
||||
has enough Linux knowledge to find and install packages and follow these
|
||||
instructions.
|
||||
|
||||
First it is a good idea to create new user specifically for cross compiling.
|
||||
It keeps all mingw stuff contained and isolated from the rest of the system.
|
||||
|
||||
Step by step...
|
||||
|
||||
1. Install necessary packages from the distribution's repositories.
|
||||
|
||||
Refer to Linux compile instructions and install required packages.
|
||||
|
||||
Additionally, install mingw-w64.
|
||||
|
||||
sudo apt-get install mingw-w64
|
||||
|
||||
|
||||
2. Create a local library directory for packages to be compiled in the next
|
||||
step. Suggested location is $HOME/usr/lib/
|
||||
|
||||
3. Download and build other packages for mingw that don't have a mingw64
|
||||
version available in the repositories.
|
||||
|
||||
Download the following source code packages from their respective and
|
||||
respected download locations, copy them to ~/usr/lib/ and uncompress them.
|
||||
|
||||
openssl
|
||||
curl
|
||||
gmp
|
||||
|
||||
In most cases the latest vesrion is ok but it's safest to download
|
||||
the same major and minor version as included in your distribution.
|
||||
|
||||
Run the following commands or follow the supplied instructions.
|
||||
Do not run "make install" unless you are using ~/usr/lib, which isn't
|
||||
recommended.
|
||||
|
||||
Some instructions insist on running "make check". If make check fails
|
||||
it may still work, YMMV.
|
||||
|
||||
You can speed up "make" by using all CPU cores available with "-j n" where
|
||||
n is the number of CPU threads you want to use.
|
||||
|
||||
openssl:
|
||||
|
||||
./Configure mingw64 shared --cross-compile-prefix=x86_64-w64-mingw32
|
||||
make
|
||||
|
||||
curl:
|
||||
|
||||
./configure --with-winssl --with-winidn --host=x86_64-w64-mingw32
|
||||
make
|
||||
|
||||
gmp:
|
||||
|
||||
./configure --host=x86_64-w64-mingw32
|
||||
make
|
||||
|
||||
|
||||
|
||||
4. Tweak the environment.
|
||||
|
||||
This step is required everytime you login or the commands can be added to
|
||||
.bashrc.
|
||||
|
||||
Define some local variables to point to local library.
|
||||
|
||||
export LOCAL_LIB="$HOME/usr/lib"
|
||||
|
||||
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
|
||||
|
||||
export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
|
||||
|
||||
Create a release directory and copy some dll files previously built.
|
||||
This can be done outside of cpuminer-opt and only needs to be done once.
|
||||
If the release directory is in cpuminer-opt directory it needs to be
|
||||
recreated every a source package is decompressed.
|
||||
|
||||
mkdir release
|
||||
cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
|
||||
cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
|
||||
cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/
|
||||
cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libgcc_s_seh-1.dll release/
|
||||
cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
|
||||
cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
|
||||
|
||||
|
||||
|
||||
The following steps need to be done every time a new source package is
|
||||
opened.
|
||||
|
||||
5. Download cpuminer-opt
|
||||
|
||||
Download the latest source code package of cpumuner-opt to your desired
|
||||
location. .zip or .tar.gz, your choice.
|
||||
|
||||
https://github.com/JayDDee/cpuminer-opt/releases
|
||||
|
||||
Decompress and change to the cpuminer-opt directory.
|
||||
|
||||
|
||||
|
||||
6. Prepare to compile
|
||||
|
||||
Create a link to the locally compiled version of gmp.h
|
||||
|
||||
ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h
|
||||
|
||||
Edit configure.ac to fix lipthread package name.
|
||||
|
||||
sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
|
||||
|
||||
|
||||
7. Compile
|
||||
|
||||
you can use the default compile if you intend to use cpuminer-opt on the
|
||||
same CPU and the virtual machine supports that architecture.
|
||||
|
||||
./build.sh
|
||||
|
||||
Otherwise you can compile manually while setting options in CFLAGS.
|
||||
|
||||
Some common options:
|
||||
|
||||
To compile for a specific CPU architecture:
|
||||
|
||||
CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl
|
||||
|
||||
This will compile for AMD Ryzen.
|
||||
|
||||
You can compile more generically for a set of specific CPU features
|
||||
if you know what features you want:
|
||||
|
||||
CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
|
||||
|
||||
This will compile for an older CPU that does not have AVX.
|
||||
|
||||
You can find several examples in build-allarch.sh
|
||||
|
||||
If you have a CPU with more than 64 threads and Windows 7 or higher you
|
||||
can enable the CPU Groups feature:
|
||||
|
||||
-D_WIN32_WINNT==0x0601
|
||||
|
||||
Once you have run configure successfully run make with n CPU threads:
|
||||
|
||||
make -j n
|
||||
|
||||
Copy cpuminer.exe to the release directory, compress and copy the release
|
||||
directory to a Windows system and run cpuminer.exe from the command line.
|
||||
|
||||
Run cpuminer
|
||||
|
||||
In a command windows change directories to the unzipped release folder.
|
||||
to get a list of all options:
|
||||
|
||||
cpuminer.exe --help
|
||||
|
||||
Command options are specific to where you mine. Refer to the pool's
|
||||
instructions on how to set them.
|
77
Makefile.am
77
Makefile.am
@@ -18,7 +18,6 @@ dist_man_MANS = cpuminer.1
|
||||
cpuminer_SOURCES = \
|
||||
cpu-miner.c \
|
||||
util.c \
|
||||
uint256.cpp \
|
||||
api.c \
|
||||
sysinfos.c \
|
||||
algo-gate-api.c\
|
||||
@@ -42,20 +41,24 @@ cpuminer_SOURCES = \
|
||||
algo/argon2/argon2d/argon2d/argon2.c \
|
||||
algo/argon2/argon2d/argon2d/core.c \
|
||||
algo/argon2/argon2d/argon2d/opt.c \
|
||||
algo/argon2/argon2d/argon2d/thread.c \
|
||||
algo/argon2/argon2d/argon2d/argon2d_thread.c \
|
||||
algo/argon2/argon2d/argon2d/encoding.c \
|
||||
algo/blake/sph_blake.c \
|
||||
algo/blake/blake-hash-4way.c \
|
||||
algo/blake/blake256-hash-4way.c \
|
||||
algo/blake/blake512-hash-4way.c \
|
||||
algo/blake/blake-gate.c \
|
||||
algo/blake/blake.c \
|
||||
algo/blake/blake-4way.c \
|
||||
algo/blake/sph_blake2b.c \
|
||||
algo/blake/blake2b.c \
|
||||
algo/blake/sph-blake2s.c \
|
||||
algo/blake/blake2s-hash-4way.c \
|
||||
algo/blake/blake2s.c \
|
||||
algo/blake/blake2s-gate.c \
|
||||
algo/blake/blake2s-4way.c \
|
||||
algo/blake/blake2b-hash-4way.c \
|
||||
algo/blake/blake2b.c \
|
||||
algo/blake/blake2b-gate.c \
|
||||
algo/blake/blake2b-4way.c \
|
||||
algo/blake/blakecoin-gate.c \
|
||||
algo/blake/mod_blakecoin.c \
|
||||
algo/blake/blakecoin.c \
|
||||
@@ -67,8 +70,12 @@ cpuminer_SOURCES = \
|
||||
algo/blake/pentablake-4way.c \
|
||||
algo/blake/pentablake.c \
|
||||
algo/bmw/sph_bmw.c \
|
||||
algo/bmw/bmw-hash-4way.c \
|
||||
algo/bmw/bmw256-hash-4way.c \
|
||||
algo/bmw/bmw512-hash-4way.c \
|
||||
algo/bmw/bmw256.c \
|
||||
algo/bmw/bmw512-gate.c \
|
||||
algo/bmw/bmw512.c \
|
||||
algo/bmw/bmw512-4way.c \
|
||||
algo/cryptonight/cryptolight.c \
|
||||
algo/cryptonight/cryptonight-common.c\
|
||||
algo/cryptonight/cryptonight-aesni.c\
|
||||
@@ -77,10 +84,14 @@ cpuminer_SOURCES = \
|
||||
algo/cubehash/cubehash_sse2.c\
|
||||
algo/cubehash/cube-hash-2way.c \
|
||||
algo/echo/sph_echo.c \
|
||||
algo/echo/echo-hash-4way.c \
|
||||
algo/echo/aes_ni/hash.c\
|
||||
algo/gost/sph_gost.c \
|
||||
algo/groestl/groestl-gate.c \
|
||||
algo/groestl/groestl512-hash-4way.c \
|
||||
algo/groestl/sph_groestl.c \
|
||||
algo/groestl/groestl.c \
|
||||
algo/groestl/groestl-4way.c \
|
||||
algo/groestl/myrgr-gate.c \
|
||||
algo/groestl/myrgr-4way.c \
|
||||
algo/groestl/myr-groestl.c \
|
||||
@@ -110,12 +121,15 @@ cpuminer_SOURCES = \
|
||||
algo/keccak/keccak-4way.c\
|
||||
algo/keccak/keccak-gate.c \
|
||||
algo/keccak/sse2/keccak.c \
|
||||
algo/lanehash/lane.c \
|
||||
algo/luffa/sph_luffa.c \
|
||||
algo/luffa/luffa.c \
|
||||
algo/luffa/luffa_for_sse2.c \
|
||||
algo/luffa/luffa-hash-2way.c \
|
||||
algo/lyra2/lyra2.c \
|
||||
algo/lyra2/sponge.c \
|
||||
algo/lyra2/sponge-2way.c \
|
||||
algo/lyra2/lyra2-hash-2way.c \
|
||||
algo/lyra2/lyra2-gate.c \
|
||||
algo/lyra2/lyra2rev2.c \
|
||||
algo/lyra2/lyra2rev2-4way.c \
|
||||
@@ -129,20 +143,24 @@ cpuminer_SOURCES = \
|
||||
algo/lyra2/lyra2h-4way.c \
|
||||
algo/lyra2/allium-4way.c \
|
||||
algo/lyra2/allium.c \
|
||||
algo/lyra2/phi2-4way.c \
|
||||
algo/lyra2/phi2.c \
|
||||
algo/m7m.c \
|
||||
algo/neoscrypt/neoscrypt.c \
|
||||
algo/nist5/nist5-gate.c \
|
||||
algo/nist5/nist5-4way.c \
|
||||
algo/nist5/nist5.c \
|
||||
algo/nist5/zr5.c \
|
||||
algo/pluck.c \
|
||||
algo/panama/sph_panama.c \
|
||||
algo/radiogatun/sph_radiogatun.c \
|
||||
algo/quark/quark-gate.c \
|
||||
algo/quark/quark.c \
|
||||
algo/quark/quark-4way.c \
|
||||
algo/quark/anime-gate.c \
|
||||
algo/quark/anime.c \
|
||||
algo/quark/anime-4way.c \
|
||||
algo/quark/hmq1725-gate.c \
|
||||
algo/quark/hmq1725-4way.c \
|
||||
algo/quark/hmq1725.c \
|
||||
algo/qubit/qubit-gate.c \
|
||||
algo/qubit/qubit.c \
|
||||
algo/qubit/qubit-2way.c \
|
||||
@@ -154,19 +172,26 @@ cpuminer_SOURCES = \
|
||||
algo/ripemd/lbry-gate.c \
|
||||
algo/ripemd/lbry.c \
|
||||
algo/ripemd/lbry-4way.c \
|
||||
algo/scrypt.c \
|
||||
algo/scrypt/scrypt.c \
|
||||
algo/scrypt/neoscrypt.c \
|
||||
algo/scrypt/pluck.c \
|
||||
algo/scryptjane/scrypt-jane.c \
|
||||
algo/sha/sph_sha2.c \
|
||||
algo/sha/sph_sha2big.c \
|
||||
algo/sha/sha2-hash-4way.c \
|
||||
algo/sha/sha256-hash-4way.c \
|
||||
algo/sha/sha512-hash-4way.c \
|
||||
algo/sha/sha2.c \
|
||||
algo/sha/sha256t-gate.c \
|
||||
algo/sha/sha256t-4way.c \
|
||||
algo/sha/sha256t.c \
|
||||
algo/sha/sha256q-4way.c \
|
||||
algo/sha/sha256q.c \
|
||||
algo/shabal/sph_shabal.c \
|
||||
algo/shabal/shabal-hash-4way.c \
|
||||
algo/shavite/sph_shavite.c \
|
||||
algo/shavite/sph-shavite-aesni.c \
|
||||
algo/shavite/shavite-hash-2way.c \
|
||||
algo/shavite/shavite-hash-4way.c \
|
||||
algo/shavite/shavite.c \
|
||||
algo/simd/sph_simd.c \
|
||||
algo/simd/nist.c \
|
||||
@@ -179,14 +204,13 @@ cpuminer_SOURCES = \
|
||||
algo/skein/skein-gate.c \
|
||||
algo/skein/skein2.c \
|
||||
algo/skein/skein2-4way.c \
|
||||
algo/skein/skein2-gate.c \
|
||||
algo/sm3/sm3.c \
|
||||
algo/sm3/sm3-hash-4way.c \
|
||||
algo/swifftx/swifftx.c \
|
||||
algo/tiger/sph_tiger.c \
|
||||
algo/whirlpool/sph_whirlpool.c \
|
||||
algo/whirlpool/whirlpool-hash-4way.c \
|
||||
algo/whirlpool/whirlpool-gate.c \
|
||||
algo/whirlpool/whirlpool-4way.c \
|
||||
algo/whirlpool/whirlpool.c \
|
||||
algo/whirlpool/whirlpoolx.c \
|
||||
algo/x11/x11-gate.c \
|
||||
@@ -227,6 +251,8 @@ cpuminer_SOURCES = \
|
||||
algo/x13/skunk-4way.c \
|
||||
algo/x13/skunk.c \
|
||||
algo/x13/drop.c \
|
||||
algo/x13/x13bcd-4way.c \
|
||||
algo/x13/x13bcd.c \
|
||||
algo/x14/x14-gate.c \
|
||||
algo/x14/x14.c \
|
||||
algo/x14/x14-4way.c \
|
||||
@@ -240,21 +266,38 @@ cpuminer_SOURCES = \
|
||||
algo/x15/x15-gate.c \
|
||||
algo/x15/x15.c \
|
||||
algo/x15/x15-4way.c \
|
||||
algo/x16/x16r-gate.c \
|
||||
algo/x16/x16r.c \
|
||||
algo/x16/x16r-4way.c \
|
||||
algo/x16/x16rv2.c \
|
||||
algo/x16/x16rv2-4way.c \
|
||||
algo/x16/x16rt.c \
|
||||
algo/x16/x16rt-4way.c \
|
||||
algo/x16/hex.c \
|
||||
algo/x16/x21s-4way.c \
|
||||
algo/x16/x21s.c \
|
||||
algo/x17/x17-gate.c \
|
||||
algo/x17/x17.c \
|
||||
algo/x17/x17-4way.c \
|
||||
algo/x17/xevan-gate.c \
|
||||
algo/x17/xevan.c \
|
||||
algo/x17/xevan-4way.c \
|
||||
algo/x17/x16r-gate.c \
|
||||
algo/x17/x16r.c \
|
||||
algo/x17/x16r-4way.c \
|
||||
algo/x17/hmq1725.c \
|
||||
algo/x17/sonoa-gate.c \
|
||||
algo/x17/sonoa-4way.c \
|
||||
algo/x17/sonoa.c \
|
||||
algo/x20/x20r.c \
|
||||
algo/x22/x22i-4way.c \
|
||||
algo/x22/x22i.c \
|
||||
algo/x22/x22i-gate.c \
|
||||
algo/x22/x25x.c \
|
||||
algo/x22/x25x-4way.c \
|
||||
algo/yescrypt/yescrypt.c \
|
||||
algo/yescrypt/sha256_Y.c \
|
||||
algo/yescrypt/yescrypt-best.c \
|
||||
algo/yespower/yespower.c \
|
||||
algo/yespower/sha256.c \
|
||||
algo/yespower/yespower-gate.c \
|
||||
algo/yespower/yespower-blake2b.c \
|
||||
algo/yespower/crypto/blake2b-yp.c \
|
||||
algo/yespower/sha256_p.c \
|
||||
algo/yespower/yespower-opt.c
|
||||
|
||||
disable_flags =
|
||||
|
58
README.md
58
README.md
@@ -16,14 +16,15 @@ https://bitcointalk.org/index.php?topic=1326803.0
|
||||
|
||||
mailto://jayddee246@gmail.com
|
||||
|
||||
See file RELEASE_NOTES for change log and compile instructions.
|
||||
See file RELEASE_NOTES for change log and INSTALL_LINUX or INSTALL_WINDOWS
|
||||
for compile instructions.
|
||||
|
||||
Requirements
|
||||
------------
|
||||
|
||||
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
|
||||
Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
|
||||
optimizations a CPU with AES_NI is required. This includes Intel Westbridge
|
||||
optimizations a CPU with AES_NI is required. This includes Intel Westmere
|
||||
and newer and AMD equivalents. Further optimizations are available on some
|
||||
algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
|
||||
|
||||
@@ -54,13 +55,12 @@ Supported Algorithms
|
||||
axiom Shabal-256 MemoHash
|
||||
bastion
|
||||
blake Blake-256 (SFR)
|
||||
blakecoin blake256r8
|
||||
blake2b Blake2b 256
|
||||
blake2s Blake-2 S
|
||||
blakecoin blake256r8
|
||||
bmw BMW 256
|
||||
bmw512 BMW 512
|
||||
c11 Chaincoin
|
||||
cryptolight Cryptonight-light
|
||||
cryptonight
|
||||
cryptonightv7 Monero (XMR)
|
||||
decred
|
||||
deep Deepcoin (DCN)
|
||||
dmd-gr Diamond-Groestl
|
||||
@@ -68,6 +68,7 @@ Supported Algorithms
|
||||
fresh Fresh
|
||||
groestl Groestl coin
|
||||
heavy Heavy
|
||||
hex x16r-hex
|
||||
hmq1725 Espers
|
||||
hodl Hodlcoin
|
||||
jha Jackpotcoin
|
||||
@@ -77,29 +78,34 @@ Supported Algorithms
|
||||
luffa Luffa
|
||||
lyra2h Hppcoin
|
||||
lyra2re lyra2
|
||||
lyra2rev2 lyra2v2, Vertcoin
|
||||
lyra2z Zcoin (XZC)
|
||||
lyra2rev2 lyra2v2
|
||||
lyra2rev3 lyrav2v3, Vertcoin
|
||||
lyra2z
|
||||
lyra2z330 Lyra2 330 rows, Zoin (ZOI)
|
||||
m7m Magi (XMG)
|
||||
myr-gr Myriad-Groestl
|
||||
neoscrypt NeoScrypt(128, 2, 1)
|
||||
nist5 Nist5
|
||||
pentablake Pentablake
|
||||
phi1612 phi, LUX coin (original algo)
|
||||
phi2 LUX coin (new algo)
|
||||
phi1612 phi
|
||||
phi2 Luxcoin (LUX)
|
||||
phi2-lux identical to phi2
|
||||
pluck Pluck:128 (Supcoin)
|
||||
polytimos Ninja
|
||||
power2b MicroBitcoin (MBC)
|
||||
quark Quark
|
||||
qubit Qubit
|
||||
scrypt scrypt(1024, 1, 1) (default)
|
||||
scrypt:N scrypt(N, 1, 1)
|
||||
scryptjane:nf
|
||||
sha256d Double SHA-256
|
||||
sha256q Quad SHA-256, Pyrite (PYE)
|
||||
sha256t Triple SHA-256, Onecoin (OC)
|
||||
shavite3 Shavite3
|
||||
skein Skein+Sha (Skeincoin)
|
||||
skein2 Double Skein (Woodcoin)
|
||||
skunk Signatum (SIGT)
|
||||
sonoa Sono
|
||||
timetravel Machinecoin (MAC)
|
||||
timetravel10 Bitcore
|
||||
tribus Denarius (DNR)
|
||||
@@ -112,12 +118,19 @@ Supported Algorithms
|
||||
x11gost sib (SibCoin)
|
||||
x12 Galaxie Cash (GCH)
|
||||
x13 X13
|
||||
x13bcd bcd
|
||||
x13sm3 hsr (Hshare)
|
||||
x14 X14
|
||||
x15 X15
|
||||
x16r Ravencoin (RVN)
|
||||
x16s pigeoncoin (PGN)
|
||||
x16r
|
||||
x16rv2 Ravencoin (RVN)
|
||||
x16rt Gincoin (GIN)
|
||||
x16rt-veil Veil (VEIL)
|
||||
x16s Pigeoncoin (PGN)
|
||||
x17
|
||||
x21s
|
||||
x22i
|
||||
x25x
|
||||
xevan Bitsend (BSD)
|
||||
yescrypt Globalboost-Y (BSTY)
|
||||
yescryptr8 BitZeny (ZNY)
|
||||
@@ -125,11 +138,17 @@ Supported Algorithms
|
||||
yescryptr32 WAVI
|
||||
yespower Cryply
|
||||
yespowerr16 Yenten (YTN)
|
||||
yespower-b2b generic yespower + blake2b
|
||||
zr5 Ziftr
|
||||
|
||||
Errata
|
||||
------
|
||||
|
||||
Old algorithms that are no longer used frequently will not have the latest
|
||||
optimizations.
|
||||
|
||||
Cryptonight and variants are no longer supported, use another miner.
|
||||
|
||||
Neoscrypt crashes on Windows, use legacy version.
|
||||
|
||||
AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
|
||||
@@ -145,14 +164,17 @@ Benchmark testing does not work for x11evo.
|
||||
Bugs
|
||||
----
|
||||
|
||||
Users are encouraged to post their bug reports on the Bitcoin Talk
|
||||
forum at:
|
||||
Users are encouraged to post their bug reports using git issues or on the
|
||||
Bitcoin Talk forum or opening an issue in git:
|
||||
|
||||
https://bitcointalk.org/index.php?topic=1326803.0
|
||||
|
||||
All problem reports must be accompanied by a proper definition.
|
||||
https://github.com/JayDDee/cpuminer-opt/issues
|
||||
|
||||
All problem reports must be accompanied by a proper problem definition.
|
||||
This should include how the problem occurred, the command line and
|
||||
output from the miner showing the startup and any errors.
|
||||
output from the miner showing the startup messages and any errors.
|
||||
A history is also useful, ie did it work before.
|
||||
|
||||
Donations
|
||||
---------
|
||||
@@ -160,10 +182,6 @@ Donations
|
||||
cpuminer-opt has no fees of any kind but donations are accepted.
|
||||
|
||||
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
|
||||
ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
|
||||
LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
|
||||
BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
|
||||
BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
|
||||
|
||||
Happy mining!
|
||||
|
||||
|
26
README.txt
26
README.txt
@@ -12,32 +12,36 @@ the software, don't use it.
|
||||
Choose the exe that best matches you CPU's features or use trial and
|
||||
error to find the fastest one that doesn't crash. Pay attention to
|
||||
the features listed at cpuminer startup to ensure you are mining at
|
||||
optimum speed using all the available features.
|
||||
optimum speed using the best available features.
|
||||
|
||||
Architecture names and compile options used are only provided for Intel
|
||||
Core series. Pentium and Celeron often have fewer features.
|
||||
Core series. Budget CPUs like Pentium and Celeron are often missing the
|
||||
latest features.
|
||||
|
||||
AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
|
||||
supported by cpuminer-opt due to an incompatible implementation of SSE2 on
|
||||
these CPUs. Some algos may crash the miner with an invalid instruction.
|
||||
Users are recommended to use an unoptimized miner such as cpuminer-multi.
|
||||
Changes in v3.8.4 may have improved compatibility with some of these CPUs.
|
||||
|
||||
More information for Intel and AMD CPU architectures and their features
|
||||
can be found on Wikipedia.
|
||||
|
||||
https://en.wikipedia.org/wiki/List_of_Intel_CPU_microarchitectures
|
||||
|
||||
https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures
|
||||
|
||||
|
||||
Exe name Compile flags Arch name
|
||||
|
||||
cpuminer-sse2.exe "-msse2" Core2, Nehalem
|
||||
cpuminer-aes-sse42.exe "-march=westmere" Westmere, Sandy-Ivybridge
|
||||
cpuminer-avx.exe "-march=corei7-avx" Sandy-Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2" Haswell, Sky-Kaby-Coffeelake
|
||||
cpuminer-avx2-sha.exe "-march=core-avx2 -msha" Ryzen
|
||||
cpuminer-aes-sse42.exe "-march=westmere" Westmere
|
||||
cpuminer-avx.exe "-march=corei7-avx" Sandybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell, Skylake, Coffeelake
|
||||
cpuminer-avx512.exe "-march=skylake-avx512" Skylake-X, Cascadelake-X
|
||||
cpuminer-zen "-march=znver1" AMD Ryzen, Threadripper
|
||||
|
||||
If you like this software feel free to donate:
|
||||
|
||||
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
|
||||
ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
|
||||
LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
|
||||
BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
|
||||
BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
|
||||
|
||||
|
||||
|
457
RELEASE_NOTES
457
RELEASE_NOTES
@@ -1,21 +1,17 @@
|
||||
puminer-opt now supports HW SHA acceleration available on AMD Ryzen CPUs.
|
||||
This feature requires recent SW including GCC version 5 or higher and
|
||||
openssl version 1.1 or higher. It may also require using "-march=znver1"
|
||||
compile flag.
|
||||
cpuminer-opt is a console program run from the command line using the
|
||||
keyboard, not the mouse.
|
||||
|
||||
HW SHA support is only available when compiled from source, Windows binaries
|
||||
are not yet available.
|
||||
|
||||
cpuminer-opt is a console program, if you're using a mouse you're doing it
|
||||
wrong.
|
||||
See also README.md for list of supported algorithms,
|
||||
|
||||
Security warning
|
||||
----------------
|
||||
|
||||
Miner programs are often flagged as malware by antivirus programs. This is
|
||||
a false positive, they are flagged simply because they are cryptocurrency
|
||||
miners. The source code is open for anyone to inspect. If you don't trust
|
||||
the software, don't use it.
|
||||
usually a false positive, they are flagged simply because they are
|
||||
cryptocurrency miners. However, some malware has been spread using the
|
||||
cover that miners are known to be subject to false positives. Always be on
|
||||
alert. The source code of cpuminer-opt is open for anyone to inspect.
|
||||
If you don't trust the software don't download it.
|
||||
|
||||
The cryptographic hashing code has been taken from trusted sources but has been
|
||||
modified for speed at the expense of accepted security practices. This
|
||||
@@ -25,140 +21,323 @@ required.
|
||||
Compile Instructions
|
||||
--------------------
|
||||
|
||||
Requirements:
|
||||
See INSTALL_LINUX or INSTALL_WINDOWS for compile instruuctions
|
||||
|
||||
Requirements
|
||||
------------
|
||||
|
||||
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
||||
supported.
|
||||
64 bit Linux or Windows operating system. Apple is not supported.
|
||||
|
||||
Building on linux prerequisites:
|
||||
|
||||
It is assumed users know how to install packages on their system and
|
||||
be able to compile standard source packages. This is basic Linux and
|
||||
beyond the scope of cpuminer-opt.
|
||||
|
||||
Make sure you have the basic development packages installed.
|
||||
Here is a good start:
|
||||
|
||||
http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu
|
||||
|
||||
Install any additional dependencies needed by cpuminer-opt. The list below
|
||||
are some of the ones that may not be in the default install and need to
|
||||
be installed manually. There may be others, read the error messages they
|
||||
will give a clue as to the missing package.
|
||||
|
||||
The following command should install everything you need on Debian based
|
||||
distributions such as Ubuntu:
|
||||
|
||||
sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake zlib1g-dev
|
||||
|
||||
build-essential (for Ubuntu, Development Tools package group on Fedora)
|
||||
automake
|
||||
libjansson-dev
|
||||
libgmp-dev
|
||||
libcurl4-openssl-dev
|
||||
libssl-dev
|
||||
pthreads
|
||||
zlib
|
||||
|
||||
SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and openssl 1.1
|
||||
or higher. Reports of improved performiance on Ryzen when using openssl 1.0.2
|
||||
have been due to AVX and AVX2 optimizations added to that version.
|
||||
Additional improvements are expected on Ryzen with openssl 1.1.
|
||||
"-march-znver1" or "-msha".
|
||||
|
||||
Additional instructions for static compilalation can be found here:
|
||||
https://lxadm.com/Static_compilation_of_cpuminer
|
||||
Static builds should only considered in a homogeneous HW and SW environment.
|
||||
Local builds will always have the best performance and compatibility.
|
||||
|
||||
Extract cpuminer source.
|
||||
|
||||
tar xvzf cpuminer-opt-x.y.z.tar.gz
|
||||
cd cpuminer-opt-x.y.z
|
||||
|
||||
Run ./build.sh to build on Linux or execute the following commands.
|
||||
|
||||
./autogen.sh
|
||||
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
make
|
||||
|
||||
Additional optional compile flags, add the following to CFLAGS to activate:
|
||||
|
||||
-DUSE_SPH_SHA (deprecated)
|
||||
|
||||
SPH may give slightly better performance on algos that use sha256 when using
|
||||
openssl 1.0.1 or older. Openssl 1.0.2 adds AVX2 and 1.1 adds SHA and perform
|
||||
better than SPH. This option is ignored when 4-way is used, even for CPUs
|
||||
with SHA.
|
||||
|
||||
Start mining.
|
||||
|
||||
./cpuminer -a algo -o url -u username -p password
|
||||
|
||||
Windows
|
||||
|
||||
Precompiled Windows binaries are built on a Linux host using Mingw
|
||||
with a more recent compiler than the following Windows hosted procedure.
|
||||
|
||||
Building on Windows prerequisites:
|
||||
|
||||
msys
|
||||
mingw_w64
|
||||
Visual C++ redistributable 2008 X64
|
||||
openssl
|
||||
|
||||
Install msys and mingw_w64, only needed once.
|
||||
|
||||
Unpack msys into C:\msys or your preferred directory.
|
||||
|
||||
Install mingw_w64 from win-builds.
|
||||
Follow instructions, check "msys or cygwin" and "x86_64" and accept default
|
||||
existing msys instalation.
|
||||
|
||||
Open a msys shell by double clicking on msys.bat.
|
||||
Note that msys shell uses linux syntax for file specifications, "C:\" is
|
||||
mounted at "/c/".
|
||||
|
||||
Add mingw bin directory to PATH variable
|
||||
PATH="/c/msys/opt/windows_64/bin/:$PATH"
|
||||
|
||||
Instalation complete, compile cpuminer-opt.
|
||||
|
||||
Unpack cpuminer-opt source files using tar from msys shell, or using 7zip
|
||||
or similar Windows program.
|
||||
|
||||
In msys shell cd to miner directory.
|
||||
cd /c/path/to/cpuminer-opt
|
||||
|
||||
Run build.sh to build on Windows or execute the following commands.
|
||||
|
||||
./autogen.sh
|
||||
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
make
|
||||
|
||||
Start mining
|
||||
|
||||
cpuminer.exe -a algo -o url -u user -p password
|
||||
|
||||
The following tips may be useful for older AMD CPUs.
|
||||
|
||||
AMD CPUs older than Steamroller, including Athlon x2 and Phenom II x4, are
|
||||
not supported by cpuminer-opt due to an incompatible implementation of SSE2
|
||||
on these CPUs. Some algos may crash the miner with an invalid instruction.
|
||||
Users are recommended to use an unoptimized miner such as cpuminer-multi.
|
||||
|
||||
Some users with AMD CPUs without AES_NI have reported problems compiling
|
||||
with build.sh or "-march=native". Problems have included compile errors
|
||||
and poor performance. These users are recommended to compile manually
|
||||
specifying "-march=btver1" on the configure command line.
|
||||
|
||||
Support for even older x86_64 without AES_NI or SSE2 is not availble.
|
||||
|
||||
64 bit Linux or Windows operating system. Apple, Android and Rpi are
|
||||
not supported. FreeBSD YMMV.
|
||||
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.11.0
|
||||
|
||||
Fixed x25x AVX512 lane 4 invalid shares.
|
||||
|
||||
AVX512 for hex, phi2.
|
||||
|
||||
VAES optimzation for Intel Icelake CPUs for most algos recently optimized
|
||||
with AVX512, source code only.
|
||||
|
||||
v3.10.7
|
||||
|
||||
AVX512 for x25x, lbry, x13bcd (bcd).
|
||||
|
||||
v3.10.6
|
||||
|
||||
Added support for SSL stratum: stratum+tcps://
|
||||
|
||||
Added job id reporting again, but leaner, suppressed with --quiet.
|
||||
|
||||
AVX512 for x21s, x22i, lyra2z, allium.
|
||||
|
||||
Fixed share overflow warnings mining lbry with Ryzen (SHA).
|
||||
|
||||
v3.10.5
|
||||
|
||||
AVX512 for x17, sonoa, xevan, hmq1725, lyra2rev3, lyra2rev2.
|
||||
Faster hmq1725 AVX2.
|
||||
|
||||
v3.10.4
|
||||
|
||||
AVX512 for x16r, x16rv2, x16rt, x16s, x16rt-veil (veil).
|
||||
|
||||
v3.10.3
|
||||
|
||||
AVX512 for x12, x13, x14, x15.
|
||||
Fixed x12 AVX2 invalid shares.
|
||||
|
||||
v.10.2
|
||||
|
||||
AVX512 added for bmw512, c11, phi1612 (phi), qubit, skunk, x11, x11gost (sib).
|
||||
Fixed c11 AVX2 invalid shares.
|
||||
|
||||
v3.10.1
|
||||
|
||||
AVX512 for blake2b, nist5, quark, tribus.
|
||||
|
||||
More broken lane fixes, fixed buffer overflow in skein AVX512, fixed
|
||||
quark invalid shares AVX2.
|
||||
|
||||
Only the highest ranking feature in a class is listed at startup, lower ranking
|
||||
features are available but no longer listed.
|
||||
|
||||
v3.10.0
|
||||
|
||||
AVX512 is now supported on selected algos, Windows binary is now available.
|
||||
AVX512 optimizations are available for argon2d, blake2s, keccak, keccakc,
|
||||
skein & skein2.
|
||||
|
||||
Fixed CPU temperature for some CPU models (Linux only).
|
||||
|
||||
Fixed a bug that caused some lanes not to submit shares.
|
||||
|
||||
Fixed some previously undetected buffer overflows.
|
||||
|
||||
Lyra2rev2 3% faster SSE2 and AVX2.
|
||||
|
||||
Added "-fno-asynchronous-unwind-tables" to AVX512 build script for Windows
|
||||
to fix known mingw issue.
|
||||
|
||||
Changed AVX2 build script to explicitly add AES to address change in
|
||||
behaviour in GCC 9.
|
||||
|
||||
v3.9.11
|
||||
|
||||
Added x22i & x25x algos.
|
||||
Blake2s 2% faster AVX2 with Intel CPU, slower with Ryzen v1, v2 ?
|
||||
|
||||
v3.9.10
|
||||
|
||||
Faster X* algos with AVX2.
|
||||
Small improvements to summary stats report.
|
||||
|
||||
v3.9.9.1
|
||||
|
||||
Fixed a day1 bug that could cause the miner to idle for up to 2 minutes
|
||||
under certain circumstances.
|
||||
|
||||
Redesigned summary stats report now includes session statistics.
|
||||
|
||||
More robust handling of statistics to reduce corruption.
|
||||
|
||||
Removed --hide-diff option.
|
||||
|
||||
Better handling of cpu-affinity with more than 64 CPUs.
|
||||
|
||||
v3.9.9
|
||||
|
||||
Added power2b algo for MicroBitcoin.
|
||||
Added generic yespower-b2b (yespower + blake2b) algo to be used with
|
||||
the parameters introduced in v3.9.7 for yespower & yescrypt.
|
||||
Display additional info when a share is rejected.
|
||||
Some low level enhancements and minor tweaking of log output.
|
||||
RELEASE_NOTES (this file) and README.md added to Windows release package.
|
||||
|
||||
v3.9.8.1
|
||||
|
||||
Summary log report will be generated on stratum diff change or after 5 minutes,
|
||||
whichever comes first, to prevent incorrect data in the report.
|
||||
|
||||
Removed phi2-lux alias (introduced in v3.9.8) due to Luxcoin's planned fork
|
||||
to a new algo. The new Luxcoin algo is not supported by cpuminer-opt.
|
||||
Until the fork Luxcoin can be mined using phi2 algo.
|
||||
|
||||
--hide-diff option is deprecated and has no effect. It will be removed in a
|
||||
future release.
|
||||
|
||||
v3.9.8
|
||||
|
||||
Changes to log output to provide data more relevant to actual mining
|
||||
performance.
|
||||
phi2 can now handle pools with a mix of coins that use and don't use roots.
|
||||
phi2-lux added as an alias for phi2 as they are identical except for roots.
|
||||
Add x16rv2 algo for Ravencoin fork.
|
||||
|
||||
v3.9.7
|
||||
|
||||
Command line option changes:
|
||||
|
||||
"-R" is no longer used as a shortcut for "--retry-pause", users must
|
||||
use the long option.
|
||||
|
||||
New options:
|
||||
|
||||
-N, --param-n: set the N parameter for yescrypt, yespower or scrypt algos
|
||||
-R, --param-r: set the R parameter for yescrypt or yespower algos, scrypt is
|
||||
hardcoded with R=1
|
||||
-K, --param-key: set the client key/pers parameter for yescrypt/yespower algos.
|
||||
|
||||
These options can be used to mine yescrypt or yespower variations using
|
||||
the generic yescrypt or yespower algo name and specifying the parameters
|
||||
manually. They can even be used to mine variations that aren't formally
|
||||
supported by a unique algo name. Existing algos can continue to to be mined
|
||||
using their original name without parameters.
|
||||
|
||||
v3.9.6.2
|
||||
|
||||
New algo blake2b.
|
||||
Faster myr-gr on Ryzen using SHA.
|
||||
Faster blake2s SSE2.
|
||||
Small speedup of around 1% for several other algos.
|
||||
|
||||
v3.9.6.1
|
||||
|
||||
New algos: x21s, hex (alias x16r-hex).
|
||||
|
||||
v3.9.6
|
||||
|
||||
New algos: bmw512, x16rt, x16rt-veil (alias veil), x13bcd (alias bcd).
|
||||
|
||||
v3.9.5.4
|
||||
|
||||
Fixed sha256q AVX2 poor performance.
|
||||
Fixed skein2 buffer overflow and restored bswap-interleave optimization.
|
||||
More restructuring.
|
||||
|
||||
v3.9.5.3
|
||||
|
||||
Fix crash mining hodl with aes-sse42.
|
||||
More restructuring and share report tweaks.
|
||||
|
||||
v3.9.5.2
|
||||
|
||||
Revert bswap-interleave optimization for causing crashes on Windows.
|
||||
|
||||
v3.9.5.1
|
||||
|
||||
Fixed skein2 crash on Windows.
|
||||
|
||||
Fixed CPU temperature reading on Ubuntu 19.04.
|
||||
|
||||
Realigned log message colours, blue is used to report normal activity and
|
||||
yellow is only used to report abnormal activity.
|
||||
|
||||
Changed stats colours, yellow now means below average, white is average
|
||||
range. Tweaked colour thresholds.
|
||||
|
||||
Changed colour of stratum difficulty change messages to blue to match other
|
||||
normal protocol messages. Blue messages (block, stratum, submit) will no
|
||||
longer be displayed when using -q option.
|
||||
|
||||
Added job id to new block, share submit, and share result messages and added
|
||||
new nessage when a new job is received for an existing block. This will for
|
||||
better troubleshooting of invalid job id rejects seen at zergpool.
|
||||
|
||||
Some more restructuring.
|
||||
|
||||
v3.9.5
|
||||
|
||||
New share reporting information includes calculation of equivalent hashrate
|
||||
based on share difficulty, network latency, 5 minute summary.
|
||||
Per-thread hash rate reports are disabled by default.
|
||||
New command line option --hash-meter added to enable per-thread hash rates.
|
||||
|
||||
|
||||
v3.9.4
|
||||
|
||||
Faster AVX2 for lyra2v3, quark, anime.
|
||||
Fixed skein AVX2 regression (invalid shares since v3.9.0) and faster.
|
||||
Faster skein2 with 4way AVX2 enabled.
|
||||
Automatic SHA override on Ryzen CPUs, no need for -DRYZEN compile flag.
|
||||
Ongoing restructuring.
|
||||
|
||||
v3.9.3.1
|
||||
|
||||
Skipped v3.9.3 due to misidentification of v3.9.2.5 as v3.9.3.
|
||||
Fixed x16r algo 25% invalid share reject rate. The bug may have also
|
||||
affected other algos.
|
||||
|
||||
v3.9.2.5
|
||||
|
||||
Fixed 2 regressions: hodl AES detection, x16r invalid shares with AVX2.
|
||||
More restructuring.
|
||||
|
||||
v3.9.2.4
|
||||
|
||||
Yet another affinity fix. Hopefully the last one.
|
||||
|
||||
v3.9.2.3
|
||||
|
||||
Another cpu-affinity fix.
|
||||
Disabled test code that fails to compile on some CPUs with limited
|
||||
AVX512 capabilities.
|
||||
|
||||
v3.9.2.2
|
||||
|
||||
Fixed some day one cpu-affinity issues.
|
||||
|
||||
v3.9.2
|
||||
|
||||
Added sha256q algo.
|
||||
Yespower now uses openssl SHA256, but no observable hash rate increase
|
||||
on Ryzen.
|
||||
Ongoing rearchitecting.
|
||||
Lyra2z now hashes 8-way on CPUs with AVX2.
|
||||
Lyra2 (all including phi2) now runs optimized code with SSE2.
|
||||
|
||||
v3.9.1.1
|
||||
|
||||
Fixed lyra2v3 AVX and below.
|
||||
|
||||
Compiling on Windows using Cygwin now works. Simply use "./build.sh"
|
||||
just like on Linux. It isn't portable therefore the binaries package will
|
||||
continue to use the existing procedure.
|
||||
The Cygwin procedure will be documented in more detail later and will
|
||||
include a list of packages that need to be installed.
|
||||
|
||||
v3.9.1
|
||||
|
||||
Fixed AVX2 version of anime algo.
|
||||
|
||||
Added sonoa algo.
|
||||
|
||||
Added "-DRYZEN_" compile option for Ryzen to override 4-way hashing when algo
|
||||
contains sha256 and use SHA instead. This is due to a combination of
|
||||
the introduction of HW SHA support combined with the poor performance
|
||||
of AVX2 on Ryzen. The Windows binaries package replaces cpuminer-avx2-sha
|
||||
with cpuminer-zen compiled with the override. Refer to the build instructions
|
||||
for more information.
|
||||
|
||||
Ongoing restructuring to streamline the process, reduce latency,
|
||||
reduce memory usage and unnecessary copying of data. Most of these
|
||||
will not result in a notoceably higher reported hashrate as the
|
||||
change simply reduces the time wasted that wasn't factored into the
|
||||
hash rate reported by the miner. In short, less dead time resulting in
|
||||
a higher net hashrate.
|
||||
|
||||
One of these measures to reduce latency also results in an enhanced
|
||||
share submission message including the share number*, the CPU thread,
|
||||
and the vector lane that found the solution. The time difference between
|
||||
the share submission and acceptance (or rejection) response indicates
|
||||
network ltatency. One other effect of this change is a reduction in hash
|
||||
meter messages because the scan function no longer exits when a share is
|
||||
found. Scan cycles will go longer and submit multiple shares per cycle.
|
||||
*the share number is antcipated and includes both accepted and rejected
|
||||
shares. Because the share is antipated and not synchronized it may be
|
||||
incorrect in time of very rapid share submission. Under most conditions
|
||||
it should be easy to match the submission with the corresponding response.
|
||||
|
||||
Removed "-DUSE_SPH_SHA" option, all users should have a recent version of
|
||||
openssl installed: v1.0.2 (Ubuntu 16.04) or better. Ryzen SHA requires
|
||||
v1.1.0 or better. Ryzen SHA is not used when hashing multi-way parallel.
|
||||
Ryzen SHA is available in the Windows binaries release package.
|
||||
|
||||
Improved compile instructions, now in seperate files: INSTALL_LINUX and
|
||||
INSTALL_WINDOWS. The Windows instructions are used to build the binaries
|
||||
release package. It's built on a Linux system either running as a virtual
|
||||
machine or a seperate computer. At this time there is no known way to
|
||||
build natively on a Windows system.
|
||||
|
||||
v3.9.0.1
|
||||
|
||||
Isolate Windows CPU groups code when CPU groups support not explicitly defined.
|
||||
|
||||
v3.9.0
|
||||
|
||||
Added support for Windows CPU groups.
|
||||
@@ -167,6 +346,7 @@ Prep work for AVX512.
|
||||
Added lyra2rev3 for the vertcoin algo change.
|
||||
Added yespower, yespowerr16 (Yenten)
|
||||
Added phi2 algo for LUX
|
||||
Discontinued support for cryptonight and variants.
|
||||
|
||||
v3.8.8.1
|
||||
|
||||
@@ -350,6 +530,7 @@ Changed default sha256 and sha512 to openssl. This should be used when
|
||||
compiling with openssl 1.0.2 or higher (Ubuntu 16.04).
|
||||
This should increase the hashrate for yescrypt, yescryptr16, m7m, xevan, skein,
|
||||
myr-gr & others when openssl 1.0.2 is installed.
|
||||
Note: -DUSE_SPH_SHA has been removed in v3.9.1.
|
||||
Users with openssl 1.0.1 (Ubuntu 14.04) may get better perforance by adding
|
||||
"-DUSE_SPH_SHA" to CLAGS.
|
||||
Windows binaries are compiled with -DUSE_SPH_SHA and won't get the speedup.
|
||||
|
231
algo-gate-api.c
231
algo-gate-api.c
@@ -71,7 +71,6 @@ bool return_false () { return false; }
|
||||
void *return_null () { return NULL; }
|
||||
void call_error () { printf("ERR: Uninitialized function pointer\n"); }
|
||||
|
||||
|
||||
void algo_not_tested()
|
||||
{
|
||||
applog( LOG_WARNING,"Algo %s has not been tested live. It may not work",
|
||||
@@ -117,13 +116,10 @@ void init_algo_gate( algo_gate_t* gate )
|
||||
gate->get_nonceptr = (void*)&std_get_nonceptr;
|
||||
gate->work_decode = (void*)&std_le_work_decode;
|
||||
gate->decode_extra_data = (void*)&do_nothing;
|
||||
gate->wait_for_diff = (void*)&std_wait_for_diff;
|
||||
gate->get_max64 = (void*)&get_max64_0x1fffffLL;
|
||||
gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
|
||||
gate->stratum_gen_work = (void*)&std_stratum_gen_work;
|
||||
gate->build_stratum_request = (void*)&std_le_build_stratum_request;
|
||||
gate->malloc_txs_request = (void*)&std_malloc_txs_request;
|
||||
gate->set_target = (void*)&std_set_target;
|
||||
gate->submit_getwork_result = (void*)&std_le_submit_getwork_result;
|
||||
gate->build_block_header = (void*)&std_build_block_header;
|
||||
gate->build_extraheader = (void*)&std_build_extraheader;
|
||||
@@ -149,109 +145,121 @@ void init_algo_gate( algo_gate_t* gate )
|
||||
// called by each thread that uses the gate
|
||||
bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
{
|
||||
if ( NULL == gate )
|
||||
{
|
||||
applog(LOG_ERR,"FAIL: algo_gate registration failed, NULL gate\n");
|
||||
return false;
|
||||
}
|
||||
if ( NULL == gate )
|
||||
{
|
||||
applog(LOG_ERR,"FAIL: algo_gate registration failed, NULL gate\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
init_algo_gate( gate );
|
||||
init_algo_gate( gate );
|
||||
|
||||
switch (algo)
|
||||
{
|
||||
case ALGO_ALLIUM: register_allium_algo ( gate ); break;
|
||||
case ALGO_ANIME: register_anime_algo ( gate ); break;
|
||||
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
|
||||
case ALGO_ARGON2D250: register_argon2d_crds_algo ( gate ); break;
|
||||
case ALGO_ARGON2D500: register_argon2d_dyn_algo ( gate ); break;
|
||||
case ALGO_ARGON2D4096: register_argon2d4096_algo ( gate ); break;
|
||||
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
|
||||
case ALGO_BASTION: register_bastion_algo ( gate ); break;
|
||||
case ALGO_BLAKE: register_blake_algo ( gate ); break;
|
||||
case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break;
|
||||
// case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break;
|
||||
case ALGO_BLAKE2S: register_blake2s_algo ( gate ); break;
|
||||
case ALGO_C11: register_c11_algo ( gate ); break;
|
||||
case ALGO_CRYPTOLIGHT: register_cryptolight_algo ( gate ); break;
|
||||
case ALGO_CRYPTONIGHT: register_cryptonight_algo ( gate ); break;
|
||||
case ALGO_CRYPTONIGHTV7:register_cryptonightv7_algo( gate ); break;
|
||||
case ALGO_DECRED: register_decred_algo ( gate ); break;
|
||||
case ALGO_DEEP: register_deep_algo ( gate ); break;
|
||||
case ALGO_DMD_GR: register_dmd_gr_algo ( gate ); break;
|
||||
case ALGO_DROP: register_drop_algo ( gate ); break;
|
||||
case ALGO_FRESH: register_fresh_algo ( gate ); break;
|
||||
case ALGO_GROESTL: register_groestl_algo ( gate ); break;
|
||||
case ALGO_HEAVY: register_heavy_algo ( gate ); break;
|
||||
case ALGO_HMQ1725: register_hmq1725_algo ( gate ); break;
|
||||
case ALGO_HODL: register_hodl_algo ( gate ); break;
|
||||
case ALGO_JHA: register_jha_algo ( gate ); break;
|
||||
case ALGO_KECCAK: register_keccak_algo ( gate ); break;
|
||||
case ALGO_KECCAKC: register_keccakc_algo ( gate ); break;
|
||||
case ALGO_LBRY: register_lbry_algo ( gate ); break;
|
||||
case ALGO_LUFFA: register_luffa_algo ( gate ); break;
|
||||
case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break;
|
||||
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV3: register_lyra2rev3_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z330: register_lyra2z330_algo ( gate ); break;
|
||||
case ALGO_M7M: register_m7m_algo ( gate ); break;
|
||||
case ALGO_MYR_GR: register_myriad_algo ( gate ); break;
|
||||
case ALGO_NEOSCRYPT: register_neoscrypt_algo ( gate ); break;
|
||||
case ALGO_NIST5: register_nist5_algo ( gate ); break;
|
||||
case ALGO_PENTABLAKE: register_pentablake_algo ( gate ); break;
|
||||
case ALGO_PHI1612: register_phi1612_algo ( gate ); break;
|
||||
case ALGO_PHI2: register_phi2_algo ( gate ); break;
|
||||
case ALGO_PLUCK: register_pluck_algo ( gate ); break;
|
||||
case ALGO_POLYTIMOS: register_polytimos_algo ( gate ); break;
|
||||
case ALGO_QUARK: register_quark_algo ( gate ); break;
|
||||
case ALGO_QUBIT: register_qubit_algo ( gate ); break;
|
||||
case ALGO_SCRYPT: register_scrypt_algo ( gate ); break;
|
||||
case ALGO_SCRYPTJANE: register_scryptjane_algo ( gate ); break;
|
||||
case ALGO_SHA256D: register_sha256d_algo ( gate ); break;
|
||||
case ALGO_SHA256T: register_sha256t_algo ( gate ); break;
|
||||
case ALGO_SHAVITE3: register_shavite_algo ( gate ); break;
|
||||
case ALGO_SKEIN: register_skein_algo ( gate ); break;
|
||||
case ALGO_SKEIN2: register_skein2_algo ( gate ); break;
|
||||
case ALGO_SKUNK: register_skunk_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL: register_timetravel_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL10: register_timetravel10_algo ( gate ); break;
|
||||
case ALGO_TRIBUS: register_tribus_algo ( gate ); break;
|
||||
case ALGO_VANILLA: register_vanilla_algo ( gate ); break;
|
||||
case ALGO_VELTOR: register_veltor_algo ( gate ); break;
|
||||
case ALGO_WHIRLPOOL: register_whirlpool_algo ( gate ); break;
|
||||
case ALGO_WHIRLPOOLX: register_whirlpoolx_algo ( gate ); break;
|
||||
case ALGO_X11: register_x11_algo ( gate ); break;
|
||||
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
|
||||
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
|
||||
case ALGO_X12: register_x12_algo ( gate ); break;
|
||||
case ALGO_X13: register_x13_algo ( gate ); break;
|
||||
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
|
||||
case ALGO_X14: register_x14_algo ( gate ); break;
|
||||
case ALGO_X15: register_x15_algo ( gate ); break;
|
||||
case ALGO_X16R: register_x16r_algo ( gate ); break;
|
||||
case ALGO_X16S: register_x16s_algo ( gate ); break;
|
||||
case ALGO_X17: register_x17_algo ( gate ); break;
|
||||
case ALGO_XEVAN: register_xevan_algo ( gate ); break;
|
||||
switch (algo)
|
||||
{
|
||||
case ALGO_ALLIUM: register_allium_algo ( gate ); break;
|
||||
case ALGO_ANIME: register_anime_algo ( gate ); break;
|
||||
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
|
||||
case ALGO_ARGON2D250: register_argon2d_crds_algo ( gate ); break;
|
||||
case ALGO_ARGON2D500: register_argon2d_dyn_algo ( gate ); break;
|
||||
case ALGO_ARGON2D4096: register_argon2d4096_algo ( gate ); break;
|
||||
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
|
||||
case ALGO_BASTION: register_bastion_algo ( gate ); break;
|
||||
case ALGO_BLAKE: register_blake_algo ( gate ); break;
|
||||
case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break;
|
||||
case ALGO_BLAKE2S: register_blake2s_algo ( gate ); break;
|
||||
case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break;
|
||||
case ALGO_BMW512: register_bmw512_algo ( gate ); break;
|
||||
case ALGO_C11: register_c11_algo ( gate ); break;
|
||||
case ALGO_CRYPTOLIGHT: register_cryptolight_algo ( gate ); break;
|
||||
case ALGO_CRYPTONIGHT: register_cryptonight_algo ( gate ); break;
|
||||
case ALGO_CRYPTONIGHTV7: register_cryptonightv7_algo ( gate ); break;
|
||||
case ALGO_DECRED: register_decred_algo ( gate ); break;
|
||||
case ALGO_DEEP: register_deep_algo ( gate ); break;
|
||||
case ALGO_DMD_GR: register_dmd_gr_algo ( gate ); break;
|
||||
case ALGO_DROP: register_drop_algo ( gate ); break;
|
||||
case ALGO_FRESH: register_fresh_algo ( gate ); break;
|
||||
case ALGO_GROESTL: register_groestl_algo ( gate ); break;
|
||||
case ALGO_HEAVY: register_heavy_algo ( gate ); break;
|
||||
case ALGO_HEX: register_hex_algo ( gate ); break;
|
||||
case ALGO_HMQ1725: register_hmq1725_algo ( gate ); break;
|
||||
case ALGO_HODL: register_hodl_algo ( gate ); break;
|
||||
case ALGO_JHA: register_jha_algo ( gate ); break;
|
||||
case ALGO_KECCAK: register_keccak_algo ( gate ); break;
|
||||
case ALGO_KECCAKC: register_keccakc_algo ( gate ); break;
|
||||
case ALGO_LBRY: register_lbry_algo ( gate ); break;
|
||||
case ALGO_LUFFA: register_luffa_algo ( gate ); break;
|
||||
case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break;
|
||||
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV3: register_lyra2rev3_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z330: register_lyra2z330_algo ( gate ); break;
|
||||
case ALGO_M7M: register_m7m_algo ( gate ); break;
|
||||
case ALGO_MYR_GR: register_myriad_algo ( gate ); break;
|
||||
case ALGO_NEOSCRYPT: register_neoscrypt_algo ( gate ); break;
|
||||
case ALGO_NIST5: register_nist5_algo ( gate ); break;
|
||||
case ALGO_PENTABLAKE: register_pentablake_algo ( gate ); break;
|
||||
case ALGO_PHI1612: register_phi1612_algo ( gate ); break;
|
||||
case ALGO_PHI2: register_phi2_algo ( gate ); break;
|
||||
case ALGO_PLUCK: register_pluck_algo ( gate ); break;
|
||||
case ALGO_POLYTIMOS: register_polytimos_algo ( gate ); break;
|
||||
case ALGO_POWER2B: register_power2b_algo ( gate ); break;
|
||||
case ALGO_QUARK: register_quark_algo ( gate ); break;
|
||||
case ALGO_QUBIT: register_qubit_algo ( gate ); break;
|
||||
case ALGO_SCRYPT: register_scrypt_algo ( gate ); break;
|
||||
case ALGO_SCRYPTJANE: register_scryptjane_algo ( gate ); break;
|
||||
case ALGO_SHA256D: register_sha256d_algo ( gate ); break;
|
||||
case ALGO_SHA256Q: register_sha256q_algo ( gate ); break;
|
||||
case ALGO_SHA256T: register_sha256t_algo ( gate ); break;
|
||||
case ALGO_SHAVITE3: register_shavite_algo ( gate ); break;
|
||||
case ALGO_SKEIN: register_skein_algo ( gate ); break;
|
||||
case ALGO_SKEIN2: register_skein2_algo ( gate ); break;
|
||||
case ALGO_SKUNK: register_skunk_algo ( gate ); break;
|
||||
case ALGO_SONOA: register_sonoa_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL: register_timetravel_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL10: register_timetravel10_algo ( gate ); break;
|
||||
case ALGO_TRIBUS: register_tribus_algo ( gate ); break;
|
||||
case ALGO_VANILLA: register_vanilla_algo ( gate ); break;
|
||||
case ALGO_VELTOR: register_veltor_algo ( gate ); break;
|
||||
case ALGO_WHIRLPOOL: register_whirlpool_algo ( gate ); break;
|
||||
case ALGO_WHIRLPOOLX: register_whirlpoolx_algo ( gate ); break;
|
||||
case ALGO_X11: register_x11_algo ( gate ); break;
|
||||
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
|
||||
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
|
||||
case ALGO_X12: register_x12_algo ( gate ); break;
|
||||
case ALGO_X13: register_x13_algo ( gate ); break;
|
||||
case ALGO_X13BCD: register_x13bcd_algo ( gate ); break;
|
||||
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
|
||||
case ALGO_X14: register_x14_algo ( gate ); break;
|
||||
case ALGO_X15: register_x15_algo ( gate ); break;
|
||||
case ALGO_X16R: register_x16r_algo ( gate ); break;
|
||||
case ALGO_X16RV2: register_x16rv2_algo ( gate ); break;
|
||||
case ALGO_X16RT: register_x16rt_algo ( gate ); break;
|
||||
case ALGO_X16RT_VEIL: register_x16rt_veil_algo ( gate ); break;
|
||||
case ALGO_X16S: register_x16s_algo ( gate ); break;
|
||||
case ALGO_X17: register_x17_algo ( gate ); break;
|
||||
case ALGO_X21S: register_x21s_algo ( gate ); break;
|
||||
case ALGO_X22I: register_x22i_algo ( gate ); break;
|
||||
case ALGO_X25X: register_x25x_algo ( gate ); break;
|
||||
case ALGO_XEVAN: register_xevan_algo ( gate ); break;
|
||||
/* case ALGO_YESCRYPT: register_yescrypt_05_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: register_yescryptr8_05_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: register_yescryptr16_05_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR32: register_yescryptr32_05_algo ( gate ); break;
|
||||
*/
|
||||
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
|
||||
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
|
||||
case ALGO_YESPOWER: register_yespower_algo ( gate ); break;
|
||||
case ALGO_YESPOWERR16: register_yespowerr16_algo ( gate ); break;
|
||||
case ALGO_YESPOWER_B2B: register_yespower_b2b_algo ( gate ); break;
|
||||
case ALGO_ZR5: register_zr5_algo ( gate ); break;
|
||||
default:
|
||||
applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
|
||||
return false;
|
||||
} // switch
|
||||
|
||||
case ALGO_YESPOWER: register_yespower_algo ( gate ); break;
|
||||
case ALGO_YESPOWERR16: register_yespowerr16_algo ( gate ); break;
|
||||
case ALGO_ZR5: register_zr5_algo ( gate ); break;
|
||||
default:
|
||||
applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
|
||||
return false;
|
||||
} // switch
|
||||
|
||||
// ensure required functions were defined.
|
||||
// ensure required functions were defined.
|
||||
if ( gate->scanhash == (void*)&null_scanhash )
|
||||
{
|
||||
applog(LOG_ERR, "FAIL: Required algo_gate functions undefined\n");
|
||||
@@ -266,7 +274,11 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
// override std defaults with jr2 defaults
|
||||
bool register_json_rpc2( algo_gate_t *gate )
|
||||
{
|
||||
gate->wait_for_diff = (void*)&do_nothing;
|
||||
applog(LOG_WARNING,"\nCryptonight algorithm and variants are no longer");
|
||||
applog(LOG_WARNING,"supported by cpuminer-opt. Shares submitted will");
|
||||
applog(LOG_WARNING,"likely be rejected. Proceed at your own risk.\n");
|
||||
|
||||
// gate->wait_for_diff = (void*)&do_nothing;
|
||||
gate->get_new_work = (void*)&jr2_get_new_work;
|
||||
gate->get_nonceptr = (void*)&jr2_get_nonceptr;
|
||||
gate->stratum_gen_work = (void*)&jr2_stratum_gen_work;
|
||||
@@ -305,6 +317,7 @@ const char* const algo_alias_map[][2] =
|
||||
{ "argon2d-crds", "argon2d250" },
|
||||
{ "argon2d-dyn", "argon2d500" },
|
||||
{ "argon2d-uis", "argon2d4096" },
|
||||
{ "bcd", "x13bcd" },
|
||||
{ "bitcore", "timetravel10" },
|
||||
{ "bitzeny", "yescryptr8" },
|
||||
{ "blake256r8", "blakecoin" },
|
||||
@@ -323,25 +336,22 @@ const char* const algo_alias_map[][2] =
|
||||
{ "lyra2", "lyra2re" },
|
||||
{ "lyra2v2", "lyra2rev2" },
|
||||
{ "lyra2v3", "lyra2rev3" },
|
||||
{ "lyra2zoin", "lyra2z330" },
|
||||
{ "myrgr", "myr-gr" },
|
||||
{ "myriad", "myr-gr" },
|
||||
{ "neo", "neoscrypt" },
|
||||
{ "phi", "phi1612" },
|
||||
// { "sia", "blake2b" },
|
||||
{ "sib", "x11gost" },
|
||||
{ "timetravel8", "timetravel" },
|
||||
{ "ziftr", "zr5" },
|
||||
{ "veil", "x16rt-veil" },
|
||||
{ "x16r-hex", "hex" },
|
||||
{ "yenten", "yescryptr16" },
|
||||
{ "yescryptr8k", "yescrypt" },
|
||||
{ "zcoin", "lyra2z" },
|
||||
{ "zoin", "lyra2z330" },
|
||||
{ "ziftr", "zr5" },
|
||||
{ NULL, NULL }
|
||||
};
|
||||
|
||||
// if arg is a valid alias for a known algo it is updated with the proper name.
|
||||
// No validation of the algo or alias is done, It is the responsinility of the
|
||||
// calling function to validate the algo after return.
|
||||
// if arg is a valid alias for a known algo it is updated with the proper
|
||||
// name. No validation of the algo or alias is done, It is the responsinility
|
||||
// of the calling function to validate the algo after return.
|
||||
void get_algo_alias( char** algo_or_alias )
|
||||
{
|
||||
int i;
|
||||
@@ -354,3 +364,6 @@ void get_algo_alias( char** algo_or_alias )
|
||||
}
|
||||
}
|
||||
|
||||
#undef ALIAS
|
||||
#undef PROPER
|
||||
|
||||
|
@@ -2,8 +2,7 @@
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include "miner.h"
|
||||
#include "avxdefs.h"
|
||||
#include "interleave.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
/////////////////////////////
|
||||
////
|
||||
@@ -36,7 +35,7 @@
|
||||
// 6. Determine if other non existant functions are required.
|
||||
// That is determined by the need to add code in cpu-miner.c
|
||||
// that applies only to the new algo. That is forbidden. All
|
||||
// algo specific code must be in theh algo's file.
|
||||
// algo specific code must be in the algo's file.
|
||||
//
|
||||
// 7. If new functions need to be added to the gate add the type
|
||||
// to the structure, declare a null instance in this file and define
|
||||
@@ -49,10 +48,10 @@
|
||||
// instances as they are defined by default, or unsafe functions that
|
||||
// are not needed by the algo.
|
||||
//
|
||||
// 9. Add an case entry to the switch/case in function register_gate
|
||||
// 9. Add a case entry to the switch/case in function register_gate
|
||||
// in file algo-gate-api.c for the new algo.
|
||||
//
|
||||
// 10 If a new function type was defined add an entry to ini talgo_gate
|
||||
// 10 If a new function type was defined add an entry to init algo_gate
|
||||
// to initialize the new function to its null instance described in step 7.
|
||||
//
|
||||
// 11. If the new algo has aliases add them to the alias array in
|
||||
@@ -86,14 +85,16 @@
|
||||
|
||||
typedef uint32_t set_t;
|
||||
|
||||
#define EMPTY_SET 0
|
||||
#define SSE2_OPT 1
|
||||
#define AES_OPT 2
|
||||
#define SSE42_OPT 4
|
||||
#define AVX_OPT 8
|
||||
#define AVX2_OPT 0x10
|
||||
#define SHA_OPT 0x20
|
||||
#define AVX512_OPT 0x40
|
||||
#define EMPTY_SET 0
|
||||
#define SSE2_OPT 1
|
||||
#define AES_OPT 2
|
||||
#define SSE42_OPT 4
|
||||
#define AVX_OPT 8 // Sandybridge
|
||||
#define AVX2_OPT 0x10 // Haswell
|
||||
#define SHA_OPT 0x20 // sha256 (Ryzen, Ice Lake)
|
||||
#define AVX512_OPT 0x40 // AVX512- F, VL, DQ, BW (Skylake-X)
|
||||
#define VAES_OPT 0x80 // VAES (Ice Lake)
|
||||
|
||||
|
||||
// return set containing all elements from sets a & b
|
||||
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
|
||||
@@ -110,35 +111,62 @@ inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
|
||||
typedef struct
|
||||
{
|
||||
// mandatory functions, must be overwritten
|
||||
int ( *scanhash ) ( int, struct work*, uint32_t, uint64_t* );
|
||||
int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );
|
||||
|
||||
// optional unsafe, must be overwritten if algo uses function
|
||||
void ( *hash ) ( void*, const void*, uint32_t ) ;
|
||||
void ( *hash_suw ) ( void*, const void* );
|
||||
|
||||
//optional, safe to use default in most cases
|
||||
|
||||
// Allocate thread local buffers and other initialization specific to miner
|
||||
// threads.
|
||||
bool ( *miner_thread_init ) ( int );
|
||||
|
||||
// Generate global blockheader from stratum data.
|
||||
void ( *stratum_gen_work ) ( struct stratum_ctx*, struct work* );
|
||||
|
||||
// Get thread local copy of blockheader with unique nonce.
|
||||
void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t*,
|
||||
bool );
|
||||
|
||||
// Return pointer to nonce in blockheader.
|
||||
uint32_t *( *get_nonceptr ) ( uint32_t* );
|
||||
void ( *decode_extra_data ) ( struct work*, uint64_t* );
|
||||
void ( *wait_for_diff ) ( struct stratum_ctx* );
|
||||
int64_t ( *get_max64 ) ();
|
||||
|
||||
// Decode getwork blockheader
|
||||
bool ( *work_decode ) ( const json_t*, struct work* );
|
||||
void ( *set_target) ( struct work*, double );
|
||||
|
||||
// Extra getwork data
|
||||
void ( *decode_extra_data ) ( struct work*, uint64_t* );
|
||||
|
||||
bool ( *submit_getwork_result ) ( CURL*, struct work* );
|
||||
|
||||
void ( *gen_merkle_root ) ( char*, struct stratum_ctx* );
|
||||
|
||||
// Increment extranonce
|
||||
void ( *build_extraheader ) ( struct work*, struct stratum_ctx* );
|
||||
|
||||
void ( *build_block_header ) ( struct work*, uint32_t, uint32_t*,
|
||||
uint32_t*, uint32_t, uint32_t );
|
||||
uint32_t*, uint32_t, uint32_t );
|
||||
// Build mining.submit message
|
||||
void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );
|
||||
|
||||
char* ( *malloc_txs_request ) ( struct work* );
|
||||
|
||||
// Big or little
|
||||
void ( *set_work_data_endian ) ( struct work* );
|
||||
|
||||
double ( *calc_network_diff ) ( struct work* );
|
||||
|
||||
// Wait for first work
|
||||
bool ( *ready_to_mine ) ( struct work*, struct stratum_ctx*, int );
|
||||
void ( *resync_threads ) ( struct work* );
|
||||
|
||||
// Diverge mining threads
|
||||
bool ( *do_this_thread ) ( int );
|
||||
|
||||
// After do_this_thread
|
||||
void ( *resync_threads ) ( struct work* );
|
||||
|
||||
json_t* (*longpoll_rpc_call) ( CURL*, int*, char* );
|
||||
bool ( *stratum_handle_response )( json_t* );
|
||||
set_t optimizations;
|
||||
@@ -147,7 +175,6 @@ int ntime_index;
|
||||
int nbits_index;
|
||||
int nonce_index; // use with caution, see warning below
|
||||
int work_cmp_size;
|
||||
|
||||
} algo_gate_t;
|
||||
|
||||
extern algo_gate_t algo_gate;
|
||||
@@ -194,8 +221,6 @@ void null_hash_suw();
|
||||
|
||||
// optional safe targets, default listed first unless noted.
|
||||
|
||||
void std_wait_for_diff();
|
||||
|
||||
uint32_t *std_get_nonceptr( uint32_t *work_data );
|
||||
uint32_t *jr2_get_nonceptr( uint32_t *work_data );
|
||||
|
||||
@@ -210,25 +235,13 @@ void jr2_stratum_gen_work( struct stratum_ctx *sctx, struct work *work );
|
||||
void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
|
||||
void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
|
||||
|
||||
// pick your favorite or define your own
|
||||
int64_t get_max64_0x1fffffLL(); // default
|
||||
int64_t get_max64_0x40LL();
|
||||
int64_t get_max64_0x3ffff();
|
||||
int64_t get_max64_0x3fffffLL();
|
||||
int64_t get_max64_0x1ffff();
|
||||
int64_t get_max64_0xffffLL();
|
||||
|
||||
void std_set_target( struct work *work, double job_diff );
|
||||
void alt_set_target( struct work* work, double job_diff );
|
||||
void scrypt_set_target( struct work *work, double job_diff );
|
||||
|
||||
bool std_le_work_decode( const json_t *val, struct work *work );
|
||||
bool std_be_work_decode( const json_t *val, struct work *work );
|
||||
bool jr2_work_decode( const json_t *val, struct work *work );
|
||||
bool jr2_work_decode( const json_t *val, struct work *work );
|
||||
|
||||
bool std_le_submit_getwork_result( CURL *curl, struct work *work );
|
||||
bool std_be_submit_getwork_result( CURL *curl, struct work *work );
|
||||
bool jr2_submit_getwork_result( CURL *curl, struct work *work );
|
||||
bool jr2_submit_getwork_result( CURL *curl, struct work *work );
|
||||
|
||||
void std_le_build_stratum_request( char *req, struct work *work );
|
||||
void std_be_build_stratum_request( char *req, struct work *work );
|
||||
@@ -242,8 +255,8 @@ void set_work_data_big_endian( struct work *work );
|
||||
double std_calc_network_diff( struct work *work );
|
||||
|
||||
void std_build_block_header( struct work* g_work, uint32_t version,
|
||||
uint32_t *prevhash, uint32_t *merkle_root,
|
||||
uint32_t ntime, uint32_t nbits );
|
||||
uint32_t *prevhash, uint32_t *merkle_root,
|
||||
uint32_t ntime, uint32_t nbits );
|
||||
|
||||
void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );
|
||||
|
||||
@@ -264,8 +277,8 @@ int std_get_work_data_size();
|
||||
// by calling the algo's register function.
|
||||
bool register_algo_gate( int algo, algo_gate_t *gate );
|
||||
|
||||
// Override any default gate functions that are applicable and do any other
|
||||
// algo-specific initialization.
|
||||
// Called by algos toverride any default gate functions that are applicable
|
||||
// and do any other algo-specific initialization.
|
||||
// The register functions for all the algos can be declared here to reduce
|
||||
// compiler warnings but that's just more work for devs adding new algos.
|
||||
bool register_algo( algo_gate_t *gate );
|
||||
@@ -278,5 +291,7 @@ bool register_json_rpc2( algo_gate_t *gate );
|
||||
// use this to call the hash function of an algo directly, ie util.c test.
|
||||
void exec_hash_function( int algo, void *output, const void *pdata );
|
||||
|
||||
void get_algo_alias( char** algo_or_alias );
|
||||
// Validate a string as a known algo and alias, updates arg to proper
|
||||
// algo name if valid alias, NULL if invalid alias or algo.
|
||||
void get_algo_alias( char **algo_or_alias );
|
||||
|
||||
|
@@ -42,12 +42,14 @@ void argon2hash(void *output, const void *input)
|
||||
(unsigned char *)output);
|
||||
}
|
||||
|
||||
int scanhash_argon2(int thr_id, struct work* work, uint32_t max_nonce, uint64_t *hashes_done)
|
||||
int scanhash_argon2( struct work* work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
@@ -72,19 +74,14 @@ int scanhash_argon2(int thr_id, struct work* work, uint32_t max_nonce, uint64_t
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t argon2_get_max64 ()
|
||||
{
|
||||
return 0x1ffLL;
|
||||
}
|
||||
|
||||
bool register_argon2_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&scanhash_argon2;
|
||||
gate->hash = (void*)&argon2hash;
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
gate->get_max64 = (void*)&argon2_get_max64;
|
||||
opt_target_factor = 65536.0;
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -33,45 +33,42 @@ void argon2d_crds_hash( void *output, const void *input )
|
||||
argon2_ctx( &context, Argon2_d );
|
||||
}
|
||||
|
||||
int scanhash_argon2d_crds( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t nonce = first_nonce;
|
||||
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint32_t nonce = first_nonce;
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
argon2d_crds_hash( hash, endiandata );
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
argon2d_crds_hash( hash, endiandata );
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
work_set_target_ratio(work, hash);
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_argon2d_crds_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_argon2d_crds;
|
||||
gate->hash = (void*)&argon2d_crds_hash;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -103,52 +100,50 @@ void argon2d_dyn_hash( void *output, const void *input )
|
||||
argon2_ctx( &context, Argon2_d );
|
||||
}
|
||||
|
||||
int scanhash_argon2d_dyn( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t nonce = first_nonce;
|
||||
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint32_t nonce = first_nonce;
|
||||
do
|
||||
{
|
||||
be32enc(&endiandata[19], nonce);
|
||||
argon2d_dyn_hash( hash, endiandata );
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
argon2d_dyn_hash( hash, endiandata );
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
work_set_target_ratio(work, hash);
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_argon2d_dyn_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_argon2d_dyn;
|
||||
gate->hash = (void*)&argon2d_dyn_hash;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Unitus
|
||||
|
||||
int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done)
|
||||
int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) vhash[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
@@ -157,7 +152,7 @@ int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
uint32_t t_cost = 1; // 1 iteration
|
||||
uint32_t m_cost = 4096; // use 4MB
|
||||
uint32_t parallelism = 1; // 1 thread, 2 lanes
|
||||
@@ -169,11 +164,10 @@ int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
be32enc( &endiandata[19], n );
|
||||
argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80,
|
||||
(char*) endiandata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
|
||||
if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) )
|
||||
if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
submit_solution( work, vhash, mythr );
|
||||
}
|
||||
n++;
|
||||
|
||||
@@ -185,14 +179,11 @@ int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t get_max64_0x1ff() { return 0x1ff; }
|
||||
|
||||
bool register_argon2d4096_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_argon2d4096;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
gate->get_max64 = (void*)&get_max64_0x1ff;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@@ -9,23 +9,23 @@ bool register_argon2d_crds_algo( algo_gate_t* gate );
|
||||
|
||||
void argon2d_crds_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_argon2d_crds( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
// Dynamic: version = 0x10, m_cost = 500.
|
||||
bool register_argon2d_dyn_algo( algo_gate_t* gate );
|
||||
|
||||
void argon2d_dyn_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_argon2d_dyn( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
|
||||
// Unitus: version = 0x13, m_cost = 4096.
|
||||
bool register_argon2d4096_algo( algo_gate_t* gate );
|
||||
|
||||
int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -17,7 +17,7 @@
|
||||
|
||||
#if !defined(ARGON2_NO_THREADS)
|
||||
|
||||
#include "thread.h"
|
||||
#include "argon2d_thread.h"
|
||||
#if defined(_WIN32)
|
||||
#include <windows.h>
|
||||
#endif
|
@@ -28,9 +28,10 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <mm_malloc.h>
|
||||
|
||||
#include "core.h"
|
||||
#include "thread.h"
|
||||
#include "argon2d_thread.h"
|
||||
#include "../blake2/blake2.h"
|
||||
#include "../blake2/blake2-impl.h"
|
||||
|
||||
@@ -99,7 +100,8 @@ int allocate_memory(const argon2_context *context, uint8_t **memory,
|
||||
if (context->allocate_cbk) {
|
||||
(context->allocate_cbk)(memory, memory_size);
|
||||
} else {
|
||||
*memory = malloc(memory_size);
|
||||
*memory = _mm_malloc( memory_size, 64 );
|
||||
// *memory = malloc(memory_size);
|
||||
}
|
||||
|
||||
if (*memory == NULL) {
|
||||
@@ -112,11 +114,12 @@ int allocate_memory(const argon2_context *context, uint8_t **memory,
|
||||
void free_memory(const argon2_context *context, uint8_t *memory,
|
||||
size_t num, size_t size) {
|
||||
size_t memory_size = num*size;
|
||||
clear_internal_memory(memory, memory_size);
|
||||
// clear_internal_memory(memory, memory_size);
|
||||
if (context->free_cbk) {
|
||||
(context->free_cbk)(memory, memory_size);
|
||||
} else {
|
||||
free(memory);
|
||||
// free(memory);
|
||||
_mm_free( memory );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -137,7 +140,7 @@ void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) {
|
||||
int FLAG_clear_internal_memory = 0;
|
||||
void clear_internal_memory(void *v, size_t n) {
|
||||
if (FLAG_clear_internal_memory && v) {
|
||||
secure_wipe_memory(v, n);
|
||||
// secure_wipe_memory(v, n);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -559,7 +562,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
|
||||
context->pwdlen);
|
||||
|
||||
if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
|
||||
secure_wipe_memory(context->pwd, context->pwdlen);
|
||||
// secure_wipe_memory(context->pwd, context->pwdlen);
|
||||
context->pwdlen = 0;
|
||||
}
|
||||
}
|
||||
@@ -580,7 +583,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
|
||||
context->secretlen);
|
||||
|
||||
if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
|
||||
secure_wipe_memory(context->secret, context->secretlen);
|
||||
// secure_wipe_memory(context->secret, context->secretlen);
|
||||
context->secretlen = 0;
|
||||
}
|
||||
}
|
||||
|
@@ -21,7 +21,7 @@
|
||||
|
||||
#include "argon2.h"
|
||||
#include "core.h"
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include "../blake2/blake2.h"
|
||||
#include "../blake2/blamka-round-opt.h"
|
||||
|
||||
@@ -37,24 +37,28 @@
|
||||
|
||||
#if defined(__AVX512F__)
|
||||
|
||||
static void fill_block(__m512i *state, const block *ref_block,
|
||||
block *next_block, int with_xor) {
|
||||
static void fill_block( __m512i *state, const block *ref_block,
|
||||
block *next_block, int with_xor )
|
||||
{
|
||||
__m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK];
|
||||
unsigned int i;
|
||||
|
||||
if (with_xor) {
|
||||
for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm512_xor_si512(
|
||||
state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
|
||||
block_XY[i] = _mm512_xor_si512(
|
||||
state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i));
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
|
||||
block_XY[i] = state[i] = _mm512_xor_si512(
|
||||
state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
|
||||
if ( with_xor )
|
||||
{
|
||||
for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
|
||||
{
|
||||
state[i] = _mm512_xor_si512( state[i],
|
||||
_mm512_load_si512( (const __m512i*)ref_block->v + i ) );
|
||||
block_XY[i] = _mm512_xor_si512( state[i],
|
||||
_mm512_load_si512( (const __m512i*)next_block->v + i ) );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
|
||||
block_XY[i] = state[i] = _mm512_xor_si512( state[i],
|
||||
_mm512_load_si512( (const __m512i*)ref_block->v + i ) );
|
||||
}
|
||||
|
||||
BLAKE2_ROUND_1( state[ 0], state[ 1], state[ 2], state[ 3],
|
||||
state[ 4], state[ 5], state[ 6], state[ 7] );
|
||||
@@ -66,23 +70,10 @@ static void fill_block(__m512i *state, const block *ref_block,
|
||||
BLAKE2_ROUND_2( state[ 1], state[ 3], state[ 5], state[ 7],
|
||||
state[ 9], state[11], state[13], state[15] );
|
||||
|
||||
/*
|
||||
for (i = 0; i < 2; ++i) {
|
||||
BLAKE2_ROUND_1(
|
||||
state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
|
||||
state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; ++i) {
|
||||
BLAKE2_ROUND_2(
|
||||
state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i],
|
||||
state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]);
|
||||
}
|
||||
*/
|
||||
|
||||
for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm512_xor_si512(state[i], block_XY[i]);
|
||||
_mm512_storeu_si512((__m512i *)next_block->v + i, state[i]);
|
||||
for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
|
||||
{
|
||||
state[i] = _mm512_xor_si512( state[i], block_XY[i] );
|
||||
_mm512_store_si512( (__m512i*)next_block->v + i, state[i] );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -96,14 +87,14 @@ static void fill_block(__m256i *state, const block *ref_block,
|
||||
if (with_xor) {
|
||||
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm256_xor_si256(
|
||||
state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
|
||||
state[i], _mm256_load_si256((const __m256i *)ref_block->v + i));
|
||||
block_XY[i] = _mm256_xor_si256(
|
||||
state[i], _mm256_loadu_si256((const __m256i *)next_block->v + i));
|
||||
state[i], _mm256_load_si256((const __m256i *)next_block->v + i));
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
|
||||
block_XY[i] = state[i] = _mm256_xor_si256(
|
||||
state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
|
||||
state[i], _mm256_load_si256((const __m256i *)ref_block->v + i));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,21 +116,9 @@ static void fill_block(__m256i *state, const block *ref_block,
|
||||
BLAKE2_ROUND_2( state[ 3], state[ 7], state[11], state[15],
|
||||
state[19], state[23], state[27], state[31] );
|
||||
|
||||
/*
|
||||
for (i = 0; i < 4; ++i) {
|
||||
BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
|
||||
state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 4; ++i) {
|
||||
BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
|
||||
state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
|
||||
}
|
||||
*/
|
||||
|
||||
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm256_xor_si256(state[i], block_XY[i]);
|
||||
_mm256_storeu_si256((__m256i *)next_block->v + i, state[i]);
|
||||
_mm256_store_si256((__m256i *)next_block->v + i, state[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -153,14 +132,14 @@ static void fill_block(__m128i *state, const block *ref_block,
|
||||
if (with_xor) {
|
||||
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm_xor_si128(
|
||||
state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
|
||||
state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
|
||||
block_XY[i] = _mm_xor_si128(
|
||||
state[i], _mm_loadu_si128((const __m128i *)next_block->v + i));
|
||||
state[i], _mm_load_si128((const __m128i *)next_block->v + i));
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
|
||||
block_XY[i] = state[i] = _mm_xor_si128(
|
||||
state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
|
||||
state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,22 +177,9 @@ static void fill_block(__m128i *state, const block *ref_block,
|
||||
BLAKE2_ROUND( state[ 7], state[15], state[23], state[31],
|
||||
state[39], state[47], state[55], state[63] );
|
||||
|
||||
/*
|
||||
for (i = 0; i < 8; ++i) {
|
||||
BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
|
||||
state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
|
||||
state[8 * i + 6], state[8 * i + 7]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 8; ++i) {
|
||||
BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
|
||||
state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
|
||||
state[8 * 6 + i], state[8 * 7 + i]);
|
||||
}
|
||||
*/
|
||||
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm_xor_si128(state[i], block_XY[i]);
|
||||
_mm_storeu_si128((__m128i *)next_block->v + i, state[i]);
|
||||
_mm_store_si128((__m128i *)next_block->v + i, state[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -29,6 +29,8 @@
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if !defined(__AVX512F__)
|
||||
#if !defined(__AVX2__)
|
||||
#if !defined(__XOP__)
|
||||
@@ -182,64 +184,63 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
|
||||
#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
|
||||
#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
|
||||
#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
|
||||
#define rotr32( x ) mm256_ror_64( x, 32 )
|
||||
#define rotr24( x ) mm256_ror_64( x, 24 )
|
||||
#define rotr16( x ) mm256_ror_64( x, 16 )
|
||||
#define rotr63( x ) mm256_rol_64( x, 1 )
|
||||
|
||||
//#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
|
||||
//#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
|
||||
//#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
|
||||
//#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
|
||||
|
||||
#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||
do { \
|
||||
__m256i ml = _mm256_mul_epu32(A0, B0); \
|
||||
ml = _mm256_add_epi64(ml, ml); \
|
||||
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
|
||||
__m256i ml0, ml1; \
|
||||
ml0 = _mm256_mul_epu32(A0, B0); \
|
||||
ml1 = _mm256_mul_epu32(A1, B1); \
|
||||
ml0 = _mm256_add_epi64(ml0, ml0); \
|
||||
ml1 = _mm256_add_epi64(ml1, ml1); \
|
||||
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml0)); \
|
||||
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml1)); \
|
||||
D0 = _mm256_xor_si256(D0, A0); \
|
||||
D0 = rotr32(D0); \
|
||||
\
|
||||
ml = _mm256_mul_epu32(C0, D0); \
|
||||
ml = _mm256_add_epi64(ml, ml); \
|
||||
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
|
||||
\
|
||||
B0 = _mm256_xor_si256(B0, C0); \
|
||||
B0 = rotr24(B0); \
|
||||
\
|
||||
ml = _mm256_mul_epu32(A1, B1); \
|
||||
ml = _mm256_add_epi64(ml, ml); \
|
||||
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
|
||||
D1 = _mm256_xor_si256(D1, A1); \
|
||||
D0 = rotr32(D0); \
|
||||
D1 = rotr32(D1); \
|
||||
\
|
||||
ml = _mm256_mul_epu32(C1, D1); \
|
||||
ml = _mm256_add_epi64(ml, ml); \
|
||||
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
|
||||
\
|
||||
ml0 = _mm256_mul_epu32(C0, D0); \
|
||||
ml1 = _mm256_mul_epu32(C1, D1); \
|
||||
ml0 = _mm256_add_epi64(ml0, ml0); \
|
||||
ml1 = _mm256_add_epi64(ml1, ml1); \
|
||||
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml0)); \
|
||||
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml1)); \
|
||||
B0 = _mm256_xor_si256(B0, C0); \
|
||||
B1 = _mm256_xor_si256(B1, C1); \
|
||||
B0 = rotr24(B0); \
|
||||
B1 = rotr24(B1); \
|
||||
} while((void)0, 0);
|
||||
|
||||
#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||
do { \
|
||||
__m256i ml = _mm256_mul_epu32(A0, B0); \
|
||||
ml = _mm256_add_epi64(ml, ml); \
|
||||
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
|
||||
__m256i ml0, ml1; \
|
||||
ml0 = _mm256_mul_epu32(A0, B0); \
|
||||
ml1 = _mm256_mul_epu32(A1, B1); \
|
||||
ml0 = _mm256_add_epi64(ml0, ml0); \
|
||||
ml1 = _mm256_add_epi64(ml1, ml1); \
|
||||
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml0)); \
|
||||
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml1)); \
|
||||
D0 = _mm256_xor_si256(D0, A0); \
|
||||
D0 = rotr16(D0); \
|
||||
\
|
||||
ml = _mm256_mul_epu32(C0, D0); \
|
||||
ml = _mm256_add_epi64(ml, ml); \
|
||||
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
|
||||
B0 = _mm256_xor_si256(B0, C0); \
|
||||
B0 = rotr63(B0); \
|
||||
\
|
||||
ml = _mm256_mul_epu32(A1, B1); \
|
||||
ml = _mm256_add_epi64(ml, ml); \
|
||||
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
|
||||
D1 = _mm256_xor_si256(D1, A1); \
|
||||
D0 = rotr16(D0); \
|
||||
D1 = rotr16(D1); \
|
||||
\
|
||||
ml = _mm256_mul_epu32(C1, D1); \
|
||||
ml = _mm256_add_epi64(ml, ml); \
|
||||
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
|
||||
ml0 = _mm256_mul_epu32(C0, D0); \
|
||||
ml1 = _mm256_mul_epu32(C1, D1); \
|
||||
ml0 = _mm256_add_epi64(ml0, ml0); \
|
||||
ml1 = _mm256_add_epi64(ml1, ml1); \
|
||||
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml0)); \
|
||||
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml1)); \
|
||||
B0 = _mm256_xor_si256(B0, C0); \
|
||||
B1 = _mm256_xor_si256(B1, C1); \
|
||||
B0 = rotr63(B0); \
|
||||
B1 = rotr63(B1); \
|
||||
} while((void)0, 0);
|
||||
|
||||
@@ -259,16 +260,14 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
|
||||
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
|
||||
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
|
||||
B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
||||
B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
||||
\
|
||||
tmp1 = C0; \
|
||||
B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
||||
C0 = C1; \
|
||||
C1 = tmp1; \
|
||||
\
|
||||
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
|
||||
tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
|
||||
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
||||
C1 = tmp1; \
|
||||
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
|
||||
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
||||
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
||||
} while(0);
|
||||
|
||||
#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
@@ -287,16 +286,14 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
|
||||
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
|
||||
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
|
||||
B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
||||
B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
||||
\
|
||||
tmp1 = C0; \
|
||||
B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
||||
C0 = C1; \
|
||||
C1 = tmp1; \
|
||||
\
|
||||
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
|
||||
tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
|
||||
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
||||
C1 = tmp1; \
|
||||
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
|
||||
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
||||
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
||||
} while((void)0, 0);
|
||||
|
||||
#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||
@@ -430,14 +427,14 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
#define SWAP_QUARTERS(A0, A1) \
|
||||
do { \
|
||||
SWAP_HALVES(A0, A1); \
|
||||
A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
|
||||
A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
|
||||
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
|
||||
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
|
||||
} while((void)0, 0)
|
||||
|
||||
#define UNSWAP_QUARTERS(A0, A1) \
|
||||
do { \
|
||||
A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
|
||||
A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
|
||||
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
|
||||
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
|
||||
SWAP_HALVES(A0, A1); \
|
||||
} while((void)0, 0)
|
||||
|
||||
|
@@ -15,11 +15,11 @@ void blakehash_4way(void *state, const void *input)
|
||||
memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
|
||||
blake256r14_4way( &ctx, input + (64<<2), 16 );
|
||||
blake256r14_4way_close( &ctx, vhash );
|
||||
mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
@@ -27,43 +27,34 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t HTarget = ptarget[7];
|
||||
uint32_t _ALIGN(32) edata[20];
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if (opt_benchmark)
|
||||
HTarget = 0x7f;
|
||||
|
||||
// we need big endian data...
|
||||
swab32_array( edata, pdata, 20 );
|
||||
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake256r14_4way_init( &blake_4w_ctx );
|
||||
blake256r14_4way( &blake_4w_ctx, vdata, 64 );
|
||||
|
||||
uint32_t *noncep = vdata + 76; // 19*4
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep +1, n+1 );
|
||||
be32enc( noncep +2, n+2 );
|
||||
be32enc( noncep +3, n+3 );
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
|
||||
blakehash_4way( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
|
||||
if ( (hash+(i<<3))[7] <= HTarget )
|
||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 4;
|
||||
|
||||
} while ( (num_found == 0) && (n < max_nonce)
|
||||
&& !work_restart[thr_id].restart );
|
||||
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -79,13 +70,13 @@ void blakehash_8way( void *state, const void *input )
|
||||
memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
|
||||
blake256r14_8way( &ctx, input + (64<<3), 16 );
|
||||
blake256r14_8way_close( &ctx, vhash );
|
||||
mm256_deinterleave_8x32( state, state+ 32, state+ 64, state+ 96,
|
||||
state+128, state+160, state+192, state+224,
|
||||
vhash, 256 );
|
||||
_dintrlv_8x32( state, state+ 32, state+ 64, state+ 96,
|
||||
state+128, state+160, state+192, state+224,
|
||||
vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
@@ -93,33 +84,21 @@ int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t HTarget = ptarget[7];
|
||||
uint32_t _ALIGN(32) edata[20];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if (opt_benchmark)
|
||||
HTarget = 0x7f;
|
||||
|
||||
// we need big endian data...
|
||||
swab32_array( edata, pdata, 20 );
|
||||
|
||||
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
|
||||
blake256r14_8way_init( &blake_8w_ctx );
|
||||
blake256r14_8way( &blake_8w_ctx, vdata, 64 );
|
||||
|
||||
uint32_t *noncep = vdata + 152; // 19*8
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep +1, n+1 );
|
||||
be32enc( noncep +2, n+2 );
|
||||
be32enc( noncep +3, n+3 );
|
||||
be32enc( noncep +4, n+4 );
|
||||
be32enc( noncep +5, n+5 );
|
||||
be32enc( noncep +6, n+6 );
|
||||
be32enc( noncep +7, n+7 );
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blakehash_8way( hash, vdata );
|
||||
@@ -128,17 +107,14 @@ int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
if ( (hash+i)[7] <= HTarget && fulltest( hash+i, ptarget ) )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
num_found++;
|
||||
nonces[i] = n+i;
|
||||
work_set_target_ratio( work, hash+1 );
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 8;
|
||||
|
||||
} while ( (num_found == 0) && (n < max_nonce)
|
||||
&& !work_restart[thr_id].restart );
|
||||
} while ( (n < max_nonce) !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -1,18 +1,8 @@
|
||||
#include "blake-gate.h"
|
||||
|
||||
int64_t blake_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
}
|
||||
|
||||
bool register_blake_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT;
|
||||
gate->get_max64 = (void*)&blake_get_max64;
|
||||
//#if defined (__AVX2__) && defined (FOUR_WAY)
|
||||
// gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
// gate->scanhash = (void*)&scanhash_blake_8way;
|
||||
// gate->hash = (void*)&blakehash_8way;
|
||||
#if defined(BLAKE_4WAY)
|
||||
four_way_not_tested();
|
||||
gate->scanhash = (void*)&scanhash_blake_4way;
|
||||
|
@@ -10,12 +10,12 @@
|
||||
|
||||
#if defined (BLAKE_4WAY)
|
||||
void blakehash_4way(void *state, const void *input);
|
||||
int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
void blakehash( void *state, const void *input );
|
||||
int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
int scanhash_blake( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -37,7 +37,7 @@
|
||||
#ifndef __BLAKE_HASH_4WAY__
|
||||
#define __BLAKE_HASH_4WAY__ 1
|
||||
|
||||
#ifdef __SSE4_2__
|
||||
//#ifdef __SSE4_2__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
@@ -45,7 +45,7 @@ extern "C"{
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_blake256 256
|
||||
|
||||
@@ -57,30 +57,35 @@ extern "C"{
|
||||
// Blake-256 4 way
|
||||
|
||||
typedef struct {
|
||||
__m128i buf[16] __attribute__ ((aligned (64)));
|
||||
__m128i H[8];
|
||||
__m128i S[4];
|
||||
unsigned char buf[64<<2];
|
||||
uint32_t H[8<<2];
|
||||
// __m128i buf[16] __attribute__ ((aligned (64)));
|
||||
// __m128i H[8];
|
||||
// __m128i S[4];
|
||||
size_t ptr;
|
||||
sph_u32 T0, T1;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_4way_small_context;
|
||||
} blake_4way_small_context __attribute__ ((aligned (64)));
|
||||
|
||||
// Default 14 rounds
|
||||
typedef blake_4way_small_context blake256_4way_context;
|
||||
void blake256_4way_init(void *cc);
|
||||
void blake256_4way(void *cc, const void *data, size_t len);
|
||||
void blake256_4way_close(void *cc, void *dst);
|
||||
void blake256_4way_init(void *ctx);
|
||||
void blake256_4way_update(void *ctx, const void *data, size_t len);
|
||||
#define blake256_4way blake256_4way_update
|
||||
void blake256_4way_close(void *ctx, void *dst);
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_4way_small_context blake256r14_4way_context;
|
||||
void blake256r14_4way_init(void *cc);
|
||||
void blake256r14_4way(void *cc, const void *data, size_t len);
|
||||
void blake256r14_4way_update(void *cc, const void *data, size_t len);
|
||||
#define blake256r14_4way blake256r14_4way_update
|
||||
void blake256r14_4way_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_4way_small_context blake256r8_4way_context;
|
||||
void blake256r8_4way_init(void *cc);
|
||||
void blake256r8_4way(void *cc, const void *data, size_t len);
|
||||
void blake256r8_4way_update(void *cc, const void *data, size_t len);
|
||||
#define blake256r8_4way blake256r8_4way_update
|
||||
void blake256r8_4way_close(void *cc, void *dst);
|
||||
|
||||
#ifdef __AVX2__
|
||||
@@ -90,7 +95,6 @@ void blake256r8_4way_close(void *cc, void *dst);
|
||||
typedef struct {
|
||||
__m256i buf[16] __attribute__ ((aligned (64)));
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
sph_u32 T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
@@ -99,45 +103,98 @@ typedef struct {
|
||||
// Default 14 rounds
|
||||
typedef blake_8way_small_context blake256_8way_context;
|
||||
void blake256_8way_init(void *cc);
|
||||
void blake256_8way(void *cc, const void *data, size_t len);
|
||||
void blake256_8way_update(void *cc, const void *data, size_t len);
|
||||
//#define blake256_8way blake256_8way_update
|
||||
void blake256_8way_close(void *cc, void *dst);
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_8way_small_context blake256r14_8way_context;
|
||||
void blake256r14_8way_init(void *cc);
|
||||
void blake256r14_8way(void *cc, const void *data, size_t len);
|
||||
void blake256r14_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_8way_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_8way_small_context blake256r8_8way_context;
|
||||
void blake256r8_8way_init(void *cc);
|
||||
void blake256r8_8way(void *cc, const void *data, size_t len);
|
||||
void blake256r8_8way_update(void *cc, const void *data, size_t len);
|
||||
#define blake256r8_8way blake256r8_8way_update
|
||||
void blake256r8_8way_close(void *cc, void *dst);
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16] __attribute__ ((aligned (64)));
|
||||
__m256i buf[16];
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
sph_u64 T0, T1;
|
||||
} blake_4way_big_context;
|
||||
} blake_4way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_4way_big_context blake512_4way_context;
|
||||
|
||||
void blake512_4way_init(void *cc);
|
||||
void blake512_4way(void *cc, const void *data, size_t len);
|
||||
void blake512_4way_close(void *cc, void *dst);
|
||||
void blake512_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
void blake512_4way_init( blake_4way_big_context *sc );
|
||||
void blake512_4way_update( void *cc, const void *data, size_t len );
|
||||
#define blake512_4way blake512_4way_update
|
||||
void blake512_4way_close( void *cc, void *dst );
|
||||
void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
||||
#endif
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
//Blake-256 16 way
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
size_t ptr;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_16way_small_context __attribute__ ((aligned (128)));
|
||||
|
||||
// Default 14 rounds
|
||||
typedef blake_16way_small_context blake256_16way_context;
|
||||
void blake256_16way_init(void *cc);
|
||||
void blake256_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256_16way_close(void *cc, void *dst);
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_16way_small_context blake256r14_16way_context;
|
||||
void blake256r14_16way_init(void *cc);
|
||||
void blake256r14_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_16way_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_16way_small_context blake256r8_16way_context;
|
||||
void blake256r8_16way_init(void *cc);
|
||||
void blake256r8_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_16way_close(void *cc, void *dst);
|
||||
|
||||
|
||||
// Blake-512 8 way
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
__m512i S[4];
|
||||
size_t ptr;
|
||||
sph_u64 T0, T1;
|
||||
} blake_8way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_8way_big_context blake512_8way_context;
|
||||
|
||||
void blake512_8way_init( blake_8way_big_context *sc );
|
||||
void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8way_close( void *cc, void *dst );
|
||||
void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif // BLAKE_HASH_4WAY_H__
|
||||
|
@@ -39,8 +39,8 @@ void blakehash(void *state, const void *input)
|
||||
|
||||
}
|
||||
|
||||
int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
int scanhash_blake( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -49,6 +49,7 @@ int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint32_t _ALIGN(32) hash64[8];
|
||||
uint32_t _ALIGN(32) endiandata[20];
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
ctx_midstate_done = false;
|
||||
|
||||
|
1358
algo/blake/blake256-hash-4way.c
Normal file
1358
algo/blake/blake256-hash-4way.c
Normal file
File diff suppressed because it is too large
Load Diff
113
algo/blake/blake2b-4way.c
Normal file
113
algo/blake/blake2b-4way.c
Normal file
@@ -0,0 +1,113 @@
|
||||
/**
|
||||
* Blake2-B Implementation
|
||||
* tpruvot@github 2015-2016
|
||||
*/
|
||||
|
||||
#include "blake2b-gate.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "blake2b-hash-4way.h"
|
||||
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
blake2b_8way_init( &ctx );
|
||||
blake2b_8way_update( &ctx, vdata, 80 );
|
||||
blake2b_8way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane<<1 ] < Htarg )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
// Function not used, code inlined.
|
||||
void blake2b_4way_hash(void *output, const void *input)
|
||||
{
|
||||
blake2b_4way_ctx ctx;
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, input, 80 );
|
||||
blake2b_4way_final( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, vdata, 80 );
|
||||
blake2b_4way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane<<1 ] < Htarg )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
20
algo/blake/blake2b-gate.c
Normal file
20
algo/blake/blake2b-gate.c
Normal file
@@ -0,0 +1,20 @@
|
||||
#include "blake2b-gate.h"
|
||||
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_8way;
|
||||
// gate->hash = (void*)&blake2b_8way_hash;
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_4way;
|
||||
gate->hash = (void*)&blake2b_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2b;
|
||||
gate->hash = (void*)&blake2b_hash;
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
34
algo/blake/blake2b-gate.h
Normal file
34
algo/blake/blake2b-gate.h
Normal file
@@ -0,0 +1,34 @@
|
||||
#ifndef __BLAKE2B_GATE_H__
|
||||
#define __BLAKE2B_GATE_H__ 1
|
||||
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2B_8WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2B_4WAY
|
||||
#endif
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
//void blake2b_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
void blake2b_4way_hash( void *state, const void *input );
|
||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#else
|
||||
|
||||
void blake2b_hash( void *state, const void *input );
|
||||
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
372
algo/blake/blake2b-hash-4way.c
Normal file
372
algo/blake/blake2b-hash-4way.c
Normal file
@@ -0,0 +1,372 @@
|
||||
/*
|
||||
* Copyright 2009 Colin Percival, 2014 savale
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* This file was originally written by Colin Percival as part of the Tarsnap
|
||||
* online backup system.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "blake2b-hash-4way.h"
|
||||
|
||||
static const uint8_t sigma[12][16] =
|
||||
{
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
|
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
|
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
|
||||
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
|
||||
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
|
||||
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
|
||||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
|
||||
};
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define B2B8W_G(a, b, c, d, x, y) \
|
||||
{ \
|
||||
v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), x ); \
|
||||
v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 32 ); \
|
||||
v[c] = _mm512_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 24 ); \
|
||||
v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), y ); \
|
||||
v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 16 ); \
|
||||
v[c] = _mm512_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
|
||||
}
|
||||
|
||||
static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
{
|
||||
__m512i v[16], m[16];
|
||||
|
||||
v[ 0] = ctx->h[0];
|
||||
v[ 1] = ctx->h[1];
|
||||
v[ 2] = ctx->h[2];
|
||||
v[ 3] = ctx->h[3];
|
||||
v[ 4] = ctx->h[4];
|
||||
v[ 5] = ctx->h[5];
|
||||
v[ 6] = ctx->h[6];
|
||||
v[ 7] = ctx->h[7];
|
||||
v[ 8] = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
v[10] = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
v[11] = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = m512_const1_64( 0x510E527FADE682D1 );
|
||||
v[13] = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) );
|
||||
v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) );
|
||||
|
||||
if ( last )
|
||||
v[14] = mm512_not( v[14] );
|
||||
|
||||
m[ 0] = ctx->b[ 0];
|
||||
m[ 1] = ctx->b[ 1];
|
||||
m[ 2] = ctx->b[ 2];
|
||||
m[ 3] = ctx->b[ 3];
|
||||
m[ 4] = ctx->b[ 4];
|
||||
m[ 5] = ctx->b[ 5];
|
||||
m[ 6] = ctx->b[ 6];
|
||||
m[ 7] = ctx->b[ 7];
|
||||
m[ 8] = ctx->b[ 8];
|
||||
m[ 9] = ctx->b[ 9];
|
||||
m[10] = ctx->b[10];
|
||||
m[11] = ctx->b[11];
|
||||
m[12] = ctx->b[12];
|
||||
m[13] = ctx->b[13];
|
||||
m[14] = ctx->b[14];
|
||||
m[15] = ctx->b[15];
|
||||
|
||||
for ( int i = 0; i < 12; i++ )
|
||||
{
|
||||
B2B8W_G( 0, 4, 8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
|
||||
B2B8W_G( 1, 5, 9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
|
||||
B2B8W_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] );
|
||||
B2B8W_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] );
|
||||
B2B8W_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] );
|
||||
B2B8W_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] );
|
||||
B2B8W_G( 2, 7, 8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] );
|
||||
B2B8W_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
|
||||
}
|
||||
|
||||
ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] );
|
||||
ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] );
|
||||
ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] );
|
||||
ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] );
|
||||
ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] );
|
||||
ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] );
|
||||
ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] );
|
||||
ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] );
|
||||
}
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
ctx->h[0] = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = m512_const1_64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
ctx->h[0] = _mm512_xor_si512( ctx->h[0], m512_const1_64( 0x01010020 ) );
|
||||
|
||||
ctx->t[0] = 0;
|
||||
ctx->t[1] = 0;
|
||||
ctx->c = 0;
|
||||
ctx->outlen = 32;
|
||||
|
||||
for ( i = 0; i < 16; i++ )
|
||||
ctx->b[i] = m512_zero;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
size_t inlen )
|
||||
{
|
||||
__m512i* in =(__m512i*)input;
|
||||
|
||||
size_t i, c;
|
||||
c = ctx->c >> 3;
|
||||
|
||||
for ( i = 0; i < (inlen >> 3); i++ )
|
||||
{
|
||||
if ( ctx->c == 128 )
|
||||
{
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
blake2b_8way_compress( ctx, 0 );
|
||||
ctx->c = 0;
|
||||
}
|
||||
ctx->b[ c++ ] = in[i];
|
||||
ctx->c += 8;
|
||||
}
|
||||
}
|
||||
|
||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
||||
{
|
||||
size_t c;
|
||||
c = ctx->c >> 3;
|
||||
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
|
||||
while ( ctx->c < 128 )
|
||||
{
|
||||
ctx->b[c++] = m512_zero;
|
||||
ctx->c += 8;
|
||||
}
|
||||
|
||||
blake2b_8way_compress( ctx, 1 ); // final block flag = 1
|
||||
|
||||
casti_m512i( out, 0 ) = ctx->h[0];
|
||||
casti_m512i( out, 1 ) = ctx->h[1];
|
||||
casti_m512i( out, 2 ) = ctx->h[2];
|
||||
casti_m512i( out, 3 ) = ctx->h[3];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// G Mixing function.
|
||||
|
||||
#define B2B_G(a, b, c, d, x, y) \
|
||||
{ \
|
||||
v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \
|
||||
v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \
|
||||
v[c] = _mm256_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \
|
||||
v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \
|
||||
v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \
|
||||
v[c] = _mm256_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \
|
||||
}
|
||||
|
||||
// Initialization Vector.
|
||||
/*
|
||||
static const uint64_t blake2b_iv[8] = {
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
|
||||
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
|
||||
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||
};
|
||||
*/
|
||||
|
||||
static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
||||
{
|
||||
__m256i v[16], m[16];
|
||||
|
||||
v[ 0] = ctx->h[0];
|
||||
v[ 1] = ctx->h[1];
|
||||
v[ 2] = ctx->h[2];
|
||||
v[ 3] = ctx->h[3];
|
||||
v[ 4] = ctx->h[4];
|
||||
v[ 5] = ctx->h[5];
|
||||
v[ 6] = ctx->h[6];
|
||||
v[ 7] = ctx->h[7];
|
||||
v[ 8] = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||
v[10] = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||
v[11] = m256_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = m256_const1_64( 0x510E527FADE682D1 );
|
||||
v[13] = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) );
|
||||
v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) );
|
||||
|
||||
if ( last )
|
||||
v[14] = mm256_not( v[14] );
|
||||
|
||||
m[ 0] = ctx->b[ 0];
|
||||
m[ 1] = ctx->b[ 1];
|
||||
m[ 2] = ctx->b[ 2];
|
||||
m[ 3] = ctx->b[ 3];
|
||||
m[ 4] = ctx->b[ 4];
|
||||
m[ 5] = ctx->b[ 5];
|
||||
m[ 6] = ctx->b[ 6];
|
||||
m[ 7] = ctx->b[ 7];
|
||||
m[ 8] = ctx->b[ 8];
|
||||
m[ 9] = ctx->b[ 9];
|
||||
m[10] = ctx->b[10];
|
||||
m[11] = ctx->b[11];
|
||||
m[12] = ctx->b[12];
|
||||
m[13] = ctx->b[13];
|
||||
m[14] = ctx->b[14];
|
||||
m[15] = ctx->b[15];
|
||||
|
||||
for ( int i = 0; i < 12; i++ )
|
||||
{
|
||||
B2B_G( 0, 4, 8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
|
||||
B2B_G( 1, 5, 9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
|
||||
B2B_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] );
|
||||
B2B_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] );
|
||||
B2B_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] );
|
||||
B2B_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] );
|
||||
B2B_G( 2, 7, 8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] );
|
||||
B2B_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
|
||||
}
|
||||
|
||||
ctx->h[0] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[0], v[0] ), v[ 8] );
|
||||
ctx->h[1] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[1], v[1] ), v[ 9] );
|
||||
ctx->h[2] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[2], v[2] ), v[10] );
|
||||
ctx->h[3] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[3], v[3] ), v[11] );
|
||||
ctx->h[4] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[4], v[4] ), v[12] );
|
||||
ctx->h[5] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[5], v[5] ), v[13] );
|
||||
ctx->h[6] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[6], v[6] ), v[14] );
|
||||
ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
|
||||
}
|
||||
|
||||
int blake2b_4way_init( blake2b_4way_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
ctx->h[0] = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = m256_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = m256_const1_64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
ctx->h[0] = _mm256_xor_si256( ctx->h[0], m256_const1_64( 0x01010020 ) );
|
||||
|
||||
ctx->t[0] = 0;
|
||||
ctx->t[1] = 0;
|
||||
ctx->c = 0;
|
||||
ctx->outlen = 32;
|
||||
|
||||
for ( i = 0; i < 16; i++ )
|
||||
ctx->b[i] = m256_zero;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
size_t inlen )
|
||||
{
|
||||
__m256i* in =(__m256i*)input;
|
||||
|
||||
size_t i, c;
|
||||
c = ctx->c >> 3;
|
||||
|
||||
for ( i = 0; i < (inlen >> 3); i++ )
|
||||
{
|
||||
if ( ctx->c == 128 )
|
||||
{
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
blake2b_4way_compress( ctx, 0 );
|
||||
ctx->c = 0;
|
||||
}
|
||||
ctx->b[ c++ ] = in[i];
|
||||
ctx->c += 8;
|
||||
}
|
||||
}
|
||||
|
||||
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
|
||||
{
|
||||
size_t c;
|
||||
c = ctx->c >> 3;
|
||||
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
|
||||
while ( ctx->c < 128 )
|
||||
{
|
||||
ctx->b[c++] = m256_zero;
|
||||
ctx->c += 8;
|
||||
}
|
||||
|
||||
blake2b_4way_compress( ctx, 1 ); // final block flag = 1
|
||||
|
||||
casti_m256i( out, 0 ) = ctx->h[0];
|
||||
casti_m256i( out, 1 ) = ctx->h[1];
|
||||
casti_m256i( out, 2 ) = ctx->h[2];
|
||||
casti_m256i( out, 3 ) = ctx->h[3];
|
||||
}
|
||||
|
||||
#endif
|
53
algo/blake/blake2b-hash-4way.h
Normal file
53
algo/blake/blake2b-hash-4way.h
Normal file
@@ -0,0 +1,53 @@
|
||||
#pragma once
|
||||
#ifndef __BLAKE2B_HASH_4WAY_H__
|
||||
#define __BLAKE2B_HASH_4WAY_H__
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#include <inttypes.h>
|
||||
#define inline __inline
|
||||
#define ALIGN(x) __declspec(align(x))
|
||||
#else
|
||||
#define ALIGN(x) __attribute__((aligned(x)))
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
ALIGN(128) typedef struct {
|
||||
__m512i b[16]; // input buffer
|
||||
__m512i h[8]; // chained state
|
||||
uint64_t t[2]; // total number of bytes
|
||||
size_t c; // pointer for b[]
|
||||
size_t outlen; // digest size
|
||||
} blake2b_8way_ctx;
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx );
|
||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
size_t inlen );
|
||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// state context
|
||||
ALIGN(128) typedef struct {
|
||||
__m256i b[16]; // input buffer
|
||||
__m256i h[8]; // chained state
|
||||
uint64_t t[2]; // total number of bytes
|
||||
size_t c; // pointer for b[]
|
||||
size_t outlen; // digest size
|
||||
} blake2b_4way_ctx;
|
||||
|
||||
int blake2b_4way_init( blake2b_4way_ctx *ctx );
|
||||
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
size_t inlen );
|
||||
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -3,13 +3,11 @@
|
||||
* tpruvot@github 2015-2016
|
||||
*/
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include "blake2b-gate.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/sph_blake2b.h"
|
||||
|
||||
//static __thread sph_blake2b_ctx s_midstate;
|
||||
//static __thread sph_blake2b_ctx s_ctx;
|
||||
#define MIDLEN 76
|
||||
#define A 64
|
||||
|
||||
@@ -25,26 +23,17 @@ void blake2b_hash(void *output, const void *input)
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
||||
/*
|
||||
static void blake2b_hash_end(uint32_t *output, const uint32_t *input)
|
||||
{
|
||||
s_ctx.outlen = MIDLEN;
|
||||
memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
|
||||
sph_blake2b_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
|
||||
sph_blake2b_final(&s_ctx, (uint8_t*) output);
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(A) vhashcpu[8];
|
||||
uint32_t _ALIGN(A) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[8];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
@@ -52,179 +41,23 @@ int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
}
|
||||
|
||||
// midstate (untested yet)
|
||||
//blake2b_init(&s_midstate, 32, NULL, 0);
|
||||
//blake2b_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
|
||||
//memcpy(&s_ctx, &s_midstate, sizeof(blake2b_ctx));
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[8], n);
|
||||
be32enc(&endiandata[19], n);
|
||||
//blake2b_hash_end(vhashcpu, endiandata);
|
||||
blake2b_hash(vhashcpu, endiandata);
|
||||
|
||||
if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) {
|
||||
work_set_target_ratio(work, vhashcpu);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[8] = n;
|
||||
pdata[19] = n;
|
||||
return 1;
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[8] = n;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void swab256(void *dest_p, const void *src_p)
|
||||
{
|
||||
uint32_t *dest = (uint32_t *)dest_p;
|
||||
const uint32_t *src = (uint32_t *)src_p;
|
||||
|
||||
dest[0] = swab32(src[7]);
|
||||
dest[1] = swab32(src[6]);
|
||||
dest[2] = swab32(src[5]);
|
||||
dest[3] = swab32(src[4]);
|
||||
dest[4] = swab32(src[3]);
|
||||
dest[5] = swab32(src[2]);
|
||||
dest[6] = swab32(src[1]);
|
||||
dest[7] = swab32(src[0]);
|
||||
}
|
||||
|
||||
/* compute nbits to get the network diff */
|
||||
void blake2b_calc_network_diff(struct work *work)
|
||||
{
|
||||
// sample for diff 43.281 : 1c05ea29
|
||||
uint32_t nbits = work->data[11]; // unsure if correct
|
||||
uint32_t bits = (nbits & 0xffffff);
|
||||
int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
|
||||
|
||||
double d = (double)0x0000ffff / (double)bits;
|
||||
for (int m=shift; m < 29; m++) d *= 256.0;
|
||||
for (int m=29; m < shift; m++) d /= 256.0;
|
||||
if (opt_debug_diff)
|
||||
applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
|
||||
net_diff = d;
|
||||
}
|
||||
|
||||
void blake2b_be_build_stratum_request( char *req, struct work *work )
|
||||
{
|
||||
unsigned char *xnonce2str;
|
||||
uint32_t ntime, nonce;
|
||||
char ntimestr[9], noncestr[9];
|
||||
be32enc( &ntime, work->data[ algo_gate.ntime_index ] );
|
||||
be32enc( &nonce, work->data[ algo_gate.nonce_index ] );
|
||||
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
|
||||
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
|
||||
uint16_t high_nonce = swab32(work->data[9]) >> 16;
|
||||
xnonce2str = abin2hex((unsigned char*)(&high_nonce), 2);
|
||||
snprintf( req, JSON_BUF_LEN,
|
||||
"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
|
||||
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
|
||||
free( xnonce2str );
|
||||
}
|
||||
|
||||
#define min(a,b) (a>b ? (b) :(a))
|
||||
|
||||
// merkle root handled here, no need for gen_merkle_root gate target
|
||||
void blake2b_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
{
|
||||
uchar merkle_root[64] = { 0 };
|
||||
uint32_t extraheader[32] = { 0 };
|
||||
int headersize = 0;
|
||||
size_t t;
|
||||
int i;
|
||||
|
||||
// merkle root
|
||||
memcpy( merkle_root, sctx->job.coinbase, 32 );
|
||||
headersize = min( (int)sctx->job.coinbase_size - 32, sizeof(extraheader) );
|
||||
memcpy( extraheader, &sctx->job.coinbase[32], headersize );
|
||||
// Increment extranonce2
|
||||
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
|
||||
// Assemble block header
|
||||
memset( g_work->data, 0, sizeof(g_work->data) );
|
||||
// g_work->data[0] = le32dec( sctx->job.version );
|
||||
// for ( i = 0; i < 8; i++ )
|
||||
// g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[i] = ((uint32_t*)sctx->job.prevhash)[7-i];
|
||||
// for ( i = 0; i < 8; i++ )
|
||||
// g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
|
||||
g_work->data[8] = 0; // nonce
|
||||
g_work->data[9] = swab32( extraheader[0] ) | ( rand() & 0xf0 );
|
||||
g_work->data[10] = be32dec( sctx->job.ntime );
|
||||
g_work->data[11] = be32dec( sctx->job.nbits );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[12+i] = ( (uint32_t*)merkle_root )[i];
|
||||
}
|
||||
|
||||
#undef min
|
||||
|
||||
void blake2b_get_new_work( struct work* work, struct work* g_work, int thr_id,
|
||||
uint32_t* end_nonce_ptr, bool clean_job )
|
||||
{
|
||||
const int wkcmp_sz = 32; // bytes
|
||||
const int wkcmp_off = 32 + 16;
|
||||
uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
|
||||
|
||||
if ( memcmp( &work->data[ wkcmp_off ], &g_work->data[ wkcmp_off ], wkcmp_sz )
|
||||
&& ( clean_job || ( *nonceptr >= *end_nonce_ptr )
|
||||
|| strcmp( work->job_id, g_work->job_id ) ) )
|
||||
{
|
||||
work_free( work );
|
||||
work_copy( work, g_work );
|
||||
*nonceptr = ( 0xffffffffU / opt_n_threads ) * thr_id;
|
||||
if ( opt_randomize )
|
||||
*nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
|
||||
*end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20;
|
||||
}
|
||||
else
|
||||
++(*nonceptr);
|
||||
|
||||
// suprnova job_id check without data/target/height change...
|
||||
// we just may have copied new g_wwork to work so why this test here?
|
||||
// if ( have_stratum && strcmp( work->job_id, g_work->job_id ) )
|
||||
// exit thread loop
|
||||
// continue;
|
||||
// else
|
||||
// {
|
||||
// nonceptr[1] += 0x10;
|
||||
// nonceptr[1] |= thr_id;
|
||||
// }
|
||||
}
|
||||
|
||||
bool blake2b_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
|
||||
int thr_id )
|
||||
{
|
||||
if ( have_stratum && strcmp( stratum->job.job_id, work->job_id ) )
|
||||
// need to regen g_work..
|
||||
return false;
|
||||
// extradata: prevent duplicates
|
||||
work->data[ 8 ] += 0x10;
|
||||
work->data[ 8 + 1 ] |= thr_id;
|
||||
return true;
|
||||
}
|
||||
|
||||
double blake2b_get_max64() { return 0x1fffffLL; }
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate )
|
||||
{
|
||||
algo_not_tested();
|
||||
gate->ntime_index = 10;
|
||||
gate->nbits_index = 11;
|
||||
gate->nonce_index = 8;
|
||||
gate->work_cmp_size = 32;
|
||||
gate->scanhash = (void*)&scanhash_blake2b;
|
||||
gate->hash = (void*)&blake2b_hash;
|
||||
gate->calc_network_diff = (void*)&blake2b_calc_network_diff;
|
||||
gate->build_stratum_request = (void*)&blake2b_be_build_stratum_request;
|
||||
gate->work_decode = (void*)&std_be_work_decode;
|
||||
gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
|
||||
gate->build_extraheader = (void*)&blake2b_build_extraheader;
|
||||
gate->get_new_work = (void*)&blake2b_get_new_work;
|
||||
gate->get_max64 = (void*)&blake2b_get_max64;
|
||||
gate->ready_to_mine = (void*)&blake2b_ready_to_mine;
|
||||
have_gbt = false;
|
||||
return true;
|
||||
}
|
||||
|
@@ -3,73 +3,115 @@
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(BLAKE2S_8WAY)
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
static __thread blake2s_16way_state blake2s_16w_ctx;
|
||||
|
||||
void blake2s_16way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_16way_state ctx;
|
||||
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
|
||||
blake2s_16way_update( &ctx, input + (64<<4), 16 );
|
||||
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*16] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<4]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm512_bswap_32( _mm512_set_epi32(
|
||||
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_16way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 16;
|
||||
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
|
||||
static __thread blake2s_8way_state blake2s_8w_ctx;
|
||||
|
||||
void blake2s_8way_hash( void *output, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
blake2s_8way_state ctx;
|
||||
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
|
||||
|
||||
blake2s_8way_update( &ctx, input + (64<<3), 16 );
|
||||
blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
|
||||
|
||||
mm256_deinterleave_8x32( output, output+ 32, output+ 64, output+ 96,
|
||||
output+128, output+160, output+192, output+224,
|
||||
vhash, 256 );
|
||||
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep = vdata + 152; // 19*8
|
||||
int thr_id = mythr->id;
|
||||
|
||||
swab32_array( edata, pdata, 20 );
|
||||
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep +1, n+1 );
|
||||
be32enc( noncep +2, n+2 );
|
||||
be32enc( noncep +3, n+3 );
|
||||
be32enc( noncep +4, n+4 );
|
||||
be32enc( noncep +5, n+5 );
|
||||
be32enc( noncep +6, n+6 );
|
||||
be32enc( noncep +7, n+7 );
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_8way_hash( hash, vdata );
|
||||
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
|
||||
} while ( (num_found == 0) && (n < max_nonce)
|
||||
&& !work_restart[thr_id].restart );
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
@@ -78,60 +120,51 @@ static __thread blake2s_4way_state blake2s_4w_ctx;
|
||||
|
||||
void blake2s_4way_hash( void *output, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake2s_4way_state ctx;
|
||||
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
|
||||
|
||||
blake2s_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
|
||||
|
||||
mm128_deinterleave_4x32( output, output+32, output+64, output+96,
|
||||
vhash, 256 );
|
||||
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep = vdata + 76; // 19*4
|
||||
int thr_id = mythr->id;
|
||||
|
||||
swab32_array( edata, pdata, 20 );
|
||||
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep +1, n+1 );
|
||||
be32enc( noncep +2, n+2 );
|
||||
be32enc( noncep +3, n+3 );
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_4way_hash( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
|
||||
} while ( (num_found == 0) && (n < max_nonce)
|
||||
&& !work_restart[thr_id].restart );
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -1,15 +1,12 @@
|
||||
#include "blake2s-gate.h"
|
||||
|
||||
|
||||
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
|
||||
int64_t blake2s_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
}
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2S_8WAY)
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_16way;
|
||||
gate->hash = (void*)&blake2s_16way_hash;
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
//#if defined(BLAKE2S_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_8way;
|
||||
gate->hash = (void*)&blake2s_8way_hash;
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
@@ -19,8 +16,7 @@ bool register_blake2s_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_blake2s;
|
||||
gate->hash = (void*)&blake2s_hash;
|
||||
#endif
|
||||
gate->get_max64 = (void*)&blake2s_get_max64;
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -4,31 +4,45 @@
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
//#if defined(__SSE4_2__)
|
||||
#if defined(__SSE2__)
|
||||
#define BLAKE2S_4WAY
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define BLAKE2S_8WAY
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2S_16WAY
|
||||
#endif
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(BLAKE2S_8WAY)
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
void blake2s_16way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined (BLAKE2S_8WAY)
|
||||
|
||||
//#if defined(BLAKE2S_8WAY)
|
||||
|
||||
void blake2s_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined (BLAKE2S_4WAY)
|
||||
|
||||
void blake2s_4way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#else
|
||||
|
||||
void blake2s_hash( void *state, const void *input );
|
||||
int scanhash_blake2s( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
int scanhash_blake2s( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -17,13 +17,16 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
//#if defined(__SSE4_2__)
|
||||
#if defined(__SSE2__)
|
||||
|
||||
/*
|
||||
static const uint32_t blake2s_IV[8] =
|
||||
{
|
||||
0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
|
||||
0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
|
||||
};
|
||||
*/
|
||||
|
||||
static const uint8_t blake2s_sigma[10][16] =
|
||||
{
|
||||
@@ -39,6 +42,7 @@ static const uint8_t blake2s_sigma[10][16] =
|
||||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
|
||||
};
|
||||
|
||||
|
||||
// define a constant for initial param.
|
||||
|
||||
int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
|
||||
@@ -57,8 +61,18 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
|
||||
memset( P->personal, 0, sizeof( P->personal ) );
|
||||
|
||||
memset( S, 0, sizeof( blake2s_4way_state ) );
|
||||
for( int i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
|
||||
|
||||
S->h[0] = m128_const1_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = m128_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = m128_const1_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = m128_const1_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = m128_const1_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = m128_const1_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
// for( int i = 0; i < 8; ++i )
|
||||
// S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
@@ -76,41 +90,45 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
|
||||
memcpy_128( m, block, 16 );
|
||||
memcpy_128( v, S->h, 8 );
|
||||
|
||||
v[ 8] = _mm_set1_epi32( blake2s_IV[0] );
|
||||
v[ 9] = _mm_set1_epi32( blake2s_IV[1] );
|
||||
v[10] = _mm_set1_epi32( blake2s_IV[2] );
|
||||
v[11] = _mm_set1_epi32( blake2s_IV[3] );
|
||||
v[ 8] = m128_const1_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = m128_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
|
||||
_mm_set1_epi32( blake2s_IV[4] ) );
|
||||
m128_const1_64( 0x510E527F510E527FULL ) );
|
||||
v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
|
||||
_mm_set1_epi32( blake2s_IV[5] ) );
|
||||
m128_const1_64( 0x9B05688C9B05688CULL ) );
|
||||
v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
|
||||
_mm_set1_epi32( blake2s_IV[6] ) );
|
||||
m128_const1_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
|
||||
_mm_set1_epi32( blake2s_IV[7] ) );
|
||||
m128_const1_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
#define G4W(r,i,a,b,c,d) \
|
||||
#define G4W( sigma0, sigma1, a, b, c, d ) \
|
||||
do { \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+0] ] ); \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+1] ] ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define ROUND4W(r) \
|
||||
do { \
|
||||
G4W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G4W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G4W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
|
||||
G4W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
|
||||
G4W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
|
||||
G4W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
|
||||
G4W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G4W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
|
||||
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
|
||||
G4W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G4W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G4W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
|
||||
G4W( sigma[ 6], sigma[ 7], v[ 3], v[ 7], v[11], v[15] ); \
|
||||
G4W( sigma[ 8], sigma[ 9], v[ 0], v[ 5], v[10], v[15] ); \
|
||||
G4W( sigma[10], sigma[11], v[ 1], v[ 6], v[11], v[12] ); \
|
||||
G4W( sigma[12], sigma[13], v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G4W( sigma[14], sigma[15], v[ 3], v[ 4], v[ 9], v[14] ); \
|
||||
} while(0)
|
||||
|
||||
ROUND4W( 0 );
|
||||
@@ -132,26 +150,47 @@ do { \
|
||||
return 0;
|
||||
}
|
||||
|
||||
// There is a problem that can't be resolved internally.
|
||||
// If the last block is a full 64 bytes it should not be compressed in
|
||||
// update but left for final. However, when streaming, it isn't known
|
||||
// which block is last. There may be a subsequent call to update to add
|
||||
// more data.
|
||||
//
|
||||
// The reference code handled this by juggling 2 blocks at a time at
|
||||
// a significant performance penalty.
|
||||
//
|
||||
// Instead a new function is introduced called full_blocks which combines
|
||||
// update and final and is to be used in non-streaming mode where the data
|
||||
// is a multiple of 64 bytes.
|
||||
//
|
||||
// Supported:
|
||||
// 64 + 16 bytes (blake2s with midstate optimization)
|
||||
// 80 bytes (blake2s without midstate optimization)
|
||||
// Any multiple of 64 bytes in one shot (x25x)
|
||||
//
|
||||
// Unsupported:
|
||||
// Stream of full 64 byte blocks one at a time.
|
||||
|
||||
// use only when streaming more data or final block not full.
|
||||
int blake2s_4way_update( blake2s_4way_state *S, const void *in,
|
||||
uint64_t inlen )
|
||||
{
|
||||
__m128i *input = (__m128i*)in;
|
||||
__m128i *buf = (__m128i*)S->buf;
|
||||
const int bsize = BLAKE2S_BLOCKBYTES;
|
||||
__m128i *input = (__m128i*)in;
|
||||
__m128i *buf = (__m128i*)S->buf;
|
||||
|
||||
while( inlen > 0 )
|
||||
{
|
||||
size_t left = S->buflen;
|
||||
if( inlen >= bsize - left )
|
||||
if( inlen >= BLAKE2S_BLOCKBYTES - left )
|
||||
{
|
||||
memcpy_128( buf + (left>>2), input, (bsize - left) >> 2 );
|
||||
S->buflen += bsize - left;
|
||||
memcpy_128( buf + (left>>2), input, (BLAKE2S_BLOCKBYTES - left) >> 2 );
|
||||
S->buflen += BLAKE2S_BLOCKBYTES - left;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
blake2s_4way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
input += ( bsize >> 2 );
|
||||
inlen -= bsize;
|
||||
input += ( BLAKE2S_BLOCKBYTES >> 2 );
|
||||
inlen -= BLAKE2S_BLOCKBYTES;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -183,8 +222,45 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Update and final when inlen is a multiple of 64 bytes
|
||||
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
||||
const void *input, uint64_t inlen )
|
||||
{
|
||||
__m128i *in = (__m128i*)input;
|
||||
__m128i *buf = (__m128i*)S->buf;
|
||||
|
||||
while( inlen > BLAKE2S_BLOCKBYTES )
|
||||
{
|
||||
memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
|
||||
S->buflen = BLAKE2S_BLOCKBYTES;
|
||||
inlen -= BLAKE2S_BLOCKBYTES;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
blake2s_4way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
in += ( BLAKE2S_BLOCKBYTES >> 2 );
|
||||
}
|
||||
|
||||
// last block
|
||||
memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
|
||||
S->buflen = BLAKE2S_BLOCKBYTES;
|
||||
S->t[0] += S->buflen;
|
||||
S->t[1] += ( S->t[0] < S->buflen );
|
||||
if ( S->last_node ) S->f[1] = ~0U;
|
||||
S->f[0] = ~0U;
|
||||
blake2s_4way_compress( S, buf );
|
||||
|
||||
for ( int i = 0; i < 8; ++i )
|
||||
casti_m128i( out, i ) = S->h[ i ];
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// The commented code below is slower on Intel but faster on
|
||||
// Zen1 AVX2. It's also faster than Zen1 AVX.
|
||||
// Ryzen gen2 is unknown at this time.
|
||||
|
||||
int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
|
||||
{
|
||||
__m256i m[16];
|
||||
@@ -193,6 +269,23 @@ int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
|
||||
memcpy_256( m, block, 16 );
|
||||
memcpy_256( v, S->h, 8 );
|
||||
|
||||
v[ 8] = m256_const1_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = m256_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
|
||||
m256_const1_64( 0x510E527F510E527FULL ) );
|
||||
|
||||
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
|
||||
m256_const1_64( 0x9B05688C9B05688CULL ) );
|
||||
|
||||
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
|
||||
m256_const1_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
|
||||
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
|
||||
m256_const1_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
/*
|
||||
v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
|
||||
v[ 9] = _mm256_set1_epi32( blake2s_IV[1] );
|
||||
v[10] = _mm256_set1_epi32( blake2s_IV[2] );
|
||||
@@ -206,6 +299,7 @@ int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
|
||||
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
|
||||
_mm256_set1_epi32( blake2s_IV[7] ) );
|
||||
|
||||
|
||||
#define G8W(r,i,a,b,c,d) \
|
||||
do { \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
|
||||
@@ -219,7 +313,36 @@ do { \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
#define G8W( sigma0, sigma1, a, b, c, d) \
|
||||
do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s1 ] ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
} while(0)
|
||||
|
||||
#define ROUND8W(r) \
|
||||
do { \
|
||||
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
|
||||
G8W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G8W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G8W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
|
||||
G8W( sigma[ 6], sigma[ 7], v[ 3], v[ 7], v[11], v[15] ); \
|
||||
G8W( sigma[ 8], sigma[ 9], v[ 0], v[ 5], v[10], v[15] ); \
|
||||
G8W( sigma[10], sigma[11], v[ 1], v[ 6], v[11], v[12] ); \
|
||||
G8W( sigma[12], sigma[13], v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G8W( sigma[14], sigma[15], v[ 3], v[ 4], v[ 9], v[14] ); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
#define ROUND8W(r) \
|
||||
do { \
|
||||
G8W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
@@ -231,6 +354,7 @@ do { \
|
||||
G8W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G8W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
ROUND8W( 0 );
|
||||
ROUND8W( 1 );
|
||||
@@ -267,8 +391,18 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
|
||||
memset( P->personal, 0, sizeof( P->personal ) );
|
||||
|
||||
memset( S, 0, sizeof( blake2s_8way_state ) );
|
||||
for( int i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
|
||||
S->h[0] = m256_const1_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = m256_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = m256_const1_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = m256_const1_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = m256_const1_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = m256_const1_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
|
||||
// for( int i = 0; i < 8; ++i )
|
||||
// S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
@@ -329,9 +463,203 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Update and final when inlen is a multiple of 64 bytes
|
||||
int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
const void *input, uint64_t inlen )
|
||||
{
|
||||
__m256i *in = (__m256i*)input;
|
||||
__m256i *buf = (__m256i*)S->buf;
|
||||
|
||||
while( inlen > BLAKE2S_BLOCKBYTES )
|
||||
{
|
||||
memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
|
||||
S->buflen = BLAKE2S_BLOCKBYTES;
|
||||
inlen -= BLAKE2S_BLOCKBYTES;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
blake2s_8way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
in += ( BLAKE2S_BLOCKBYTES >> 2 );
|
||||
}
|
||||
|
||||
// last block
|
||||
memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
|
||||
S->buflen = BLAKE2S_BLOCKBYTES;
|
||||
S->t[0] += S->buflen;
|
||||
S->t[1] += ( S->t[0] < S->buflen );
|
||||
if ( S->last_node ) S->f[1] = ~0U;
|
||||
S->f[0] = ~0U;
|
||||
blake2s_8way_compress( S, buf );
|
||||
|
||||
for ( int i = 0; i < 8; ++i )
|
||||
casti_m256i( out, i ) = S->h[ i ];
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// Blake2s-256 16 way
|
||||
|
||||
int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
|
||||
{
|
||||
__m512i m[16];
|
||||
__m512i v[16];
|
||||
|
||||
memcpy_512( m, block, 16 );
|
||||
memcpy_512( v, S->h, 8 );
|
||||
|
||||
v[ 8] = m512_const1_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = m512_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ),
|
||||
m512_const1_64( 0x510E527F510E527FULL ) );
|
||||
|
||||
v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ),
|
||||
m512_const1_64( 0x9B05688C9B05688CULL ) );
|
||||
|
||||
v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ),
|
||||
m512_const1_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
|
||||
v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ),
|
||||
m512_const1_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
|
||||
#define G16W( sigma0, sigma1, a, b, c, d) \
|
||||
do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s1 ] ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
|
||||
} while(0)
|
||||
|
||||
#define ROUND16W(r) \
|
||||
do { \
|
||||
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
|
||||
G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
|
||||
G16W( sigma[ 6], sigma[ 7], v[ 3], v[ 7], v[11], v[15] ); \
|
||||
G16W( sigma[ 8], sigma[ 9], v[ 0], v[ 5], v[10], v[15] ); \
|
||||
G16W( sigma[10], sigma[11], v[ 1], v[ 6], v[11], v[12] ); \
|
||||
G16W( sigma[12], sigma[13], v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G16W( sigma[14], sigma[15], v[ 3], v[ 4], v[ 9], v[14] ); \
|
||||
} while(0)
|
||||
|
||||
ROUND16W( 0 );
|
||||
ROUND16W( 1 );
|
||||
ROUND16W( 2 );
|
||||
ROUND16W( 3 );
|
||||
ROUND16W( 4 );
|
||||
ROUND16W( 5 );
|
||||
ROUND16W( 6 );
|
||||
ROUND16W( 7 );
|
||||
ROUND16W( 8 );
|
||||
ROUND16W( 9 );
|
||||
|
||||
for( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] );
|
||||
|
||||
#undef G16W
|
||||
#undef ROUND16W
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen )
|
||||
{
|
||||
blake2s_nway_param P[1];
|
||||
|
||||
P->digest_length = outlen;
|
||||
P->key_length = 0;
|
||||
P->fanout = 1;
|
||||
P->depth = 1;
|
||||
P->leaf_length = 0;
|
||||
*((uint64_t*)(P->node_offset)) = 0;
|
||||
P->node_depth = 0;
|
||||
P->inner_length = 0;
|
||||
memset( P->salt, 0, sizeof( P->salt ) );
|
||||
memset( P->personal, 0, sizeof( P->personal ) );
|
||||
|
||||
memset( S, 0, sizeof( blake2s_16way_state ) );
|
||||
S->h[0] = m512_const1_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = m512_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = m512_const1_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = m512_const1_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = m512_const1_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = m512_const1_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
/* IV XOR ParamBlock */
|
||||
for ( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm512_xor_si512( S->h[i], _mm512_set1_epi32( p[i] ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blake2s_16way_update( blake2s_16way_state *S, const void *in,
|
||||
uint64_t inlen )
|
||||
{
|
||||
__m512i *input = (__m512i*)in;
|
||||
__m512i *buf = (__m512i*)S->buf;
|
||||
const int bsize = BLAKE2S_BLOCKBYTES;
|
||||
|
||||
while( inlen > 0 )
|
||||
{
|
||||
size_t left = S->buflen;
|
||||
if( inlen >= bsize - left )
|
||||
{
|
||||
memcpy_512( buf + (left>>2), input, (bsize - left) >> 2 );
|
||||
S->buflen += bsize - left;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
blake2s_16way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
input += ( bsize >> 2 );
|
||||
inlen -= bsize;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy_512( buf + ( left>>2 ), input, inlen>>2 );
|
||||
S->buflen += (size_t) inlen;
|
||||
input += ( inlen>>2 );
|
||||
inlen -= inlen;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen )
|
||||
{
|
||||
__m512i *buf = (__m512i*)S->buf;
|
||||
|
||||
S->t[0] += S->buflen;
|
||||
S->t[1] += ( S->t[0] < S->buflen );
|
||||
if ( S->last_node )
|
||||
S->f[1] = ~0U;
|
||||
S->f[0] = ~0U;
|
||||
|
||||
memset_zero_512( buf + ( S->buflen>>2 ),
|
||||
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
|
||||
blake2s_16way_compress( S, buf );
|
||||
|
||||
for ( int i = 0; i < 8; ++i )
|
||||
casti_m512i( out, i ) = S->h[ i ];
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
#if 0
|
||||
int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
|
||||
{
|
||||
|
@@ -14,9 +14,9 @@
|
||||
#ifndef __BLAKE2S_HASH_4WAY_H__
|
||||
#define __BLAKE2S_HASH_4WAY_H__ 1
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
#if defined(__SSE2__)
|
||||
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
@@ -74,6 +74,9 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen );
|
||||
int blake2s_4way_update( blake2s_4way_state *S, const void *in,
|
||||
uint64_t inlen );
|
||||
int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
|
||||
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
||||
const void *input, uint64_t inlen );
|
||||
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
@@ -91,6 +94,27 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
|
||||
int blake2s_8way_update( blake2s_8way_state *S, const void *in,
|
||||
uint64_t inlen );
|
||||
int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
|
||||
int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
const void *input, uint64_t inlen );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
ALIGN( 128 ) typedef struct __blake2s_16way_state
|
||||
{
|
||||
__m512i h[8];
|
||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 16 ];
|
||||
uint32_t t[2];
|
||||
uint32_t f[2];
|
||||
size_t buflen;
|
||||
uint8_t last_node;
|
||||
} blake2s_16way_state ;
|
||||
|
||||
int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen );
|
||||
int blake2s_16way_update( blake2s_16way_state *S, const void *in,
|
||||
uint64_t inlen );
|
||||
int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
|
||||
|
||||
#endif
|
||||
|
||||
@@ -107,6 +131,6 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // __SSE4_2__
|
||||
#endif // __SSE2__
|
||||
|
||||
#endif
|
||||
|
@@ -32,14 +32,15 @@ static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
|
||||
blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
|
||||
}
|
||||
*/
|
||||
int scanhash_blake2s(int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done)
|
||||
int scanhash_blake2s( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
||||
uint32_t _ALIGN(64) hash64[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -69,18 +70,3 @@ int scanhash_blake2s(int thr_id, struct work *work,
|
||||
|
||||
return 0;
|
||||
}
|
||||
/*
|
||||
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
|
||||
int64_t blake2s_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
}
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_blake2s;
|
||||
gate->hash = (void*)&blake2s_hash;
|
||||
gate->get_max64 = (void*)&blake2s_get_max64;
|
||||
return true;
|
||||
};
|
||||
*/
|
||||
|
824
algo/blake/blake512-hash-4way.c
Normal file
824
algo/blake/blake512-hash-4way.c
Normal file
@@ -0,0 +1,824 @@
|
||||
/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
|
||||
/*
|
||||
* BLAKE implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include "blake-hash-4way.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
// Blake-512 common
|
||||
|
||||
/*
|
||||
static const sph_u64 IV512[8] = {
|
||||
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
||||
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
|
||||
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
|
||||
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
|
||||
};
|
||||
|
||||
static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
|
||||
static const unsigned sigma[16][16] = {
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
|
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
|
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
|
||||
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
|
||||
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
|
||||
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
|
||||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
|
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
|
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
|
||||
};
|
||||
|
||||
static const sph_u64 CB[16] = {
|
||||
SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
|
||||
SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
|
||||
SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
|
||||
SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
|
||||
SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
|
||||
SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
|
||||
SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
|
||||
SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
|
||||
|
||||
*/
|
||||
|
||||
#define Z00 0
|
||||
#define Z01 1
|
||||
#define Z02 2
|
||||
#define Z03 3
|
||||
#define Z04 4
|
||||
#define Z05 5
|
||||
#define Z06 6
|
||||
#define Z07 7
|
||||
#define Z08 8
|
||||
#define Z09 9
|
||||
#define Z0A A
|
||||
#define Z0B B
|
||||
#define Z0C C
|
||||
#define Z0D D
|
||||
#define Z0E E
|
||||
#define Z0F F
|
||||
|
||||
#define Z10 E
|
||||
#define Z11 A
|
||||
#define Z12 4
|
||||
#define Z13 8
|
||||
#define Z14 9
|
||||
#define Z15 F
|
||||
#define Z16 D
|
||||
#define Z17 6
|
||||
#define Z18 1
|
||||
#define Z19 C
|
||||
#define Z1A 0
|
||||
#define Z1B 2
|
||||
#define Z1C B
|
||||
#define Z1D 7
|
||||
#define Z1E 5
|
||||
#define Z1F 3
|
||||
|
||||
#define Z20 B
|
||||
#define Z21 8
|
||||
#define Z22 C
|
||||
#define Z23 0
|
||||
#define Z24 5
|
||||
#define Z25 2
|
||||
#define Z26 F
|
||||
#define Z27 D
|
||||
#define Z28 A
|
||||
#define Z29 E
|
||||
#define Z2A 3
|
||||
#define Z2B 6
|
||||
#define Z2C 7
|
||||
#define Z2D 1
|
||||
#define Z2E 9
|
||||
#define Z2F 4
|
||||
|
||||
#define Z30 7
|
||||
#define Z31 9
|
||||
#define Z32 3
|
||||
#define Z33 1
|
||||
#define Z34 D
|
||||
#define Z35 C
|
||||
#define Z36 B
|
||||
#define Z37 E
|
||||
#define Z38 2
|
||||
#define Z39 6
|
||||
#define Z3A 5
|
||||
#define Z3B A
|
||||
#define Z3C 4
|
||||
#define Z3D 0
|
||||
#define Z3E F
|
||||
#define Z3F 8
|
||||
|
||||
#define Z40 9
|
||||
#define Z41 0
|
||||
#define Z42 5
|
||||
#define Z43 7
|
||||
#define Z44 2
|
||||
#define Z45 4
|
||||
#define Z46 A
|
||||
#define Z47 F
|
||||
#define Z48 E
|
||||
#define Z49 1
|
||||
#define Z4A B
|
||||
#define Z4B C
|
||||
#define Z4C 6
|
||||
#define Z4D 8
|
||||
#define Z4E 3
|
||||
#define Z4F D
|
||||
|
||||
#define Z50 2
|
||||
#define Z51 C
|
||||
#define Z52 6
|
||||
#define Z53 A
|
||||
#define Z54 0
|
||||
#define Z55 B
|
||||
#define Z56 8
|
||||
#define Z57 3
|
||||
#define Z58 4
|
||||
#define Z59 D
|
||||
#define Z5A 7
|
||||
#define Z5B 5
|
||||
#define Z5C F
|
||||
#define Z5D E
|
||||
#define Z5E 1
|
||||
#define Z5F 9
|
||||
|
||||
#define Z60 C
|
||||
#define Z61 5
|
||||
#define Z62 1
|
||||
#define Z63 F
|
||||
#define Z64 E
|
||||
#define Z65 D
|
||||
#define Z66 4
|
||||
#define Z67 A
|
||||
#define Z68 0
|
||||
#define Z69 7
|
||||
#define Z6A 6
|
||||
#define Z6B 3
|
||||
#define Z6C 9
|
||||
#define Z6D 2
|
||||
#define Z6E 8
|
||||
#define Z6F B
|
||||
|
||||
#define Z70 D
|
||||
#define Z71 B
|
||||
#define Z72 7
|
||||
#define Z73 E
|
||||
#define Z74 C
|
||||
#define Z75 1
|
||||
#define Z76 3
|
||||
#define Z77 9
|
||||
#define Z78 5
|
||||
#define Z79 0
|
||||
#define Z7A F
|
||||
#define Z7B 4
|
||||
#define Z7C 8
|
||||
#define Z7D 6
|
||||
#define Z7E 2
|
||||
#define Z7F A
|
||||
|
||||
#define Z80 6
|
||||
#define Z81 F
|
||||
#define Z82 E
|
||||
#define Z83 9
|
||||
#define Z84 B
|
||||
#define Z85 3
|
||||
#define Z86 0
|
||||
#define Z87 8
|
||||
#define Z88 C
|
||||
#define Z89 2
|
||||
#define Z8A D
|
||||
#define Z8B 7
|
||||
#define Z8C 1
|
||||
#define Z8D 4
|
||||
#define Z8E A
|
||||
#define Z8F 5
|
||||
|
||||
#define Z90 A
|
||||
#define Z91 2
|
||||
#define Z92 8
|
||||
#define Z93 4
|
||||
#define Z94 7
|
||||
#define Z95 6
|
||||
#define Z96 1
|
||||
#define Z97 5
|
||||
#define Z98 F
|
||||
#define Z99 B
|
||||
#define Z9A 9
|
||||
#define Z9B E
|
||||
#define Z9C 3
|
||||
#define Z9D C
|
||||
#define Z9E D
|
||||
#define Z9F 0
|
||||
|
||||
#define Mx(r, i) Mx_(Z ## r ## i)
|
||||
#define Mx_(n) Mx__(n)
|
||||
#define Mx__(n) M ## n
|
||||
|
||||
#define CBx(r, i) CBx_(Z ## r ## i)
|
||||
#define CBx_(n) CBx__(n)
|
||||
#define CBx__(n) CB ## n
|
||||
|
||||
#define CB0 SPH_C64(0x243F6A8885A308D3)
|
||||
#define CB1 SPH_C64(0x13198A2E03707344)
|
||||
#define CB2 SPH_C64(0xA4093822299F31D0)
|
||||
#define CB3 SPH_C64(0x082EFA98EC4E6C89)
|
||||
#define CB4 SPH_C64(0x452821E638D01377)
|
||||
#define CB5 SPH_C64(0xBE5466CF34E90C6C)
|
||||
#define CB6 SPH_C64(0xC0AC29B7C97C50DD)
|
||||
#define CB7 SPH_C64(0x3F84D5B5B5470917)
|
||||
#define CB8 SPH_C64(0x9216D5D98979FB1B)
|
||||
#define CB9 SPH_C64(0xD1310BA698DFB5AC)
|
||||
#define CBA SPH_C64(0x2FFD72DBD01ADFB7)
|
||||
#define CBB SPH_C64(0xB8E1AFED6A267E96)
|
||||
#define CBC SPH_C64(0xBA7C9045F12C7F99)
|
||||
#define CBD SPH_C64(0x24A19947B3916CF7)
|
||||
#define CBE SPH_C64(0x0801F2E2858EFC16)
|
||||
#define CBF SPH_C64(0x636920D871574E69)
|
||||
|
||||
#define READ_STATE64(state) do { \
|
||||
H0 = (state)->H[0]; \
|
||||
H1 = (state)->H[1]; \
|
||||
H2 = (state)->H[2]; \
|
||||
H3 = (state)->H[3]; \
|
||||
H4 = (state)->H[4]; \
|
||||
H5 = (state)->H[5]; \
|
||||
H6 = (state)->H[6]; \
|
||||
H7 = (state)->H[7]; \
|
||||
S0 = (state)->S[0]; \
|
||||
S1 = (state)->S[1]; \
|
||||
S2 = (state)->S[2]; \
|
||||
S3 = (state)->S[3]; \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE64(state) do { \
|
||||
(state)->H[0] = H0; \
|
||||
(state)->H[1] = H1; \
|
||||
(state)->H[2] = H2; \
|
||||
(state)->H[3] = H3; \
|
||||
(state)->H[4] = H4; \
|
||||
(state)->H[5] = H5; \
|
||||
(state)->H[6] = H6; \
|
||||
(state)->H[7] = H7; \
|
||||
(state)->S[0] = S0; \
|
||||
(state)->S[1] = S1; \
|
||||
(state)->S[2] = S2; \
|
||||
(state)->S[3] = S3; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// Blake-512 8 way AVX512
|
||||
|
||||
#define GB_8WAY(m0, m1, c0, c1, a, b, c, d) do { \
|
||||
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
|
||||
_mm512_set1_epi64( c1 ), m0 ), b ), a ); \
|
||||
d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \
|
||||
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
|
||||
_mm512_set1_epi64( c0 ), m1 ), b ), a ); \
|
||||
d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \
|
||||
} while (0)
|
||||
|
||||
#define ROUND_B_8WAY(r) do { \
|
||||
GB_8WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
|
||||
GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
|
||||
GB_8WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
|
||||
GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
|
||||
GB_8WAY(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
|
||||
GB_8WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
|
||||
GB_8WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
|
||||
GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#define DECL_STATE64_8WAY \
|
||||
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m512i S0, S1, S2, S3; \
|
||||
sph_u64 T0, T1;
|
||||
|
||||
#define COMPRESS64_8WAY do \
|
||||
{ \
|
||||
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
__m512i shuf_bswap64; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm512_xor_si512( S0, m512_const1_64( CB0 ) ); \
|
||||
V9 = _mm512_xor_si512( S1, m512_const1_64( CB1 ) ); \
|
||||
VA = _mm512_xor_si512( S2, m512_const1_64( CB2 ) ); \
|
||||
VB = _mm512_xor_si512( S3, m512_const1_64( CB3 ) ); \
|
||||
VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
|
||||
m512_const1_64( CB4 ) ); \
|
||||
VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
|
||||
m512_const1_64( CB5 ) ); \
|
||||
VE = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
|
||||
m512_const1_64( CB6 ) ); \
|
||||
VF = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
|
||||
m512_const1_64( CB7 ) ); \
|
||||
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
|
||||
0x28292a2b2c2d2e2f, 0x2021222324252627, \
|
||||
0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
|
||||
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
|
||||
M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
|
||||
M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
|
||||
M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
|
||||
M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
|
||||
M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
|
||||
M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
|
||||
M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
|
||||
MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
|
||||
MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
|
||||
MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
|
||||
MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
|
||||
ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
|
||||
MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
|
||||
ROUND_B_8WAY(0); \
|
||||
ROUND_B_8WAY(1); \
|
||||
ROUND_B_8WAY(2); \
|
||||
ROUND_B_8WAY(3); \
|
||||
ROUND_B_8WAY(4); \
|
||||
ROUND_B_8WAY(5); \
|
||||
ROUND_B_8WAY(6); \
|
||||
ROUND_B_8WAY(7); \
|
||||
ROUND_B_8WAY(8); \
|
||||
ROUND_B_8WAY(9); \
|
||||
ROUND_B_8WAY(0); \
|
||||
ROUND_B_8WAY(1); \
|
||||
ROUND_B_8WAY(2); \
|
||||
ROUND_B_8WAY(3); \
|
||||
ROUND_B_8WAY(4); \
|
||||
ROUND_B_8WAY(5); \
|
||||
H0 = mm512_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm512_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm512_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm512_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm512_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm512_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm512_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm512_xor4( VF, V7, S3, H7 ); \
|
||||
} while (0)
|
||||
|
||||
void blake512_8way_init( blake_8way_big_context *sc )
|
||||
{
|
||||
__m512i zero = m512_zero;
|
||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
casti_m512i( sc->S, 0 ) = zero;
|
||||
casti_m512i( sc->S, 1 ) = zero;
|
||||
casti_m512i( sc->S, 2 ) = zero;
|
||||
casti_m512i( sc->S, 3 ) = zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
blake64_8way( blake_8way_big_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
__m512i *buf;
|
||||
size_t ptr;
|
||||
DECL_STATE64_8WAY
|
||||
|
||||
const int buf_size = 128; // sizeof/8
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
if ( len < (buf_size - ptr) )
|
||||
{
|
||||
memcpy_512( buf + (ptr>>3), vdata, len>>3 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE64(sc);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata = vdata + (clen>>3);
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
if ( ( T0 = SPH_T64(T0 + 1024) ) < 1024 )
|
||||
T1 = SPH_T64(T1 + 1);
|
||||
COMPRESS64_8WAY;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE64(sc);
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
blake64_8way_close( blake_8way_big_context *sc, void *dst )
|
||||
{
|
||||
__m512i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
// uint64_t z, zz;
|
||||
sph_u64 th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
// z = 0x80 >> n;
|
||||
// zz = ((ub & -z) | z) & 0xFF;
|
||||
// buf[ptr>>3] = _mm512_set1_epi64( zz );
|
||||
buf[ptr>>3] = m512_const1_64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
|
||||
sc->T1 = SPH_T64(sc->T1 - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
}
|
||||
if ( ptr <= 104 )
|
||||
{
|
||||
memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
buf[104>>3] = _mm512_or_si512( buf[104>>3],
|
||||
m512_const1_64( 0x0100000000000000ULL ) );
|
||||
buf[112>>3] = m512_const1_64( bswap_64( th ) );
|
||||
buf[120>>3] = m512_const1_64( bswap_64( tl ) );
|
||||
|
||||
blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_512( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
|
||||
blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
memset_zero_512( buf, 112>>3 );
|
||||
buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
|
||||
buf[112>>3] = m512_const1_64( bswap_64( th ) );
|
||||
buf[120>>3] = m512_const1_64( bswap_64( tl ) );
|
||||
|
||||
blake64_8way( sc, buf, 128 );
|
||||
}
|
||||
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
||||
}
|
||||
|
||||
void
|
||||
blake512_8way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake64_8way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_8way_close(void *cc, void *dst)
|
||||
{
|
||||
blake512_8way_addbits_and_close(cc, 0, 0, dst);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
blake64_8way_close(cc, dst);
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) do { \
|
||||
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
|
||||
_mm256_set1_epi64x( c1 ), m0 ), b ), a ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
|
||||
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
|
||||
_mm256_set1_epi64x( c0 ), m1 ), b ), a ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
|
||||
} while (0)
|
||||
|
||||
#define ROUND_B_4WAY(r) do { \
|
||||
GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
|
||||
GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
|
||||
GB_4WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
|
||||
GB_4WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
|
||||
GB_4WAY(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
|
||||
GB_4WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
|
||||
GB_4WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
|
||||
GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#define DECL_STATE64_4WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m256i S0, S1, S2, S3; \
|
||||
sph_u64 T0, T1;
|
||||
|
||||
#define COMPRESS64_4WAY do \
|
||||
{ \
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
__m256i shuf_bswap64; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, m256_const1_64( CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, m256_const1_64( CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, m256_const1_64( CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, m256_const1_64( CB3 ) ); \
|
||||
VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
|
||||
m256_const1_64( CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
|
||||
m256_const1_64( CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
|
||||
m256_const1_64( CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
|
||||
m256_const1_64( CB7 ) ); \
|
||||
shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
|
||||
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
|
||||
M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
|
||||
M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
|
||||
M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
|
||||
M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
|
||||
M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
|
||||
M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
|
||||
M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
|
||||
MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
|
||||
MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
|
||||
MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
|
||||
MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
|
||||
ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
|
||||
MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
|
||||
ROUND_B_4WAY(0); \
|
||||
ROUND_B_4WAY(1); \
|
||||
ROUND_B_4WAY(2); \
|
||||
ROUND_B_4WAY(3); \
|
||||
ROUND_B_4WAY(4); \
|
||||
ROUND_B_4WAY(5); \
|
||||
ROUND_B_4WAY(6); \
|
||||
ROUND_B_4WAY(7); \
|
||||
ROUND_B_4WAY(8); \
|
||||
ROUND_B_4WAY(9); \
|
||||
ROUND_B_4WAY(0); \
|
||||
ROUND_B_4WAY(1); \
|
||||
ROUND_B_4WAY(2); \
|
||||
ROUND_B_4WAY(3); \
|
||||
ROUND_B_4WAY(4); \
|
||||
ROUND_B_4WAY(5); \
|
||||
H0 = mm256_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm256_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm256_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm256_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm256_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm256_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm256_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm256_xor4( VF, V7, S3, H7 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
void blake512_4way_init( blake_4way_big_context *sc )
|
||||
{
|
||||
__m256i zero = m256_zero;
|
||||
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
casti_m256i( sc->S, 0 ) = zero;
|
||||
casti_m256i( sc->S, 1 ) = zero;
|
||||
casti_m256i( sc->S, 2 ) = zero;
|
||||
casti_m256i( sc->S, 3 ) = zero;
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
__m256i *buf;
|
||||
size_t ptr;
|
||||
DECL_STATE64_4WAY
|
||||
|
||||
const int buf_size = 128; // sizeof/8
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
if ( len < (buf_size - ptr) )
|
||||
{
|
||||
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE64(sc);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata = vdata + (clen>>3);
|
||||
len -= clen;
|
||||
if (ptr == buf_size )
|
||||
{
|
||||
if ((T0 = SPH_T64(T0 + 1024)) < 1024)
|
||||
T1 = SPH_T64(T1 + 1);
|
||||
COMPRESS64_4WAY;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE64(sc);
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||
{
|
||||
__m256i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
sph_u64 th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr>>3] = m256_const1_64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
|
||||
sc->T1 = SPH_T64(sc->T1 - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
}
|
||||
|
||||
if ( ptr <= 104 )
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
buf[104>>3] = _mm256_or_si256( buf[104>>3],
|
||||
m256_const1_64( 0x0100000000000000ULL ) );
|
||||
buf[112>>3] = m256_const1_64( bswap_64( th ) );
|
||||
buf[120>>3] = m256_const1_64( bswap_64( tl ) );
|
||||
|
||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
|
||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
memset_zero_256( buf, 112>>3 );
|
||||
buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
|
||||
buf[112>>3] = m256_const1_64( bswap_64( th ) );
|
||||
buf[120>>3] = m256_const1_64( bswap_64( tl ) );
|
||||
|
||||
blake64_4way( sc, buf, 128 );
|
||||
}
|
||||
mm256_block_bswap_64( (__m256i*)dst, sc->H );
|
||||
}
|
||||
|
||||
/*
|
||||
void
|
||||
blake512_4way_init(void *cc)
|
||||
{
|
||||
blake64_4way_init(cc, IV512, salt_zero_big);
|
||||
}
|
||||
*/
|
||||
|
||||
void
|
||||
blake512_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake64_4way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_4way_close(void *cc, void *dst)
|
||||
{
|
||||
blake64_4way_close( cc, dst );
|
||||
|
||||
// blake512_4way_addbits_and_close(cc, dst);
|
||||
}
|
||||
|
||||
/*
|
||||
void
|
||||
blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
blake64_4way_close(cc, ub, n, dst, 8);
|
||||
}
|
||||
*/
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -17,11 +17,11 @@ void blakecoin_4way_hash(void *state, const void *input)
|
||||
blake256r8_4way( &ctx, input + (64<<2), 16 );
|
||||
blake256r8_4way_close( &ctx, vhash );
|
||||
|
||||
mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
@@ -29,41 +29,34 @@ int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t HTarget = ptarget[7];
|
||||
uint32_t _ALIGN(32) edata[20];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
if ( opt_benchmark )
|
||||
HTarget = 0x7f;
|
||||
|
||||
swab32_array( edata, pdata, 20 );
|
||||
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake256r8_4way_init( &blakecoin_4w_ctx );
|
||||
blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );
|
||||
|
||||
uint32_t *noncep = vdata + 76; // 19*4
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep +1, n+1 );
|
||||
be32enc( noncep +2, n+2 );
|
||||
be32enc( noncep +3, n+3 );
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
blakecoin_4way_hash( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
|
||||
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget )
|
||||
&& !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 4;
|
||||
|
||||
} while ( (num_found == 0) && (n < max_nonce)
|
||||
&& !work_restart[thr_id].restart );
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -81,13 +74,12 @@ void blakecoin_8way_hash( void *state, const void *input )
|
||||
blake256r8_8way( &ctx, input + (64<<3), 16 );
|
||||
blake256r8_8way_close( &ctx, vhash );
|
||||
|
||||
mm256_deinterleave_8x32( state, state+ 32, state+ 64, state+ 96,
|
||||
state+128, state+160, state+192, state+224,
|
||||
vhash, 256 );
|
||||
dintrlv_8x32( state, state+ 32, state+ 64, state+ 96, state+128,
|
||||
state+160, state+192, state+224, vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
@@ -95,46 +87,34 @@ int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t HTarget = ptarget[7];
|
||||
uint32_t _ALIGN(32) edata[20];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
uint32_t *noncep = vdata + 152; // 19*8
|
||||
int num_found = 0;
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
if ( opt_benchmark )
|
||||
HTarget = 0x7f;
|
||||
|
||||
// we need big endian data...
|
||||
swab32_array( edata, pdata, 20 );
|
||||
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake256r8_8way_init( &blakecoin_8w_ctx );
|
||||
blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep +1, n+1 );
|
||||
be32enc( noncep +2, n+2 );
|
||||
be32enc( noncep +3, n+3 );
|
||||
be32enc( noncep +4, n+4 );
|
||||
be32enc( noncep +5, n+5 );
|
||||
be32enc( noncep +6, n+6 );
|
||||
be32enc( noncep +7, n+7 );
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
blakecoin_8way_hash( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
|
||||
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget )
|
||||
&& !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 8;
|
||||
} while ( (num_found == 0) && (n < max_nonce)
|
||||
&& !work_restart[thr_id].restart );
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -1,13 +1,6 @@
|
||||
#include "blakecoin-gate.h"
|
||||
#include <memory.h>
|
||||
|
||||
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
|
||||
int64_t blakecoin_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
// return 0x3fffffLL;
|
||||
}
|
||||
|
||||
// vanilla uses default gen merkle root, otherwise identical to blakecoin
|
||||
bool register_vanilla_algo( algo_gate_t* gate )
|
||||
{
|
||||
@@ -23,7 +16,6 @@ bool register_vanilla_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&blakecoinhash;
|
||||
#endif
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
gate->get_max64 = (void*)&blakecoin_get_max64;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@@ -13,18 +13,18 @@
|
||||
|
||||
#if defined (BLAKECOIN_8WAY)
|
||||
void blakecoin_8way_hash(void *state, const void *input);
|
||||
int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
#if defined (BLAKECOIN_4WAY)
|
||||
void blakecoin_4way_hash(void *state, const void *input);
|
||||
int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
void blakecoinhash( void *state, const void *input );
|
||||
int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
@@ -39,13 +39,14 @@ void blakecoinhash( void *state, const void *input )
|
||||
memcpy( state, hash, 32 );
|
||||
}
|
||||
|
||||
int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t HTarget = ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
uint32_t _ALIGN(32) hash64[8];
|
||||
uint32_t _ALIGN(32) endiandata[20];
|
||||
@@ -92,33 +93,3 @@ int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
|
||||
{
|
||||
SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
|
||||
}
|
||||
*/
|
||||
/*
|
||||
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
|
||||
int64_t blakecoin_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
}
|
||||
|
||||
// vanilla uses default gen merkle root, otherwise identical to blakecoin
|
||||
bool register_vanilla_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_blakecoin;
|
||||
gate->hash = (void*)&blakecoinhash;
|
||||
gate->get_max64 = (void*)&blakecoin_get_max64;
|
||||
blakecoin_init( &blake_init_ctx );
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_blakecoin_algo( algo_gate_t* gate )
|
||||
{
|
||||
register_vanilla_algo( gate );
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
@@ -23,11 +23,11 @@ void decred_hash_4way( void *state, const void *input )
|
||||
memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
|
||||
blake256_4way( &ctx, tail, tail_len );
|
||||
blake256_4way_close( &ctx, vhash );
|
||||
mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done)
|
||||
int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[48*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
@@ -37,14 +37,13 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
// copy to buffer guaranteed to be aligned.
|
||||
memcpy( edata, pdata, 180 );
|
||||
|
||||
// use the old way until new way updated for size.
|
||||
mm128_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );
|
||||
mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 );
|
||||
|
||||
blake256_4way_init( &blake_mid );
|
||||
blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
|
||||
@@ -59,18 +58,17 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
decred_hash_4way( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
|
||||
if ( (hash+(i<<3))[7] <= HTarget )
|
||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[DECRED_NONCE_INDEX] = n+i;
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce)
|
||||
&& !work_restart[thr_id].restart );
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -38,7 +38,7 @@ void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
|
||||
if (!have_longpoll && work->height > *net_blocks + 1)
|
||||
{
|
||||
char netinfo[64] = { 0 };
|
||||
if (opt_showdiff && net_diff > 0.)
|
||||
if ( net_diff > 0. )
|
||||
{
|
||||
if (net_diff != work->targetdiff)
|
||||
sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
|
||||
@@ -116,7 +116,7 @@ void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
// block header suffix from coinb2 (stake version)
|
||||
memcpy( &g_work->data[44],
|
||||
&sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
|
||||
sctx->bloc_height = g_work->data[32];
|
||||
sctx->block_height = g_work->data[32];
|
||||
//applog_hex(work->data, 180);
|
||||
//applog_hex(&work->data[36], 36);
|
||||
}
|
||||
@@ -154,7 +154,6 @@ bool register_decred_algo( algo_gate_t* gate )
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT;
|
||||
gate->get_nonceptr = (void*)&decred_get_nonceptr;
|
||||
gate->get_max64 = (void*)&get_max64_0x3fffffLL;
|
||||
gate->decode_extra_data = (void*)&decred_decode_extradata;
|
||||
gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
|
||||
gate->work_decode = (void*)&std_be_work_decode;
|
||||
|
@@ -14,7 +14,7 @@
|
||||
|
||||
#if defined (__AVX2__)
|
||||
//void blakehash_84way(void *state, const void *input);
|
||||
//int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
//int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done );
|
||||
#endif
|
||||
|
||||
@@ -24,13 +24,13 @@
|
||||
|
||||
#if defined (DECRED_4WAY)
|
||||
void decred_hash_4way(void *state, const void *input);
|
||||
int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
void decred_hash( void *state, const void *input );
|
||||
int scanhash_decred( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
int scanhash_decred( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -52,12 +52,14 @@ void decred_hash_simple(void *state, const void *input)
|
||||
sph_blake256_close(&ctx, state);
|
||||
}
|
||||
|
||||
int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
|
||||
int scanhash_decred( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) endiandata[48];
|
||||
uint32_t _ALIGN(64) hash32[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
// #define DCR_NONCE_OFT32 35
|
||||
|
||||
@@ -141,7 +143,7 @@ void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
|
||||
if (!have_longpoll && work->height > *net_blocks + 1)
|
||||
{
|
||||
char netinfo[64] = { 0 };
|
||||
if (opt_showdiff && net_diff > 0.)
|
||||
if (net_diff > 0.)
|
||||
{
|
||||
if (net_diff != work->targetdiff)
|
||||
sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
|
||||
@@ -267,7 +269,6 @@ bool register_decred_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_decred;
|
||||
gate->hash = (void*)&decred_hash;
|
||||
gate->get_nonceptr = (void*)&decred_get_nonceptr;
|
||||
gate->get_max64 = (void*)&get_max64_0x3fffffLL;
|
||||
gate->decode_extra_data = (void*)&decred_decode_extradata;
|
||||
gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
|
||||
gate->work_decode = (void*)&std_be_work_decode;
|
||||
|
@@ -10,13 +10,8 @@
|
||||
#include "blake-hash-4way.h"
|
||||
#include "sph_blake.h"
|
||||
|
||||
//#define DEBUG_ALGO
|
||||
|
||||
extern void pentablakehash_4way( void *output, const void *input )
|
||||
{
|
||||
unsigned char _ALIGN(32) hash[128];
|
||||
// // same as uint32_t hashA[16], hashB[16];
|
||||
// #define hashB hash+64
|
||||
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
@@ -30,21 +25,6 @@ extern void pentablakehash_4way( void *output, const void *input )
|
||||
blake512_4way( &ctx, input, 80 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
|
||||
uint64_t sin0[10], sin1[10], sin2[10], sin3[10];
|
||||
mm256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
|
||||
sph_blake512_context ctx2_blake;
|
||||
sph_blake512_init(&ctx2_blake);
|
||||
sph_blake512(&ctx2_blake, sin0, 80);
|
||||
sph_blake512_close(&ctx2_blake, (void*) hash);
|
||||
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
uint64_t* hash64 = (uint64_t*)hash;
|
||||
for( int i = 0; i < 8; i++ )
|
||||
{
|
||||
if ( hash0[i] != hash64[i] )
|
||||
printf("hash mismatch %u\n",i);
|
||||
}
|
||||
|
||||
blake512_4way_init( &ctx );
|
||||
blake512_4way( &ctx, vhash, 64 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
@@ -61,46 +41,14 @@ for( int i = 0; i < 8; i++ )
|
||||
blake512_4way( &ctx, vhash, 64 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
memcpy( output, hash0, 32 );
|
||||
memcpy( output+32, hash1, 32 );
|
||||
memcpy( output+64, hash2, 32 );
|
||||
memcpy( output+96, hash3, 32 );
|
||||
|
||||
/*
|
||||
uint64_t sin0[10] __attribute__ ((aligned (64)));
|
||||
uint64_t sin1[10] __attribute__ ((aligned (64)));
|
||||
uint64_t sin2[10] __attribute__ ((aligned (64)));
|
||||
uint64_t sin3[10] __attribute__ ((aligned (64)));
|
||||
|
||||
sph_blake512_context ctx_blake;
|
||||
|
||||
sph_blake512_init(&ctx_blake);
|
||||
sph_blake512(&ctx_blake, input, 80);
|
||||
sph_blake512_close(&ctx_blake, hash);
|
||||
|
||||
sph_blake512_init(&ctx_blake);
|
||||
sph_blake512(&ctx_blake, hash, 64);
|
||||
sph_blake512_close(&ctx_blake, hash);
|
||||
|
||||
sph_blake512_init(&ctx_blake);
|
||||
sph_blake512(&ctx_blake, hash, 64);
|
||||
sph_blake512_close(&ctx_blake, hash);
|
||||
|
||||
sph_blake512_init(&ctx_blake);
|
||||
sph_blake512(&ctx_blake, hash, 64);
|
||||
sph_blake512_close(&ctx_blake, hash);
|
||||
|
||||
sph_blake512_init(&ctx_blake);
|
||||
sph_blake512(&ctx_blake, hash, 64);
|
||||
sph_blake512_close(&ctx_blake, hash);
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
*/
|
||||
}
|
||||
|
||||
int scanhash_pentablake_4way( int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done )
|
||||
int scanhash_pentablake_4way( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
@@ -110,9 +58,8 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
|
||||
uint32_t n = pdata[19] - 1;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep = vdata + 73; // 9*8 + 1
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
// uint32_t _ALIGN(32) hash64[8];
|
||||
// uint32_t _ALIGN(32) endiandata[32];
|
||||
@@ -138,7 +85,7 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
for ( int m=0; m < 6; m++ )
|
||||
{
|
||||
@@ -155,10 +102,10 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( !( (hash+(i<<3))[7] & mask )
|
||||
&& fulltest( hash+(i<<3), ptarget ) )
|
||||
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
pdata[19] = n + i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 4;
|
||||
|
||||
|
@@ -10,7 +10,6 @@ bool register_pentablake_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&pentablakehash;
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -10,12 +10,12 @@
|
||||
|
||||
#if defined(PENTABLAKE_4WAY)
|
||||
void pentablakehash_4way( void *state, const void *input );
|
||||
int scanhash_pentablake_4way( int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done );
|
||||
int scanhash_pentablake_4way( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
void pentablakehash( void *state, const void *input );
|
||||
int scanhash_pentablake( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
int scanhash_pentablake( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
|
@@ -40,8 +40,8 @@ extern void pentablakehash(void *output, const void *input)
|
||||
|
||||
}
|
||||
|
||||
int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done)
|
||||
int scanhash_pentablake( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -49,6 +49,7 @@ int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = pdata[19] - 1;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
uint32_t _ALIGN(32) hash64[8];
|
||||
uint32_t _ALIGN(32) endiandata[32];
|
||||
|
@@ -103,7 +103,6 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
|
||||
v[13] ^= ctx->t[1]; // high 64 bits
|
||||
if (last) // last block flag set ?
|
||||
v[14] = ~v[14];
|
||||
|
||||
for (i = 0; i < 16; i++) // get little-endian words
|
||||
m[i] = B2B_GET64(&ctx->b[8 * i]);
|
||||
|
||||
@@ -184,7 +183,8 @@ void sph_blake2b_final( sph_blake2b_ctx *ctx, void *out )
|
||||
|
||||
while (ctx->c < 128) // fill up with zeros
|
||||
ctx->b[ctx->c++] = 0;
|
||||
blake2b_compress(ctx, 1); // final block flag = 1
|
||||
|
||||
blake2b_compress(ctx, 1); // final block flag = 1
|
||||
|
||||
// little endian convert and store
|
||||
for (i = 0; i < ctx->outlen; i++) {
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -41,15 +41,18 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_bmw256 256
|
||||
|
||||
#define SPH_SIZE_bmw512 512
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
// BMW-256 4 way 32
|
||||
|
||||
typedef struct {
|
||||
__m128i buf[64];
|
||||
__m128i H[16];
|
||||
@@ -59,37 +62,124 @@ typedef struct {
|
||||
|
||||
typedef bmw_4way_small_context bmw256_4way_context;
|
||||
|
||||
void bmw256_4way_init( bmw256_4way_context *ctx );
|
||||
|
||||
void bmw256_4way_update(void *cc, const void *data, size_t len);
|
||||
#define bmw256_4way bmw256_4way_update
|
||||
|
||||
void bmw256_4way_close(void *cc, void *dst);
|
||||
|
||||
void bmw256_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#endif // __SSE2__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// BMW-256 8 way 32
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16];
|
||||
__m256i H[16];
|
||||
size_t ptr;
|
||||
uint32_t bit_count; // assume bit_count fits in 32 bits
|
||||
} bmw_8way_small_context __attribute__ ((aligned (64)));
|
||||
|
||||
typedef bmw_8way_small_context bmw256_8way_context;
|
||||
|
||||
void bmw256_8way_init( bmw256_8way_context *ctx );
|
||||
void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
|
||||
size_t len );
|
||||
#define bmw256_8way bmw256_8way_update
|
||||
void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// BMW-256 16 way 32
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[16];
|
||||
size_t ptr;
|
||||
uint32_t bit_count; // assume bit_count fits in 32 bits
|
||||
} bmw_16way_small_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef bmw_16way_small_context bmw256_16way_context;
|
||||
|
||||
void bmw256_16way_init( bmw256_16way_context *ctx );
|
||||
void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
|
||||
size_t len );
|
||||
void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
// BMW-512 2 way 64
|
||||
|
||||
typedef struct {
|
||||
__m128i buf[16];
|
||||
__m128i H[16];
|
||||
size_t ptr;
|
||||
uint64_t bit_count;
|
||||
} bmw_2way_big_context __attribute__ ((aligned (64)));
|
||||
|
||||
typedef bmw_2way_big_context bmw512_2way_context;
|
||||
|
||||
void bmw512_2way_init( bmw512_2way_context *ctx );
|
||||
void bmw512_2way_update( bmw512_2way_context *ctx, const void *data,
|
||||
size_t len );
|
||||
void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );
|
||||
|
||||
#endif // __SSE2__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// BMW-512 4 way 64
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16];
|
||||
__m256i H[16];
|
||||
size_t ptr;
|
||||
sph_u64 bit_count;
|
||||
} bmw_4way_big_context;
|
||||
} bmw_4way_big_context __attribute__((aligned(128)));
|
||||
|
||||
typedef bmw_4way_big_context bmw512_4way_context;
|
||||
|
||||
void bmw256_4way_init(void *cc);
|
||||
|
||||
void bmw256_4way(void *cc, const void *data, size_t len);
|
||||
|
||||
void bmw256_4way_close(void *cc, void *dst);
|
||||
|
||||
void bmw256_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
void bmw512_4way_init(void *cc);
|
||||
|
||||
void bmw512_4way(void *cc, const void *data, size_t len);
|
||||
void bmw512_4way_update(void *cc, const void *data, size_t len);
|
||||
#define bmw512_4way bmw512_4way_update
|
||||
|
||||
void bmw512_4way_close(void *cc, void *dst);
|
||||
|
||||
void bmw512_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#endif
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[16];
|
||||
size_t ptr;
|
||||
uint64_t bit_count;
|
||||
} bmw512_8way_context __attribute__((aligned(128)));
|
||||
|
||||
void bmw512_8way_init( bmw512_8way_context *ctx );
|
||||
void bmw512_8way_update( bmw512_8way_context *ctx, const void *data,
|
||||
size_t len );
|
||||
void bmw512_8way_close( bmw512_8way_context *ctx, void *dst );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif // BMW_HASH_H__
|
||||
|
1658
algo/bmw/bmw256-hash-4way.c
Normal file
1658
algo/bmw/bmw256-hash-4way.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -19,14 +19,15 @@ void bmwhash(void *output, const void *input)
|
||||
*/
|
||||
}
|
||||
|
||||
int scanhash_bmw(int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done)
|
||||
int scanhash_bmw( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
||||
uint32_t _ALIGN(64) hash64[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
int thr_id = mythr->id;
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
114
algo/bmw/bmw512-4way.c
Normal file
114
algo/bmw/bmw512-4way.c
Normal file
@@ -0,0 +1,114 @@
|
||||
#include "bmw512-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
//#include "sph_keccak.h"
|
||||
#include "bmw-hash-4way.h"
|
||||
|
||||
#if defined(BMW512_8WAY)
|
||||
|
||||
void bmw512hash_8way(void *state, const void *input)
|
||||
{
|
||||
bmw512_8way_context ctx;
|
||||
bmw512_8way_init( &ctx );
|
||||
bmw512_8way_update( &ctx, input, 80 );
|
||||
bmw512_8way_close( &ctx, state );
|
||||
}
|
||||
|
||||
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[16*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0 ,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
bmw512hash_8way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
|
||||
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart) );
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BMW512_4WAY)
|
||||
|
||||
//#ifdef BMW512_4WAY
|
||||
|
||||
void bmw512hash_4way(void *state, const void *input)
|
||||
{
|
||||
bmw512_4way_context ctx;
|
||||
bmw512_4way_init( &ctx );
|
||||
bmw512_4way_update( &ctx, input, 80 );
|
||||
bmw512_4way_close( &ctx, state );
|
||||
}
|
||||
|
||||
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[16*4] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
do {
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
bmw512hash_4way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
|
||||
// if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
20
algo/bmw/bmw512-gate.c
Normal file
20
algo/bmw/bmw512-gate.c
Normal file
@@ -0,0 +1,20 @@
|
||||
#include "bmw512-gate.h"
|
||||
|
||||
bool register_bmw512_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
#if defined (BMW512_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_bmw512_8way;
|
||||
gate->hash = (void*)&bmw512hash_8way;
|
||||
#elif defined (BMW512_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_bmw512_4way;
|
||||
gate->hash = (void*)&bmw512hash_4way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_bmw512;
|
||||
gate->hash = (void*)&bmw512hash;
|
||||
#endif
|
||||
return true;
|
||||
};
|
||||
|
||||
|
33
algo/bmw/bmw512-gate.h
Normal file
33
algo/bmw/bmw512-gate.h
Normal file
@@ -0,0 +1,33 @@
|
||||
#ifndef BMW512_GATE_H__
|
||||
#define BMW512_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BMW512_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define BMW512_4WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(BMW512_8WAY)
|
||||
|
||||
void bmw512hash_8way( void *state, const void *input );
|
||||
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(BMW512_4WAY)
|
||||
|
||||
void bmw512hash_4way( void *state, const void *input );
|
||||
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
void bmw512hash( void *state, const void *input );
|
||||
int scanhash_bmw512( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
1515
algo/bmw/bmw512-hash-4way.c
Normal file
1515
algo/bmw/bmw512-hash-4way.c
Normal file
File diff suppressed because it is too large
Load Diff
53
algo/bmw/bmw512.c
Normal file
53
algo/bmw/bmw512.c
Normal file
@@ -0,0 +1,53 @@
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "sph_bmw.h"
|
||||
|
||||
void bmw512hash(void *state, const void *input)
|
||||
{
|
||||
sph_bmw512_context ctx;
|
||||
uint32_t hash[32];
|
||||
|
||||
sph_bmw512_init( &ctx );
|
||||
sph_bmw512( &ctx,input, 80 );
|
||||
sph_bmw512_close( &ctx, hash );
|
||||
|
||||
memcpy( state, hash, 32 );
|
||||
}
|
||||
|
||||
int scanhash_bmw512( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19] - 1;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
//const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
uint32_t _ALIGN(32) hash64[8];
|
||||
uint32_t endiandata[32];
|
||||
|
||||
for (int i=0; i < 19; i++)
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
|
||||
do {
|
||||
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
bmw512hash(hash64, endiandata);
|
||||
if (((hash64[7]&0xFFFFFF00)==0) &&
|
||||
fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
@@ -242,6 +242,8 @@ void cryptolight_hash(void* output, const void* input, int len) {
|
||||
free(ctx);
|
||||
}
|
||||
|
||||
#if defined(__AES__)
|
||||
|
||||
static void cryptolight_hash_ctx_aes_ni(void* output, const void* input,
|
||||
int len, struct cryptonight_ctx* ctx)
|
||||
{
|
||||
@@ -312,8 +314,10 @@ static void cryptolight_hash_ctx_aes_ni(void* output, const void* input,
|
||||
oaes_free((OAES_CTX **) &ctx->aes_ctx);
|
||||
}
|
||||
|
||||
int scanhash_cryptolight(int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done)
|
||||
#endif
|
||||
|
||||
int scanhash_cryptolight( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -322,6 +326,7 @@ int scanhash_cryptolight(int thr_id, struct work *work,
|
||||
const uint32_t first_nonce = n + 1;
|
||||
//const uint32_t Htarg = ptarget[7];
|
||||
uint32_t _ALIGN(32) hash[HASH_SIZE / 4];
|
||||
int thr_id = mythr->id;
|
||||
|
||||
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
|
||||
|
||||
@@ -358,7 +363,6 @@ bool register_cryptolight_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_cryptolight;
|
||||
gate->hash = (void*)&cryptolight_hash;
|
||||
gate->hash_suw = (void*)&cryptolight_hash;
|
||||
gate->get_max64 = (void*)&get_max64_0x40LL;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -70,11 +70,12 @@ void cryptonight_hash_suw( void *restrict output, const void *input )
|
||||
|
||||
bool cryptonightV7 = false;
|
||||
|
||||
int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
int scanhash_cryptonight( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
uint32_t *nonceptr = (uint32_t*) (((char*)pdata) + 39);
|
||||
uint32_t n = *nonceptr - 1;
|
||||
@@ -110,7 +111,6 @@ bool register_cryptonight_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_cryptonight;
|
||||
gate->hash = (void*)&cryptonight_hash;
|
||||
gate->hash_suw = (void*)&cryptonight_hash_suw;
|
||||
gate->get_max64 = (void*)&get_max64_0x40LL;
|
||||
return true;
|
||||
};
|
||||
|
||||
@@ -122,7 +122,6 @@ bool register_cryptonightv7_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_cryptonight;
|
||||
gate->hash = (void*)&cryptonight_hash;
|
||||
gate->hash_suw = (void*)&cryptonight_hash_suw;
|
||||
gate->get_max64 = (void*)&get_max64_0x40LL;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -40,8 +40,8 @@ void cryptonight_hash_ctx(void* output, const void* input, int len);
|
||||
void keccakf(uint64_t st[25], int rounds);
|
||||
extern void (* const extra_hashes[4])(const void *, size_t, char *);
|
||||
|
||||
int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
int scanhash_cryptonight( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void cryptonight_hash_aes( void *restrict output, const void *input, int len );
|
||||
|
||||
|
@@ -7,12 +7,212 @@
|
||||
|
||||
// 2x128
|
||||
|
||||
|
||||
// The result of hashing 10 rounds of initial data which consists of params
|
||||
// zero padded.
|
||||
static const uint64_t IV256[] =
|
||||
{
|
||||
0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
|
||||
0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
|
||||
0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
|
||||
0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
|
||||
};
|
||||
|
||||
static const uint64_t IV512[] =
|
||||
{
|
||||
0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
|
||||
0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
|
||||
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
|
||||
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
||||
};
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// 4 way 128 is handy to avoid reinterleaving in many algos.
|
||||
// If reinterleaving is necessary it may be more efficient to use
|
||||
// 2 way 256. The same transform code should work for both.
|
||||
|
||||
static void transform_4way( cube_4way_context *sp )
|
||||
{
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
__m512i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;
|
||||
|
||||
x0 = _mm512_load_si512( (__m512i*)sp->h );
|
||||
x1 = _mm512_load_si512( (__m512i*)sp->h + 1 );
|
||||
x2 = _mm512_load_si512( (__m512i*)sp->h + 2 );
|
||||
x3 = _mm512_load_si512( (__m512i*)sp->h + 3 );
|
||||
x4 = _mm512_load_si512( (__m512i*)sp->h + 4 );
|
||||
x5 = _mm512_load_si512( (__m512i*)sp->h + 5 );
|
||||
x6 = _mm512_load_si512( (__m512i*)sp->h + 6 );
|
||||
x7 = _mm512_load_si512( (__m512i*)sp->h + 7 );
|
||||
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x4 = _mm512_add_epi32( x0, x4 );
|
||||
x5 = _mm512_add_epi32( x1, x5 );
|
||||
x6 = _mm512_add_epi32( x2, x6 );
|
||||
x7 = _mm512_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x1;
|
||||
x0 = mm512_rol_32( x2, 7 );
|
||||
x1 = mm512_rol_32( x3, 7 );
|
||||
x2 = mm512_rol_32( y0, 7 );
|
||||
x3 = mm512_rol_32( y1, 7 );
|
||||
x0 = _mm512_xor_si512( x0, x4 );
|
||||
x1 = _mm512_xor_si512( x1, x5 );
|
||||
x2 = _mm512_xor_si512( x2, x6 );
|
||||
x3 = _mm512_xor_si512( x3, x7 );
|
||||
x4 = mm512_swap128_64( x4 );
|
||||
x5 = mm512_swap128_64( x5 );
|
||||
x6 = mm512_swap128_64( x6 );
|
||||
x7 = mm512_swap128_64( x7 );
|
||||
x4 = _mm512_add_epi32( x0, x4 );
|
||||
x5 = _mm512_add_epi32( x1, x5 );
|
||||
x6 = _mm512_add_epi32( x2, x6 );
|
||||
x7 = _mm512_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x2;
|
||||
x0 = mm512_rol_32( x1, 11 );
|
||||
x1 = mm512_rol_32( y0, 11 );
|
||||
x2 = mm512_rol_32( x3, 11 );
|
||||
x3 = mm512_rol_32( y1, 11 );
|
||||
x0 = _mm512_xor_si512( x0, x4 );
|
||||
x1 = _mm512_xor_si512( x1, x5 );
|
||||
x2 = _mm512_xor_si512( x2, x6 );
|
||||
x3 = _mm512_xor_si512( x3, x7 );
|
||||
x4 = mm512_swap64_32( x4 );
|
||||
x5 = mm512_swap64_32( x5 );
|
||||
x6 = mm512_swap64_32( x6 );
|
||||
x7 = mm512_swap64_32( x7 );
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)sp->h, x0 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 1, x1 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 2, x2 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 3, x3 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 4, x4 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 5, x5 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 6, x6 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 7, x7 );
|
||||
}
|
||||
|
||||
int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
|
||||
int blockbytes )
|
||||
{
|
||||
__m512i *h = (__m512i*)sp->h;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = blockbytes/16;
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m512_const1_128( iv[0] );
|
||||
h[ 1] = m512_const1_128( iv[1] );
|
||||
h[ 2] = m512_const1_128( iv[2] );
|
||||
h[ 3] = m512_const1_128( iv[3] );
|
||||
h[ 4] = m512_const1_128( iv[4] );
|
||||
h[ 5] = m512_const1_128( iv[5] );
|
||||
h[ 6] = m512_const1_128( iv[6] );
|
||||
h[ 7] = m512_const1_128( iv[7] );
|
||||
h[ 0] = m512_const1_128( iv[0] );
|
||||
h[ 1] = m512_const1_128( iv[1] );
|
||||
h[ 2] = m512_const1_128( iv[2] );
|
||||
h[ 3] = m512_const1_128( iv[3] );
|
||||
h[ 4] = m512_const1_128( iv[4] );
|
||||
h[ 5] = m512_const1_128( iv[5] );
|
||||
h[ 6] = m512_const1_128( iv[6] );
|
||||
h[ 7] = m512_const1_128( iv[7] );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_update( cube_4way_context *sp, const void *data, size_t size )
|
||||
{
|
||||
const int len = size >> 4;
|
||||
const __m512i *in = (__m512i*)data;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_4way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_close( cube_4way_context *sp, void *output )
|
||||
{
|
||||
__m512i *hash = (__m512i*)output;
|
||||
int i;
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
|
||||
m512_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_4way( sp );
|
||||
|
||||
sp->h[7] = _mm512_xor_si512( sp->h[7],
|
||||
m512_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<6 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size )
|
||||
{
|
||||
const int len = size >> 4;
|
||||
const __m512i *in = (__m512i*)data;
|
||||
__m512i *hash = (__m512i*)output;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_4way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
|
||||
m512_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_4way( sp );
|
||||
|
||||
sp->h[7] = _mm512_xor_si512( sp->h[7],
|
||||
m512_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
// 2 way 128
|
||||
|
||||
static void transform_2way( cube_2way_context *sp )
|
||||
{
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
__m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
|
||||
__m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;
|
||||
|
||||
x0 = _mm256_load_si256( (__m256i*)sp->h );
|
||||
x1 = _mm256_load_si256( (__m256i*)sp->h + 1 );
|
||||
@@ -29,18 +229,12 @@ static void transform_2way( cube_2way_context *sp )
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
y0 = x2;
|
||||
y1 = x3;
|
||||
y2 = x0;
|
||||
y3 = x1;
|
||||
x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 7 ),
|
||||
_mm256_srli_epi32( y0, 25 ) );
|
||||
x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 7 ),
|
||||
_mm256_srli_epi32( y1, 25 ) );
|
||||
x2 = _mm256_xor_si256( _mm256_slli_epi32( y2, 7 ),
|
||||
_mm256_srli_epi32( y2, 25 ) );
|
||||
x3 = _mm256_xor_si256( _mm256_slli_epi32( y3, 7 ),
|
||||
_mm256_srli_epi32( y3, 25 ) );
|
||||
y0 = x0;
|
||||
y1 = x1;
|
||||
x0 = mm256_rol_32( x2, 7 );
|
||||
x1 = mm256_rol_32( x3, 7 );
|
||||
x2 = mm256_rol_32( y0, 7 );
|
||||
x3 = mm256_rol_32( y1, 7 );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
@@ -53,18 +247,12 @@ static void transform_2way( cube_2way_context *sp )
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
y0 = x1;
|
||||
y1 = x0;
|
||||
y2 = x3;
|
||||
y3 = x2;
|
||||
x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
|
||||
_mm256_srli_epi32( y0, 21 ) );
|
||||
x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ),
|
||||
_mm256_srli_epi32( y1, 21 ) );
|
||||
x2 = _mm256_xor_si256( _mm256_slli_epi32( y2, 11 ),
|
||||
_mm256_srli_epi32( y2, 21 ) );
|
||||
x3 = _mm256_xor_si256( _mm256_slli_epi32( y3, 11 ),
|
||||
_mm256_srli_epi32( y3, 21 ) );
|
||||
y0 = x0;
|
||||
y1 = x2;
|
||||
x0 = mm256_rol_32( x1, 11 );
|
||||
x1 = mm256_rol_32( y0, 11 );
|
||||
x2 = mm256_rol_32( x3, 11 );
|
||||
x3 = mm256_rol_32( y1, 11 );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
@@ -83,39 +271,36 @@ static void transform_2way( cube_2way_context *sp )
|
||||
_mm256_store_si256( (__m256i*)sp->h + 5, x5 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 6, x6 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 7, x7 );
|
||||
|
||||
}
|
||||
|
||||
cube_2way_context cube_2way_ctx_cache __attribute__ ((aligned (64)));
|
||||
|
||||
int cube_2way_reinit( cube_2way_context *sp )
|
||||
{
|
||||
memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
|
||||
int blockbytes )
|
||||
int blockbytes )
|
||||
{
|
||||
int i;
|
||||
__m256i *h = (__m256i*)sp->h;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = blockbytes/16;
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
// all sizes of __m128i
|
||||
cube_2way_ctx_cache.hashlen = hashbitlen/128;
|
||||
cube_2way_ctx_cache.blocksize = blockbytes/16;
|
||||
cube_2way_ctx_cache.rounds = rounds;
|
||||
cube_2way_ctx_cache.pos = 0;
|
||||
|
||||
for ( i = 0; i < 8; ++i )
|
||||
cube_2way_ctx_cache.h[i] = m256_zero;
|
||||
|
||||
cube_2way_ctx_cache.h[0] = _mm256_set_epi32(
|
||||
0, rounds, blockbytes, hashbitlen / 8,
|
||||
0, rounds, blockbytes, hashbitlen / 8 );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_2way( &cube_2way_ctx_cache );
|
||||
|
||||
memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -126,9 +311,6 @@ int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
|
||||
const __m256i *in = (__m256i*)data;
|
||||
int i;
|
||||
|
||||
// It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
|
||||
// Current usage sata is either 64 or 80 bytes.
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
|
||||
@@ -149,15 +331,15 @@ int cube_2way_close( cube_2way_context *sp, void *output )
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
_mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ) );
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
_mm256_set_epi32( 1,0,0,0, 1,0,0,0 ) );
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
|
||||
for ( i = 0; i < sp->hashlen; i++ ) hash[i] = sp->h[i];
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -182,15 +364,15 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
_mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ) );
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0,
|
||||
1,0,0,0 ) );
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
|
||||
for ( i = 0; i < sp->hashlen; i++ ) hash[i] = sp->h[i];
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -1,10 +1,37 @@
|
||||
#ifndef CUBE_HASH_2WAY_H__
|
||||
#define CUBE_HASH_2WAY_H__
|
||||
#define CUBE_HASH_2WAY_H__ 1
|
||||
|
||||
#include <stdint.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <stdint.h>
|
||||
#include "avxdefs.h"
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
struct _cube_4way_context
|
||||
{
|
||||
__m512i h[8];
|
||||
int hashlen;
|
||||
int rounds;
|
||||
int blocksize;
|
||||
int pos;
|
||||
} __attribute__ ((aligned (128)));
|
||||
|
||||
typedef struct _cube_4way_context cube_4way_context;
|
||||
|
||||
int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
|
||||
int blockbytes );
|
||||
// reinitialize context with same parameters, much faster.
|
||||
int cube_4way_reinit( cube_4way_context *sp );
|
||||
|
||||
int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
|
||||
|
||||
int cube_4way_close( cube_4way_context *sp, void *output );
|
||||
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size );
|
||||
|
||||
#endif
|
||||
|
||||
// 2x128, 2 way parallel SSE2
|
||||
|
||||
@@ -15,7 +42,7 @@ struct _cube_2way_context
|
||||
int rounds;
|
||||
int blocksize; // __m128i
|
||||
int pos; // number of __m128i read into x from current block
|
||||
} __attribute__ ((aligned (64)));
|
||||
} __attribute__ ((aligned (128)));
|
||||
|
||||
typedef struct _cube_2way_context cube_2way_context;
|
||||
|
||||
|
@@ -13,14 +13,35 @@
|
||||
#include <stdbool.h>
|
||||
#include <unistd.h>
|
||||
#include <memory.h>
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
#include <stdio.h>
|
||||
|
||||
static void transform( cubehashParam *sp )
|
||||
{
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
#ifdef __AVX2__
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
register __m512i x0, x1;
|
||||
|
||||
x0 = _mm512_load_si512( (__m512i*)sp->x );
|
||||
x1 = _mm512_load_si512( (__m512i*)sp->x + 1 );
|
||||
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x1 = _mm512_add_epi32( x0, x1 );
|
||||
x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 );
|
||||
x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) );
|
||||
x0 = _mm512_xor_si512( mm512_rol_32(
|
||||
mm512_swap256_128( x0 ), 11 ), x1 );
|
||||
x1 = mm512_swap64_32( x1 );
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)sp->x, x0 );
|
||||
_mm512_store_si512( (__m512i*)sp->x + 1, x1 );
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
|
||||
register __m256i x0, x1, x2, x3, y0, y1;
|
||||
|
||||
@@ -34,26 +55,22 @@ static void transform( cubehashParam *sp )
|
||||
x2 = _mm256_add_epi32( x0, x2 );
|
||||
x3 = _mm256_add_epi32( x1, x3 );
|
||||
y0 = x0;
|
||||
x0 = _mm256_xor_si256( _mm256_slli_epi32( x1, 7 ),
|
||||
_mm256_srli_epi32( x1, 25 ) );
|
||||
x1 = _mm256_xor_si256( _mm256_slli_epi32( y0, 7 ),
|
||||
_mm256_srli_epi32( y0, 25 ) );
|
||||
x0 = mm256_rol_32( x1, 7 );
|
||||
x1 = mm256_rol_32( y0, 7 );
|
||||
x0 = _mm256_xor_si256( x0, x2 );
|
||||
x1 = _mm256_xor_si256( x1, x3 );
|
||||
x2 = _mm256_shuffle_epi32( x2, 0x4e );
|
||||
x3 = _mm256_shuffle_epi32( x3, 0x4e );
|
||||
x2 = mm256_swap128_64( x2 );
|
||||
x3 = mm256_swap128_64( x3 );
|
||||
x2 = _mm256_add_epi32( x0, x2 );
|
||||
x3 = _mm256_add_epi32( x1, x3 );
|
||||
y0 = _mm256_permute4x64_epi64( x0, 0x4e );
|
||||
y1 = _mm256_permute4x64_epi64( x1, 0x4e );
|
||||
x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
|
||||
_mm256_srli_epi32( y0, 21 ) );
|
||||
x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ),
|
||||
_mm256_srli_epi32( y1, 21 ) );
|
||||
y0 = mm256_swap_128( x0 );
|
||||
y1 = mm256_swap_128( x1 );
|
||||
x0 = mm256_rol_32( y0, 11 );
|
||||
x1 = mm256_rol_32( y1, 11 );
|
||||
x0 = _mm256_xor_si256( x0, x2 );
|
||||
x1 = _mm256_xor_si256( x1, x3 );
|
||||
x2 = _mm256_shuffle_epi32( x2, 0xb1 );
|
||||
x3 = _mm256_shuffle_epi32( x3, 0xb1 );
|
||||
x2 = mm256_swap64_32( x2 );
|
||||
x3 = mm256_swap64_32( x3 );
|
||||
}
|
||||
|
||||
_mm256_store_si256( (__m256i*)sp->x, x0 );
|
||||
@@ -128,48 +145,58 @@ static void transform( cubehashParam *sp )
|
||||
#endif
|
||||
} // transform
|
||||
|
||||
// Cubehash context initializing is very expensive.
|
||||
// Cache the intial value for faster reinitializing.
|
||||
cubehashParam cube_ctx_cache __attribute__ ((aligned (64)));
|
||||
|
||||
int cubehashReinit( cubehashParam *sp )
|
||||
/*
|
||||
// The result of hashing 10 rounds of initial data which is params and
|
||||
// mostly zeros.
|
||||
static const uint64_t IV256[] =
|
||||
{
|
||||
memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
|
||||
return SUCCESS;
|
||||
0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
|
||||
0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
|
||||
0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
|
||||
0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
|
||||
};
|
||||
|
||||
}
|
||||
static const uint64_t IV512[] =
|
||||
{
|
||||
0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
|
||||
0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
|
||||
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
|
||||
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
||||
};
|
||||
*/
|
||||
|
||||
// Initialize the cache then copy to sp.
|
||||
int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
|
||||
{
|
||||
int i;
|
||||
__m128i *x = (__m128i*)sp->x;
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = blockbytes/16;
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
if ( hashbitlen < 8 ) return BAD_HASHBITLEN;
|
||||
if ( hashbitlen > 512 ) return BAD_HASHBITLEN;
|
||||
if ( hashbitlen != 8 * (hashbitlen / 8) ) return BAD_HASHBITLEN;
|
||||
if ( hashbitlen == 512 )
|
||||
{
|
||||
|
||||
/* Sanity checks */
|
||||
if ( rounds <= 0 || rounds > 32 )
|
||||
rounds = CUBEHASH_ROUNDS;
|
||||
if ( blockbytes <= 0 || blockbytes >= 256)
|
||||
blockbytes = CUBEHASH_BLOCKBYTES;
|
||||
x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
|
||||
x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
|
||||
x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
|
||||
x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
|
||||
x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
|
||||
x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
|
||||
x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
|
||||
x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
|
||||
}
|
||||
else
|
||||
{
|
||||
x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
|
||||
x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
|
||||
x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
|
||||
x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
|
||||
x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
|
||||
x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
|
||||
x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
|
||||
x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
|
||||
}
|
||||
|
||||
// all sizes of __m128i
|
||||
cube_ctx_cache.hashlen = hashbitlen/128;
|
||||
cube_ctx_cache.blocksize = blockbytes/16;
|
||||
cube_ctx_cache.rounds = rounds;
|
||||
cube_ctx_cache.pos = 0;
|
||||
|
||||
for ( i = 0; i < 8; ++i )
|
||||
cube_ctx_cache.x[i] = _mm_setzero_si128();;
|
||||
|
||||
cube_ctx_cache.x[0] = _mm_set_epi32( 0, rounds, blockbytes,
|
||||
hashbitlen / 8 );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform( &cube_ctx_cache );
|
||||
|
||||
memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
|
@@ -186,7 +186,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
|
||||
_state[i][j] = _mm_load_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -390,13 +390,13 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
|
||||
}
|
||||
|
||||
// Store the hash value
|
||||
_mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]);
|
||||
_mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]);
|
||||
_mm_store_si128((__m128i*)hashval + 0, state->state[0][0]);
|
||||
_mm_store_si128((__m128i*)hashval + 1, state->state[1][0]);
|
||||
|
||||
if(state->uHashSize == 512)
|
||||
{
|
||||
_mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]);
|
||||
_mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]);
|
||||
_mm_store_si128((__m128i*)hashval + 2, state->state[2][0]);
|
||||
_mm_store_si128((__m128i*)hashval + 3, state->state[3][0]);
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
@@ -513,13 +513,13 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
}
|
||||
|
||||
// Store the hash value
|
||||
_mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] );
|
||||
_mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] );
|
||||
|
||||
if( state->uHashSize == 512 )
|
||||
{
|
||||
_mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] );
|
||||
_mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] );
|
||||
|
||||
}
|
||||
return SUCCESS;
|
||||
|
620
algo/echo/aes_ni/hash.c.test
Normal file
620
algo/echo/aes_ni/hash.c.test
Normal file
@@ -0,0 +1,620 @@
|
||||
/*
|
||||
* file : echo_vperm.c
|
||||
* version : 1.0.208
|
||||
* date : 14.12.2010
|
||||
*
|
||||
* - vperm and aes_ni implementations of hash function ECHO
|
||||
* - implements NIST hash api
|
||||
* - assumes that message lenght is multiple of 8-bits
|
||||
* - _ECHO_VPERM_ must be defined if compiling with ../main.c
|
||||
* - define NO_AES_NI for aes_ni version
|
||||
*
|
||||
* Cagdas Calik
|
||||
* ccalik@metu.edu.tr
|
||||
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
|
||||
*
|
||||
*/
|
||||
#if defined(__AES__)
|
||||
|
||||
#include <memory.h>
|
||||
#include "miner.h"
|
||||
#include "hash_api.h"
|
||||
//#include "vperm.h"
|
||||
#include <immintrin.h>
|
||||
/*
|
||||
#ifndef NO_AES_NI
|
||||
#include <wmmintrin.h>
|
||||
#else
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
*/
|
||||
|
||||
MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
|
||||
MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
|
||||
MYALIGN const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
|
||||
MYALIGN const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
|
||||
MYALIGN const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
|
||||
MYALIGN const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
|
||||
MYALIGN const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
|
||||
MYALIGN const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
|
||||
MYALIGN const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
|
||||
MYALIGN const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
|
||||
MYALIGN const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
|
||||
MYALIGN const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
|
||||
MYALIGN const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
|
||||
MYALIGN const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
|
||||
MYALIGN const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
|
||||
MYALIGN const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
|
||||
MYALIGN const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
|
||||
MYALIGN const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
|
||||
MYALIGN const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
|
||||
MYALIGN const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
|
||||
MYALIGN const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
|
||||
|
||||
|
||||
MYALIGN const unsigned int const1[] = {0x00000001, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int mul2mask[] = {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int lsbmask[] = {0x01010101, 0x01010101, 0x01010101, 0x01010101};
|
||||
MYALIGN const unsigned int invshiftrows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
|
||||
MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
|
||||
|
||||
|
||||
#define ECHO_SUBBYTES(state, i, j) \
|
||||
state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
|
||||
state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\
|
||||
k1 = _mm_add_epi32(k1, M128(const1))
|
||||
|
||||
#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
|
||||
s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
|
||||
t1 = _mm_srli_epi16(state1[0][j], 7);\
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = s2;\
|
||||
state2[1][j] = state1[0][j];\
|
||||
state2[2][j] = state1[0][j];\
|
||||
state2[3][j] = _mm_xor_si128(s2, state1[0][j]);\
|
||||
s2 = _mm_add_epi8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
|
||||
t1 = _mm_srli_epi16(state1[1][(j + 1) & 3], 7);\
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
|
||||
s2 = _mm_add_epi8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
|
||||
t1 = _mm_srli_epi16(state1[2][(j + 2) & 3], 7);\
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
|
||||
s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
|
||||
t1 = _mm_srli_epi16(state1[3][(j + 3) & 3], 7);\
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], s2)
|
||||
|
||||
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES(_state, 0, 0);\
|
||||
ECHO_SUBBYTES(_state, 1, 0);\
|
||||
ECHO_SUBBYTES(_state, 2, 0);\
|
||||
ECHO_SUBBYTES(_state, 3, 0);\
|
||||
ECHO_SUBBYTES(_state, 0, 1);\
|
||||
ECHO_SUBBYTES(_state, 1, 1);\
|
||||
ECHO_SUBBYTES(_state, 2, 1);\
|
||||
ECHO_SUBBYTES(_state, 3, 1);\
|
||||
ECHO_SUBBYTES(_state, 0, 2);\
|
||||
ECHO_SUBBYTES(_state, 1, 2);\
|
||||
ECHO_SUBBYTES(_state, 2, 2);\
|
||||
ECHO_SUBBYTES(_state, 3, 2);\
|
||||
ECHO_SUBBYTES(_state, 0, 3);\
|
||||
ECHO_SUBBYTES(_state, 1, 3);\
|
||||
ECHO_SUBBYTES(_state, 2, 3);\
|
||||
ECHO_SUBBYTES(_state, 3, 3);\
|
||||
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES(_state2, 0, 0);\
|
||||
ECHO_SUBBYTES(_state2, 1, 0);\
|
||||
ECHO_SUBBYTES(_state2, 2, 0);\
|
||||
ECHO_SUBBYTES(_state2, 3, 0);\
|
||||
ECHO_SUBBYTES(_state2, 0, 1);\
|
||||
ECHO_SUBBYTES(_state2, 1, 1);\
|
||||
ECHO_SUBBYTES(_state2, 2, 1);\
|
||||
ECHO_SUBBYTES(_state2, 3, 1);\
|
||||
ECHO_SUBBYTES(_state2, 0, 2);\
|
||||
ECHO_SUBBYTES(_state2, 1, 2);\
|
||||
ECHO_SUBBYTES(_state2, 2, 2);\
|
||||
ECHO_SUBBYTES(_state2, 3, 2);\
|
||||
ECHO_SUBBYTES(_state2, 0, 3);\
|
||||
ECHO_SUBBYTES(_state2, 1, 3);\
|
||||
ECHO_SUBBYTES(_state2, 2, 3);\
|
||||
ECHO_SUBBYTES(_state2, 3, 3);\
|
||||
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
|
||||
|
||||
#define SAVESTATE(dst, src)\
|
||||
dst[0][0] = src[0][0];\
|
||||
dst[0][1] = src[0][1];\
|
||||
dst[0][2] = src[0][2];\
|
||||
dst[0][3] = src[0][3];\
|
||||
dst[1][0] = src[1][0];\
|
||||
dst[1][1] = src[1][1];\
|
||||
dst[1][2] = src[1][2];\
|
||||
dst[1][3] = src[1][3];\
|
||||
dst[2][0] = src[2][0];\
|
||||
dst[2][1] = src[2][1];\
|
||||
dst[2][2] = src[2][2];\
|
||||
dst[2][3] = src[2][3];\
|
||||
dst[3][0] = src[3][0];\
|
||||
dst[3][1] = src[3][1];\
|
||||
dst[3][2] = src[3][2];\
|
||||
dst[3][3] = src[3][3]
|
||||
|
||||
|
||||
void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
|
||||
{
|
||||
unsigned int r, b, i, j;
|
||||
__m128i t1, t2, s2, k1;
|
||||
__m128i _state[4][4], _state2[4][4], _statebackup[4][4];
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = 0; j < ctx->uHashSize / 256; j++)
|
||||
_state[i][j] = ctx->state[i][j];
|
||||
|
||||
for(b = 0; b < uBlockCount; b++)
|
||||
{
|
||||
ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
|
||||
|
||||
// load message
|
||||
for(j = ctx->uHashSize / 256; j < 4; j++)
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t *b = (uint64_t*)_state;
|
||||
//printf("Ss3: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
|
||||
// save state
|
||||
SAVESTATE(_statebackup, _state);
|
||||
|
||||
k1 = ctx->k;
|
||||
|
||||
for(r = 0; r < ctx->uRounds / 2; r++)
|
||||
{
|
||||
ECHO_ROUND_UNROLL2;
|
||||
}
|
||||
|
||||
//printf("Ss4: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
|
||||
|
||||
if(ctx->uHashSize == 256)
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
|
||||
}
|
||||
}
|
||||
pmsg += ctx->uBlockLength;
|
||||
}
|
||||
SAVESTATE(ctx->state, _state);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
HashReturn init_echo(hashState_echo *ctx, int nHashSize)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
ctx->k = _mm_setzero_si128();
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
|
||||
switch(nHashSize)
|
||||
{
|
||||
case 256:
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000100);
|
||||
ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000600);
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000200);
|
||||
ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000400);
|
||||
break;
|
||||
|
||||
default:
|
||||
return BAD_HASHBITLEN;
|
||||
}
|
||||
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = 0; j < nHashSize / 256; j++)
|
||||
ctx->state[i][j] = ctx->hashsize;
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = nHashSize / 256; j < 4; j++)
|
||||
ctx->state[i][j] = _mm_set_epi32(0, 0, 0, 0);
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
|
||||
{
|
||||
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
||||
|
||||
uByteLength = (unsigned int)(databitlen / 8);
|
||||
|
||||
if((state->uBufferBytes + uByteLength) >= state->uBlockLength)
|
||||
{
|
||||
if(state->uBufferBytes != 0)
|
||||
{
|
||||
// Fill the buffer
|
||||
memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
|
||||
|
||||
// Process buffer
|
||||
Compress(state, state->buffer, 1);
|
||||
state->processed_bits += state->uBlockLength * 8;
|
||||
|
||||
data += state->uBlockLength - state->uBufferBytes;
|
||||
uByteLength -= state->uBlockLength - state->uBufferBytes;
|
||||
}
|
||||
|
||||
// buffer now does not contain any unprocessed bytes
|
||||
|
||||
uBlockCount = uByteLength / state->uBlockLength;
|
||||
uRemainingBytes = uByteLength % state->uBlockLength;
|
||||
|
||||
if(uBlockCount > 0)
|
||||
{
|
||||
Compress(state, data, uBlockCount);
|
||||
|
||||
state->processed_bits += uBlockCount * state->uBlockLength * 8;
|
||||
data += uBlockCount * state->uBlockLength;
|
||||
}
|
||||
|
||||
if(uRemainingBytes > 0)
|
||||
{
|
||||
memcpy(state->buffer, (void*)data, uRemainingBytes);
|
||||
}
|
||||
|
||||
state->uBufferBytes = uRemainingBytes;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(state->buffer + state->uBufferBytes, (void*)data, uByteLength);
|
||||
state->uBufferBytes += uByteLength;
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
|
||||
{
|
||||
__m128i remainingbits;
|
||||
|
||||
// Add remaining bytes in the buffer
|
||||
state->processed_bits += state->uBufferBytes * 8;
|
||||
|
||||
remainingbits = _mm_set_epi32(0, 0, 0, state->uBufferBytes * 8);
|
||||
|
||||
// Pad with 0x80
|
||||
state->buffer[state->uBufferBytes++] = 0x80;
|
||||
|
||||
// Enough buffer space for padding in this block?
|
||||
if((state->uBlockLength - state->uBufferBytes) >= 18)
|
||||
{
|
||||
// Pad with zeros
|
||||
memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18));
|
||||
|
||||
// Hash size
|
||||
*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;
|
||||
|
||||
// Processed bits
|
||||
*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
|
||||
*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
|
||||
|
||||
// Last block contains message bits?
|
||||
if(state->uBufferBytes == 1)
|
||||
{
|
||||
state->k = _mm_xor_si128(state->k, state->k);
|
||||
state->k = _mm_sub_epi64(state->k, state->const1536);
|
||||
}
|
||||
else
|
||||
{
|
||||
state->k = _mm_add_epi64(state->k, remainingbits);
|
||||
state->k = _mm_sub_epi64(state->k, state->const1536);
|
||||
}
|
||||
|
||||
// Compress
|
||||
Compress(state, state->buffer, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fill with zero and compress
|
||||
memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes);
|
||||
state->k = _mm_add_epi64(state->k, remainingbits);
|
||||
state->k = _mm_sub_epi64(state->k, state->const1536);
|
||||
Compress(state, state->buffer, 1);
|
||||
|
||||
// Last block
|
||||
memset(state->buffer, 0, state->uBlockLength - 18);
|
||||
|
||||
// Hash size
|
||||
*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;
|
||||
|
||||
// Processed bits
|
||||
*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
|
||||
*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
|
||||
|
||||
// Compress the last block
|
||||
state->k = _mm_xor_si128(state->k, state->k);
|
||||
state->k = _mm_sub_epi64(state->k, state->const1536);
|
||||
Compress(state, state->buffer, 1);
|
||||
}
|
||||
|
||||
// Store the hash value
|
||||
_mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]);
|
||||
_mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]);
|
||||
|
||||
if(state->uHashSize == 512)
|
||||
{
|
||||
_mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]);
|
||||
_mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]);
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen )
|
||||
{
|
||||
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
||||
|
||||
uByteLength = (unsigned int)(databitlen / 8);
|
||||
|
||||
/*
|
||||
if( (state->uBufferBytes + uByteLength) >= state->uBlockLength )
|
||||
{
|
||||
printf("full block\n");
|
||||
if( state->uBufferBytes != 0 )
|
||||
{
|
||||
// Fill the buffer
|
||||
memcpy( state->buffer + state->uBufferBytes,
|
||||
(void*)data, state->uBlockLength - state->uBufferBytes );
|
||||
|
||||
// Process buffer
|
||||
Compress( state, state->buffer, 1 );
|
||||
state->processed_bits += state->uBlockLength * 8;
|
||||
|
||||
data += state->uBlockLength - state->uBufferBytes;
|
||||
uByteLength -= state->uBlockLength - state->uBufferBytes;
|
||||
}
|
||||
|
||||
// buffer now does not contain any unprocessed bytes
|
||||
|
||||
uBlockCount = uByteLength / state->uBlockLength;
|
||||
uRemainingBytes = uByteLength % state->uBlockLength;
|
||||
|
||||
if( uBlockCount > 0 )
|
||||
{
|
||||
Compress( state, data, uBlockCount );
|
||||
state->processed_bits += uBlockCount * state->uBlockLength * 8;
|
||||
data += uBlockCount * state->uBlockLength;
|
||||
}
|
||||
|
||||
if( uRemainingBytes > 0 )
|
||||
memcpy(state->buffer, (void*)data, uRemainingBytes);
|
||||
|
||||
state->uBufferBytes = uRemainingBytes;
|
||||
}
|
||||
else
|
||||
{
|
||||
*/
|
||||
memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
|
||||
state->uBufferBytes += uByteLength;
|
||||
// }
|
||||
|
||||
__m128i remainingbits;
|
||||
|
||||
// Add remaining bytes in the buffer
|
||||
state->processed_bits += state->uBufferBytes * 8;
|
||||
|
||||
remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
|
||||
|
||||
// Pad with 0x80
|
||||
state->buffer[state->uBufferBytes++] = 0x80;
|
||||
|
||||
// Enough buffer space for padding in this block?
|
||||
|
||||
// if( (state->uBlockLength - state->uBufferBytes) >= 18 )
|
||||
// {
|
||||
// Pad with zeros
|
||||
|
||||
memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) );
|
||||
|
||||
// Hash size
|
||||
*( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize;
|
||||
|
||||
// Processed bits
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
|
||||
state->processed_bits;
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
|
||||
|
||||
|
||||
// Last block contains message bits?
|
||||
if( state->uBufferBytes == 1 )
|
||||
{
|
||||
state->k = _mm_xor_si128( state->k, state->k );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
}
|
||||
else
|
||||
{
|
||||
state->k = _mm_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
}
|
||||
|
||||
uint64_t *b = (uint64_t*)&state->k;
|
||||
/*
|
||||
printf("Sk: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
b = (uint64_t*)state->buffer;
|
||||
printf("Sb: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
printf("Sb: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]);
|
||||
printf("Sb: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]);
|
||||
printf("Sb: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]);
|
||||
|
||||
b = (uint64_t*)state->state;
|
||||
printf("Ss1: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
printf("Ss1: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]);
|
||||
printf("Ss1: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]);
|
||||
printf("Ss1: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]);
|
||||
*/
|
||||
// Compress
|
||||
Compress( state, state->buffer, 1 );
|
||||
|
||||
//printf("Ss2: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
|
||||
|
||||
/*
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fill with zero and compress
|
||||
memset( state->buffer + state->uBufferBytes, 0,
|
||||
state->uBlockLength - state->uBufferBytes );
|
||||
state->k = _mm_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
Compress( state, state->buffer, 1 );
|
||||
|
||||
// Last block
|
||||
memset( state->buffer, 0, state->uBlockLength - 18 );
|
||||
|
||||
// Hash size
|
||||
*( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
|
||||
state->uHashSize;
|
||||
|
||||
// Processed bits
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
|
||||
state->processed_bits;
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
|
||||
// Compress the last block
|
||||
state->k = _mm_xor_si128( state->k, state->k );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
Compress( state, state->buffer, 1) ;
|
||||
}
|
||||
*/
|
||||
|
||||
// Store the hash value
|
||||
_mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] );
|
||||
_mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] );
|
||||
|
||||
if( state->uHashSize == 512 )
|
||||
{
|
||||
_mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] );
|
||||
_mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] );
|
||||
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
|
||||
{
|
||||
HashReturn hRet;
|
||||
hashState_echo hs;
|
||||
|
||||
/////
|
||||
/*
|
||||
__m128i a, b, c, d, t[4], u[4], v[4];
|
||||
|
||||
a = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);
|
||||
b = _mm_set_epi32(0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110);
|
||||
c = _mm_set_epi32(0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120);
|
||||
d = _mm_set_epi32(0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130);
|
||||
|
||||
t[0] = _mm_unpacklo_epi8(a, b);
|
||||
t[1] = _mm_unpackhi_epi8(a, b);
|
||||
t[2] = _mm_unpacklo_epi8(c, d);
|
||||
t[3] = _mm_unpackhi_epi8(c, d);
|
||||
|
||||
u[0] = _mm_unpacklo_epi16(t[0], t[2]);
|
||||
u[1] = _mm_unpackhi_epi16(t[0], t[2]);
|
||||
u[2] = _mm_unpacklo_epi16(t[1], t[3]);
|
||||
u[3] = _mm_unpackhi_epi16(t[1], t[3]);
|
||||
|
||||
|
||||
t[0] = _mm_unpacklo_epi16(u[0], u[1]);
|
||||
t[1] = _mm_unpackhi_epi16(u[0], u[1]);
|
||||
t[2] = _mm_unpacklo_epi16(u[2], u[3]);
|
||||
t[3] = _mm_unpackhi_epi16(u[2], u[3]);
|
||||
|
||||
u[0] = _mm_unpacklo_epi8(t[0], t[1]);
|
||||
u[1] = _mm_unpackhi_epi8(t[0], t[1]);
|
||||
u[2] = _mm_unpacklo_epi8(t[2], t[3]);
|
||||
u[3] = _mm_unpackhi_epi8(t[2], t[3]);
|
||||
|
||||
a = _mm_unpacklo_epi8(u[0], u[1]);
|
||||
b = _mm_unpackhi_epi8(u[0], u[1]);
|
||||
c = _mm_unpacklo_epi8(u[2], u[3]);
|
||||
d = _mm_unpackhi_epi8(u[2], u[3]);
|
||||
*/
|
||||
/////
|
||||
|
||||
hRet = init_echo(&hs, hashbitlen);
|
||||
if(hRet != SUCCESS)
|
||||
return hRet;
|
||||
|
||||
hRet = update_echo(&hs, data, databitlen);
|
||||
if(hRet != SUCCESS)
|
||||
return hRet;
|
||||
|
||||
hRet = final_echo(&hs, hashval);
|
||||
if(hRet != SUCCESS)
|
||||
return hRet;
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
317
algo/echo/echo-hash-4way.c
Normal file
317
algo/echo/echo-hash-4way.c
Normal file
@@ -0,0 +1,317 @@
|
||||
//#if 0
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include "echo-hash-4way.h"
|
||||
|
||||
/*
|
||||
static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57,
|
||||
0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234
|
||||
};
|
||||
*/
|
||||
// do these need to be reversed?
|
||||
|
||||
#define mul2mask \
|
||||
_mm512_set4_epi32( 0, 0, 0, 0x00001b00 )
|
||||
// _mm512_set4_epi32( 0x00001b00, 0, 0, 0 )
|
||||
|
||||
#define lsbmask m512_const1_32( 0x01010101 )
|
||||
|
||||
#define ECHO_SUBBYTES( state, i, j ) \
|
||||
state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
|
||||
state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero ); \
|
||||
k1 = _mm512_add_epi32( k1, m512_one_128 );
|
||||
|
||||
#define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) do \
|
||||
{ \
|
||||
const int j1 = ( (j)+1 ) & 3; \
|
||||
const int j2 = ( (j)+2 ) & 3; \
|
||||
const int j3 = ( (j)+3 ) & 3; \
|
||||
s2 = _mm512_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask );\
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ] [j ] = s2; \
|
||||
state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
|
||||
state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
|
||||
state2[ 3 ] [j ] = _mm512_xor_si512( s2, state1[ 0 ][ j ] );\
|
||||
s2 = _mm512_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 );\
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
s2 = _mm512_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
|
||||
s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
|
||||
} while(0)
|
||||
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES(_state, 0, 0);\
|
||||
ECHO_SUBBYTES(_state, 1, 0);\
|
||||
ECHO_SUBBYTES(_state, 2, 0);\
|
||||
ECHO_SUBBYTES(_state, 3, 0);\
|
||||
ECHO_SUBBYTES(_state, 0, 1);\
|
||||
ECHO_SUBBYTES(_state, 1, 1);\
|
||||
ECHO_SUBBYTES(_state, 2, 1);\
|
||||
ECHO_SUBBYTES(_state, 3, 1);\
|
||||
ECHO_SUBBYTES(_state, 0, 2);\
|
||||
ECHO_SUBBYTES(_state, 1, 2);\
|
||||
ECHO_SUBBYTES(_state, 2, 2);\
|
||||
ECHO_SUBBYTES(_state, 3, 2);\
|
||||
ECHO_SUBBYTES(_state, 0, 3);\
|
||||
ECHO_SUBBYTES(_state, 1, 3);\
|
||||
ECHO_SUBBYTES(_state, 2, 3);\
|
||||
ECHO_SUBBYTES(_state, 3, 3);\
|
||||
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES(_state2, 0, 0);\
|
||||
ECHO_SUBBYTES(_state2, 1, 0);\
|
||||
ECHO_SUBBYTES(_state2, 2, 0);\
|
||||
ECHO_SUBBYTES(_state2, 3, 0);\
|
||||
ECHO_SUBBYTES(_state2, 0, 1);\
|
||||
ECHO_SUBBYTES(_state2, 1, 1);\
|
||||
ECHO_SUBBYTES(_state2, 2, 1);\
|
||||
ECHO_SUBBYTES(_state2, 3, 1);\
|
||||
ECHO_SUBBYTES(_state2, 0, 2);\
|
||||
ECHO_SUBBYTES(_state2, 1, 2);\
|
||||
ECHO_SUBBYTES(_state2, 2, 2);\
|
||||
ECHO_SUBBYTES(_state2, 3, 2);\
|
||||
ECHO_SUBBYTES(_state2, 0, 3);\
|
||||
ECHO_SUBBYTES(_state2, 1, 3);\
|
||||
ECHO_SUBBYTES(_state2, 2, 3);\
|
||||
ECHO_SUBBYTES(_state2, 3, 3);\
|
||||
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
#define SAVESTATE(dst, src)\
|
||||
dst[0][0] = src[0][0];\
|
||||
dst[0][1] = src[0][1];\
|
||||
dst[0][2] = src[0][2];\
|
||||
dst[0][3] = src[0][3];\
|
||||
dst[1][0] = src[1][0];\
|
||||
dst[1][1] = src[1][1];\
|
||||
dst[1][2] = src[1][2];\
|
||||
dst[1][3] = src[1][3];\
|
||||
dst[2][0] = src[2][0];\
|
||||
dst[2][1] = src[2][1];\
|
||||
dst[2][2] = src[2][2];\
|
||||
dst[2][3] = src[2][3];\
|
||||
dst[3][0] = src[3][0];\
|
||||
dst[3][1] = src[3][1];\
|
||||
dst[3][2] = src[3][2];\
|
||||
dst[3][3] = src[3][3]
|
||||
|
||||
// blockcount always 1
|
||||
void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg,
|
||||
unsigned int uBlockCount )
|
||||
{
|
||||
unsigned int r, b, i, j;
|
||||
__m512i t1, t2, s2, k1;
|
||||
__m512i _state[4][4], _state2[4][4], _statebackup[4][4];
|
||||
|
||||
_state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ];
|
||||
_state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ];
|
||||
_state[ 0 ][ 2 ] = ctx->state[ 0 ][ 2 ];
|
||||
_state[ 0 ][ 3 ] = ctx->state[ 0 ][ 3 ];
|
||||
_state[ 1 ][ 0 ] = ctx->state[ 1 ][ 0 ];
|
||||
_state[ 1 ][ 1 ] = ctx->state[ 1 ][ 1 ];
|
||||
_state[ 1 ][ 2 ] = ctx->state[ 1 ][ 2 ];
|
||||
_state[ 1 ][ 3 ] = ctx->state[ 1 ][ 3 ];
|
||||
_state[ 2 ][ 0 ] = ctx->state[ 2 ][ 0 ];
|
||||
_state[ 2 ][ 1 ] = ctx->state[ 2 ][ 1 ];
|
||||
_state[ 2 ][ 2 ] = ctx->state[ 2 ][ 2 ];
|
||||
_state[ 2 ][ 3 ] = ctx->state[ 2 ][ 3 ];
|
||||
_state[ 3 ][ 0 ] = ctx->state[ 3 ][ 0 ];
|
||||
_state[ 3 ][ 1 ] = ctx->state[ 3 ][ 1 ];
|
||||
_state[ 3 ][ 2 ] = ctx->state[ 3 ][ 2 ];
|
||||
_state[ 3 ][ 3 ] = ctx->state[ 3 ][ 3 ];
|
||||
|
||||
for ( b = 0; b < uBlockCount; b++ )
|
||||
{
|
||||
ctx->k = _mm512_add_epi64( ctx->k, ctx->const1536 );
|
||||
|
||||
for( j = ctx->uHashSize / 256; j < 4; j++ )
|
||||
{
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
_state[ i ][ j ] = _mm512_load_si512(
|
||||
pmsg + 4 * (j - (ctx->uHashSize / 256)) + i );
|
||||
}
|
||||
}
|
||||
|
||||
// save state
|
||||
SAVESTATE( _statebackup, _state );
|
||||
|
||||
k1 = ctx->k;
|
||||
|
||||
for ( r = 0; r < ctx->uRounds / 2; r++ )
|
||||
{
|
||||
ECHO_ROUND_UNROLL2;
|
||||
}
|
||||
|
||||
if ( ctx->uHashSize == 256 )
|
||||
{
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_state[ i ][ 1 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_state[ i ][ 2 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_state[ i ][ 3 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 0 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 1 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 2 ] ) ;
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 3 ] );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_state[ i ][ 2 ] );
|
||||
_state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
|
||||
_state[ i ][ 3 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 0 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ] [0 ],
|
||||
_statebackup[ i ][ 2 ] );
|
||||
_state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
|
||||
_statebackup[ i ][ 1 ] );
|
||||
_state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
|
||||
_statebackup[ i ][ 3 ] );
|
||||
}
|
||||
}
|
||||
pmsg += ctx->uBlockLength;
|
||||
}
|
||||
SAVESTATE(ctx->state, _state);
|
||||
|
||||
}
|
||||
|
||||
int echo_4way_init( echo_4way_context *ctx, int nHashSize )
|
||||
{
|
||||
int i, j;
|
||||
|
||||
ctx->k = m512_zero;
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
|
||||
switch( nHashSize )
|
||||
{
|
||||
case 256:
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 );
|
||||
ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 );
|
||||
ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400);
|
||||
break;
|
||||
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = 0; j < nHashSize / 256; j++ )
|
||||
ctx->state[ i ][ j ] = ctx->hashsize;
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = nHashSize / 256; j < 4; j++ )
|
||||
ctx->state[ i ][ j ] = m512_zero;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int echo_4way_update_close( echo_4way_context *state, void *hashval,
|
||||
const void *data, int databitlen )
|
||||
{
|
||||
// bytelen is either 32 (maybe), 64 or 80 or 128!
|
||||
// all are less than full block.
|
||||
|
||||
int vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
const int vblen = state->uBlockLength / 16; // 16 bytes per lane
|
||||
__m512i remainingbits;
|
||||
|
||||
if ( databitlen == 1024 )
|
||||
{
|
||||
echo_4way_compress( state, data, 1 );
|
||||
state->processed_bits = 1024;
|
||||
remainingbits = m512_zero;
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
memcpy_512( state->buffer, data, vlen );
|
||||
|
||||
state->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );
|
||||
|
||||
}
|
||||
|
||||
state->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
|
||||
memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
state->buffer[ vblen-2 ] =
|
||||
_mm512_set4_epi32( (uint32_t)state->uHashSize << 16, 0, 0, 0 );
|
||||
state->buffer[ vblen-1 ] =
|
||||
_mm512_set4_epi64( 0, state->processed_bits,
|
||||
0, state->processed_bits );
|
||||
|
||||
state->k = _mm512_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm512_sub_epi64( state->k, state->const1536 );
|
||||
|
||||
echo_4way_compress( state, state->buffer, 1 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] );
|
||||
_mm512_store_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] );
|
||||
|
||||
if ( state->uHashSize == 512 )
|
||||
{
|
||||
_mm512_store_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] );
|
||||
_mm512_store_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
36
algo/echo/echo-hash-4way.h
Normal file
36
algo/echo/echo-hash-4way.h
Normal file
@@ -0,0 +1,36 @@
|
||||
#if !defined(ECHO_HASH_4WAY_H__)
|
||||
#define ECHO_HASH_4WAY_H__ 1
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m512i state[4][4];
|
||||
__m512i buffer[ 4 * 192 / 16 ]; // 4x128 interleaved 192 bytes
|
||||
__m512i k;
|
||||
__m512i hashsize;
|
||||
__m512i const1536;
|
||||
|
||||
unsigned int uRounds;
|
||||
unsigned int uHashSize;
|
||||
unsigned int uBlockLength;
|
||||
unsigned int uBufferBytes;
|
||||
unsigned int processed_bits;
|
||||
|
||||
} echo_4way_context __attribute__ ((aligned (64)));
|
||||
|
||||
int echo_4way_init( echo_4way_context *state, int hashbitlen );
|
||||
|
||||
|
||||
int echo_4way_update( echo_4way_context *state, const void *data,
|
||||
unsigned int databitlen);
|
||||
|
||||
int echo_close( echo_4way_context *state, void *hashval );
|
||||
|
||||
int echo_4way_update_close( echo_4way_context *state, void *hashval,
|
||||
const void *data, int databitlen );
|
||||
|
||||
#endif
|
||||
#endif
|
@@ -11,6 +11,8 @@ extern "C"{
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
#define SPH_FUGUE_NOCOPY 1
|
||||
|
||||
static const sph_u32 IV224[] = {
|
||||
SPH_C32(0xf4c9120d), SPH_C32(0x6286f757), SPH_C32(0xee39e01c),
|
||||
SPH_C32(0xe074e3cb), SPH_C32(0xa1127c62), SPH_C32(0x9a43d215),
|
||||
|
@@ -43,7 +43,7 @@
|
||||
# if !defined( __MINGW32__ ) && !defined( _AIX )
|
||||
# include <endian.h>
|
||||
# if !defined( __BEOS__ )
|
||||
# include <byteswap.h>
|
||||
//# include <byteswap.h>
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
@@ -73,7 +73,7 @@ __m128i ALL_FF;
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm_xor_si128(b0, a4);\
|
||||
b6 = _mm_xor_si128(b6, a4);\
|
||||
@@ -195,7 +195,7 @@ __m128i ALL_FF;
|
||||
for(round_counter = 0; round_counter < 14; round_counter+=2) {\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[0]));\
|
||||
xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[1]));\
|
||||
xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
|
||||
@@ -209,7 +209,6 @@ __m128i ALL_FF;
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
|
||||
xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
|
||||
xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
|
||||
@@ -218,7 +217,6 @@ __m128i ALL_FF;
|
||||
xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
|
||||
xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
|
||||
xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}\
|
||||
}
|
||||
|
@@ -9,6 +9,7 @@
|
||||
|
||||
//#ifndef NO_AES_NI
|
||||
|
||||
// Not to be confused with AVX512VAES
|
||||
#define VAES
|
||||
// #define VAVX
|
||||
// #define VVPERM
|
||||
|
@@ -12,7 +12,7 @@
|
||||
#include <memory.h>
|
||||
#include "hash-groestl.h"
|
||||
#include "miner.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
|
||||
@@ -230,6 +230,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF1024( ctx->chaining, ctx->buffer );
|
||||
|
||||
OF1024( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
|
@@ -9,7 +9,7 @@
|
||||
#include <memory.h>
|
||||
#include "hash-groestl256.h"
|
||||
#include "miner.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
|
||||
|
64
algo/groestl/groestl-4way.c
Normal file
64
algo/groestl/groestl-4way.c
Normal file
@@ -0,0 +1,64 @@
|
||||
#include "groestl-gate.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#if defined(GROESTL_4WAY_VAES)
|
||||
|
||||
#include "groestl512-hash-4way.h"
|
||||
|
||||
void groestl_4way_hash( void *output, const void *input )
|
||||
{
|
||||
uint32_t hash[16*4] __attribute__ ((aligned (128)));
|
||||
groestl512_4way_context ctx;
|
||||
|
||||
groestl512_4way_init( &ctx, 64 );
|
||||
groestl512_4way_update_close( &ctx, hash, input, 640 );
|
||||
|
||||
groestl512_4way_init( &ctx, 64 );
|
||||
groestl512_4way_update_close( &ctx, hash, hash, 512 );
|
||||
|
||||
dintrlv_4x128( output, output+32, output+64, output+96, hash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_groestl_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t *noncep = vdata + 64+3; // 4*16 + 3
|
||||
int thr_id = mythr->id;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
|
||||
mm512_bswap32_intrlv80_4x128( vdata, pdata );
|
||||
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+ 4, n+1 );
|
||||
be32enc( noncep+ 8, n+2 );
|
||||
be32enc( noncep+12, n+3 );
|
||||
|
||||
groestl_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( hash+(lane<<3) )[7] < Htarg )
|
||||
if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, hash+(lane<<3), mythr, lane );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
23
algo/groestl/groestl-gate.c
Normal file
23
algo/groestl/groestl-gate.c
Normal file
@@ -0,0 +1,23 @@
|
||||
#include "groestl-gate.h"
|
||||
|
||||
bool register_dmd_gr_algo( algo_gate_t *gate )
|
||||
{
|
||||
#if defined (GROESTL_4WAY_VAES)
|
||||
gate->scanhash = (void*)&scanhash_groestl_4way;
|
||||
gate->hash = (void*)&groestl_4way_hash;
|
||||
#else
|
||||
init_groestl_ctx();
|
||||
gate->scanhash = (void*)&scanhash_groestl;
|
||||
gate->hash = (void*)&groestlhash;
|
||||
#endif
|
||||
gate->optimizations = AES_OPT | VAES_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
bool register_groestl_algo( algo_gate_t* gate )
|
||||
{
|
||||
register_dmd_gr_algo( gate );
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
return true;
|
||||
};
|
||||
|
31
algo/groestl/groestl-gate.h
Normal file
31
algo/groestl/groestl-gate.h
Normal file
@@ -0,0 +1,31 @@
|
||||
#ifndef GROESTL_GATE_H__
|
||||
#define GROESTL_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define GROESTL_4WAY_VAES 1
|
||||
#endif
|
||||
|
||||
bool register_dmd_gr_algo( algo_gate_t* gate );
|
||||
|
||||
bool register_groestl_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(GROESTL_4WAY_VAES)
|
||||
|
||||
void groestl_4way_hash( void *state, const void *input );
|
||||
int scanhash_groestl_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
void groestlhash( void *state, const void *input );
|
||||
int scanhash_groestl( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_groestl_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -1,5 +1,4 @@
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#include "groestl-gate.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
@@ -56,14 +55,15 @@ void groestlhash( void *output, const void *input )
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_groestl( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
int scanhash_groestl( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t endiandata[20] __attribute__ ((aligned (64)));
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||
@@ -77,15 +77,12 @@ int scanhash_groestl( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
groestlhash(hash, endiandata);
|
||||
|
||||
if (hash[7] <= Htarg )
|
||||
if ( fulltest(hash, ptarget))
|
||||
{
|
||||
if ( fulltest(hash, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = nonce;
|
||||
@@ -93,26 +90,3 @@ int scanhash_groestl( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
void groestl_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
bool register_dmd_gr_algo( algo_gate_t* gate )
|
||||
{
|
||||
init_groestl_ctx();
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
gate->scanhash = (void*)&scanhash_groestl;
|
||||
gate->hash = (void*)&groestlhash;
|
||||
gate->set_target = (void*)&groestl_set_target;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
||||
bool register_groestl_algo( algo_gate_t* gate )
|
||||
{
|
||||
register_dmd_gr_algo( gate );
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
280
algo/groestl/groestl256-hash-4way.c
Normal file
280
algo/groestl/groestl256-hash-4way.c
Normal file
@@ -0,0 +1,280 @@
|
||||
/* hash.c Aug 2011
|
||||
*
|
||||
* Groestl implementation for different versions.
|
||||
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include <memory.h>
|
||||
#include "hash-groestl256.h"
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
|
||||
#include "groestl-version.h"
|
||||
|
||||
#ifdef TASM
|
||||
#ifdef VAES
|
||||
#include "groestl256-asm-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl256-asm-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl256-asm-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#ifdef TINTR
|
||||
#ifdef VAES
|
||||
#include "groestl256-intr-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl256-intr-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl256-intr-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#error NO TYPE SPECIFIED (-DT[ASM/INTR])
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* initialise context */
|
||||
HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
|
||||
{
|
||||
int i;
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
SET_CONSTANTS();
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT256( ctx->chaining );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
|
||||
HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT256(ctx->chaining);
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
// Use this only for midstate and never for cryptonight
|
||||
HashReturn_gr update_groestl256( hashState_groestl256* ctx, const void* input,
|
||||
DataLength_gr databitlen )
|
||||
{
|
||||
__m128i* in = (__m128i*)input;
|
||||
const int len = (int)databitlen / 128; // bits to __m128i
|
||||
const int blocks = len / SIZE256; // __M128i to blocks
|
||||
int rem = ctx->rem_ptr;
|
||||
int i;
|
||||
|
||||
ctx->blk_count = blocks;
|
||||
ctx->databitlen = databitlen;
|
||||
|
||||
// digest any full blocks
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF512( ctx->chaining, &in[ i * SIZE256 ] );
|
||||
// adjust buf_ptr to last block
|
||||
ctx->buf_ptr = blocks * SIZE256;
|
||||
|
||||
// Copy any remainder to buffer
|
||||
for ( i = 0; i < len % SIZE256; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
// adjust rem_ptr for new data
|
||||
ctx->rem_ptr += i;
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
// don't use this at all
|
||||
HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output )
|
||||
{
|
||||
const int len = (int)ctx->databitlen / 128; // bits to __m128i
|
||||
const int blocks = ctx->blk_count + 1; // adjust for final block
|
||||
const int rem_ptr = ctx->rem_ptr; // end of data start of padding
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i; // where in buffer
|
||||
int i;
|
||||
|
||||
// first pad byte = 0x80, last pad byte = block count
|
||||
// everything in between is zero
|
||||
|
||||
if ( rem_ptr == len - 1 )
|
||||
{
|
||||
// all padding at once
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
// add zero padding
|
||||
for ( i = rem_ptr + 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
// add length padding
|
||||
// cheat since we know the block count is trivial, good if block < 256
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF512( ctx->chaining, ctx->buffer );
|
||||
OF512( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
|
||||
void* output, const void* input, DataLength_gr databitlen )
|
||||
{
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE256;
|
||||
__m128i* in = (__m128i*)input;
|
||||
int i;
|
||||
|
||||
// --- update ---
|
||||
|
||||
// digest any full blocks, process directly from input
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF512( ctx->chaining, &in[ i * SIZE256 ] );
|
||||
ctx->buf_ptr = blocks * SIZE256;
|
||||
|
||||
// cryptonight has 200 byte input, an odd number of __m128i
|
||||
// remainder is only 8 bytes, ie u64.
|
||||
if ( databitlen % 128 !=0 )
|
||||
{
|
||||
// must be cryptonight, copy 64 bits of data
|
||||
*(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
|
||||
i = -1; // signal for odd length
|
||||
}
|
||||
else
|
||||
{
|
||||
// Copy any remaining data to buffer for final transform
|
||||
for ( i = 0; i < len % SIZE256; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
}
|
||||
|
||||
//--- final ---
|
||||
|
||||
// adjust for final block
|
||||
blocks++;
|
||||
|
||||
if ( i == len - 1 )
|
||||
{
|
||||
// all padding at once
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
0, 0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( i == -1 )
|
||||
{
|
||||
// cryptonight odd length
|
||||
((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull;
|
||||
// finish the block with zero and length padding as normal
|
||||
i = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
// add length padding
|
||||
// cheat since we know the block count is trivial, good if block < 256
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
0, 0,0,0, 0,0,0,0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF512( ctx->chaining, ctx->buffer );
|
||||
OF512( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
/* hash bit sequence */
|
||||
HashReturn_gr hash_groestl256(int hashbitlen,
|
||||
const BitSequence_gr* data,
|
||||
DataLength_gr databitlen,
|
||||
BitSequence_gr* hashval) {
|
||||
HashReturn_gr ret;
|
||||
hashState_groestl256 context;
|
||||
|
||||
/* initialise */
|
||||
if ((ret = init_groestl256(&context, hashbitlen/8)) != SUCCESS_GR)
|
||||
return ret;
|
||||
|
||||
/* process message */
|
||||
if ((ret = update_groestl256(&context, data, databitlen)) != SUCCESS_GR)
|
||||
return ret;
|
||||
|
||||
/* finalise */
|
||||
ret = final_groestl256(&context, hashval);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* eBash API */
|
||||
//#ifdef crypto_hash_BYTES
|
||||
//int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen)
|
||||
//{
|
||||
// if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0;
|
||||
// return -1;
|
||||
//}
|
||||
//#endif
|
||||
|
||||
#endif
|
121
algo/groestl/groestl256-hash-4way.h
Normal file
121
algo/groestl/groestl256-hash-4way.h
Normal file
@@ -0,0 +1,121 @@
|
||||
/* hash.h Aug 2011
|
||||
*
|
||||
* Groestl implementation for different versions.
|
||||
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#ifndef __hash_h
|
||||
#define __hash_h
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdio.h>
|
||||
#if defined(_WIN64) || defined(__WINDOWS__)
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
|
||||
/* eBash API begin */
|
||||
/*
|
||||
#include "crypto_hash.h"
|
||||
#ifdef crypto_hash_BYTES
|
||||
|
||||
#include <crypto_uint8.h>
|
||||
#include <crypto_uint32.h>
|
||||
#include <crypto_uint64.h>
|
||||
typedef crypto_uint8 u8;
|
||||
typedef crypto_uint32 u32;
|
||||
typedef crypto_uint64 u64;
|
||||
#endif
|
||||
*/
|
||||
/* eBash API end */
|
||||
|
||||
//#define LENGTH (512)
|
||||
|
||||
#include "brg_endian.h"
|
||||
#define NEED_UINT_64T
|
||||
#include "algo/sha/brg_types.h"
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
#include IACA_MARKS
|
||||
#endif
|
||||
|
||||
#define LENGTH (256)
|
||||
|
||||
/* some sizes (number of bytes) */
|
||||
#define ROWS (8)
|
||||
#define LENGTHFIELDLEN (ROWS)
|
||||
#define COLS512 (8)
|
||||
//#define COLS1024 (16)
|
||||
#define SIZE_512 ((ROWS)*(COLS512))
|
||||
//#define SIZE1024 ((ROWS)*(COLS1024))
|
||||
#define ROUNDS512 (10)
|
||||
//#define ROUNDS1024 (14)
|
||||
|
||||
//#if LENGTH<=256
|
||||
#define COLS (COLS512)
|
||||
//#define SIZE (SIZE512)
|
||||
#define ROUNDS (ROUNDS512)
|
||||
//#else
|
||||
//#define COLS (COLS1024)
|
||||
//#define SIZE (SIZE1024)
|
||||
//#define ROUNDS (ROUNDS1024)
|
||||
//#endif
|
||||
|
||||
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
||||
#define U64BIG(a) (a)
|
||||
#endif /* IS_BIG_ENDIAN */
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
|
||||
#define U64BIG(a) \
|
||||
((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
|
||||
(ROTL64(a,24) & li_64(0000FF000000FF00)) | \
|
||||
(ROTL64(a,40) & li_64(00FF000000FF0000)) | \
|
||||
(ROTL64(a,56) & li_64(FF000000FF000000)))
|
||||
#endif /* IS_LITTLE_ENDIAN */
|
||||
|
||||
typedef unsigned char BitSequence_gr;
|
||||
typedef unsigned long long DataLength_gr;
|
||||
typedef enum
|
||||
{
|
||||
SUCCESS_GR = 0,
|
||||
FAIL_GR = 1,
|
||||
BAD_HASHBITLEN_GR = 2
|
||||
} HashReturn_gr;
|
||||
|
||||
#define SIZE256 (SIZE_512/16)
|
||||
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (32))) __m128i chaining[SIZE256];
|
||||
__attribute__ ((aligned (32))) __m128i buffer[SIZE256];
|
||||
// __attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */
|
||||
// __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; /* data buffer */
|
||||
// u64 block_counter; /* message block counter */
|
||||
int hashlen; // bytes
|
||||
int blk_count;
|
||||
int buf_ptr; /* data buffer pointer */
|
||||
int rem_ptr;
|
||||
int databitlen;
|
||||
} hashState_groestl256;
|
||||
|
||||
HashReturn_gr init_groestl256( hashState_groestl256*, int );
|
||||
|
||||
HashReturn_gr reinit_groestl256( hashState_groestl256* );
|
||||
|
||||
HashReturn_gr update_groestl256( hashState_groestl256*, const void*,
|
||||
DataLength_gr );
|
||||
|
||||
HashReturn_gr final_groestl256( hashState_groestl256*, void* );
|
||||
|
||||
HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
|
||||
BitSequence_gr* );
|
||||
|
||||
HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
|
||||
const void*, DataLength_gr );
|
||||
|
||||
#endif /* __hash_h */
|
492
algo/groestl/groestl256-intr-4way.h
Normal file
492
algo/groestl/groestl256-intr-4way.h
Normal file
@@ -0,0 +1,492 @@
|
||||
/* groestl-intr-aes.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
|
||||
* instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
/* global constants */
|
||||
__m128i ROUND_CONST_Lx;
|
||||
__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
//__m128i ROUND_CONST_P[ROUNDS1024];
|
||||
//__m128i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m128i TRANSP_MASK;
|
||||
__m128i SUBSH_MASK[8];
|
||||
__m128i ALL_1B;
|
||||
__m128i ALL_FF;
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm_xor_si128(j, j);\
|
||||
j = _mm_cmpgt_epi8(j, i);\
|
||||
i = _mm_add_epi8(i, i);\
|
||||
j = _mm_and_si128(j, k);\
|
||||
i = _mm_xor_si128(i, j);\
|
||||
}
|
||||
|
||||
/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
Output: b0, ..., b7 = MixBytes(a0,...,a7).
|
||||
but we use the relations:
|
||||
t_i = a_i + a_{i+3}
|
||||
x_i = t_i + t_{i+3}
|
||||
y_i = t_i + t+{i+2} + a_{i+6}
|
||||
z_i = 2*x_i
|
||||
w_i = z_i + y_{i+4}
|
||||
v_i = 2*w_i
|
||||
b_i = v_{i+3} + y_{i+4}
|
||||
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
|
||||
and then adding v_i computed in the meantime in registers xmm0..xmm7.
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
b1 = a3;\
|
||||
a2 = _mm_xor_si128(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm_xor_si128(b0, a4);\
|
||||
b6 = _mm_xor_si128(b6, a4);\
|
||||
b1 = _mm_xor_si128(b1, a5);\
|
||||
b7 = _mm_xor_si128(b7, a5);\
|
||||
b2 = _mm_xor_si128(b2, a6);\
|
||||
b0 = _mm_xor_si128(b0, a6);\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm_xor_si128(b3, a7);\
|
||||
b1 = _mm_xor_si128(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm_xor_si128(b4, a0);\
|
||||
b2 = _mm_xor_si128(b2, a0);\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b5 = _mm_xor_si128(b5, a1);\
|
||||
b3 = _mm_xor_si128(b3, a1);\
|
||||
b1 = a1;\
|
||||
b6 = _mm_xor_si128(b6, a2);\
|
||||
b4 = _mm_xor_si128(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm_xor_si128(b7, a3);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(a2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = ALL_1B;\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm_xor_si128(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm_xor_si128(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
|
||||
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
|
||||
SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
|
||||
SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
|
||||
SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
|
||||
SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
|
||||
SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
|
||||
SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
|
||||
SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
|
||||
SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
|
||||
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
|
||||
}\
|
||||
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
|
||||
}while(0); \
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = ROUND_CONST_Lx;\
|
||||
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
|
||||
a1 = _mm_xor_si128(a1, b1);\
|
||||
a2 = _mm_xor_si128(a2, b1);\
|
||||
a3 = _mm_xor_si128(a3, b1);\
|
||||
a4 = _mm_xor_si128(a4, b1);\
|
||||
a5 = _mm_xor_si128(a5, b1);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
|
||||
\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
b0 = _mm_xor_si128(b0, b0);\
|
||||
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
|
||||
a0 = _mm_aesenclast_si128(a0, b0);\
|
||||
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
|
||||
a1 = _mm_aesenclast_si128(a1, b0);\
|
||||
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
|
||||
a2 = _mm_aesenclast_si128(a2, b0);\
|
||||
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
|
||||
a3 = _mm_aesenclast_si128(a3, b0);\
|
||||
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
|
||||
a4 = _mm_aesenclast_si128(a4, b0);\
|
||||
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
|
||||
a5 = _mm_aesenclast_si128(a5, b0);\
|
||||
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
|
||||
a6 = _mm_aesenclast_si128(a6, b0);\
|
||||
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
|
||||
a7 = _mm_aesenclast_si128(a7, b0);\
|
||||
\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
\
|
||||
}
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
t0 = TRANSP_MASK;\
|
||||
\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm_shuffle_epi8(i3, t0);\
|
||||
\
|
||||
o1 = i0;\
|
||||
t0 = i2;\
|
||||
\
|
||||
i0 = _mm_unpacklo_epi16(i0, i1);\
|
||||
o1 = _mm_unpackhi_epi16(o1, i1);\
|
||||
i2 = _mm_unpacklo_epi16(i2, i3);\
|
||||
t0 = _mm_unpackhi_epi16(t0, i3);\
|
||||
\
|
||||
i0 = _mm_shuffle_epi32(i0, 216);\
|
||||
o1 = _mm_shuffle_epi32(o1, 216);\
|
||||
i2 = _mm_shuffle_epi32(i2, 216);\
|
||||
t0 = _mm_shuffle_epi32(t0, 216);\
|
||||
\
|
||||
o2 = i0;\
|
||||
o3 = o1;\
|
||||
\
|
||||
i0 = _mm_unpacklo_epi32(i0, i2);\
|
||||
o1 = _mm_unpacklo_epi32(o1, t0);\
|
||||
o2 = _mm_unpackhi_epi32(o2, i2);\
|
||||
o3 = _mm_unpackhi_epi32(o3, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
o1 = i0;\
|
||||
o2 = i1;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i4);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i4);\
|
||||
o3 = i1;\
|
||||
o4 = i2;\
|
||||
o2 = _mm_unpacklo_epi64(o2, i5);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i5);\
|
||||
o5 = i2;\
|
||||
o6 = i3;\
|
||||
o4 = _mm_unpacklo_epi64(o4, i6);\
|
||||
o5 = _mm_unpackhi_epi64(o5, i6);\
|
||||
o7 = i3;\
|
||||
o6 = _mm_unpacklo_epi64(o6, i7);\
|
||||
o7 = _mm_unpackhi_epi64(o7, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
o0 = i0;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
o0 = _mm_unpackhi_epi64(o0, i1);\
|
||||
o1 = i2;\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i3);\
|
||||
o2 = i4;\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
o2 = _mm_unpackhi_epi64(o2, i5);\
|
||||
o3 = i6;\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm_xor_si128(t0, t0);\
|
||||
i1 = i0;\
|
||||
i3 = i2;\
|
||||
i5 = i4;\
|
||||
i7 = i6;\
|
||||
i0 = _mm_unpacklo_epi64(i0, t0);\
|
||||
i1 = _mm_unpackhi_epi64(i1, t0);\
|
||||
i2 = _mm_unpacklo_epi64(i2, t0);\
|
||||
i3 = _mm_unpackhi_epi64(i3, t0);\
|
||||
i4 = _mm_unpacklo_epi64(i4, t0);\
|
||||
i5 = _mm_unpackhi_epi64(i5, t0);\
|
||||
i6 = _mm_unpacklo_epi64(i6, t0);\
|
||||
i7 = _mm_unpackhi_epi64(i7, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT256( __m128i* chaining )
|
||||
{
|
||||
static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
|
||||
static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
xmm12 = chaining[0];
|
||||
xmm13 = chaining[1];
|
||||
xmm14 = chaining[2];
|
||||
xmm15 = chaining[3];
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* store transposed IV */
|
||||
chaining[0] = xmm12;
|
||||
chaining[1] = xmm2;
|
||||
chaining[2] = xmm6;
|
||||
chaining[3] = xmm7;
|
||||
}
|
||||
|
||||
void TF512( __m128i* chaining, __m128i* message )
|
||||
{
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
/* load message into registers xmm12 - xmm15 */
|
||||
xmm12 = message[0];
|
||||
xmm13 = message[1];
|
||||
xmm14 = message[2];
|
||||
xmm15 = message[3];
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* load previous chaining value */
|
||||
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
|
||||
xmm8 = chaining[0];
|
||||
xmm0 = chaining[1];
|
||||
xmm4 = chaining[2];
|
||||
xmm5 = chaining[3];
|
||||
|
||||
/* xor message to CV get input of P */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
xmm8 = _mm_xor_si128(xmm8, xmm12);
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm2);
|
||||
xmm4 = _mm_xor_si128(xmm4, xmm6);
|
||||
xmm5 = _mm_xor_si128(xmm5, xmm7);
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm8);
|
||||
xmm1 = _mm_xor_si128(xmm1, xmm10);
|
||||
xmm2 = _mm_xor_si128(xmm2, xmm12);
|
||||
xmm3 = _mm_xor_si128(xmm3, xmm14);
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
|
||||
xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
|
||||
xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
|
||||
xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm0;
|
||||
chaining[1] = xmm1;
|
||||
chaining[2] = xmm2;
|
||||
chaining[3] = xmm3;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512( __m128i* chaining )
|
||||
{
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
xmm10 = chaining[1];
|
||||
xmm12 = chaining[2];
|
||||
xmm14 = chaining[3];
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
chaining[2] = xmm9;
|
||||
chaining[3] = xmm11;
|
||||
}
|
||||
|
||||
|
114
algo/groestl/groestl512-hash-4way.c
Normal file
114
algo/groestl/groestl512-hash-4way.c
Normal file
@@ -0,0 +1,114 @@
|
||||
/* hash.c Aug 2011
|
||||
* groestl512-hash-4way https://github.com/JayDDee/cpuminer-opt 2019-12.
|
||||
*
|
||||
* Groestl implementation for different versions.
|
||||
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
// Optimized for hash and data length that are integrals of __m128i
|
||||
|
||||
|
||||
#include <memory.h>
|
||||
#include "groestl512-intr-4way.h"
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
#define ROTL64(a,n) \
|
||||
( ( ( (a)<<(n) ) | ( (a) >> (64-(n)) ) ) & 0xffffffffffffffff )
|
||||
|
||||
#define U64BIG(a) \
|
||||
( ( ROTL64(a, 8) & 0x000000FF000000FF ) | \
|
||||
( ROTL64(a,24) & 0x0000FF000000FF00 ) | \
|
||||
( ROTL64(a,40) & 0x00FF000000FF0000 ) | \
|
||||
( ROTL64(a,56) & 0xFF000000FF000000 ) )
|
||||
|
||||
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
|
||||
{
|
||||
int i;
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
SET_CONSTANTS();
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
|
||||
for ( i = 0; i < SIZE512; i++ )
|
||||
{
|
||||
ctx->chaining[i] = m512_zero;
|
||||
ctx->buffer[i] = m512_zero;
|
||||
}
|
||||
|
||||
uint64_t len = U64BIG((uint64_t)LENGTH);
|
||||
ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 );
|
||||
INIT_4way(ctx->chaining);
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
|
||||
const void* input, uint64_t databitlen )
|
||||
{
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE512;
|
||||
__m512i* in = (__m512i*)input;
|
||||
int i;
|
||||
|
||||
// --- update ---
|
||||
|
||||
// digest any full blocks, process directly from input
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] );
|
||||
ctx->buf_ptr = blocks * SIZE512;
|
||||
|
||||
// copy any remaining data to buffer, it may already contain data
|
||||
// from a previous update for a midstate precalc
|
||||
for ( i = 0; i < len % SIZE512; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
|
||||
//--- final ---
|
||||
|
||||
blocks++; // adjust for final block
|
||||
|
||||
if ( i == SIZE512 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = m512_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF1024_4way( ctx->chaining, ctx->buffer );
|
||||
|
||||
OF1024_4way( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // VAES
|
||||
|
94
algo/groestl/groestl512-hash-4way.h
Normal file
94
algo/groestl/groestl512-hash-4way.h
Normal file
@@ -0,0 +1,94 @@
|
||||
/* hash.h Aug 2011
|
||||
*
|
||||
* Groestl implementation for different versions.
|
||||
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#if !defined(GROESTL512_HASH_4WAY_H__)
|
||||
#define GROESTL512_HASH_4WAY_H__ 1
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#if defined(_WIN64) || defined(__WINDOWS__)
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
|
||||
#define LENGTH (512)
|
||||
|
||||
//#include "brg_endian.h"
|
||||
//#define NEED_UINT_64T
|
||||
//#include "algo/sha/brg_types.h"
|
||||
|
||||
/* some sizes (number of bytes) */
|
||||
#define ROWS (8)
|
||||
#define LENGTHFIELDLEN (ROWS)
|
||||
//#define COLS512 (8)
|
||||
#define COLS1024 (16)
|
||||
//#define SIZE512 ((ROWS)*(COLS512))
|
||||
#define SIZE_1024 ((ROWS)*(COLS1024))
|
||||
//#define ROUNDS512 (10)
|
||||
#define ROUNDS1024 (14)
|
||||
|
||||
//#if LENGTH<=256
|
||||
//#define COLS (COLS512)
|
||||
//#define SIZE (SIZE512)
|
||||
//#define ROUNDS (ROUNDS512)
|
||||
//#else
|
||||
#define COLS (COLS1024)
|
||||
//#define SIZE (SIZE1024)
|
||||
#define ROUNDS (ROUNDS1024)
|
||||
//#endif
|
||||
|
||||
/*
|
||||
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
||||
#define U64BIG(a) (a)
|
||||
#endif // IS_BIG_ENDIAN
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
|
||||
#define U64BIG(a) \
|
||||
((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
|
||||
(ROTL64(a,24) & li_64(0000FF000000FF00)) | \
|
||||
(ROTL64(a,40) & li_64(00FF000000FF0000)) | \
|
||||
(ROTL64(a,56) & li_64(FF000000FF000000)))
|
||||
#endif // IS_LITTLE_ENDIAN
|
||||
|
||||
typedef unsigned char BitSequence_gr;
|
||||
typedef unsigned long long DataLength_gr;
|
||||
typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;
|
||||
*/
|
||||
|
||||
#define SIZE512 (SIZE_1024/16)
|
||||
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (128))) __m512i chaining[SIZE512];
|
||||
__attribute__ ((aligned (64))) __m512i buffer[SIZE512];
|
||||
int hashlen; // byte
|
||||
int blk_count; // SIZE_m128i
|
||||
int buf_ptr; // __m128i offset
|
||||
int rem_ptr;
|
||||
int databitlen; // bits
|
||||
} groestl512_4way_context;
|
||||
|
||||
|
||||
int groestl512_4way_init( groestl512_4way_context*, uint64_t );
|
||||
|
||||
//int reinit_groestl( hashState_groestl* );
|
||||
|
||||
int groestl512_4way_update( groestl512_4way_context*, const void*,
|
||||
uint64_t );
|
||||
|
||||
int groestl512_4way_close( groestl512_4way_context*, void* );
|
||||
|
||||
int groestl512_4way_update_close( groestl512_4way_context*, void*,
|
||||
const void*, uint64_t );
|
||||
|
||||
#endif /* __hash_h */
|
654
algo/groestl/groestl512-intr-4way.h
Normal file
654
algo/groestl/groestl512-intr-4way.h
Normal file
@@ -0,0 +1,654 @@
|
||||
/* groestl-intr-aes.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
|
||||
* instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
|
||||
#if !defined(GROESTL512_INTR_4WAY_H__)
|
||||
#define GROESTL512_INTR_4WAY_H__ 1
|
||||
|
||||
#include "groestl512-hash-4way.h"
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
/* global constants */
|
||||
__m512i ROUND_CONST_Lx;
|
||||
//__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
//__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
__m512i ROUND_CONST_P[ROUNDS1024];
|
||||
__m512i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m512i TRANSP_MASK;
|
||||
__m512i SUBSH_MASK[8];
|
||||
__m512i ALL_1B;
|
||||
__m512i ALL_FF;
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm512_xor_si512(j, j);\
|
||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
|
||||
i = _mm512_add_epi8(i, i);\
|
||||
j = _mm512_and_si512(j, k);\
|
||||
i = _mm512_xor_si512(i, j);\
|
||||
}
|
||||
|
||||
/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
Output: b0, ..., b7 = MixBytes(a0,...,a7).
|
||||
but we use the relations:
|
||||
t_i = a_i + a_{i+3}
|
||||
x_i = t_i + t_{i+3}
|
||||
y_i = t_i + t+{i+2} + a_{i+6}
|
||||
z_i = 2*x_i
|
||||
w_i = z_i + y_{i+4}
|
||||
v_i = 2*w_i
|
||||
b_i = v_{i+3} + y_{i+4}
|
||||
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
|
||||
and then adding v_i computed in the meantime in registers xmm0..xmm7.
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm512_xor_si512(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm512_xor_si512(a1, a2);\
|
||||
b1 = a3;\
|
||||
a2 = _mm512_xor_si512(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm512_xor_si512(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm512_xor_si512(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm512_xor_si512(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm512_xor_si512(a6, a7);\
|
||||
a7 = _mm512_xor_si512(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm512_xor_si512(b0, a4);\
|
||||
b6 = _mm512_xor_si512(b6, a4);\
|
||||
b1 = _mm512_xor_si512(b1, a5);\
|
||||
b7 = _mm512_xor_si512(b7, a5);\
|
||||
b2 = _mm512_xor_si512(b2, a6);\
|
||||
b0 = _mm512_xor_si512(b0, a6);\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm512_xor_si512(b3, a7);\
|
||||
b1 = _mm512_xor_si512(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm512_xor_si512(b4, a0);\
|
||||
b2 = _mm512_xor_si512(b2, a0);\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b5 = _mm512_xor_si512(b5, a1);\
|
||||
b3 = _mm512_xor_si512(b3, a1);\
|
||||
b1 = a1;\
|
||||
b6 = _mm512_xor_si512(b6, a2);\
|
||||
b4 = _mm512_xor_si512(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm512_xor_si512(b7, a3);\
|
||||
b5 = _mm512_xor_si512(b5, a3);\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm512_xor_si512(a0, a3);\
|
||||
a1 = _mm512_xor_si512(a1, a4);\
|
||||
a2 = _mm512_xor_si512(a2, a5);\
|
||||
a3 = _mm512_xor_si512(a3, a6);\
|
||||
a4 = _mm512_xor_si512(a4, a7);\
|
||||
a5 = _mm512_xor_si512(a5, b0);\
|
||||
a6 = _mm512_xor_si512(a6, b1);\
|
||||
a7 = _mm512_xor_si512(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = ALL_1B;\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm512_xor_si512(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm512_xor_si512(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm512_xor_si512(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm512_xor_si512(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm512_xor_si512(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm512_xor_si512(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm512_xor_si512(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm512_xor_si512(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm512_xor_si512(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm512_xor_si512(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm512_xor_si512(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm512_xor_si512(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm512_xor_si512(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm512_xor_si512(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm512_xor_si512(b0, a3);\
|
||||
b1 = _mm512_xor_si512(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
// calculate the round constants seperately and load at startup
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
ALL_FF = _mm512_set1_epi32( 0xffffffff );\
|
||||
ALL_1B = _mm512_set1_epi32( 0x1b1b1b1b );\
|
||||
TRANSP_MASK = _mm512_set_epi32( \
|
||||
0x3f373b33, 0x3e363a32, 0x3d353931, 0x3c343830, \
|
||||
0x2f272b23, 0x2e262a22, 0x2d252921, 0x2c242820, \
|
||||
0x1f171b13, 0x1e161a12, 0x1d151911, 0x1c141810, \
|
||||
0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800 ); \
|
||||
SUBSH_MASK[0] = _mm512_set_epi32( \
|
||||
0x3336393c, 0x3f323538, 0x3b3e3134, 0x373a3d30, \
|
||||
0x2326292c, 0x2f222528, 0x2b2e2124, 0x272a2d20, \
|
||||
0x1316191c, 0x1f121518, 0x1b1e1114, 0x171a1d10, \
|
||||
0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00 ); \
|
||||
SUBSH_MASK[1] = _mm512_set_epi32( \
|
||||
0x34373a3d, 0x30333639, 0x3c3f3235, 0x383b3e31, \
|
||||
0x24272a2d, 0x20232629, 0x2c2f2225, 0x282b2e21, \
|
||||
0x14171a1d, 0x10131619, 0x1c1f1215, 0x181b1e11, \
|
||||
0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01 ); \
|
||||
SUBSH_MASK[2] = _mm512_set_epi32( \
|
||||
0x35383b3e, 0x3134373a, 0x3d303336, 0x393c3f32, \
|
||||
0x25282b2e, 0x2124272a, 0x2d202326, 0x292c2f22, \
|
||||
0x15181b1e, 0x1114171a, 0x1d101316, 0x191c1f12, \
|
||||
0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02 ); \
|
||||
SUBSH_MASK[3] = _mm512_set_epi32( \
|
||||
0x36393c3f, 0x3235383b, 0x3e313437, 0x3a3d3033, \
|
||||
0x26292c2f, 0x2225282b, 0x2e212427, 0x2a2d2023, \
|
||||
0x16191c1f, 0x1215181b, 0x1e111417, 0x1a1d1013, \
|
||||
0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003 ); \
|
||||
SUBSH_MASK[4] = _mm512_set_epi32( \
|
||||
0x373a3d30, 0x3336393c, 0x3f323538, 0x3b3e3134, \
|
||||
0x272a2d20, 0x2326292c, 0x2f222528, 0x2b2e2124, \
|
||||
0x171a1d10, 0x1316191c, 0x1f121518, 0x1b1e1114, \
|
||||
0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104 ); \
|
||||
SUBSH_MASK[5] = _mm512_set_epi32( \
|
||||
0x383b3e31, 0x34373a3d, 0x30333639, 0x3c3f3235, \
|
||||
0x282b2e21, 0x24272a2d, 0x20232629, 0x2c2f2225, \
|
||||
0x181b1e11, 0x14171a1d, 0x10131619, 0x1c1f1215, \
|
||||
0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205 ); \
|
||||
SUBSH_MASK[6] = _mm512_set_epi32( \
|
||||
0x393c3f32, 0x35383b3e, 0x3134373a, 0x3d303336, \
|
||||
0x292c2f22, 0x25282b2e, 0x2124272a, 0x2d202326, \
|
||||
0x191c1f12, 0x15181b1e, 0x1114171a, 0x1d101316, \
|
||||
0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306 ); \
|
||||
SUBSH_MASK[7] = _mm512_set_epi32( \
|
||||
0x3e313437, 0x3a3d3033, 0x36393c3f, 0x3235383b, \
|
||||
0x2e212427, 0x2a2d2023, 0x26292c2f, 0x2225282b, \
|
||||
0x1e111417, 0x1a1d1013, 0x16191c1f, 0x1215181b, \
|
||||
0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b ); \
|
||||
for( i = 0; i < ROUNDS1024; i++ ) \
|
||||
{ \
|
||||
ROUND_CONST_P[i] = _mm512_set4_epi32( 0xf0e0d0c0 ^ (i * 0x01010101), \
|
||||
0xb0a09080 ^ (i * 0x01010101), \
|
||||
0x70605040 ^ (i * 0x01010101), \
|
||||
0x30201000 ^ (i * 0x01010101) ); \
|
||||
ROUND_CONST_Q[i] = _mm512_set4_epi32( 0x0f1f2f3f ^ (i * 0x01010101), \
|
||||
0x4f5f6f7f ^ (i * 0x01010101), \
|
||||
0x8f9fafbf ^ (i * 0x01010101), \
|
||||
0xcfdfefff ^ (i * 0x01010101));\
|
||||
} \
|
||||
}while(0);\
|
||||
|
||||
/* one round
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* SubBytes */\
|
||||
b0 = _mm512_xor_si512( b0, b0 );\
|
||||
a0 = _mm512_aesenclast_epi128( a0, b0 );\
|
||||
a1 = _mm512_aesenclast_epi128( a1, b0 );\
|
||||
a2 = _mm512_aesenclast_epi128( a2, b0 );\
|
||||
a3 = _mm512_aesenclast_epi128( a3, b0 );\
|
||||
a4 = _mm512_aesenclast_epi128( a4, b0 );\
|
||||
a5 = _mm512_aesenclast_epi128( a5, b0 );\
|
||||
a6 = _mm512_aesenclast_epi128( a6, b0 );\
|
||||
a7 = _mm512_aesenclast_epi128( a7, b0 );\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}
|
||||
|
||||
#define ROUNDS_P(){\
|
||||
uint8_t round_counter = 0;\
|
||||
for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
|
||||
{ \
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm512_xor_si512( xmm8, ( ROUND_CONST_P[ round_counter ] ) );\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm512_shuffle_epi8( xmm8, ( SUBSH_MASK[0] ) );\
|
||||
xmm9 = _mm512_shuffle_epi8( xmm9, ( SUBSH_MASK[1] ) );\
|
||||
xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[2] ) );\
|
||||
xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[3] ) );\
|
||||
xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[4] ) );\
|
||||
xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[5] ) );\
|
||||
xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[6] ) );\
|
||||
xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[7] ) );\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm512_xor_si512( xmm0, ( ROUND_CONST_P[ round_counter+1 ] ) );\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[0] ) );\
|
||||
xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[1] ) );\
|
||||
xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[2] ) );\
|
||||
xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[3] ) );\
|
||||
xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[4] ) );\
|
||||
xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[5] ) );\
|
||||
xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[6] ) );\
|
||||
xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[7] ) );\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}\
|
||||
}
|
||||
|
||||
#define ROUNDS_Q(){\
|
||||
uint8_t round_counter = 0;\
|
||||
for ( round_counter = 0; round_counter < 14; round_counter += 2) \
|
||||
{ \
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm1 = ALL_FF;\
|
||||
xmm8 = _mm512_xor_si512( xmm8, xmm1 );\
|
||||
xmm9 = _mm512_xor_si512( xmm9, xmm1 );\
|
||||
xmm10 = _mm512_xor_si512( xmm10, xmm1 );\
|
||||
xmm11 = _mm512_xor_si512( xmm11, xmm1 );\
|
||||
xmm12 = _mm512_xor_si512( xmm12, xmm1 );\
|
||||
xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
|
||||
xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
|
||||
xmm15 = _mm512_xor_si512( xmm15, ( ROUND_CONST_Q[ round_counter ] ) );\
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm512_shuffle_epi8( xmm8, ( SUBSH_MASK[1] ) );\
|
||||
xmm9 = _mm512_shuffle_epi8( xmm9, ( SUBSH_MASK[3] ) );\
|
||||
xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[5] ) );\
|
||||
xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[7] ) );\
|
||||
xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[0] ) );\
|
||||
xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[2] ) );\
|
||||
xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[4] ) );\
|
||||
xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[6] ) );\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm9 = ALL_FF;\
|
||||
xmm0 = _mm512_xor_si512( xmm0, xmm9 );\
|
||||
xmm1 = _mm512_xor_si512( xmm1, xmm9 );\
|
||||
xmm2 = _mm512_xor_si512( xmm2, xmm9 );\
|
||||
xmm3 = _mm512_xor_si512( xmm3, xmm9 );\
|
||||
xmm4 = _mm512_xor_si512( xmm4, xmm9 );\
|
||||
xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
|
||||
xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
|
||||
xmm7 = _mm512_xor_si512( xmm7, ( ROUND_CONST_Q[ round_counter+1 ] ) );\
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[1] ) );\
|
||||
xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[3] ) );\
|
||||
xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[5] ) );\
|
||||
xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[7] ) );\
|
||||
xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[0] ) );\
|
||||
xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[2] ) );\
|
||||
xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[4] ) );\
|
||||
xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[6] ) );\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}\
|
||||
}
|
||||
|
||||
/* Matrix Transpose
|
||||
* input is a 1024-bit state with two columns in one xmm
|
||||
* output is a 1024-bit state with two rows in one xmm
|
||||
* inputs: i0-i7
|
||||
* outputs: i0-i7
|
||||
* clobbers: t0-t7
|
||||
*/
|
||||
#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
|
||||
t0 = TRANSP_MASK;\
|
||||
\
|
||||
i6 = _mm512_shuffle_epi8(i6, t0);\
|
||||
i0 = _mm512_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm512_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm512_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm512_shuffle_epi8(i3, t0);\
|
||||
t1 = i2;\
|
||||
i4 = _mm512_shuffle_epi8(i4, t0);\
|
||||
i5 = _mm512_shuffle_epi8(i5, t0);\
|
||||
t2 = i4;\
|
||||
t3 = i6;\
|
||||
i7 = _mm512_shuffle_epi8(i7, t0);\
|
||||
\
|
||||
/* continue with unpack using 4 temp registers */\
|
||||
t0 = i0;\
|
||||
t2 = _mm512_unpackhi_epi16(t2, i5);\
|
||||
i4 = _mm512_unpacklo_epi16(i4, i5);\
|
||||
t3 = _mm512_unpackhi_epi16(t3, i7);\
|
||||
i6 = _mm512_unpacklo_epi16(i6, i7);\
|
||||
t0 = _mm512_unpackhi_epi16(t0, i1);\
|
||||
t1 = _mm512_unpackhi_epi16(t1, i3);\
|
||||
i2 = _mm512_unpacklo_epi16(i2, i3);\
|
||||
i0 = _mm512_unpacklo_epi16(i0, i1);\
|
||||
\
|
||||
/* shuffle with immediate */\
|
||||
t0 = _mm512_shuffle_epi32(t0, 216);\
|
||||
t1 = _mm512_shuffle_epi32(t1, 216);\
|
||||
t2 = _mm512_shuffle_epi32(t2, 216);\
|
||||
t3 = _mm512_shuffle_epi32(t3, 216);\
|
||||
i0 = _mm512_shuffle_epi32(i0, 216);\
|
||||
i2 = _mm512_shuffle_epi32(i2, 216);\
|
||||
i4 = _mm512_shuffle_epi32(i4, 216);\
|
||||
i6 = _mm512_shuffle_epi32(i6, 216);\
|
||||
\
|
||||
/* continue with unpack */\
|
||||
t4 = i0;\
|
||||
i0 = _mm512_unpacklo_epi32(i0, i2);\
|
||||
t4 = _mm512_unpackhi_epi32(t4, i2);\
|
||||
t5 = t0;\
|
||||
t0 = _mm512_unpacklo_epi32(t0, t1);\
|
||||
t5 = _mm512_unpackhi_epi32(t5, t1);\
|
||||
t6 = i4;\
|
||||
i4 = _mm512_unpacklo_epi32(i4, i6);\
|
||||
t7 = t2;\
|
||||
t6 = _mm512_unpackhi_epi32(t6, i6);\
|
||||
i2 = t0;\
|
||||
t2 = _mm512_unpacklo_epi32(t2, t3);\
|
||||
i3 = t0;\
|
||||
t7 = _mm512_unpackhi_epi32(t7, t3);\
|
||||
\
|
||||
/* there are now 2 rows in each xmm */\
|
||||
/* unpack to get 1 row of CV in each xmm */\
|
||||
i1 = i0;\
|
||||
i1 = _mm512_unpackhi_epi64(i1, i4);\
|
||||
i0 = _mm512_unpacklo_epi64(i0, i4);\
|
||||
i4 = t4;\
|
||||
i3 = _mm512_unpackhi_epi64(i3, t2);\
|
||||
i5 = t4;\
|
||||
i2 = _mm512_unpacklo_epi64(i2, t2);\
|
||||
i6 = t5;\
|
||||
i5 = _mm512_unpackhi_epi64(i5, t6);\
|
||||
i7 = t5;\
|
||||
i4 = _mm512_unpacklo_epi64(i4, t6);\
|
||||
i7 = _mm512_unpackhi_epi64(i7, t7);\
|
||||
i6 = _mm512_unpacklo_epi64(i6, t7);\
|
||||
/* transpose done */\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse
|
||||
* input is a 1024-bit state with two rows in one xmm
|
||||
* output is a 1024-bit state with two columns in one xmm
|
||||
* inputs: i0-i7
|
||||
* outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
|
||||
* clobbers: t0-t4
|
||||
*/
|
||||
#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
|
||||
/* transpose matrix to get output format */\
|
||||
o1 = i0;\
|
||||
i0 = _mm512_unpacklo_epi64(i0, i1);\
|
||||
o1 = _mm512_unpackhi_epi64(o1, i1);\
|
||||
t0 = i2;\
|
||||
i2 = _mm512_unpacklo_epi64(i2, i3);\
|
||||
t0 = _mm512_unpackhi_epi64(t0, i3);\
|
||||
t1 = i4;\
|
||||
i4 = _mm512_unpacklo_epi64(i4, i5);\
|
||||
t1 = _mm512_unpackhi_epi64(t1, i5);\
|
||||
t2 = i6;\
|
||||
o0 = TRANSP_MASK;\
|
||||
i6 = _mm512_unpacklo_epi64(i6, i7);\
|
||||
t2 = _mm512_unpackhi_epi64(t2, i7);\
|
||||
/* load transpose mask into a register, because it will be used 8 times */\
|
||||
i0 = _mm512_shuffle_epi8(i0, o0);\
|
||||
i2 = _mm512_shuffle_epi8(i2, o0);\
|
||||
i4 = _mm512_shuffle_epi8(i4, o0);\
|
||||
i6 = _mm512_shuffle_epi8(i6, o0);\
|
||||
o1 = _mm512_shuffle_epi8(o1, o0);\
|
||||
t0 = _mm512_shuffle_epi8(t0, o0);\
|
||||
t1 = _mm512_shuffle_epi8(t1, o0);\
|
||||
t2 = _mm512_shuffle_epi8(t2, o0);\
|
||||
/* continue with unpack using 4 temp registers */\
|
||||
t3 = i4;\
|
||||
o2 = o1;\
|
||||
o0 = i0;\
|
||||
t4 = t1;\
|
||||
\
|
||||
t3 = _mm512_unpackhi_epi16(t3, i6);\
|
||||
i4 = _mm512_unpacklo_epi16(i4, i6);\
|
||||
o0 = _mm512_unpackhi_epi16(o0, i2);\
|
||||
i0 = _mm512_unpacklo_epi16(i0, i2);\
|
||||
o2 = _mm512_unpackhi_epi16(o2, t0);\
|
||||
o1 = _mm512_unpacklo_epi16(o1, t0);\
|
||||
t4 = _mm512_unpackhi_epi16(t4, t2);\
|
||||
t1 = _mm512_unpacklo_epi16(t1, t2);\
|
||||
/* shuffle with immediate */\
|
||||
i4 = _mm512_shuffle_epi32(i4, 216);\
|
||||
t3 = _mm512_shuffle_epi32(t3, 216);\
|
||||
o1 = _mm512_shuffle_epi32(o1, 216);\
|
||||
o2 = _mm512_shuffle_epi32(o2, 216);\
|
||||
i0 = _mm512_shuffle_epi32(i0, 216);\
|
||||
o0 = _mm512_shuffle_epi32(o0, 216);\
|
||||
t1 = _mm512_shuffle_epi32(t1, 216);\
|
||||
t4 = _mm512_shuffle_epi32(t4, 216);\
|
||||
/* continue with unpack */\
|
||||
i1 = i0;\
|
||||
i3 = o0;\
|
||||
i5 = o1;\
|
||||
i7 = o2;\
|
||||
i0 = _mm512_unpacklo_epi32(i0, i4);\
|
||||
i1 = _mm512_unpackhi_epi32(i1, i4);\
|
||||
o0 = _mm512_unpacklo_epi32(o0, t3);\
|
||||
i3 = _mm512_unpackhi_epi32(i3, t3);\
|
||||
o1 = _mm512_unpacklo_epi32(o1, t1);\
|
||||
i5 = _mm512_unpackhi_epi32(i5, t1);\
|
||||
o2 = _mm512_unpacklo_epi32(o2, t4);\
|
||||
i7 = _mm512_unpackhi_epi32(i7, t4);\
|
||||
/* transpose done */\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT_4way( __m512i* chaining )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
xmm9 = chaining[1];
|
||||
xmm10 = chaining[2];
|
||||
xmm11 = chaining[3];
|
||||
xmm12 = chaining[4];
|
||||
xmm13 = chaining[5];
|
||||
xmm14 = chaining[6];
|
||||
xmm15 = chaining[7];
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
|
||||
|
||||
/* store transposed IV */
|
||||
chaining[0] = xmm8;
|
||||
chaining[1] = xmm9;
|
||||
chaining[2] = xmm10;
|
||||
chaining[3] = xmm11;
|
||||
chaining[4] = xmm12;
|
||||
chaining[5] = xmm13;
|
||||
chaining[6] = xmm14;
|
||||
chaining[7] = xmm15;
|
||||
}
|
||||
|
||||
void TF1024_4way( __m512i* chaining, const __m512i* message )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m512i QTEMP[8];
|
||||
static __m512i TEMP0;
|
||||
static __m512i TEMP1;
|
||||
static __m512i TEMP2;
|
||||
|
||||
/* load message into registers xmm8 - xmm15 (Q = message) */
|
||||
xmm8 = message[0];
|
||||
xmm9 = message[1];
|
||||
xmm10 = message[2];
|
||||
xmm11 = message[3];
|
||||
xmm12 = message[4];
|
||||
xmm13 = message[5];
|
||||
xmm14 = message[6];
|
||||
xmm15 = message[7];
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
|
||||
|
||||
/* store message M (Q input) for later */
|
||||
QTEMP[0] = xmm8;
|
||||
QTEMP[1] = xmm9;
|
||||
QTEMP[2] = xmm10;
|
||||
QTEMP[3] = xmm11;
|
||||
QTEMP[4] = xmm12;
|
||||
QTEMP[5] = xmm13;
|
||||
QTEMP[6] = xmm14;
|
||||
QTEMP[7] = xmm15;
|
||||
|
||||
/* xor CV to message to get P input */
|
||||
/* result: CV+M in xmm8...xmm15 */
|
||||
xmm8 = _mm512_xor_si512( xmm8, (chaining[0]) );
|
||||
xmm9 = _mm512_xor_si512( xmm9, (chaining[1]) );
|
||||
xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
|
||||
xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
|
||||
xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
|
||||
xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
|
||||
xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
|
||||
xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
|
||||
|
||||
/* compute permutation P */
|
||||
/* result: P(CV+M) in xmm8...xmm15 */
|
||||
ROUNDS_P();
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV+M)+CV in xmm8...xmm15 */
|
||||
xmm8 = _mm512_xor_si512( xmm8, (chaining[0]) );
|
||||
xmm9 = _mm512_xor_si512( xmm9, (chaining[1]) );
|
||||
xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
|
||||
xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
|
||||
xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
|
||||
xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
|
||||
xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
|
||||
xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
|
||||
|
||||
/* store P(CV+M)+CV */
|
||||
chaining[0] = xmm8;
|
||||
chaining[1] = xmm9;
|
||||
chaining[2] = xmm10;
|
||||
chaining[3] = xmm11;
|
||||
chaining[4] = xmm12;
|
||||
chaining[5] = xmm13;
|
||||
chaining[6] = xmm14;
|
||||
chaining[7] = xmm15;
|
||||
|
||||
/* load message M (Q input) into xmm8-15 */
|
||||
xmm8 = QTEMP[0];
|
||||
xmm9 = QTEMP[1];
|
||||
xmm10 = QTEMP[2];
|
||||
xmm11 = QTEMP[3];
|
||||
xmm12 = QTEMP[4];
|
||||
xmm13 = QTEMP[5];
|
||||
xmm14 = QTEMP[6];
|
||||
xmm15 = QTEMP[7];
|
||||
|
||||
/* compute permutation Q */
|
||||
/* result: Q(M) in xmm8...xmm15 */
|
||||
ROUNDS_Q();
|
||||
|
||||
/* xor Q output */
|
||||
/* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
|
||||
xmm8 = _mm512_xor_si512( xmm8, (chaining[0]) );
|
||||
xmm9 = _mm512_xor_si512( xmm9, (chaining[1]) );
|
||||
xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
|
||||
xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
|
||||
xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
|
||||
xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
|
||||
xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
|
||||
xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm8;
|
||||
chaining[1] = xmm9;
|
||||
chaining[2] = xmm10;
|
||||
chaining[3] = xmm11;
|
||||
chaining[4] = xmm12;
|
||||
chaining[5] = xmm13;
|
||||
chaining[6] = xmm14;
|
||||
chaining[7] = xmm15;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void OF1024_4way( __m512i* chaining )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m512i TEMP0;
|
||||
static __m512i TEMP1;
|
||||
static __m512i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
xmm9 = chaining[1];
|
||||
xmm10 = chaining[2];
|
||||
xmm11 = chaining[3];
|
||||
xmm12 = chaining[4];
|
||||
xmm13 = chaining[5];
|
||||
xmm14 = chaining[6];
|
||||
xmm15 = chaining[7];
|
||||
|
||||
/* compute permutation P */
|
||||
/* result: P(CV) in xmm8...xmm15 */
|
||||
ROUNDS_P();
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8...xmm15 */
|
||||
xmm8 = _mm512_xor_si512( xmm8, (chaining[0]) );
|
||||
xmm9 = _mm512_xor_si512( xmm9, (chaining[1]) );
|
||||
xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
|
||||
xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
|
||||
xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
|
||||
xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
|
||||
xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
|
||||
xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
|
||||
|
||||
/* transpose CV back from row ordering to column ordering */
|
||||
/* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
|
||||
Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
chaining[4] = xmm0;
|
||||
chaining[5] = xmm6;
|
||||
chaining[6] = xmm13;
|
||||
chaining[7] = xmm15;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
#endif // VAES
|
||||
#endif // GROESTL512_INTR_4WAY_H__
|
@@ -10,7 +10,7 @@
|
||||
#else
|
||||
#include "aes_ni/hash-groestl.h"
|
||||
#endif
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include <openssl/sha.h>
|
||||
|
||||
typedef struct {
|
||||
#ifdef NO_AES_NI
|
||||
@@ -18,7 +18,7 @@ typedef struct {
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
#endif
|
||||
sph_sha256_context sha;
|
||||
SHA256_CTX sha;
|
||||
} myrgr_ctx_holder;
|
||||
|
||||
myrgr_ctx_holder myrgr_ctx;
|
||||
@@ -28,15 +28,15 @@ void init_myrgr_ctx()
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init( &myrgr_ctx.groestl );
|
||||
#else
|
||||
init_groestl (&myrgr_ctx.groestl, 64 );
|
||||
init_groestl ( &myrgr_ctx.groestl, 64 );
|
||||
#endif
|
||||
sph_sha256_init(&myrgr_ctx.sha);
|
||||
SHA256_Init( &myrgr_ctx.sha );
|
||||
}
|
||||
|
||||
void myriad_hash(void *output, const void *input)
|
||||
{
|
||||
myrgr_ctx_holder ctx;
|
||||
memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
|
||||
myrgr_ctx_holder ctx;
|
||||
memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
|
||||
|
||||
uint32_t _ALIGN(32) hash[16];
|
||||
|
||||
@@ -44,25 +44,25 @@ void myriad_hash(void *output, const void *input)
|
||||
sph_groestl512(&ctx.groestl, input, 80);
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#else
|
||||
update_groestl( &ctx.groestl, (char*)input, 640 );
|
||||
final_groestl( &ctx.groestl, (char*)hash);
|
||||
update_groestl( &ctx.groestl, (char*)input, 640 );
|
||||
final_groestl( &ctx.groestl, (char*)hash);
|
||||
#endif
|
||||
|
||||
sph_sha256(&ctx.sha, hash, 64);
|
||||
sph_sha256_close(&ctx.sha, hash);
|
||||
SHA256_Update( &ctx.sha, (unsigned char*)hash, 64 );
|
||||
SHA256_Final( (unsigned char*)hash, &ctx.sha );
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_myriad(int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done)
|
||||
int scanhash_myriad( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||
@@ -88,15 +88,3 @@ int scanhash_myriad(int thr_id, struct work *work,
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
/*
|
||||
bool register_myriad_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
init_myrgr_ctx();
|
||||
gate->scanhash = (void*)&scanhash_myriad;
|
||||
gate->hash = (void*)&myriadhash;
|
||||
// gate->hash_alt = (void*)&myriadhash;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
*/
|
||||
|
@@ -1,14 +1,159 @@
|
||||
#include "myrgr-gate.h"
|
||||
|
||||
#if defined(MYRGR_4WAY)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "aes_ni/hash-groestl.h"
|
||||
#include "algo/sha/sha2-hash-4way.h"
|
||||
#include "algo/sha/sha-hash-4way.h"
|
||||
#if defined(__VAES__)
|
||||
#include "groestl512-hash-4way.h"
|
||||
#endif
|
||||
|
||||
#if defined(MYRGR_8WAY)
|
||||
|
||||
typedef struct {
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_context groestl;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
#endif
|
||||
sha256_8way_context sha;
|
||||
} myrgr_8way_ctx_holder;
|
||||
|
||||
myrgr_8way_ctx_holder myrgr_8way_ctx;
|
||||
|
||||
void init_myrgr_8way_ctx()
|
||||
{
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_init( &myrgr_8way_ctx.groestl, 64 );
|
||||
#else
|
||||
init_groestl( &myrgr_8way_ctx.groestl, 64 );
|
||||
#endif
|
||||
sha256_8way_init( &myrgr_8way_ctx.sha );
|
||||
}
|
||||
|
||||
void myriad_8way_hash( void *output, const void *input )
|
||||
{
|
||||
uint32_t vhash[16*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vhashA[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhashB[20*8] __attribute__ ((aligned (64)));
|
||||
myrgr_8way_ctx_holder ctx;
|
||||
memcpy( &ctx, &myrgr_8way_ctx, sizeof(myrgr_8way_ctx) );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, input, 640 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 );
|
||||
|
||||
uint32_t hash0[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash4[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash5[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash6[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash7[20] __attribute__ ((aligned (64)));
|
||||
|
||||
// rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
|
||||
hash6, hash7 );
|
||||
|
||||
#else
|
||||
|
||||
uint32_t hash0[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash4[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash5[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash6[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash7[20] __attribute__ ((aligned (64)));
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, input, 640 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
|
||||
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, 512 );
|
||||
|
||||
#endif
|
||||
|
||||
sha256_8way_update( &ctx.sha, vhash, 64 );
|
||||
sha256_8way_close( &ctx.sha, output );
|
||||
}
|
||||
|
||||
int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *noncep = vdata + 64+3; // 4*16 + 3
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
mm512_bswap32_intrlv80_4x128( vdata, pdata );
|
||||
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+ 8, n+1 );
|
||||
be32enc( noncep+16, n+2 );
|
||||
be32enc( noncep+24, n+3 );
|
||||
be32enc( noncep+32, n+4 );
|
||||
be32enc( noncep+40, n+5 );
|
||||
be32enc( noncep+48, n+6 );
|
||||
be32enc( noncep+64, n+7 );
|
||||
|
||||
myriad_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane ] <= Htarg )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(MYRGR_4WAY)
|
||||
|
||||
typedef struct {
|
||||
hashState_groestl groestl;
|
||||
@@ -33,7 +178,7 @@ void myriad_4way_hash( void *output, const void *input )
|
||||
myrgr_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) );
|
||||
|
||||
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, input, 640 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, input, 640 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
@@ -43,66 +188,52 @@ void myriad_4way_hash( void *output, const void *input )
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
|
||||
|
||||
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
sha256_4way( &ctx.sha, vhash, 64 );
|
||||
sha256_4way_close( &ctx.sha, vhash );
|
||||
|
||||
mm128_deinterleave_4x32( output, output+32, output+64, output+96,
|
||||
vhash, 256 );
|
||||
sha256_4way_update( &ctx.sha, vhash, 64 );
|
||||
sha256_4way_close( &ctx.sha, output );
|
||||
}
|
||||
|
||||
int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep = vdata + 76; // 19*4
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
/*
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
*/
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
swab32_array( edata, pdata, 20 );
|
||||
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+1, n+1 );
|
||||
be32enc( noncep+2, n+2 );
|
||||
be32enc( noncep+3, n+3 );
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
|
||||
|
||||
myriad_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane ] <= Htarg )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
&& !work_restart[thr_id].restart);
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -2,17 +2,22 @@
|
||||
|
||||
bool register_myriad_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (MYRGR_4WAY)
|
||||
#if defined (MYRGR_8WAY)
|
||||
init_myrgr_8way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_myriad_8way;
|
||||
gate->hash = (void*)&myriad_8way_hash;
|
||||
gate->optimizations = AES_OPT | AVX2_OPT | VAES_OPT;
|
||||
#elif defined (MYRGR_4WAY)
|
||||
init_myrgr_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_myriad_4way;
|
||||
gate->hash = (void*)&myriad_4way_hash;
|
||||
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | VAES_OPT;
|
||||
#else
|
||||
init_myrgr_ctx();
|
||||
gate->scanhash = (void*)&scanhash_myriad;
|
||||
gate->hash = (void*)&myriad_hash;
|
||||
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
|
||||
#endif
|
||||
gate->optimizations = AES_OPT | AVX2_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -1,30 +1,35 @@
|
||||
#ifndef MYRGR_GATE_H__
|
||||
#define MYRGR_GATE_H__
|
||||
#define MYRGR_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define MYRGR_4WAY
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define MYRGR_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
|
||||
#define MYRGR_4WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(MYRGR_4WAY)
|
||||
#if defined(MYRGR_8WAY)
|
||||
|
||||
void myriad_8way_hash( void *state, const void *input );
|
||||
int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_myrgr_8way_ctx();
|
||||
|
||||
#elif defined(MYRGR_4WAY)
|
||||
|
||||
void myriad_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_myrgr_4way_ctx();
|
||||
|
||||
#endif
|
||||
#else
|
||||
|
||||
void myriad_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
int scanhash_myriad( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_myrgr_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -32,8 +32,6 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
//#include "miner.h"
|
||||
#include "hamsi-hash-4way.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
@@ -100,7 +98,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
//#include "hamsi-helper-4way.c"
|
||||
|
||||
/*
|
||||
static const sph_u32 IV512[] = {
|
||||
SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
|
||||
SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
|
||||
@@ -109,7 +107,7 @@ static const sph_u32 IV512[] = {
|
||||
SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
|
||||
SPH_C32(0x6769756d)
|
||||
};
|
||||
|
||||
*/
|
||||
static const sph_u32 alpha_n[] = {
|
||||
SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
|
||||
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
|
||||
@@ -138,6 +136,7 @@ static const sph_u32 alpha_f[] = {
|
||||
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
|
||||
};
|
||||
|
||||
|
||||
// imported from hamsi helper
|
||||
|
||||
/* Note: this table lists bits within each byte from least
|
||||
@@ -529,48 +528,374 @@ static const sph_u32 T512[64][16] = {
|
||||
SPH_C32(0xe7e00a94) }
|
||||
};
|
||||
|
||||
#define s0 m0
|
||||
#define s1 c0
|
||||
#define s2 m1
|
||||
#define s3 c1
|
||||
#define s4 c2
|
||||
#define s5 m2
|
||||
#define s6 c3
|
||||
#define s7 m3
|
||||
#define s8 m4
|
||||
#define s9 c4
|
||||
#define sA m5
|
||||
#define sB c5
|
||||
#define sC c6
|
||||
#define sD m6
|
||||
#define sE c7
|
||||
#define sF m7
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// Hamsi 8 way
|
||||
|
||||
#define INPUT_BIG8 \
|
||||
do { \
|
||||
__m512i db = *buf; \
|
||||
const uint64_t *tp = (uint64_t*)&T512[0][0]; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
|
||||
for ( int u = 0; u < 64; u++ ) \
|
||||
{ \
|
||||
__m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
|
||||
dm = mm512_negate_32( _mm512_or_si512( dm, \
|
||||
_mm512_slli_epi64( dm, 32 ) ) ); \
|
||||
m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[0] ) ) ); \
|
||||
m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[1] ) ) ); \
|
||||
m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[2] ) ) ); \
|
||||
m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[3] ) ) ); \
|
||||
m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[4] ) ) ); \
|
||||
m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[5] ) ) ); \
|
||||
m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[6] ) ) ); \
|
||||
m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[7] ) ) ); \
|
||||
tp += 8; \
|
||||
db = _mm512_srli_epi64( db, 1 ); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define SBOX8( a, b, c, d ) \
|
||||
do { \
|
||||
__m512i t; \
|
||||
t = a; \
|
||||
a = _mm512_and_si512( a, c ); \
|
||||
a = _mm512_xor_si512( a, d ); \
|
||||
c = _mm512_xor_si512( c, b ); \
|
||||
c = _mm512_xor_si512( c, a ); \
|
||||
d = _mm512_or_si512( d, t ); \
|
||||
d = _mm512_xor_si512( d, b ); \
|
||||
t = _mm512_xor_si512( t, c ); \
|
||||
b = d; \
|
||||
d = _mm512_or_si512( d, t ); \
|
||||
d = _mm512_xor_si512( d, a ); \
|
||||
a = _mm512_and_si512( a, b ); \
|
||||
t = _mm512_xor_si512( t, a ); \
|
||||
b = _mm512_xor_si512( b, d ); \
|
||||
b = _mm512_xor_si512( b, t ); \
|
||||
a = c; \
|
||||
c = b; \
|
||||
b = d; \
|
||||
d = mm512_not( t ); \
|
||||
} while (0)
|
||||
|
||||
#define L8( a, b, c, d ) \
|
||||
do { \
|
||||
a = mm512_rol_32( a, 13 ); \
|
||||
c = mm512_rol_32( c, 3 ); \
|
||||
b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \
|
||||
d = _mm512_xor_si512( d, _mm512_xor_si512( c, \
|
||||
_mm512_slli_epi32( a, 3 ) ) ); \
|
||||
b = mm512_rol_32( b, 1 ); \
|
||||
d = mm512_rol_32( d, 7 ); \
|
||||
a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \
|
||||
c = _mm512_xor_si512( c, _mm512_xor_si512( d, \
|
||||
_mm512_slli_epi32( b, 7 ) ) ); \
|
||||
a = mm512_rol_32( a, 5 ); \
|
||||
c = mm512_rol_32( c, 22 ); \
|
||||
} while (0)
|
||||
|
||||
#define DECL_STATE_BIG8 \
|
||||
__m512i c0, c1, c2, c3, c4, c5, c6, c7; \
|
||||
|
||||
#define READ_STATE_BIG8(sc) \
|
||||
do { \
|
||||
c0 = sc->h[0x0]; \
|
||||
c1 = sc->h[0x1]; \
|
||||
c2 = sc->h[0x2]; \
|
||||
c3 = sc->h[0x3]; \
|
||||
c4 = sc->h[0x4]; \
|
||||
c5 = sc->h[0x5]; \
|
||||
c6 = sc->h[0x6]; \
|
||||
c7 = sc->h[0x7]; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE_BIG8(sc) \
|
||||
do { \
|
||||
sc->h[0x0] = c0; \
|
||||
sc->h[0x1] = c1; \
|
||||
sc->h[0x2] = c2; \
|
||||
sc->h[0x3] = c3; \
|
||||
sc->h[0x4] = c4; \
|
||||
sc->h[0x5] = c5; \
|
||||
sc->h[0x6] = c6; \
|
||||
sc->h[0x7] = c7; \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define ROUND_BIG8(rc, alpha) \
|
||||
do { \
|
||||
__m512i t0, t1, t2, t3; \
|
||||
s0 = _mm512_xor_si512( s0, m512_const1_64( \
|
||||
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
|
||||
s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
|
||||
s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
|
||||
s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
|
||||
s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
|
||||
s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
|
||||
s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
|
||||
s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
|
||||
s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
|
||||
s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
|
||||
sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
|
||||
sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
|
||||
sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
|
||||
sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
|
||||
sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
|
||||
sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
|
||||
\
|
||||
SBOX8( s0, s4, s8, sC ); \
|
||||
SBOX8( s1, s5, s9, sD ); \
|
||||
SBOX8( s2, s6, sA, sE ); \
|
||||
SBOX8( s3, s7, sB, sF ); \
|
||||
\
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \
|
||||
_mm512_bslli_epi128( s5, 4 ) ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \
|
||||
_mm512_bslli_epi128( sE, 4 ) ); \
|
||||
L8( s0, t1, s9, t3 ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||
\
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
|
||||
_mm512_bslli_epi128( s6, 4 ) ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \
|
||||
_mm512_bslli_epi128( sF, 4 ) ); \
|
||||
L8( s1, t1, sA, t3 ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||
s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||
sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||
\
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \
|
||||
_mm512_bslli_epi128( s7, 4 ) ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \
|
||||
_mm512_bslli_epi128( sC, 4 ) ); \
|
||||
L8( s2, t1, sB, t3 ); \
|
||||
s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||
s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||
sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||
\
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \
|
||||
_mm512_bslli_epi128( s4, 4 ) ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
|
||||
_mm512_bslli_epi128( sD, 4 ) ); \
|
||||
L8( s3, t1, s8, t3 ); \
|
||||
s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||
\
|
||||
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
|
||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \
|
||||
_mm512_bslli_epi128( sB, 4 ) ); \
|
||||
L8( t0, t1, t2, t3 ); \
|
||||
s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
|
||||
s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \
|
||||
s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
|
||||
s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
|
||||
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \
|
||||
sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
|
||||
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||
sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||
\
|
||||
t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
|
||||
_mm512_bslli_epi128( sD, 4 ) ); \
|
||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
|
||||
L8( t0, t1, t2, t3 ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \
|
||||
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||
s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
|
||||
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \
|
||||
s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
|
||||
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
|
||||
} while (0)
|
||||
|
||||
#define P_BIG8 \
|
||||
do { \
|
||||
ROUND_BIG8(0, alpha_n); \
|
||||
ROUND_BIG8(1, alpha_n); \
|
||||
ROUND_BIG8(2, alpha_n); \
|
||||
ROUND_BIG8(3, alpha_n); \
|
||||
ROUND_BIG8(4, alpha_n); \
|
||||
ROUND_BIG8(5, alpha_n); \
|
||||
} while (0)
|
||||
|
||||
#define PF_BIG8 \
|
||||
do { \
|
||||
ROUND_BIG8( 0, alpha_f); \
|
||||
ROUND_BIG8( 1, alpha_f); \
|
||||
ROUND_BIG8( 2, alpha_f); \
|
||||
ROUND_BIG8( 3, alpha_f); \
|
||||
ROUND_BIG8( 4, alpha_f); \
|
||||
ROUND_BIG8( 5, alpha_f); \
|
||||
ROUND_BIG8( 6, alpha_f); \
|
||||
ROUND_BIG8( 7, alpha_f); \
|
||||
ROUND_BIG8( 8, alpha_f); \
|
||||
ROUND_BIG8( 9, alpha_f); \
|
||||
ROUND_BIG8(10, alpha_f); \
|
||||
ROUND_BIG8(11, alpha_f); \
|
||||
} while (0)
|
||||
|
||||
#define T_BIG8 \
|
||||
do { /* order is important */ \
|
||||
c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \
|
||||
c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \
|
||||
c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \
|
||||
c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \
|
||||
c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \
|
||||
c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \
|
||||
c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \
|
||||
c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \
|
||||
} while (0)
|
||||
|
||||
void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
|
||||
{
|
||||
DECL_STATE_BIG8
|
||||
uint32_t tmp = num << 6;
|
||||
|
||||
sc->count_low = SPH_T32( sc->count_low + tmp );
|
||||
sc->count_high += (sph_u32)( (num >> 13) >> 13 );
|
||||
if ( sc->count_low < tmp )
|
||||
sc->count_high++;
|
||||
|
||||
READ_STATE_BIG8( sc );
|
||||
while ( num-- > 0 )
|
||||
{
|
||||
__m512i m0, m1, m2, m3, m4, m5, m6, m7;
|
||||
|
||||
INPUT_BIG8;
|
||||
P_BIG8;
|
||||
T_BIG8;
|
||||
buf++;
|
||||
}
|
||||
WRITE_STATE_BIG8( sc );
|
||||
}
|
||||
|
||||
void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
|
||||
{
|
||||
__m512i m0, m1, m2, m3, m4, m5, m6, m7;
|
||||
DECL_STATE_BIG8
|
||||
READ_STATE_BIG8( sc );
|
||||
INPUT_BIG8;
|
||||
PF_BIG8;
|
||||
T_BIG8;
|
||||
WRITE_STATE_BIG8( sc );
|
||||
}
|
||||
|
||||
|
||||
void hamsi512_8way_init( hamsi_8way_big_context *sc )
|
||||
{
|
||||
sc->partial_len = 0;
|
||||
sc->count_high = sc->count_low = 0;
|
||||
|
||||
sc->h[0] = m512_const1_64( 0x6c70617273746565 );
|
||||
sc->h[1] = m512_const1_64( 0x656e62656b204172 );
|
||||
sc->h[2] = m512_const1_64( 0x302c206272672031 );
|
||||
sc->h[3] = m512_const1_64( 0x3434362c75732032 );
|
||||
sc->h[4] = m512_const1_64( 0x3030312020422d33 );
|
||||
sc->h[5] = m512_const1_64( 0x656e2d484c657576 );
|
||||
sc->h[6] = m512_const1_64( 0x6c65652c65766572 );
|
||||
sc->h[7] = m512_const1_64( 0x6769756d2042656c );
|
||||
}
|
||||
|
||||
void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
|
||||
hamsi_8way_big( sc, vdata, len>>3 );
|
||||
vdata += ( (len& ~(size_t)7) >> 3 );
|
||||
len &= (size_t)7;
|
||||
memcpy_512( sc->buf, vdata, len>>3 );
|
||||
sc->partial_len = len;
|
||||
}
|
||||
|
||||
void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
|
||||
{
|
||||
__m512i pad[1];
|
||||
int ch, cl;
|
||||
|
||||
sph_enc32be( &ch, sc->count_high );
|
||||
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
|
||||
pad[0] = _mm512_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch,
|
||||
cl, ch, cl, ch, cl, ch, cl, ch );
|
||||
// pad[0] = m512_const2_32( cl, ch );
|
||||
sc->buf[0] = m512_const1_64( 0x80 );
|
||||
hamsi_8way_big( sc, sc->buf, 1 );
|
||||
hamsi_8way_big_final( sc, pad );
|
||||
|
||||
mm512_block_bswap_32( (__m512i*)dst, sc->h );
|
||||
}
|
||||
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
// Hamsi 4 way
|
||||
|
||||
#define INPUT_BIG \
|
||||
do { \
|
||||
__m256i db = *buf; \
|
||||
const sph_u32 *tp = &T512[0][0]; \
|
||||
m0 = m256_zero; \
|
||||
m1 = m256_zero; \
|
||||
m2 = m256_zero; \
|
||||
m3 = m256_zero; \
|
||||
m4 = m256_zero; \
|
||||
m5 = m256_zero; \
|
||||
m6 = m256_zero; \
|
||||
m7 = m256_zero; \
|
||||
const uint64_t *tp = (uint64_t*)&T512[0][0]; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \
|
||||
for ( int u = 0; u < 64; u++ ) \
|
||||
{ \
|
||||
__m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
|
||||
dm = mm256_negate_32( _mm256_or_si256( dm, \
|
||||
_mm256_slli_epi64( dm, 32 ) ) ); \
|
||||
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
|
||||
_mm256_set_epi32( tp[0x1], tp[0x0], tp[0x1], tp[0x0], \
|
||||
tp[0x1], tp[0x0], tp[0x1], tp[0x0] ) ) ); \
|
||||
m256_const1_64( tp[0] ) ) ); \
|
||||
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
|
||||
_mm256_set_epi32( tp[0x3], tp[0x2], tp[0x3], tp[0x2], \
|
||||
tp[0x3], tp[0x2], tp[0x3], tp[0x2] ) ) ); \
|
||||
m256_const1_64( tp[1] ) ) ); \
|
||||
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \
|
||||
_mm256_set_epi32( tp[0x5], tp[0x4], tp[0x5], tp[0x4], \
|
||||
tp[0x5], tp[0x4], tp[0x5], tp[0x4] ) ) ); \
|
||||
m256_const1_64( tp[2] ) ) ); \
|
||||
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \
|
||||
_mm256_set_epi32( tp[0x7], tp[0x6], tp[0x7], tp[0x6], \
|
||||
tp[0x7], tp[0x6], tp[0x7], tp[0x6] ) ) ); \
|
||||
m256_const1_64( tp[3] ) ) ); \
|
||||
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \
|
||||
_mm256_set_epi32( tp[0x9], tp[0x8], tp[0x9], tp[0x8], \
|
||||
tp[0x9], tp[0x8], tp[0x9], tp[0x8] ) ) ); \
|
||||
m256_const1_64( tp[4] ) ) ); \
|
||||
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \
|
||||
_mm256_set_epi32( tp[0xB], tp[0xA], tp[0xB], tp[0xA], \
|
||||
tp[0xB], tp[0xA], tp[0xB], tp[0xA] ) ) ); \
|
||||
m256_const1_64( tp[5] ) ) ); \
|
||||
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \
|
||||
_mm256_set_epi32( tp[0xD], tp[0xC], tp[0xD], tp[0xC], \
|
||||
tp[0xD], tp[0xC], tp[0xD], tp[0xC] ) ) ); \
|
||||
m256_const1_64( tp[6] ) ) ); \
|
||||
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
|
||||
_mm256_set_epi32( tp[0xF], tp[0xE], tp[0xF], tp[0xE], \
|
||||
tp[0xF], tp[0xE], tp[0xF], tp[0xE] ) ) ); \
|
||||
tp += 0x10; \
|
||||
m256_const1_64( tp[7] ) ) ); \
|
||||
tp += 8; \
|
||||
db = _mm256_srli_epi64( db, 1 ); \
|
||||
} \
|
||||
} while (0)
|
||||
@@ -642,6 +967,7 @@ do { \
|
||||
sc->h[0x7] = c7; \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
#define s0 m0
|
||||
#define s1 c0
|
||||
#define s2 m1
|
||||
@@ -658,58 +984,28 @@ do { \
|
||||
#define sD m6
|
||||
#define sE c7
|
||||
#define sF m7
|
||||
*/
|
||||
|
||||
#define ROUND_BIG(rc, alpha) \
|
||||
do { \
|
||||
__m256i t0, t1, t2, t3; \
|
||||
s0 = _mm256_xor_si256( s0, _mm256_set_epi32( \
|
||||
alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00], \
|
||||
alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00] ) ); \
|
||||
s1 = _mm256_xor_si256( s1, _mm256_set_epi32( \
|
||||
alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02], \
|
||||
alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02] ) ); \
|
||||
s2 = _mm256_xor_si256( s2, _mm256_set_epi32( \
|
||||
alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04], \
|
||||
alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04] ) ); \
|
||||
s3 = _mm256_xor_si256( s3, _mm256_set_epi32( \
|
||||
alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06], \
|
||||
alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06] ) ); \
|
||||
s4 = _mm256_xor_si256( s4, _mm256_set_epi32( \
|
||||
alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08], \
|
||||
alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08] ) ); \
|
||||
s5 = _mm256_xor_si256( s5, _mm256_set_epi32( \
|
||||
alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A], \
|
||||
alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A] ) ); \
|
||||
s6 = _mm256_xor_si256( s6, _mm256_set_epi32( \
|
||||
alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C], \
|
||||
alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C] ) ); \
|
||||
s7 = _mm256_xor_si256( s7, _mm256_set_epi32( \
|
||||
alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E], \
|
||||
alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E] ) ); \
|
||||
s8 = _mm256_xor_si256( s8, _mm256_set_epi32( \
|
||||
alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10], \
|
||||
alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10] ) ); \
|
||||
s9 = _mm256_xor_si256( s9, _mm256_set_epi32( \
|
||||
alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12], \
|
||||
alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12] ) ); \
|
||||
sA = _mm256_xor_si256( sA, _mm256_set_epi32( \
|
||||
alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14], \
|
||||
alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14] ) ); \
|
||||
sB = _mm256_xor_si256( sB, _mm256_set_epi32( \
|
||||
alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16], \
|
||||
alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16] ) ); \
|
||||
sC = _mm256_xor_si256( sC, _mm256_set_epi32( \
|
||||
alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18], \
|
||||
alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18] ) ); \
|
||||
sD = _mm256_xor_si256( sD, _mm256_set_epi32( \
|
||||
alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A], \
|
||||
alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A] ) ); \
|
||||
sE = _mm256_xor_si256( sE, _mm256_set_epi32( \
|
||||
alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C], \
|
||||
alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C] ) ); \
|
||||
sF = _mm256_xor_si256( sF, _mm256_set_epi32( \
|
||||
alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E], \
|
||||
alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E] ) ); \
|
||||
__m256i t0, t1, t2, t3; \
|
||||
s0 = _mm256_xor_si256( s0, m256_const1_64( \
|
||||
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
|
||||
s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
|
||||
s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
|
||||
s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
|
||||
s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
|
||||
s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
|
||||
s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
|
||||
s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
|
||||
s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
|
||||
s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
|
||||
sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
|
||||
sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
|
||||
sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
|
||||
sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
|
||||
sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
|
||||
sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
|
||||
\
|
||||
SBOX( s0, s4, s8, sC ); \
|
||||
SBOX( s1, s5, s9, sD ); \
|
||||
@@ -863,47 +1159,23 @@ void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
|
||||
void hamsi512_4way_init( hamsi_4way_big_context *sc )
|
||||
{
|
||||
sc->partial_len = 0;
|
||||
sph_u32 lo, hi;
|
||||
sc->count_high = sc->count_low = 0;
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
{
|
||||
lo = 2*i;
|
||||
hi = 2*i + 1;
|
||||
sc->h[i] = _mm256_set_epi32( IV512[hi], IV512[lo], IV512[hi], IV512[lo],
|
||||
IV512[hi], IV512[lo], IV512[hi], IV512[lo] );
|
||||
}
|
||||
|
||||
sc->h[0] = m256_const1_64( 0x6c70617273746565 );
|
||||
sc->h[1] = m256_const1_64( 0x656e62656b204172 );
|
||||
sc->h[2] = m256_const1_64( 0x302c206272672031 );
|
||||
sc->h[3] = m256_const1_64( 0x3434362c75732032 );
|
||||
sc->h[4] = m256_const1_64( 0x3030312020422d33 );
|
||||
sc->h[5] = m256_const1_64( 0x656e2d484c657576 );
|
||||
sc->h[6] = m256_const1_64( 0x6c65652c65766572 );
|
||||
sc->h[7] = m256_const1_64( 0x6769756d2042656c );
|
||||
}
|
||||
|
||||
void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
|
||||
void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
|
||||
// It looks like the only way to get in here is if core was previously called
|
||||
// with a very small len
|
||||
// That's not likely even with 80 byte input so deprecate partial len
|
||||
/*
|
||||
if ( sc->partial_len != 0 )
|
||||
{
|
||||
size_t mlen;
|
||||
|
||||
mlen = 8 - sc->partial_len;
|
||||
if ( len < mlen )
|
||||
{
|
||||
memcpy_256( sc->partial + (sc->partial_len >> 3), data, len>>3 );
|
||||
sc->partial_len += len;
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy_256( sc->partial + (sc->partial_len >> 3), data, mlen>>3 );
|
||||
len -= mlen;
|
||||
vdata += mlen>>3;
|
||||
hamsi_big( sc, sc->partial, 1 );
|
||||
sc->partial_len = 0;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
hamsi_big( sc, vdata, len>>3 );
|
||||
vdata += ( (len& ~(size_t)7) >> 3 );
|
||||
len &= (size_t)7;
|
||||
@@ -913,20 +1185,19 @@ void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
|
||||
|
||||
void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
{
|
||||
__m256i *out = (__m256i*)dst;
|
||||
__m256i pad[1];
|
||||
size_t u;
|
||||
int ch, cl;
|
||||
|
||||
sph_enc32be( &ch, sc->count_high );
|
||||
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
|
||||
pad[0] = _mm256_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch );
|
||||
sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL,
|
||||
0UL, 0x80UL, 0UL, 0x80UL );
|
||||
sc->buf[0] = m256_const1_64( 0x80 );
|
||||
// sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL,
|
||||
// 0UL, 0x80UL, 0UL, 0x80UL );
|
||||
hamsi_big( sc, sc->buf, 1 );
|
||||
hamsi_big_final( sc, pad );
|
||||
for ( u = 0; u < 8; u ++ )
|
||||
out[u] = mm256_bswap_32( sc->h[u] );
|
||||
|
||||
mm256_block_bswap_32( (__m256i*)dst, sc->h );
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@@ -40,7 +40,7 @@
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
@@ -60,9 +60,32 @@ typedef struct {
|
||||
typedef hamsi_4way_big_context hamsi512_4way_context;
|
||||
|
||||
void hamsi512_4way_init( hamsi512_4way_context *sc );
|
||||
void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
|
||||
void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
//#define hamsi512_4way hamsi512_4way_update
|
||||
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
__m512i h[8];
|
||||
__m512i buf[1];
|
||||
size_t partial_len;
|
||||
sph_u32 count_high, count_low;
|
||||
} hamsi_8way_big_context;
|
||||
|
||||
typedef hamsi_8way_big_context hamsi512_8way_context;
|
||||
|
||||
void hamsi512_8way_init( hamsi512_8way_context *sc );
|
||||
void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -38,7 +38,7 @@
|
||||
#define SPH_XCAT_(a, b) a ## b
|
||||
|
||||
static void
|
||||
SPH_XCAT(SPH_XCAT(haval, PASSES), _4way)
|
||||
SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
|
||||
( haval_4way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m128i *vdata = (__m128i*)data;
|
||||
|
115
algo/haval/haval-8way-helper.c
Normal file
115
algo/haval/haval-8way-helper.c
Normal file
@@ -0,0 +1,115 @@
|
||||
/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
|
||||
/*
|
||||
* Helper code, included (three times !) by HAVAL implementation.
|
||||
*
|
||||
* TODO: try to merge this with md_helper.c.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#undef SPH_XCAT
|
||||
#define SPH_XCAT(a, b) SPH_XCAT_(a, b)
|
||||
#undef SPH_XCAT_
|
||||
#define SPH_XCAT_(a, b) a ## b
|
||||
|
||||
static void
|
||||
SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_update)
|
||||
( haval_8way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
unsigned current;
|
||||
|
||||
current = (unsigned)sc->count_low & 127U;
|
||||
while ( len > 0 )
|
||||
{
|
||||
unsigned clen;
|
||||
uint32_t clow, clow2;
|
||||
|
||||
clen = 128U - current;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_256( sc->buf + (current>>2), vdata, clen>>2 );
|
||||
vdata += clen>>2;
|
||||
current += clen;
|
||||
len -= clen;
|
||||
if ( current == 128U )
|
||||
{
|
||||
DSTATE_8W;
|
||||
IN_PREPARE_8W(sc->buf);
|
||||
RSTATE_8W;
|
||||
SPH_XCAT(CORE_8W, PASSES)(INW_8W);
|
||||
WSTATE_8W;
|
||||
current = 0;
|
||||
}
|
||||
clow = sc->count_low;
|
||||
clow2 = clow + clen;
|
||||
sc->count_low = clow2;
|
||||
if ( clow2 < clow )
|
||||
sc->count_high ++;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
|
||||
void *dst)
|
||||
{
|
||||
unsigned current;
|
||||
DSTATE_8W;
|
||||
|
||||
current = (unsigned)sc->count_low & 127UL;
|
||||
|
||||
sc->buf[ current>>2 ] = m256_one_32;
|
||||
current += 4;
|
||||
RSTATE_8W;
|
||||
if ( current > 116UL )
|
||||
{
|
||||
memset_zero_256( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
|
||||
do
|
||||
{
|
||||
IN_PREPARE_8W(sc->buf);
|
||||
SPH_XCAT(CORE_8W, PASSES)(INW_8W);
|
||||
} while (0);
|
||||
current = 0;
|
||||
}
|
||||
|
||||
uint32_t t1, t2;
|
||||
memset_zero_256( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
|
||||
t1 = 0x01 | (PASSES << 3);
|
||||
t2 = sc->olen << 3;
|
||||
sc->buf[ 116>>2 ] = _mm256_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
|
||||
sc->buf[ 120>>2 ] = _mm256_set1_epi32( sc->count_low << 3 );
|
||||
sc->buf[ 124>>2 ] = _mm256_set1_epi32( (sc->count_high << 3)
|
||||
| (sc->count_low >> 29) );
|
||||
do
|
||||
{
|
||||
IN_PREPARE_8W(sc->buf);
|
||||
SPH_XCAT(CORE_8W, PASSES)(INW_8W);
|
||||
} while (0);
|
||||
WSTATE_8W;
|
||||
haval_8way_out( sc, dst );
|
||||
}
|
@@ -40,7 +40,7 @@
|
||||
#include <string.h>
|
||||
#include "haval-hash-4way.h"
|
||||
|
||||
// won't compile with sse4.2
|
||||
// won't compile with sse4.2, not a problem, it's only used with AVX2 4 way.
|
||||
//#if defined (__SSE4_2__)
|
||||
#if defined(__AVX__)
|
||||
|
||||
@@ -479,9 +479,9 @@ haval ## xxx ## _ ## y ## _4way_init(void *cc) \
|
||||
} \
|
||||
\
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _4way (void *cc, const void *data, size_t len) \
|
||||
haval ## xxx ## _ ## y ## _4way_update (void *cc, const void *data, size_t len) \
|
||||
{ \
|
||||
haval ## y ## _4way(cc, data, len); \
|
||||
haval ## y ## _4way_update(cc, data, len); \
|
||||
} \
|
||||
\
|
||||
void \
|
||||
@@ -518,6 +518,301 @@ do { \
|
||||
|
||||
#define INMSG(i) msg[i]
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// Haval-256 8 way 32 bit avx2
|
||||
|
||||
#define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( x0, \
|
||||
_mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
|
||||
_mm256_and_si256( x3, x6 ) ) ) ) \
|
||||
|
||||
#define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x2, \
|
||||
_mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x4, x5 ), \
|
||||
_mm256_xor_si256( x6, x0 ) ) ) ), \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
|
||||
|
||||
#define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x3, \
|
||||
_mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
|
||||
_mm256_xor_si256( x6, x0 ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \
|
||||
_mm256_and_si256( x2, x5 ) ), x0 ) )
|
||||
|
||||
#define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x3, \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
|
||||
_mm256_or_si256( x4, x6 ) ), x5 ) ), \
|
||||
_mm256_and_si256( x4, \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \
|
||||
_mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
|
||||
|
||||
|
||||
#define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x0, \
|
||||
mm256_not( _mm256_xor_si256( \
|
||||
_mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
|
||||
_mm256_and_si256( x2, x5 ) ), \
|
||||
_mm256_and_si256( x3, x6 ) ) )
|
||||
|
||||
#define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F1_8W(x1, x0, x3, x5, x6, x2, x4)
|
||||
#define FP3_2_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F2_8W(x4, x2, x1, x0, x5, x3, x6)
|
||||
#define FP3_3_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F3_8W(x6, x1, x2, x3, x4, x5, x0)
|
||||
|
||||
#define FP4_1_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F1_8W(x2, x6, x1, x4, x5, x3, x0)
|
||||
#define FP4_2_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F2_8W(x3, x5, x2, x0, x1, x6, x4)
|
||||
#define FP4_3_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F3_8W(x1, x4, x3, x6, x0, x2, x5)
|
||||
#define FP4_4_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F4_8W(x6, x4, x0, x5, x2, x1, x3)
|
||||
|
||||
#define FP5_1_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F1_8W(x3, x4, x1, x0, x5, x2, x6)
|
||||
#define FP5_2_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F2_8W(x6, x2, x1, x0, x3, x4, x5)
|
||||
#define FP5_3_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F3_8W(x2, x6, x0, x4, x3, x1, x5)
|
||||
#define FP5_4_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F4_8W(x1, x5, x3, x2, x0, x4, x6)
|
||||
#define FP5_5_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F5_8W(x2, x5, x0, x6, x4, x3, x1)
|
||||
|
||||
#define STEP_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
|
||||
do { \
|
||||
__m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
|
||||
mm256_ror_32( x7, 11 ) ), \
|
||||
_mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define PASS1_8W(n, in) do { \
|
||||
unsigned pass_count; \
|
||||
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
|
||||
STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(pass_count + 0), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(pass_count + 1), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(pass_count + 2), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(pass_count + 3), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(pass_count + 4), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(pass_count + 5), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(pass_count + 6), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(pass_count + 7), SPH_C32(0x00000000)); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PASSG_8W(p, n, in) do { \
|
||||
unsigned pass_count; \
|
||||
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
|
||||
STEP_8W(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(MP ## p[pass_count + 0]), \
|
||||
RK ## p[pass_count + 0]); \
|
||||
STEP_8W(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(MP ## p[pass_count + 1]), \
|
||||
RK ## p[pass_count + 1]); \
|
||||
STEP_8W(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(MP ## p[pass_count + 2]), \
|
||||
RK ## p[pass_count + 2]); \
|
||||
STEP_8W(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(MP ## p[pass_count + 3]), \
|
||||
RK ## p[pass_count + 3]); \
|
||||
STEP_8W(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(MP ## p[pass_count + 4]), \
|
||||
RK ## p[pass_count + 4]); \
|
||||
STEP_8W(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(MP ## p[pass_count + 5]), \
|
||||
RK ## p[pass_count + 5]); \
|
||||
STEP_8W(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(MP ## p[pass_count + 6]), \
|
||||
RK ## p[pass_count + 6]); \
|
||||
STEP_8W(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(MP ## p[pass_count + 7]), \
|
||||
RK ## p[pass_count + 7]); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PASS2_8W(n, in) PASSG_8W(2, n, in)
|
||||
#define PASS3_8W(n, in) PASSG_8W(3, n, in)
|
||||
#define PASS4_8W(n, in) PASSG_8W(4, n, in)
|
||||
#define PASS5_8W(n, in) PASSG_8W(5, n, in)
|
||||
|
||||
#define SAVE_STATE_8W \
|
||||
__m256i u0, u1, u2, u3, u4, u5, u6, u7; \
|
||||
do { \
|
||||
u0 = s0; \
|
||||
u1 = s1; \
|
||||
u2 = s2; \
|
||||
u3 = s3; \
|
||||
u4 = s4; \
|
||||
u5 = s5; \
|
||||
u6 = s6; \
|
||||
u7 = s7; \
|
||||
} while (0)
|
||||
|
||||
#define UPDATE_STATE_8W \
|
||||
do { \
|
||||
s0 = _mm256_add_epi32( s0, u0 ); \
|
||||
s1 = _mm256_add_epi32( s1, u1 ); \
|
||||
s2 = _mm256_add_epi32( s2, u2 ); \
|
||||
s3 = _mm256_add_epi32( s3, u3 ); \
|
||||
s4 = _mm256_add_epi32( s4, u4 ); \
|
||||
s5 = _mm256_add_epi32( s5, u5 ); \
|
||||
s6 = _mm256_add_epi32( s6, u6 ); \
|
||||
s7 = _mm256_add_epi32( s7, u7 ); \
|
||||
} while (0)
|
||||
|
||||
#define CORE_8W5(in) do { \
|
||||
SAVE_STATE_8W; \
|
||||
PASS1_8W(5, in); \
|
||||
PASS2_8W(5, in); \
|
||||
PASS3_8W(5, in); \
|
||||
PASS4_8W(5, in); \
|
||||
PASS5_8W(5, in); \
|
||||
UPDATE_STATE_8W; \
|
||||
} while (0)
|
||||
|
||||
#define DSTATE_8W __m256i s0, s1, s2, s3, s4, s5, s6, s7
|
||||
|
||||
#define RSTATE_8W \
|
||||
do { \
|
||||
s0 = sc->s0; \
|
||||
s1 = sc->s1; \
|
||||
s2 = sc->s2; \
|
||||
s3 = sc->s3; \
|
||||
s4 = sc->s4; \
|
||||
s5 = sc->s5; \
|
||||
s6 = sc->s6; \
|
||||
s7 = sc->s7; \
|
||||
} while (0)
|
||||
|
||||
#define WSTATE_8W \
|
||||
do { \
|
||||
sc->s0 = s0; \
|
||||
sc->s1 = s1; \
|
||||
sc->s2 = s2; \
|
||||
sc->s3 = s3; \
|
||||
sc->s4 = s4; \
|
||||
sc->s5 = s5; \
|
||||
sc->s6 = s6; \
|
||||
sc->s7 = s7; \
|
||||
} while (0)
|
||||
|
||||
static void
|
||||
haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
|
||||
{
|
||||
sc->s0 = m256_const1_32( 0x243F6A88UL );
|
||||
sc->s1 = m256_const1_32( 0x85A308D3UL );
|
||||
sc->s2 = m256_const1_32( 0x13198A2EUL );
|
||||
sc->s3 = m256_const1_32( 0x03707344UL );
|
||||
sc->s4 = m256_const1_32( 0xA4093822UL );
|
||||
sc->s5 = m256_const1_32( 0x299F31D0UL );
|
||||
sc->s6 = m256_const1_32( 0x082EFA98UL );
|
||||
sc->s7 = m256_const1_32( 0xEC4E6C89UL );
|
||||
sc->olen = olen;
|
||||
sc->passes = passes;
|
||||
sc->count_high = 0;
|
||||
sc->count_low = 0;
|
||||
|
||||
}
|
||||
#define IN_PREPARE_8W(indata) const __m256i *const load_ptr_8w = (indata)
|
||||
|
||||
#define INW_8W(i) load_ptr_8w[ i ]
|
||||
|
||||
static void
|
||||
haval_8way_out( haval_8way_context *sc, void *dst )
|
||||
{
|
||||
__m256i *buf = (__m256i*)dst;
|
||||
DSTATE_8W;
|
||||
RSTATE_8W;
|
||||
|
||||
buf[0] = s0;
|
||||
buf[1] = s1;
|
||||
buf[2] = s2;
|
||||
buf[3] = s3;
|
||||
buf[4] = s4;
|
||||
buf[5] = s5;
|
||||
buf[6] = s6;
|
||||
buf[7] = s7;
|
||||
}
|
||||
|
||||
#undef PASSES
|
||||
#define PASSES 5
|
||||
#include "haval-8way-helper.c"
|
||||
|
||||
#define API_8W(xxx, y) \
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _8way_init(void *cc) \
|
||||
{ \
|
||||
haval_8way_init(cc, xxx >> 5, y); \
|
||||
} \
|
||||
\
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _8way_update (void *cc, const void *data, size_t len) \
|
||||
{ \
|
||||
haval ## y ## _8way_update(cc, data, len); \
|
||||
} \
|
||||
\
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _8way_close(void *cc, void *dst) \
|
||||
{ \
|
||||
haval ## y ## _8way_close(cc, dst); \
|
||||
} \
|
||||
|
||||
API_8W(256, 5)
|
||||
|
||||
#define RVAL_8W \
|
||||
do { \
|
||||
s0 = val[0]; \
|
||||
s1 = val[1]; \
|
||||
s2 = val[2]; \
|
||||
s3 = val[3]; \
|
||||
s4 = val[4]; \
|
||||
s5 = val[5]; \
|
||||
s6 = val[6]; \
|
||||
s7 = val[7]; \
|
||||
} while (0)
|
||||
|
||||
#define WVAL_8W \
|
||||
do { \
|
||||
val[0] = s0; \
|
||||
val[1] = s1; \
|
||||
val[2] = s2; \
|
||||
val[3] = s3; \
|
||||
val[4] = s4; \
|
||||
val[5] = s5; \
|
||||
val[6] = s6; \
|
||||
val[7] = s7; \
|
||||
} while (0)
|
||||
|
||||
#define INMSG_8W(i) msg[i]
|
||||
|
||||
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -59,7 +59,7 @@
|
||||
*/
|
||||
|
||||
#ifndef HAVAL_HASH_4WAY_H__
|
||||
#define HAVAL_HASH_4WAY_H__
|
||||
#define HAVAL_HASH_4WAY_H__ 1
|
||||
|
||||
#if defined(__AVX__)
|
||||
|
||||
@@ -69,7 +69,7 @@ extern "C"{
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_haval256_5 256
|
||||
|
||||
@@ -84,10 +84,30 @@ typedef haval_4way_context haval256_5_4way_context;
|
||||
|
||||
void haval256_5_4way_init( void *cc );
|
||||
|
||||
void haval256_5_4way( void *cc, const void *data, size_t len );
|
||||
void haval256_5_4way_update( void *cc, const void *data, size_t len );
|
||||
//#define haval256_5_4way haval256_5_4way_update
|
||||
|
||||
void haval256_5_4way_close( void *cc, void *dst );
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[32];
|
||||
__m256i s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
unsigned olen, passes;
|
||||
uint32_t count_high, count_low;
|
||||
} haval_8way_context __attribute__ ((aligned (64)));
|
||||
|
||||
typedef haval_8way_context haval256_5_8way_context;
|
||||
|
||||
void haval256_5_8way_init( void *cc );
|
||||
|
||||
void haval256_5_8way_update( void *cc, const void *data, size_t len );
|
||||
|
||||
void haval256_5_8way_close( void *cc, void *dst );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -131,12 +131,14 @@ void bastionhash(void *output, const void *input)
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_bastion(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
|
||||
int scanhash_bastion( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t _ALIGN(64) hash32[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
@@ -79,11 +79,12 @@ extern void heavyhash(unsigned char* output, const unsigned char* input, int len
|
||||
|
||||
}
|
||||
|
||||
int scanhash_heavy(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
|
||||
uint32_t max_nonce, uint64_t *hashes_done)
|
||||
int scanhash_heavy( uint32_t *pdata, const uint32_t *ptarget,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t hash[8];
|
||||
uint32_t start_nonce = pdata[19];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
do {
|
||||
heavyhash((unsigned char *)hash, (unsigned char *)pdata, 80);
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include "wolf-aes.h"
|
||||
#include "miner.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#if defined(__AES__)
|
||||
|
||||
static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
|
||||
{
|
||||
@@ -83,7 +83,7 @@ void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf)
|
||||
keys[14] = tmp1;
|
||||
}
|
||||
|
||||
#ifdef __SSE4_2__
|
||||
#if defined(__SSE4_2__)
|
||||
//#ifdef __AVX__
|
||||
|
||||
#define AESENC(i,j) \
|
||||
|
@@ -15,11 +15,6 @@ pthread_barrier_t hodl_barrier;
|
||||
// need to be passed.
|
||||
unsigned char *hodl_scratchbuf = NULL;
|
||||
|
||||
void hodl_set_target( struct work* work, double diff )
|
||||
{
|
||||
diff_to_target(work->target, diff / 8388608.0 );
|
||||
}
|
||||
|
||||
void hodl_le_build_stratum_request( char* req, struct work* work,
|
||||
struct stratum_ctx *sctx )
|
||||
{
|
||||
@@ -101,39 +96,6 @@ void hodl_build_block_header( struct work* g_work, uint32_t version,
|
||||
g_work->data[31] = 0x00000280;
|
||||
}
|
||||
|
||||
// hodl build_extra_header is redundant, hodl can use std_build_extra_header
|
||||
// and call hodl_build_block_header.
|
||||
#if 0
|
||||
void hodl_build_extraheader( struct work* g_work, struct stratum_ctx *sctx )
|
||||
{
|
||||
uchar merkle_tree[64] = { 0 };
|
||||
size_t t;
|
||||
// int i;
|
||||
|
||||
algo_gate.gen_merkle_root( merkle_tree, sctx );
|
||||
// Increment extranonce2
|
||||
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
|
||||
|
||||
algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
|
||||
(uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
|
||||
le32dec( sctx->job.ntime ), le32dec( sctx->job.nbits ) );
|
||||
/*
|
||||
// Assemble block header
|
||||
memset( g_work->data, 0, sizeof(g_work->data) );
|
||||
g_work->data[0] = le32dec( sctx->job.version );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
|
||||
|
||||
g_work->data[ algo_gate.ntime_index ] = le32dec( sctx->job.ntime );
|
||||
g_work->data[ algo_gate.nbits_index ] = le32dec( sctx->job.nbits );
|
||||
g_work->data[22] = 0x80000000;
|
||||
g_work->data[31] = 0x00000280;
|
||||
*/
|
||||
}
|
||||
#endif
|
||||
|
||||
// called only by thread 0, saves a backup of g_work
|
||||
void hodl_get_new_work( struct work* work, struct work* g_work)
|
||||
{
|
||||
@@ -176,20 +138,20 @@ bool hodl_do_this_thread( int thr_id )
|
||||
return ( thr_id == 0 );
|
||||
}
|
||||
|
||||
int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
int hodl_scanhash( struct work* work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
#ifndef NO_AES_NI
|
||||
GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, thr_id );
|
||||
#if defined(__AES__)
|
||||
GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, mythr->id );
|
||||
pthread_barrier_wait( &hodl_barrier );
|
||||
return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
|
||||
return scanhash_hodl_wolf( work, max_nonce, hashes_done, thr_info );
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
bool register_hodl_algo( algo_gate_t* gate )
|
||||
{
|
||||
#ifdef NO_AES_NI
|
||||
#if !defined(__AES__)
|
||||
applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
|
||||
return false;
|
||||
#endif
|
||||
@@ -199,20 +161,19 @@ bool register_hodl_algo( algo_gate_t* gate )
|
||||
// return false;
|
||||
// }
|
||||
pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
|
||||
gate->optimizations = AES_OPT | SSE42_OPT | AVX2_OPT;
|
||||
gate->optimizations = AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&hodl_scanhash;
|
||||
gate->get_new_work = (void*)&hodl_get_new_work;
|
||||
gate->longpoll_rpc_call = (void*)&hodl_longpoll_rpc_call;
|
||||
gate->set_target = (void*)&hodl_set_target;
|
||||
gate->build_stratum_request = (void*)&hodl_le_build_stratum_request;
|
||||
gate->malloc_txs_request = (void*)&hodl_malloc_txs_request;
|
||||
gate->build_block_header = (void*)&hodl_build_block_header;
|
||||
// gate->build_extraheader = (void*)&hodl_build_extraheader;
|
||||
gate->resync_threads = (void*)&hodl_resync_threads;
|
||||
gate->do_this_thread = (void*)&hodl_do_this_thread;
|
||||
gate->work_cmp_size = 76;
|
||||
hodl_scratchbuf = (unsigned char*)malloc( 1 << 30 );
|
||||
allow_getwork = false;
|
||||
opt_target_factor = 8388608.0;
|
||||
return ( hodl_scratchbuf != NULL );
|
||||
}
|
||||
|
||||
|
@@ -8,7 +8,7 @@
|
||||
#include "hodl-wolf.h"
|
||||
#include "miner.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#if defined(__AES__)
|
||||
|
||||
void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
|
||||
void *MidHash )
|
||||
@@ -17,7 +17,7 @@ void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
|
||||
const uint32_t StartChunk = ThreadID * Chunk;
|
||||
const uint32_t EndChunk = StartChunk + Chunk;
|
||||
|
||||
#ifdef __SSE4_2__
|
||||
#if defined(__SSE4_2__)
|
||||
//#ifdef __AVX__
|
||||
uint64_t* TempBufs[ SHA512_PARALLEL_N ] ;
|
||||
uint64_t* desination[ SHA512_PARALLEL_N ];
|
||||
@@ -61,13 +61,14 @@ void Rev256(uint32_t *Dest, const uint32_t *Src)
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
#ifdef __SSE4_2__
|
||||
#if defined(__SSE4_2__)
|
||||
//#ifdef __AVX__
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int threadNumber = mythr->id;
|
||||
CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
|
||||
CacheEntry Cache[AES_PARALLEL_N];
|
||||
__m128i* data[AES_PARALLEL_N];
|
||||
@@ -147,6 +148,7 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
|
||||
CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
|
||||
CacheEntry Cache;
|
||||
uint32_t CollisionCount = 0;
|
||||
int threadNumber = mythr->id;
|
||||
|
||||
swab32_array( BlockHdr, pdata, 20 );
|
||||
// Search for pattern in psuedorandom data
|
||||
@@ -160,7 +162,6 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
|
||||
{
|
||||
// copy data to first l2 cache
|
||||
memcpy(Cache.dwords, Garbage + k, GARBAGE_SLICE_SIZE);
|
||||
#ifndef NO_AES_NI
|
||||
for(int j = 0; j < AES_ITERATIONS; j++)
|
||||
{
|
||||
CacheEntry TmpXOR;
|
||||
@@ -184,7 +185,6 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
|
||||
AES256CBC( Cache.dqwords, TmpXOR.dqwords, ExpKey,
|
||||
TmpXOR.dqwords[ (GARBAGE_SLICE_SIZE / sizeof(__m128i))
|
||||
- 1 ], 256 ); }
|
||||
#endif
|
||||
// use last X bits as solution
|
||||
if( ( Cache.dwords[ (GARBAGE_SLICE_SIZE >> 2) - 1 ]
|
||||
& (COMPARE_SIZE - 1) ) < 1000 )
|
||||
@@ -206,7 +206,7 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
|
||||
*hashes_done = CollisionCount;
|
||||
return(0);
|
||||
|
||||
#endif
|
||||
#endif // AVX else
|
||||
|
||||
}
|
||||
|
||||
@@ -218,5 +218,5 @@ void GenRandomGarbage(CacheEntry *Garbage, uint32_t *pdata, int thr_id)
|
||||
GenerateGarbageCore(Garbage, thr_id, opt_n_threads, MidHash);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // AES
|
||||
|
||||
|
@@ -19,8 +19,8 @@ typedef union _CacheEntry
|
||||
__m128i dqwords[GARBAGE_SLICE_SIZE >> 4] __attribute__((aligned(16)));
|
||||
} CacheEntry;
|
||||
|
||||
int scanhash_hodl_wolf( int thr_id, struct work* work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void GenRandomGarbage( CacheEntry *Garbage, uint32_t *pdata, int thr_id);
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user