mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1a234cbe53 | ||
|
|
47cc5dcff5 | ||
|
|
2cd1507c2e | ||
|
|
9b905fccc8 | ||
|
|
92b3733925 | ||
|
|
19cc88d102 | ||
|
|
a053690170 | ||
|
|
3c5e8921b7 | ||
|
|
f3333b0070 | ||
|
|
902ec046dd | ||
|
|
d0b4941321 | ||
|
|
40089428c5 | ||
|
|
dc6b007a18 |
@@ -32,14 +32,26 @@ but different package names.
|
|||||||
$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
|
$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
|
||||||
|
|
||||||
SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
|
SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
|
||||||
openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
|
openssl 1.1.0e or higher.
|
||||||
support depending on your CPU and compiler version:
|
|
||||||
|
|
||||||
"-march=native" is always the best choice
|
znver1 and znver2 should be recognized on most recent version of GCC and
|
||||||
|
znver3 is expected with GCC 11. GCC 11 also includes rocketlake support.
|
||||||
|
In the meantime here are some suggestions to compile with new CPUs:
|
||||||
|
|
||||||
"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
|
"-march=native" is usually the best choice, used by build.sh.
|
||||||
|
|
||||||
"-msha" Add SHA to other tuning options
|
"-march=znver2 -mvaes" can be used for Ryzen 5000 if znver3 is not recongized.
|
||||||
|
|
||||||
|
"-mcascadelake -msha" or
|
||||||
|
"-mcometlake -mavx512 -msha" can be used for Rocket Lake.
|
||||||
|
|
||||||
|
Features can also be added individually:
|
||||||
|
|
||||||
|
"-msha" adds support for HW accelerated sha256.
|
||||||
|
|
||||||
|
"-mavx512" adds support for 512 bit vectors
|
||||||
|
|
||||||
|
"-mvaes" add support for parallel AES
|
||||||
|
|
||||||
Additional instructions for static compilalation can be found here:
|
Additional instructions for static compilalation can be found here:
|
||||||
https://lxadm.com/Static_compilation_of_cpuminer
|
https://lxadm.com/Static_compilation_of_cpuminer
|
||||||
|
|||||||
143
INSTALL_WINDOWS
143
INSTALL_WINDOWS
@@ -1,5 +1,9 @@
|
|||||||
Instructions for compiling cpuminer-opt for Windows.
|
Instructions for compiling cpuminer-opt for Windows.
|
||||||
|
|
||||||
|
Thwaw intructions nay be out of date. Please consult the wiki for
|
||||||
|
the latest:
|
||||||
|
|
||||||
|
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
|
||||||
|
|
||||||
Windows compilation using Visual Studio is not supported. Mingw64 is
|
Windows compilation using Visual Studio is not supported. Mingw64 is
|
||||||
used on a Linux system (bare metal or virtual machine) to cross-compile
|
used on a Linux system (bare metal or virtual machine) to cross-compile
|
||||||
@@ -24,79 +28,76 @@ Refer to Linux compile instructions and install required packages.
|
|||||||
|
|
||||||
Additionally, install mingw-w64.
|
Additionally, install mingw-w64.
|
||||||
|
|
||||||
sudo apt-get install mingw-w64
|
sudo apt-get install mingw-w64 libz-mingw-w64-dev
|
||||||
|
|
||||||
|
|
||||||
2. Create a local library directory for packages to be compiled in the next
|
2. Create a local library directory for packages to be compiled in the next
|
||||||
step. Suggested location is $HOME/usr/lib/
|
step. Suggested location is $HOME/usr/lib/
|
||||||
|
|
||||||
|
$ mkdir $HOME/usr/lib
|
||||||
|
|
||||||
3. Download and build other packages for mingw that don't have a mingw64
|
3. Download and build other packages for mingw that don't have a mingw64
|
||||||
version available in the repositories.
|
version available in the repositories.
|
||||||
|
|
||||||
Download the following source code packages from their respective and
|
Download the following source code packages from their respective and
|
||||||
respected download locations, copy them to ~/usr/lib/ and uncompress them.
|
respected download locations, copy them to ~/usr/lib/ and uncompress them.
|
||||||
|
|
||||||
openssl
|
openssl: https://github.com/openssl/openssl/releases
|
||||||
curl
|
|
||||||
gmp
|
|
||||||
|
|
||||||
In most cases the latest vesrion is ok but it's safest to download
|
curl: https://github.com/curl/curl/releases
|
||||||
the same major and minor version as included in your distribution.
|
|
||||||
|
|
||||||
Run the following commands or follow the supplied instructions.
|
gmp: https://gmplib.org/download/gmp/
|
||||||
Do not run "make install" unless you are using ~/usr/lib, which isn't
|
|
||||||
recommended.
|
|
||||||
|
|
||||||
Some instructions insist on running "make check". If make check fails
|
In most cases the latest version is ok but it's safest to download the same major and minor version as included in your distribution. The following uses versions from Ubuntu 20.04. Change version numbers as required.
|
||||||
it may still work, YMMV.
|
|
||||||
|
|
||||||
You can speed up "make" by using all CPU cores available with "-j n" where
|
Run the following commands or follow the supplied instructions. Do not run "make install" unless you are using /usr/lib, which isn't recommended.
|
||||||
n is the number of CPU threads you want to use.
|
|
||||||
|
Some instructions insist on running "make check". If make check fails it may still work, YMMV.
|
||||||
|
|
||||||
|
You can speed up "make" by using all CPU cores available with "-j n" where n is the number of CPU threads you want to use.
|
||||||
|
|
||||||
openssl:
|
openssl:
|
||||||
|
|
||||||
./Configure mingw64 shared --cross-compile-prefix=x86_64-w64-mingw32
|
$ ./Configure mingw64 shared --cross-compile-prefix=x86_64-w64-mingw32-
|
||||||
make
|
$ make
|
||||||
|
|
||||||
|
Make may fail with an ld error, just ensure libcrypto-1_1-x64.dll is created.
|
||||||
|
|
||||||
curl:
|
curl:
|
||||||
|
|
||||||
./configure --with-winssl --with-winidn --host=x86_64-w64-mingw32
|
$ ./configure --with-winssl --with-winidn --host=x86_64-w64-mingw32
|
||||||
make
|
$ make
|
||||||
|
|
||||||
gmp:
|
gmp:
|
||||||
|
|
||||||
./configure --host=x86_64-w64-mingw32
|
$ ./configure --host=x86_64-w64-mingw32
|
||||||
make
|
$ make
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
4. Tweak the environment.
|
4. Tweak the environment.
|
||||||
|
|
||||||
This step is required everytime you login or the commands can be added to
|
This step is required everytime you login or the commands can be added to .bashrc.
|
||||||
.bashrc.
|
|
||||||
|
|
||||||
Define some local variables to point to local library.
|
Define some local variables to point to local library.
|
||||||
|
|
||||||
export LOCAL_LIB="$HOME/usr/lib"
|
$ export LOCAL_LIB="$HOME/usr/lib"
|
||||||
|
|
||||||
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
|
$ export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
|
||||||
|
|
||||||
export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
|
$ export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
|
||||||
|
|
||||||
Create a release directory and copy some dll files previously built.
|
Adjust for gcc version:
|
||||||
This can be done outside of cpuminer-opt and only needs to be done once.
|
|
||||||
If the release directory is in cpuminer-opt directory it needs to be
|
|
||||||
recreated every a source package is decompressed.
|
|
||||||
|
|
||||||
mkdir release
|
$ export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
|
||||||
cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
|
|
||||||
cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
|
|
||||||
cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/
|
|
||||||
cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libgcc_s_seh-1.dll release/
|
|
||||||
cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
|
|
||||||
cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
|
|
||||||
|
|
||||||
|
Create a release directory and copy some dll files previously built. This can be done outside of cpuminer-opt and only needs to be done once. If the release directory is in cpuminer-opt directory it needs to be recreated every time a source package is decompressed.
|
||||||
|
|
||||||
|
$ mkdir release
|
||||||
|
$ cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
|
||||||
|
$ cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
|
||||||
|
$ cp $GCC_MINGW_LIB/libstdc++-6.dll release/
|
||||||
|
$ cp $GCC_MINGW_LIB/libgcc_s_seh-1.dll release/
|
||||||
|
$ cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
|
||||||
|
$ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
|
||||||
|
|
||||||
The following steps need to be done every time a new source package is
|
The following steps need to be done every time a new source package is
|
||||||
opened.
|
opened.
|
||||||
@@ -110,13 +111,73 @@ https://github.com/JayDDee/cpuminer-opt/releases
|
|||||||
|
|
||||||
Decompress and change to the cpuminer-opt directory.
|
Decompress and change to the cpuminer-opt directory.
|
||||||
|
|
||||||
|
6. compile
|
||||||
|
|
||||||
6. Prepare to compile
|
|
||||||
|
|
||||||
Create a link to the locally compiled version of gmp.h
|
Create a link to the locally compiled version of gmp.h
|
||||||
|
|
||||||
ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h
|
$ ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h
|
||||||
|
|
||||||
|
$ ./autogen.sh
|
||||||
|
|
||||||
|
Configure the compiler for the CPU architecture of the host machine:
|
||||||
|
|
||||||
|
CFLAGS="-O3 -march=native -Wall" ./configure $CONFIGURE_ARGS
|
||||||
|
|
||||||
|
or cross compile for a specific CPU architecture:
|
||||||
|
|
||||||
|
CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
|
||||||
|
|
||||||
|
This will compile for AMD Ryzen.
|
||||||
|
|
||||||
|
You can compile more generically for a set of specific CPU features if you know what features you want:
|
||||||
|
|
||||||
|
CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS
|
||||||
|
|
||||||
|
This will compile for an older CPU that does not have AVX.
|
||||||
|
|
||||||
|
You can find several examples in README.txt
|
||||||
|
|
||||||
|
If you have a CPU with more than 64 threads and Windows 7 or higher you can enable the CPU Groups feature by adding the following to CFLAGS:
|
||||||
|
|
||||||
|
"-D_WIN32_WINNT=0x0601"
|
||||||
|
|
||||||
|
Once you have run configure successfully run the compiler with n CPU threads:
|
||||||
|
|
||||||
|
$ make -j n
|
||||||
|
|
||||||
|
Copy cpuminer.exe to the release directory, compress and copy the release directory to a Windows system and run cpuminer.exe from the command line.
|
||||||
|
|
||||||
|
Run cpuminer
|
||||||
|
|
||||||
|
In a command windows change directories to the unzipped release folder. to get a list of all options:
|
||||||
|
|
||||||
|
cpuminer.exe --help
|
||||||
|
|
||||||
|
Command options are specific to where you mine. Refer to the pool's instructions on how to set them.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Create a link to the locally compiled version of gmp.h
|
||||||
|
|
||||||
|
$ ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h
|
||||||
|
|
||||||
Edit configure.ac to fix lipthread package name.
|
Edit configure.ac to fix lipthread package name.
|
||||||
|
|
||||||
|
|||||||
13
Makefile.am
13
Makefile.am
@@ -129,7 +129,7 @@ cpuminer_SOURCES = \
|
|||||||
algo/lyra2/allium.c \
|
algo/lyra2/allium.c \
|
||||||
algo/lyra2/phi2-4way.c \
|
algo/lyra2/phi2-4way.c \
|
||||||
algo/lyra2/phi2.c \
|
algo/lyra2/phi2.c \
|
||||||
algo//m7m/m7m.c \
|
algo/m7m/m7m.c \
|
||||||
algo/m7m/magimath.cpp \
|
algo/m7m/magimath.cpp \
|
||||||
algo/nist5/nist5-gate.c \
|
algo/nist5/nist5-gate.c \
|
||||||
algo/nist5/nist5-4way.c \
|
algo/nist5/nist5-4way.c \
|
||||||
@@ -158,14 +158,20 @@ cpuminer_SOURCES = \
|
|||||||
algo/ripemd/lbry.c \
|
algo/ripemd/lbry.c \
|
||||||
algo/ripemd/lbry-4way.c \
|
algo/ripemd/lbry-4way.c \
|
||||||
algo/scrypt/scrypt.c \
|
algo/scrypt/scrypt.c \
|
||||||
|
algo/scrypt/scrypt-core-4way.c \
|
||||||
algo/scrypt/neoscrypt.c \
|
algo/scrypt/neoscrypt.c \
|
||||||
|
algo/sha/sha256-hash.c \
|
||||||
algo/sha/sph_sha2.c \
|
algo/sha/sph_sha2.c \
|
||||||
algo/sha/sph_sha2big.c \
|
algo/sha/sph_sha2big.c \
|
||||||
algo/sha/sha256-hash-4way.c \
|
algo/sha/sha256-hash-4way.c \
|
||||||
algo/sha/sha512-hash-4way.c \
|
algo/sha/sha512-hash-4way.c \
|
||||||
|
algo/sha/sha256-hash-opt.c \
|
||||||
|
algo/sha/sha256-hash-2way-ni.c \
|
||||||
algo/sha/hmac-sha256-hash.c \
|
algo/sha/hmac-sha256-hash.c \
|
||||||
algo/sha/hmac-sha256-hash-4way.c \
|
algo/sha/hmac-sha256-hash-4way.c \
|
||||||
|
algo/sha/sha256d.c \
|
||||||
algo/sha/sha2.c \
|
algo/sha/sha2.c \
|
||||||
|
algo/sha/sha256d-4way.c \
|
||||||
algo/sha/sha256t-gate.c \
|
algo/sha/sha256t-gate.c \
|
||||||
algo/sha/sha256t-4way.c \
|
algo/sha/sha256t-4way.c \
|
||||||
algo/sha/sha256t.c \
|
algo/sha/sha256t.c \
|
||||||
@@ -192,6 +198,11 @@ cpuminer_SOURCES = \
|
|||||||
algo/sm3/sm3-hash-4way.c \
|
algo/sm3/sm3-hash-4way.c \
|
||||||
algo/swifftx/swifftx.c \
|
algo/swifftx/swifftx.c \
|
||||||
algo/tiger/sph_tiger.c \
|
algo/tiger/sph_tiger.c \
|
||||||
|
algo/verthash/verthash-gate.c \
|
||||||
|
algo/verthash/Verthash.c \
|
||||||
|
algo/verthash/fopen_utf8.c \
|
||||||
|
algo/verthash/tiny_sha3/sha3.c \
|
||||||
|
algo/verthash/tiny_sha3/sha3-4way.c \
|
||||||
algo/whirlpool/sph_whirlpool.c \
|
algo/whirlpool/sph_whirlpool.c \
|
||||||
algo/whirlpool/whirlpool-hash-4way.c \
|
algo/whirlpool/whirlpool-hash-4way.c \
|
||||||
algo/whirlpool/whirlpool-gate.c \
|
algo/whirlpool/whirlpool-gate.c \
|
||||||
|
|||||||
@@ -89,7 +89,7 @@ Supported Algorithms
|
|||||||
lyra2h Hppcoin
|
lyra2h Hppcoin
|
||||||
lyra2re lyra2
|
lyra2re lyra2
|
||||||
lyra2rev2 lyra2v2
|
lyra2rev2 lyra2v2
|
||||||
lyra2rev3 lyrav2v3, Vertcoin
|
lyra2rev3 lyrav2v3
|
||||||
lyra2z
|
lyra2z
|
||||||
lyra2z330 Lyra2 330 rows, Zoin (ZOI)
|
lyra2z330 Lyra2 330 rows, Zoin (ZOI)
|
||||||
m7m Magi (XMG)
|
m7m Magi (XMG)
|
||||||
@@ -122,6 +122,7 @@ Supported Algorithms
|
|||||||
tribus Denarius (DNR)
|
tribus Denarius (DNR)
|
||||||
vanilla blake256r8vnl (VCash)
|
vanilla blake256r8vnl (VCash)
|
||||||
veltor (VLT)
|
veltor (VLT)
|
||||||
|
verthash Vertcoin
|
||||||
whirlpool
|
whirlpool
|
||||||
whirlpoolx
|
whirlpoolx
|
||||||
x11 Dash
|
x11 Dash
|
||||||
@@ -134,7 +135,7 @@ Supported Algorithms
|
|||||||
x14 X14
|
x14 X14
|
||||||
x15 X15
|
x15 X15
|
||||||
x16r
|
x16r
|
||||||
x16rv2 Ravencoin (RVN)
|
x16rv2
|
||||||
x16rt Gincoin (GIN)
|
x16rt Gincoin (GIN)
|
||||||
x16rt-veil Veil (VEIL)
|
x16rt-veil Veil (VEIL)
|
||||||
x16s Pigeoncoin (PGN)
|
x16s Pigeoncoin (PGN)
|
||||||
|
|||||||
@@ -59,11 +59,16 @@ Notes about included DLL files:
|
|||||||
|
|
||||||
Downloading DLL files from alternative sources presents an inherent
|
Downloading DLL files from alternative sources presents an inherent
|
||||||
security risk if their source is unknown. All DLL files included have
|
security risk if their source is unknown. All DLL files included have
|
||||||
been copied from the Ubuntu-20.04 instalation or compiled by me from
|
been copied from the Ubuntu-20.04 installation or compiled by me from
|
||||||
source code obtained from the author's official repository. The exact
|
source code obtained from the author's official repository. The exact
|
||||||
procedure is documented in the build instructions for Windows:
|
procedure is documented in the build instructions for Windows:
|
||||||
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
|
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
|
||||||
|
|
||||||
|
Some DLL filess may already be installed on the system by Windows or third
|
||||||
|
party packages. They often will work and may be used instead of the included
|
||||||
|
file. Without a compelling reason to do so it's recommended to use the included
|
||||||
|
files as they are packaged.
|
||||||
|
|
||||||
If you like this software feel free to donate:
|
If you like this software feel free to donate:
|
||||||
|
|
||||||
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
|
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
|
||||||
|
|||||||
131
RELEASE_NOTES
131
RELEASE_NOTES
@@ -65,11 +65,140 @@ If not what makes it happen or not happen?
|
|||||||
Change Log
|
Change Log
|
||||||
----------
|
----------
|
||||||
|
|
||||||
|
v3.8.2
|
||||||
|
|
||||||
|
Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.
|
||||||
|
|
||||||
|
AVX512 for sha256d.
|
||||||
|
|
||||||
|
SSE42 and AVX may now be displayed as mining features at startup.
|
||||||
|
This is hard coded for each algo, and is only implemented for scrypt
|
||||||
|
at this time as it is the only algo with significant performance differences
|
||||||
|
with those features.
|
||||||
|
|
||||||
|
Fixed an issue where a high hashrate algo could cause excessive invalid hash
|
||||||
|
rate log reports when starting up in benchmark mode.
|
||||||
|
|
||||||
|
v3.18.1
|
||||||
|
|
||||||
|
More speed for scrypt:
|
||||||
|
- additional scryptn2 optimizations for all CPU architectures,
|
||||||
|
- AVX2 is now used by default on CPUS with SHA but not AVX512,
|
||||||
|
- scrypt:1024 performance lost in v3.18.0 is restored,
|
||||||
|
- AVX512 & AVX2 improvements to scrypt:1024.
|
||||||
|
|
||||||
|
Big speedup for SwiFFTx AVX2 & SSE4.1: x22i +55%, x25x +22%.
|
||||||
|
|
||||||
|
Issue #337: fixed a problem that could display negative stats values in the
|
||||||
|
first summary report if the report was forced prematurely due to a stratum
|
||||||
|
diff change. The stats will still be invalid but should display zeros.
|
||||||
|
|
||||||
|
v3.18.0
|
||||||
|
|
||||||
|
Complete rewrite of Scrypt code, optimized for large N factor (scryptn2):
|
||||||
|
- AVX512 & SHA support for sha256, AVX512 has priority,
|
||||||
|
- up to 50% increase in hashrate,
|
||||||
|
- memory requirements reduced 30-60% depending on CPU architecture,
|
||||||
|
- memory usage displayed at startup,
|
||||||
|
- scrypt, default N=1024 (LTC), will likely perform slower.
|
||||||
|
|
||||||
|
Improved stale share detection and handling for Scrypt with large N factor:
|
||||||
|
- abort and discard partially computed hash when new work is detected,
|
||||||
|
- quicker response to new job, less time wasted mining stale job.
|
||||||
|
|
||||||
|
Improved stale share handling for all algorithms:
|
||||||
|
- report possible stale share when new work received with a previously
|
||||||
|
submitted share still pending,
|
||||||
|
- when new work is detected report the submission of an already completed,
|
||||||
|
otherwise valid, but likely stale, share,
|
||||||
|
- fixed incorrect block height in stale share log.
|
||||||
|
|
||||||
|
Small performance improvements to sha, bmw, cube & hamsi for AVX512 & AVX2.
|
||||||
|
|
||||||
|
When stratum disconnects miner threads go to idle until reconnected.
|
||||||
|
|
||||||
|
Colour changes to some logs.
|
||||||
|
|
||||||
|
Some low level function name changes for clarity and consistency.
|
||||||
|
|
||||||
|
The reference hashrate in the summary log and the benchmark total hashrate
|
||||||
|
are now the mean hashrate for the session.
|
||||||
|
|
||||||
|
v3.17.1
|
||||||
|
|
||||||
|
Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES.
|
||||||
|
More ternary logic optimizations for AVX512, AVX512+VAES, and AVX512+AES.
|
||||||
|
Fixed my-gr algo for VAES.
|
||||||
|
|
||||||
|
v3.17.0
|
||||||
|
|
||||||
|
AVX512 optimized using ternary logic instructions.
|
||||||
|
Faster sha256t on all CPU architectures: AVX512 +30%, SHA +30%, AVX2 +9%.
|
||||||
|
Use SHA on supported CPUs to produce merkle hash.
|
||||||
|
Fixed byte order in Extranonce2 log & replaced Block height with Job ID.
|
||||||
|
|
||||||
|
v3.16.5
|
||||||
|
|
||||||
|
#329: Fixed GBT incorrect target diff in stats, second attempt.
|
||||||
|
Fixed formatting error in share result log when --no-color option is used.
|
||||||
|
|
||||||
|
v3.16.4
|
||||||
|
|
||||||
|
Faster sha512 and sha256 when not using SHA CPU extension.
|
||||||
|
#329: Fixed GBT incorrect target diff in stats.
|
||||||
|
|
||||||
|
v3.16.3
|
||||||
|
|
||||||
|
#313 Fix compile error with GCC 11.
|
||||||
|
Incremental improvements to verthash.
|
||||||
|
|
||||||
|
v3.16.2
|
||||||
|
|
||||||
|
Verthash: midstate prehash optimization for all architectures.
|
||||||
|
Verthash: AVX2 optimization.
|
||||||
|
GBT: added support for Bech32 addresses.
|
||||||
|
Linux: added CPU frequency to benchmark log.
|
||||||
|
Fixed integer overflow in time calculations.
|
||||||
|
|
||||||
|
v3.16.1
|
||||||
|
|
||||||
|
New options for verthash:
|
||||||
|
--data-file to specify the name, and optionally the path, of the verthash
|
||||||
|
data file, default is "verthash.dat" in the current directory.
|
||||||
|
--verify to perform the data file integrity check at startup, default is
|
||||||
|
not to verify data file integrity.
|
||||||
|
Support for creation of default verthash data file if:
|
||||||
|
1) --data-file option is not used,
|
||||||
|
2) no default data file is found in the current directory, and,
|
||||||
|
3) --verify option is used.
|
||||||
|
More detailed logs related to verthash data file.
|
||||||
|
Small verthash performance improvement.
|
||||||
|
Fixed detection of corrupt stats caused by networking issues.
|
||||||
|
|
||||||
|
v3.16.0
|
||||||
|
|
||||||
|
Added verthash algo.
|
||||||
|
|
||||||
|
v3.15.7
|
||||||
|
|
||||||
|
Added accepted/stale/rejected percentage to summary log report.
|
||||||
|
Added warning if share counters mismatch which could corrupt stats.
|
||||||
|
Linux: CPU temperature reporting is more responsive to rising temperature.
|
||||||
|
A few AVX2 & AVX512 tweaks.
|
||||||
|
Removed some dead code and other cleanup.
|
||||||
|
|
||||||
|
v3.15.6
|
||||||
|
|
||||||
|
Implement keccak pre-hash optimization for x16* algos.
|
||||||
|
Move conditional mining test to before get_new_work in miner thread.
|
||||||
|
Add test for share reject reason when solo mining.
|
||||||
|
Add support for floating point, as well as integer, "networkhasps" in
|
||||||
|
RPC getmininginfo method.
|
||||||
|
|
||||||
v3.15.5
|
v3.15.5
|
||||||
|
|
||||||
Fix stratum jobs lost if 2 jobs received in less than one second.
|
Fix stratum jobs lost if 2 jobs received in less than one second.
|
||||||
|
|
||||||
|
|
||||||
v3.15.4
|
v3.15.4
|
||||||
|
|
||||||
Fixed yescryptr16 broken in v3.15.3.
|
Fixed yescryptr16 broken in v3.15.3.
|
||||||
|
|||||||
192
algo-gate-api.c
192
algo-gate-api.c
@@ -15,8 +15,6 @@
|
|||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <openssl/sha.h>
|
|
||||||
//#include "miner.h"
|
|
||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
|
|
||||||
// Define null and standard functions.
|
// Define null and standard functions.
|
||||||
@@ -279,9 +277,11 @@ void init_algo_gate( algo_gate_t* gate )
|
|||||||
#pragma GCC diagnostic push
|
#pragma GCC diagnostic push
|
||||||
#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
|
#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
|
||||||
|
|
||||||
// called by each thread that uses the gate
|
// Called once by main
|
||||||
bool register_algo_gate( int algo, algo_gate_t *gate )
|
bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||||
{
|
{
|
||||||
|
bool rc = false;
|
||||||
|
|
||||||
if ( NULL == gate )
|
if ( NULL == gate )
|
||||||
{
|
{
|
||||||
applog(LOG_ERR,"FAIL: algo_gate registration failed, NULL gate\n");
|
applog(LOG_ERR,"FAIL: algo_gate registration failed, NULL gate\n");
|
||||||
@@ -290,108 +290,108 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
|||||||
|
|
||||||
init_algo_gate( gate );
|
init_algo_gate( gate );
|
||||||
|
|
||||||
switch (algo)
|
switch ( algo )
|
||||||
{
|
{
|
||||||
case ALGO_ALLIUM: register_allium_algo ( gate ); break;
|
case ALGO_ALLIUM: rc = register_allium_algo ( gate ); break;
|
||||||
case ALGO_ANIME: register_anime_algo ( gate ); break;
|
case ALGO_ANIME: rc = register_anime_algo ( gate ); break;
|
||||||
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
|
case ALGO_ARGON2: rc = register_argon2_algo ( gate ); break;
|
||||||
case ALGO_ARGON2D250: register_argon2d_crds_algo ( gate ); break;
|
case ALGO_ARGON2D250: rc = register_argon2d_crds_algo ( gate ); break;
|
||||||
case ALGO_ARGON2D500: register_argon2d_dyn_algo ( gate ); break;
|
case ALGO_ARGON2D500: rc = register_argon2d_dyn_algo ( gate ); break;
|
||||||
case ALGO_ARGON2D4096: register_argon2d4096_algo ( gate ); break;
|
case ALGO_ARGON2D4096: rc = register_argon2d4096_algo ( gate ); break;
|
||||||
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
|
case ALGO_AXIOM: rc = register_axiom_algo ( gate ); break;
|
||||||
case ALGO_BLAKE: register_blake_algo ( gate ); break;
|
case ALGO_BLAKE: rc = register_blake_algo ( gate ); break;
|
||||||
case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break;
|
case ALGO_BLAKE2B: rc = register_blake2b_algo ( gate ); break;
|
||||||
case ALGO_BLAKE2S: register_blake2s_algo ( gate ); break;
|
case ALGO_BLAKE2S: rc = register_blake2s_algo ( gate ); break;
|
||||||
case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break;
|
case ALGO_BLAKECOIN: rc = register_blakecoin_algo ( gate ); break;
|
||||||
case ALGO_BMW512: register_bmw512_algo ( gate ); break;
|
case ALGO_BMW512: rc = register_bmw512_algo ( gate ); break;
|
||||||
case ALGO_C11: register_c11_algo ( gate ); break;
|
case ALGO_C11: rc = register_c11_algo ( gate ); break;
|
||||||
case ALGO_DECRED: register_decred_algo ( gate ); break;
|
case ALGO_DECRED: rc = register_decred_algo ( gate ); break;
|
||||||
case ALGO_DEEP: register_deep_algo ( gate ); break;
|
case ALGO_DEEP: rc = register_deep_algo ( gate ); break;
|
||||||
case ALGO_DMD_GR: register_dmd_gr_algo ( gate ); break;
|
case ALGO_DMD_GR: rc = register_dmd_gr_algo ( gate ); break;
|
||||||
case ALGO_GROESTL: register_groestl_algo ( gate ); break;
|
case ALGO_GROESTL: rc = register_groestl_algo ( gate ); break;
|
||||||
case ALGO_HEX: register_hex_algo ( gate ); break;
|
case ALGO_HEX: rc = register_hex_algo ( gate ); break;
|
||||||
case ALGO_HMQ1725: register_hmq1725_algo ( gate ); break;
|
case ALGO_HMQ1725: rc = register_hmq1725_algo ( gate ); break;
|
||||||
case ALGO_HODL: register_hodl_algo ( gate ); break;
|
case ALGO_HODL: rc = register_hodl_algo ( gate ); break;
|
||||||
case ALGO_JHA: register_jha_algo ( gate ); break;
|
case ALGO_JHA: rc = register_jha_algo ( gate ); break;
|
||||||
case ALGO_KECCAK: register_keccak_algo ( gate ); break;
|
case ALGO_KECCAK: rc = register_keccak_algo ( gate ); break;
|
||||||
case ALGO_KECCAKC: register_keccakc_algo ( gate ); break;
|
case ALGO_KECCAKC: rc = register_keccakc_algo ( gate ); break;
|
||||||
case ALGO_LBRY: register_lbry_algo ( gate ); break;
|
case ALGO_LBRY: rc = register_lbry_algo ( gate ); break;
|
||||||
case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break;
|
case ALGO_LYRA2H: rc = register_lyra2h_algo ( gate ); break;
|
||||||
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
|
case ALGO_LYRA2RE: rc = register_lyra2re_algo ( gate ); break;
|
||||||
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
|
case ALGO_LYRA2REV2: rc = register_lyra2rev2_algo ( gate ); break;
|
||||||
case ALGO_LYRA2REV3: register_lyra2rev3_algo ( gate ); break;
|
case ALGO_LYRA2REV3: rc = register_lyra2rev3_algo ( gate ); break;
|
||||||
case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break;
|
case ALGO_LYRA2Z: rc = register_lyra2z_algo ( gate ); break;
|
||||||
case ALGO_LYRA2Z330: register_lyra2z330_algo ( gate ); break;
|
case ALGO_LYRA2Z330: rc = register_lyra2z330_algo ( gate ); break;
|
||||||
case ALGO_M7M: register_m7m_algo ( gate ); break;
|
case ALGO_M7M: rc = register_m7m_algo ( gate ); break;
|
||||||
case ALGO_MINOTAUR: register_minotaur_algo ( gate ); break;
|
case ALGO_MINOTAUR: rc = register_minotaur_algo ( gate ); break;
|
||||||
case ALGO_MYR_GR: register_myriad_algo ( gate ); break;
|
case ALGO_MYR_GR: rc = register_myriad_algo ( gate ); break;
|
||||||
case ALGO_NEOSCRYPT: register_neoscrypt_algo ( gate ); break;
|
case ALGO_NEOSCRYPT: rc = register_neoscrypt_algo ( gate ); break;
|
||||||
case ALGO_NIST5: register_nist5_algo ( gate ); break;
|
case ALGO_NIST5: rc = register_nist5_algo ( gate ); break;
|
||||||
case ALGO_PENTABLAKE: register_pentablake_algo ( gate ); break;
|
case ALGO_PENTABLAKE: rc = register_pentablake_algo ( gate ); break;
|
||||||
case ALGO_PHI1612: register_phi1612_algo ( gate ); break;
|
case ALGO_PHI1612: rc = register_phi1612_algo ( gate ); break;
|
||||||
case ALGO_PHI2: register_phi2_algo ( gate ); break;
|
case ALGO_PHI2: rc = register_phi2_algo ( gate ); break;
|
||||||
case ALGO_POLYTIMOS: register_polytimos_algo ( gate ); break;
|
case ALGO_POLYTIMOS: rc = register_polytimos_algo ( gate ); break;
|
||||||
case ALGO_POWER2B: register_power2b_algo ( gate ); break;
|
case ALGO_POWER2B: rc = register_power2b_algo ( gate ); break;
|
||||||
case ALGO_QUARK: register_quark_algo ( gate ); break;
|
case ALGO_QUARK: rc = register_quark_algo ( gate ); break;
|
||||||
case ALGO_QUBIT: register_qubit_algo ( gate ); break;
|
case ALGO_QUBIT: rc = register_qubit_algo ( gate ); break;
|
||||||
case ALGO_SCRYPT: register_scrypt_algo ( gate ); break;
|
case ALGO_SCRYPT: rc = register_scrypt_algo ( gate ); break;
|
||||||
case ALGO_SHA256D: register_sha256d_algo ( gate ); break;
|
case ALGO_SHA256D: rc = register_sha256d_algo ( gate ); break;
|
||||||
case ALGO_SHA256Q: register_sha256q_algo ( gate ); break;
|
case ALGO_SHA256Q: rc = register_sha256q_algo ( gate ); break;
|
||||||
case ALGO_SHA256T: register_sha256t_algo ( gate ); break;
|
case ALGO_SHA256T: rc = register_sha256t_algo ( gate ); break;
|
||||||
case ALGO_SHA3D: register_sha3d_algo ( gate ); break;
|
case ALGO_SHA3D: rc = register_sha3d_algo ( gate ); break;
|
||||||
case ALGO_SHAVITE3: register_shavite_algo ( gate ); break;
|
case ALGO_SHAVITE3: rc = register_shavite_algo ( gate ); break;
|
||||||
case ALGO_SKEIN: register_skein_algo ( gate ); break;
|
case ALGO_SKEIN: rc = register_skein_algo ( gate ); break;
|
||||||
case ALGO_SKEIN2: register_skein2_algo ( gate ); break;
|
case ALGO_SKEIN2: rc = register_skein2_algo ( gate ); break;
|
||||||
case ALGO_SKUNK: register_skunk_algo ( gate ); break;
|
case ALGO_SKUNK: rc = register_skunk_algo ( gate ); break;
|
||||||
case ALGO_SONOA: register_sonoa_algo ( gate ); break;
|
case ALGO_SONOA: rc = register_sonoa_algo ( gate ); break;
|
||||||
case ALGO_TIMETRAVEL: register_timetravel_algo ( gate ); break;
|
case ALGO_TIMETRAVEL: rc = register_timetravel_algo ( gate ); break;
|
||||||
case ALGO_TIMETRAVEL10: register_timetravel10_algo ( gate ); break;
|
case ALGO_TIMETRAVEL10: rc = register_timetravel10_algo ( gate ); break;
|
||||||
case ALGO_TRIBUS: register_tribus_algo ( gate ); break;
|
case ALGO_TRIBUS: rc = register_tribus_algo ( gate ); break;
|
||||||
case ALGO_VANILLA: register_vanilla_algo ( gate ); break;
|
case ALGO_VANILLA: rc = register_vanilla_algo ( gate ); break;
|
||||||
case ALGO_VELTOR: register_veltor_algo ( gate ); break;
|
case ALGO_VELTOR: rc = register_veltor_algo ( gate ); break;
|
||||||
case ALGO_WHIRLPOOL: register_whirlpool_algo ( gate ); break;
|
case ALGO_VERTHASH: rc = register_verthash_algo ( gate ); break;
|
||||||
case ALGO_WHIRLPOOLX: register_whirlpoolx_algo ( gate ); break;
|
case ALGO_WHIRLPOOL: rc = register_whirlpool_algo ( gate ); break;
|
||||||
case ALGO_X11: register_x11_algo ( gate ); break;
|
case ALGO_WHIRLPOOLX: rc = register_whirlpoolx_algo ( gate ); break;
|
||||||
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
|
case ALGO_X11: rc = register_x11_algo ( gate ); break;
|
||||||
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
|
case ALGO_X11EVO: rc = register_x11evo_algo ( gate ); break;
|
||||||
case ALGO_X12: register_x12_algo ( gate ); break;
|
case ALGO_X11GOST: rc = register_x11gost_algo ( gate ); break;
|
||||||
case ALGO_X13: register_x13_algo ( gate ); break;
|
case ALGO_X12: rc = register_x12_algo ( gate ); break;
|
||||||
case ALGO_X13BCD: register_x13bcd_algo ( gate ); break;
|
case ALGO_X13: rc = register_x13_algo ( gate ); break;
|
||||||
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
|
case ALGO_X13BCD: rc = register_x13bcd_algo ( gate ); break;
|
||||||
case ALGO_X14: register_x14_algo ( gate ); break;
|
case ALGO_X13SM3: rc = register_x13sm3_algo ( gate ); break;
|
||||||
case ALGO_X15: register_x15_algo ( gate ); break;
|
case ALGO_X14: rc = register_x14_algo ( gate ); break;
|
||||||
case ALGO_X16R: register_x16r_algo ( gate ); break;
|
case ALGO_X15: rc = register_x15_algo ( gate ); break;
|
||||||
case ALGO_X16RV2: register_x16rv2_algo ( gate ); break;
|
case ALGO_X16R: rc = register_x16r_algo ( gate ); break;
|
||||||
case ALGO_X16RT: register_x16rt_algo ( gate ); break;
|
case ALGO_X16RV2: rc = register_x16rv2_algo ( gate ); break;
|
||||||
case ALGO_X16RT_VEIL: register_x16rt_veil_algo ( gate ); break;
|
case ALGO_X16RT: rc = register_x16rt_algo ( gate ); break;
|
||||||
case ALGO_X16S: register_x16s_algo ( gate ); break;
|
case ALGO_X16RT_VEIL: rc = register_x16rt_veil_algo ( gate ); break;
|
||||||
case ALGO_X17: register_x17_algo ( gate ); break;
|
case ALGO_X16S: rc = register_x16s_algo ( gate ); break;
|
||||||
case ALGO_X21S: register_x21s_algo ( gate ); break;
|
case ALGO_X17: rc = register_x17_algo ( gate ); break;
|
||||||
case ALGO_X22I: register_x22i_algo ( gate ); break;
|
case ALGO_X21S: rc = register_x21s_algo ( gate ); break;
|
||||||
case ALGO_X25X: register_x25x_algo ( gate ); break;
|
case ALGO_X22I: rc = register_x22i_algo ( gate ); break;
|
||||||
case ALGO_XEVAN: register_xevan_algo ( gate ); break;
|
case ALGO_X25X: rc = register_x25x_algo ( gate ); break;
|
||||||
case ALGO_YESCRYPT: register_yescrypt_05_algo ( gate ); break;
|
case ALGO_XEVAN: rc = register_xevan_algo ( gate ); break;
|
||||||
|
case ALGO_YESCRYPT: rc = register_yescrypt_05_algo ( gate ); break;
|
||||||
// case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
|
// case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
|
||||||
case ALGO_YESCRYPTR8: register_yescryptr8_05_algo ( gate ); break;
|
case ALGO_YESCRYPTR8: rc = register_yescryptr8_05_algo ( gate ); break;
|
||||||
// case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
|
// case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
|
||||||
case ALGO_YESCRYPTR8G: register_yescryptr8g_algo ( gate ); break;
|
case ALGO_YESCRYPTR8G: rc = register_yescryptr8g_algo ( gate ); break;
|
||||||
case ALGO_YESCRYPTR16: register_yescryptr16_05_algo( gate ); break;
|
case ALGO_YESCRYPTR16: rc = register_yescryptr16_05_algo( gate ); break;
|
||||||
// case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
|
// case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
|
||||||
case ALGO_YESCRYPTR32: register_yescryptr32_05_algo( gate ); break;
|
case ALGO_YESCRYPTR32: rc = register_yescryptr32_05_algo( gate ); break;
|
||||||
// case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
|
// case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
|
||||||
case ALGO_YESPOWER: register_yespower_algo ( gate ); break;
|
case ALGO_YESPOWER: rc = register_yespower_algo ( gate ); break;
|
||||||
case ALGO_YESPOWERR16: register_yespowerr16_algo ( gate ); break;
|
case ALGO_YESPOWERR16: rc = register_yespowerr16_algo ( gate ); break;
|
||||||
case ALGO_YESPOWER_B2B: register_yespower_b2b_algo ( gate ); break;
|
case ALGO_YESPOWER_B2B: rc = register_yespower_b2b_algo ( gate ); break;
|
||||||
case ALGO_ZR5: register_zr5_algo ( gate ); break;
|
case ALGO_ZR5: rc = register_zr5_algo ( gate ); break;
|
||||||
default:
|
default:
|
||||||
applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
|
applog(LOG_ERR,"BUG: unregistered algorithm %s.\n", algo_names[opt_algo] );
|
||||||
return false;
|
return false;
|
||||||
} // switch
|
} // switch
|
||||||
|
|
||||||
// ensure required functions were defined.
|
if ( !rc )
|
||||||
if ( gate->scanhash == (void*)&null_scanhash )
|
|
||||||
{
|
{
|
||||||
applog(LOG_ERR, "FAIL: Required algo_gate functions undefined\n");
|
applog(LOG_ERR, "FAIL: %s algorithm failed to initialize\n", algo_names[opt_algo] );
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@@ -419,7 +419,6 @@ void exec_hash_function( int algo, void *output, const void *pdata )
|
|||||||
const char* const algo_alias_map[][2] =
|
const char* const algo_alias_map[][2] =
|
||||||
{
|
{
|
||||||
// alias proper
|
// alias proper
|
||||||
{ "argon2d-crds", "argon2d250" },
|
|
||||||
{ "argon2d-dyn", "argon2d500" },
|
{ "argon2d-dyn", "argon2d500" },
|
||||||
{ "argon2d-uis", "argon2d4096" },
|
{ "argon2d-uis", "argon2d4096" },
|
||||||
{ "bcd", "x13bcd" },
|
{ "bcd", "x13bcd" },
|
||||||
@@ -434,7 +433,6 @@ const char* const algo_alias_map[][2] =
|
|||||||
{ "flax", "c11" },
|
{ "flax", "c11" },
|
||||||
{ "hsr", "x13sm3" },
|
{ "hsr", "x13sm3" },
|
||||||
{ "jackpot", "jha" },
|
{ "jackpot", "jha" },
|
||||||
{ "jane", "scryptjane" },
|
|
||||||
{ "lyra2", "lyra2re" },
|
{ "lyra2", "lyra2re" },
|
||||||
{ "lyra2v2", "lyra2rev2" },
|
{ "lyra2v2", "lyra2rev2" },
|
||||||
{ "lyra2v3", "lyra2rev3" },
|
{ "lyra2v3", "lyra2rev3" },
|
||||||
|
|||||||
@@ -1,3 +1,6 @@
|
|||||||
|
#ifndef __ALGO_GATE_API_H__
|
||||||
|
#define __ALGO_GATE_API_H__ 1
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
@@ -114,15 +117,15 @@ typedef struct
|
|||||||
// Mandatory functions, one of these is mandatory. If a generic scanhash
|
// Mandatory functions, one of these is mandatory. If a generic scanhash
|
||||||
// is used a custom target hash function must be registered, with a custom
|
// is used a custom target hash function must be registered, with a custom
|
||||||
// scanhash the target hash function can be called directly and doesn't need
|
// scanhash the target hash function can be called directly and doesn't need
|
||||||
// to be registered in the gate.
|
// to be registered with the gate.
|
||||||
int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );
|
int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );
|
||||||
|
|
||||||
int ( *hash ) ( void*, const void*, int );
|
int ( *hash ) ( void*, const void*, int );
|
||||||
|
|
||||||
//optional, safe to use default in most cases
|
//optional, safe to use default in most cases
|
||||||
|
|
||||||
// Allocate thread local buffers and other initialization specific to miner
|
// Called once by each miner thread to allocate thread local buffers and
|
||||||
// threads.
|
// other initialization specific to miner threads.
|
||||||
bool ( *miner_thread_init ) ( int );
|
bool ( *miner_thread_init ) ( int );
|
||||||
|
|
||||||
// Get thread local copy of blockheader with unique nonce.
|
// Get thread local copy of blockheader with unique nonce.
|
||||||
@@ -150,7 +153,7 @@ void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );
|
|||||||
|
|
||||||
char* ( *malloc_txs_request ) ( struct work* );
|
char* ( *malloc_txs_request ) ( struct work* );
|
||||||
|
|
||||||
// Big or little
|
// Big endian or little endian
|
||||||
void ( *set_work_data_endian ) ( struct work* );
|
void ( *set_work_data_endian ) ( struct work* );
|
||||||
|
|
||||||
double ( *calc_network_diff ) ( struct work* );
|
double ( *calc_network_diff ) ( struct work* );
|
||||||
@@ -260,7 +263,7 @@ int scanhash_8way_64in_32out( struct work *work, uint32_t max_nonce,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// displays warning
|
// displays warning
|
||||||
int null_hash ();
|
int null_hash();
|
||||||
|
|
||||||
// optional safe targets, default listed first unless noted.
|
// optional safe targets, default listed first unless noted.
|
||||||
|
|
||||||
@@ -281,7 +284,7 @@ void std_be_build_stratum_request( char *req, struct work *work );
|
|||||||
|
|
||||||
char* std_malloc_txs_request( struct work *work );
|
char* std_malloc_txs_request( struct work *work );
|
||||||
|
|
||||||
// Default is do_nothing (assumed LE)
|
// Default is do_nothing, little endian is assumed
|
||||||
void set_work_data_big_endian( struct work *work );
|
void set_work_data_big_endian( struct work *work );
|
||||||
|
|
||||||
double std_calc_network_diff( struct work *work );
|
double std_calc_network_diff( struct work *work );
|
||||||
@@ -319,3 +322,4 @@ void exec_hash_function( int algo, void *output, const void *pdata );
|
|||||||
// algo name if valid alias, NULL if invalid alias or algo.
|
// algo name if valid alias, NULL if invalid alias or algo.
|
||||||
void get_algo_alias( char **algo_or_alias );
|
void get_algo_alias( char **algo_or_alias );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|||||||
@@ -328,7 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
|
|||||||
|
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
|
|
||||||
#define ror64(x, n) _mm512_ror_epi64((x), (n))
|
#define ROR64(x, n) _mm512_ror_epi64((x), (n))
|
||||||
|
|
||||||
static __m512i muladd(__m512i x, __m512i y)
|
static __m512i muladd(__m512i x, __m512i y)
|
||||||
{
|
{
|
||||||
@@ -344,8 +344,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
|||||||
D0 = _mm512_xor_si512(D0, A0); \
|
D0 = _mm512_xor_si512(D0, A0); \
|
||||||
D1 = _mm512_xor_si512(D1, A1); \
|
D1 = _mm512_xor_si512(D1, A1); \
|
||||||
\
|
\
|
||||||
D0 = ror64(D0, 32); \
|
D0 = ROR64(D0, 32); \
|
||||||
D1 = ror64(D1, 32); \
|
D1 = ROR64(D1, 32); \
|
||||||
\
|
\
|
||||||
C0 = muladd(C0, D0); \
|
C0 = muladd(C0, D0); \
|
||||||
C1 = muladd(C1, D1); \
|
C1 = muladd(C1, D1); \
|
||||||
@@ -353,8 +353,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
|||||||
B0 = _mm512_xor_si512(B0, C0); \
|
B0 = _mm512_xor_si512(B0, C0); \
|
||||||
B1 = _mm512_xor_si512(B1, C1); \
|
B1 = _mm512_xor_si512(B1, C1); \
|
||||||
\
|
\
|
||||||
B0 = ror64(B0, 24); \
|
B0 = ROR64(B0, 24); \
|
||||||
B1 = ror64(B1, 24); \
|
B1 = ROR64(B1, 24); \
|
||||||
} while ((void)0, 0)
|
} while ((void)0, 0)
|
||||||
|
|
||||||
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
|
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||||
@@ -365,8 +365,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
|||||||
D0 = _mm512_xor_si512(D0, A0); \
|
D0 = _mm512_xor_si512(D0, A0); \
|
||||||
D1 = _mm512_xor_si512(D1, A1); \
|
D1 = _mm512_xor_si512(D1, A1); \
|
||||||
\
|
\
|
||||||
D0 = ror64(D0, 16); \
|
D0 = ROR64(D0, 16); \
|
||||||
D1 = ror64(D1, 16); \
|
D1 = ROR64(D1, 16); \
|
||||||
\
|
\
|
||||||
C0 = muladd(C0, D0); \
|
C0 = muladd(C0, D0); \
|
||||||
C1 = muladd(C1, D1); \
|
C1 = muladd(C1, D1); \
|
||||||
@@ -374,8 +374,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
|||||||
B0 = _mm512_xor_si512(B0, C0); \
|
B0 = _mm512_xor_si512(B0, C0); \
|
||||||
B1 = _mm512_xor_si512(B1, C1); \
|
B1 = _mm512_xor_si512(B1, C1); \
|
||||||
\
|
\
|
||||||
B0 = ror64(B0, 63); \
|
B0 = ROR64(B0, 63); \
|
||||||
B1 = ror64(B1, 63); \
|
B1 = ROR64(B1, 63); \
|
||||||
} while ((void)0, 0)
|
} while ((void)0, 0)
|
||||||
|
|
||||||
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||||
|
|||||||
@@ -180,6 +180,7 @@ void blake512_8way_update( void *cc, const void *data, size_t len );
|
|||||||
void blake512_8way_close( void *cc, void *dst );
|
void blake512_8way_close( void *cc, void *dst );
|
||||||
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||||
const void *data, size_t len );
|
const void *data, size_t len );
|
||||||
|
void blake512_8way_hash_le80( void *hash, const void *data );
|
||||||
|
|
||||||
#endif // AVX512
|
#endif // AVX512
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|||||||
@@ -669,14 +669,14 @@ do { \
|
|||||||
ROUND_S_8WAY(2); \
|
ROUND_S_8WAY(2); \
|
||||||
ROUND_S_8WAY(3); \
|
ROUND_S_8WAY(3); \
|
||||||
} \
|
} \
|
||||||
H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \
|
H0 = mm256_xor3( V8, V0, H0 ); \
|
||||||
H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \
|
H1 = mm256_xor3( V9, V1, H1 ); \
|
||||||
H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \
|
H2 = mm256_xor3( VA, V2, H2 ); \
|
||||||
H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \
|
H3 = mm256_xor3( VB, V3, H3 ); \
|
||||||
H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \
|
H4 = mm256_xor3( VC, V4, H4 ); \
|
||||||
H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \
|
H5 = mm256_xor3( VD, V5, H5 ); \
|
||||||
H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \
|
H6 = mm256_xor3( VE, V6, H6 ); \
|
||||||
H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \
|
H7 = mm256_xor3( VF, V7, H7 ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
@@ -808,14 +808,14 @@ do { \
|
|||||||
ROUND_S_16WAY(2); \
|
ROUND_S_16WAY(2); \
|
||||||
ROUND_S_16WAY(3); \
|
ROUND_S_16WAY(3); \
|
||||||
} \
|
} \
|
||||||
H0 = _mm512_xor_si512( _mm512_xor_si512( V8, V0 ), H0 ); \
|
H0 = mm512_xor3( V8, V0, H0 ); \
|
||||||
H1 = _mm512_xor_si512( _mm512_xor_si512( V9, V1 ), H1 ); \
|
H1 = mm512_xor3( V9, V1, H1 ); \
|
||||||
H2 = _mm512_xor_si512( _mm512_xor_si512( VA, V2 ), H2 ); \
|
H2 = mm512_xor3( VA, V2, H2 ); \
|
||||||
H3 = _mm512_xor_si512( _mm512_xor_si512( VB, V3 ), H3 ); \
|
H3 = mm512_xor3( VB, V3, H3 ); \
|
||||||
H4 = _mm512_xor_si512( _mm512_xor_si512( VC, V4 ), H4 ); \
|
H4 = mm512_xor3( VC, V4, H4 ); \
|
||||||
H5 = _mm512_xor_si512( _mm512_xor_si512( VD, V5 ), H5 ); \
|
H5 = mm512_xor3( VD, V5, H5 ); \
|
||||||
H6 = _mm512_xor_si512( _mm512_xor_si512( VE, V6 ), H6 ); \
|
H6 = mm512_xor3( VE, V6, H6 ); \
|
||||||
H7 = _mm512_xor_si512( _mm512_xor_si512( VF, V7 ), H7 ); \
|
H7 = mm512_xor3( VF, V7, H7 ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -122,14 +122,14 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
|||||||
B2B8W_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
|
B2B8W_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] );
|
ctx->h[0] = mm512_xor3( ctx->h[0], v[0], v[ 8] );
|
||||||
ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] );
|
ctx->h[1] = mm512_xor3( ctx->h[1], v[1], v[ 9] );
|
||||||
ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] );
|
ctx->h[2] = mm512_xor3( ctx->h[2], v[2], v[10] );
|
||||||
ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] );
|
ctx->h[3] = mm512_xor3( ctx->h[3], v[3], v[11] );
|
||||||
ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] );
|
ctx->h[4] = mm512_xor3( ctx->h[4], v[4], v[12] );
|
||||||
ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] );
|
ctx->h[5] = mm512_xor3( ctx->h[5], v[5], v[13] );
|
||||||
ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] );
|
ctx->h[6] = mm512_xor3( ctx->h[6], v[6], v[14] );
|
||||||
ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] );
|
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
|
||||||
}
|
}
|
||||||
|
|
||||||
int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||||
|
|||||||
@@ -17,7 +17,7 @@
|
|||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
ALIGN(128) typedef struct {
|
typedef struct ALIGN( 64 ) {
|
||||||
__m512i b[16]; // input buffer
|
__m512i b[16]; // input buffer
|
||||||
__m512i h[8]; // chained state
|
__m512i h[8]; // chained state
|
||||||
uint64_t t[2]; // total number of bytes
|
uint64_t t[2]; // total number of bytes
|
||||||
@@ -35,7 +35,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
|
|||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
// state context
|
// state context
|
||||||
ALIGN(128) typedef struct {
|
typedef struct ALIGN( 64 ) {
|
||||||
__m256i b[16]; // input buffer
|
__m256i b[16]; // input buffer
|
||||||
__m256i h[8]; // chained state
|
__m256i h[8]; // chained state
|
||||||
uint64_t t[2]; // total number of bytes
|
uint64_t t[2]; // total number of bytes
|
||||||
|
|||||||
@@ -4,7 +4,6 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
|
|
||||||
//#if defined(__SSE4_2__)
|
|
||||||
#if defined(__SSE2__)
|
#if defined(__SSE2__)
|
||||||
#define BLAKE2S_4WAY
|
#define BLAKE2S_4WAY
|
||||||
#endif
|
#endif
|
||||||
@@ -27,8 +26,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
#elif defined (BLAKE2S_8WAY)
|
#elif defined (BLAKE2S_8WAY)
|
||||||
|
|
||||||
//#if defined(BLAKE2S_8WAY)
|
|
||||||
|
|
||||||
void blake2s_8way_hash( void *state, const void *input );
|
void blake2s_8way_hash( void *state, const void *input );
|
||||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|||||||
@@ -368,7 +368,7 @@ do { \
|
|||||||
ROUND8W( 9 );
|
ROUND8W( 9 );
|
||||||
|
|
||||||
for( size_t i = 0; i < 8; ++i )
|
for( size_t i = 0; i < 8; ++i )
|
||||||
S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] );
|
S->h[i] = mm256_xor3( S->h[i], v[i], v[i + 8] );
|
||||||
|
|
||||||
#undef G8W
|
#undef G8W
|
||||||
#undef ROUND8W
|
#undef ROUND8W
|
||||||
@@ -566,7 +566,7 @@ do { \
|
|||||||
ROUND16W( 9 );
|
ROUND16W( 9 );
|
||||||
|
|
||||||
for( size_t i = 0; i < 8; ++i )
|
for( size_t i = 0; i < 8; ++i )
|
||||||
S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] );
|
S->h[i] = mm512_xor3( S->h[i], v[i], v[i + 8] );
|
||||||
|
|
||||||
#undef G16W
|
#undef G16W
|
||||||
#undef ROUND16W
|
#undef ROUND16W
|
||||||
|
|||||||
@@ -60,7 +60,7 @@ typedef struct __blake2s_nway_param
|
|||||||
} blake2s_nway_param;
|
} blake2s_nway_param;
|
||||||
#pragma pack(pop)
|
#pragma pack(pop)
|
||||||
|
|
||||||
ALIGN( 64 ) typedef struct __blake2s_4way_state
|
typedef struct ALIGN( 64 ) __blake2s_4way_state
|
||||||
{
|
{
|
||||||
__m128i h[8];
|
__m128i h[8];
|
||||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 4 ];
|
uint8_t buf[ BLAKE2S_BLOCKBYTES * 4 ];
|
||||||
@@ -80,7 +80,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
|||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
ALIGN( 64 ) typedef struct __blake2s_8way_state
|
typedef struct ALIGN( 64 ) __blake2s_8way_state
|
||||||
{
|
{
|
||||||
__m256i h[8];
|
__m256i h[8];
|
||||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 8 ];
|
uint8_t buf[ BLAKE2S_BLOCKBYTES * 8 ];
|
||||||
@@ -101,7 +101,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
|||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
ALIGN( 128 ) typedef struct __blake2s_16way_state
|
typedef struct ALIGN( 64 ) __blake2s_16way_state
|
||||||
{
|
{
|
||||||
__m512i h[8];
|
__m512i h[8];
|
||||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 16 ];
|
uint8_t buf[ BLAKE2S_BLOCKBYTES * 16 ];
|
||||||
|
|||||||
@@ -293,10 +293,6 @@ static const sph_u64 CB[16] = {
|
|||||||
H5 = (state)->H[5]; \
|
H5 = (state)->H[5]; \
|
||||||
H6 = (state)->H[6]; \
|
H6 = (state)->H[6]; \
|
||||||
H7 = (state)->H[7]; \
|
H7 = (state)->H[7]; \
|
||||||
S0 = (state)->S[0]; \
|
|
||||||
S1 = (state)->S[1]; \
|
|
||||||
S2 = (state)->S[2]; \
|
|
||||||
S3 = (state)->S[3]; \
|
|
||||||
T0 = (state)->T0; \
|
T0 = (state)->T0; \
|
||||||
T1 = (state)->T1; \
|
T1 = (state)->T1; \
|
||||||
} while (0)
|
} while (0)
|
||||||
@@ -310,10 +306,6 @@ static const sph_u64 CB[16] = {
|
|||||||
(state)->H[5] = H5; \
|
(state)->H[5] = H5; \
|
||||||
(state)->H[6] = H6; \
|
(state)->H[6] = H6; \
|
||||||
(state)->H[7] = H7; \
|
(state)->H[7] = H7; \
|
||||||
(state)->S[0] = S0; \
|
|
||||||
(state)->S[1] = S1; \
|
|
||||||
(state)->S[2] = S2; \
|
|
||||||
(state)->S[3] = S3; \
|
|
||||||
(state)->T0 = T0; \
|
(state)->T0 = T0; \
|
||||||
(state)->T1 = T1; \
|
(state)->T1 = T1; \
|
||||||
} while (0)
|
} while (0)
|
||||||
@@ -348,7 +340,6 @@ static const sph_u64 CB[16] = {
|
|||||||
|
|
||||||
#define DECL_STATE64_8WAY \
|
#define DECL_STATE64_8WAY \
|
||||||
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||||
__m512i S0, S1, S2, S3; \
|
|
||||||
uint64_t T0, T1;
|
uint64_t T0, T1;
|
||||||
|
|
||||||
#define COMPRESS64_8WAY( buf ) do \
|
#define COMPRESS64_8WAY( buf ) do \
|
||||||
@@ -366,10 +357,10 @@ static const sph_u64 CB[16] = {
|
|||||||
V5 = H5; \
|
V5 = H5; \
|
||||||
V6 = H6; \
|
V6 = H6; \
|
||||||
V7 = H7; \
|
V7 = H7; \
|
||||||
V8 = _mm512_xor_si512( S0, m512_const1_64( CB0 ) ); \
|
V8 = m512_const1_64( CB0 ); \
|
||||||
V9 = _mm512_xor_si512( S1, m512_const1_64( CB1 ) ); \
|
V9 = m512_const1_64( CB1 ); \
|
||||||
VA = _mm512_xor_si512( S2, m512_const1_64( CB2 ) ); \
|
VA = m512_const1_64( CB2 ); \
|
||||||
VB = _mm512_xor_si512( S3, m512_const1_64( CB3 ) ); \
|
VB = m512_const1_64( CB3 ); \
|
||||||
VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
|
VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
|
||||||
m512_const1_64( CB4 ) ); \
|
m512_const1_64( CB4 ) ); \
|
||||||
VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
|
VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
|
||||||
@@ -414,14 +405,14 @@ static const sph_u64 CB[16] = {
|
|||||||
ROUND_B_8WAY(3); \
|
ROUND_B_8WAY(3); \
|
||||||
ROUND_B_8WAY(4); \
|
ROUND_B_8WAY(4); \
|
||||||
ROUND_B_8WAY(5); \
|
ROUND_B_8WAY(5); \
|
||||||
H0 = mm512_xor4( V8, V0, S0, H0 ); \
|
H0 = mm512_xor3( V8, V0, H0 ); \
|
||||||
H1 = mm512_xor4( V9, V1, S1, H1 ); \
|
H1 = mm512_xor3( V9, V1, H1 ); \
|
||||||
H2 = mm512_xor4( VA, V2, S2, H2 ); \
|
H2 = mm512_xor3( VA, V2, H2 ); \
|
||||||
H3 = mm512_xor4( VB, V3, S3, H3 ); \
|
H3 = mm512_xor3( VB, V3, H3 ); \
|
||||||
H4 = mm512_xor4( VC, V4, S0, H4 ); \
|
H4 = mm512_xor3( VC, V4, H4 ); \
|
||||||
H5 = mm512_xor4( VD, V5, S1, H5 ); \
|
H5 = mm512_xor3( VD, V5, H5 ); \
|
||||||
H6 = mm512_xor4( VE, V6, S2, H6 ); \
|
H6 = mm512_xor3( VE, V6, H6 ); \
|
||||||
H7 = mm512_xor4( VF, V7, S3, H7 ); \
|
H7 = mm512_xor3( VF, V7, H7 ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
void blake512_8way_compress( blake_8way_big_context *sc )
|
void blake512_8way_compress( blake_8way_big_context *sc )
|
||||||
@@ -440,10 +431,10 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
|||||||
V5 = sc->H[5];
|
V5 = sc->H[5];
|
||||||
V6 = sc->H[6];
|
V6 = sc->H[6];
|
||||||
V7 = sc->H[7];
|
V7 = sc->H[7];
|
||||||
V8 = _mm512_xor_si512( sc->S[0], m512_const1_64( CB0 ) );
|
V8 = m512_const1_64( CB0 );
|
||||||
V9 = _mm512_xor_si512( sc->S[1], m512_const1_64( CB1 ) );
|
V9 = m512_const1_64( CB1 );
|
||||||
VA = _mm512_xor_si512( sc->S[2], m512_const1_64( CB2 ) );
|
VA = m512_const1_64( CB2 );
|
||||||
VB = _mm512_xor_si512( sc->S[3], m512_const1_64( CB3 ) );
|
VB = m512_const1_64( CB3 );
|
||||||
VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
|
VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
|
||||||
m512_const1_64( CB4 ) );
|
m512_const1_64( CB4 ) );
|
||||||
VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
|
VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
|
||||||
@@ -492,19 +483,18 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
|||||||
ROUND_B_8WAY(4);
|
ROUND_B_8WAY(4);
|
||||||
ROUND_B_8WAY(5);
|
ROUND_B_8WAY(5);
|
||||||
|
|
||||||
sc->H[0] = mm512_xor4( V8, V0, sc->S[0], sc->H[0] );
|
sc->H[0] = mm512_xor3( V8, V0, sc->H[0] );
|
||||||
sc->H[1] = mm512_xor4( V9, V1, sc->S[1], sc->H[1] );
|
sc->H[1] = mm512_xor3( V9, V1, sc->H[1] );
|
||||||
sc->H[2] = mm512_xor4( VA, V2, sc->S[2], sc->H[2] );
|
sc->H[2] = mm512_xor3( VA, V2, sc->H[2] );
|
||||||
sc->H[3] = mm512_xor4( VB, V3, sc->S[3], sc->H[3] );
|
sc->H[3] = mm512_xor3( VB, V3, sc->H[3] );
|
||||||
sc->H[4] = mm512_xor4( VC, V4, sc->S[0], sc->H[4] );
|
sc->H[4] = mm512_xor3( VC, V4, sc->H[4] );
|
||||||
sc->H[5] = mm512_xor4( VD, V5, sc->S[1], sc->H[5] );
|
sc->H[5] = mm512_xor3( VD, V5, sc->H[5] );
|
||||||
sc->H[6] = mm512_xor4( VE, V6, sc->S[2], sc->H[6] );
|
sc->H[6] = mm512_xor3( VE, V6, sc->H[6] );
|
||||||
sc->H[7] = mm512_xor4( VF, V7, sc->S[3], sc->H[7] );
|
sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
|
||||||
}
|
}
|
||||||
|
|
||||||
void blake512_8way_init( blake_8way_big_context *sc )
|
void blake512_8way_init( blake_8way_big_context *sc )
|
||||||
{
|
{
|
||||||
__m512i zero = m512_zero;
|
|
||||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||||
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
|
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||||
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
|
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||||
@@ -514,11 +504,6 @@ void blake512_8way_init( blake_8way_big_context *sc )
|
|||||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||||
|
|
||||||
casti_m512i( sc->S, 0 ) = zero;
|
|
||||||
casti_m512i( sc->S, 1 ) = zero;
|
|
||||||
casti_m512i( sc->S, 2 ) = zero;
|
|
||||||
casti_m512i( sc->S, 3 ) = zero;
|
|
||||||
|
|
||||||
sc->T0 = sc->T1 = 0;
|
sc->T0 = sc->T1 = 0;
|
||||||
sc->ptr = 0;
|
sc->ptr = 0;
|
||||||
}
|
}
|
||||||
@@ -641,11 +626,6 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
|||||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||||
|
|
||||||
casti_m512i( sc->S, 0 ) = m512_zero;
|
|
||||||
casti_m512i( sc->S, 1 ) = m512_zero;
|
|
||||||
casti_m512i( sc->S, 2 ) = m512_zero;
|
|
||||||
casti_m512i( sc->S, 3 ) = m512_zero;
|
|
||||||
|
|
||||||
sc->T0 = sc->T1 = 0;
|
sc->T0 = sc->T1 = 0;
|
||||||
sc->ptr = 0;
|
sc->ptr = 0;
|
||||||
|
|
||||||
@@ -740,7 +720,6 @@ blake512_8way_close(void *cc, void *dst)
|
|||||||
|
|
||||||
#define DECL_STATE64_4WAY \
|
#define DECL_STATE64_4WAY \
|
||||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||||
__m256i S0, S1, S2, S3; \
|
|
||||||
uint64_t T0, T1;
|
uint64_t T0, T1;
|
||||||
|
|
||||||
#define COMPRESS64_4WAY do \
|
#define COMPRESS64_4WAY do \
|
||||||
@@ -758,10 +737,10 @@ blake512_8way_close(void *cc, void *dst)
|
|||||||
V5 = H5; \
|
V5 = H5; \
|
||||||
V6 = H6; \
|
V6 = H6; \
|
||||||
V7 = H7; \
|
V7 = H7; \
|
||||||
V8 = _mm256_xor_si256( S0, m256_const1_64( CB0 ) ); \
|
V8 = m256_const1_64( CB0 ); \
|
||||||
V9 = _mm256_xor_si256( S1, m256_const1_64( CB1 ) ); \
|
V9 = m256_const1_64( CB1 ); \
|
||||||
VA = _mm256_xor_si256( S2, m256_const1_64( CB2 ) ); \
|
VA = m256_const1_64( CB2 ); \
|
||||||
VB = _mm256_xor_si256( S3, m256_const1_64( CB3 ) ); \
|
VB = m256_const1_64( CB3 ); \
|
||||||
VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
|
VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
|
||||||
m256_const1_64( CB4 ) ); \
|
m256_const1_64( CB4 ) ); \
|
||||||
VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
|
VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
|
||||||
@@ -804,14 +783,14 @@ blake512_8way_close(void *cc, void *dst)
|
|||||||
ROUND_B_4WAY(3); \
|
ROUND_B_4WAY(3); \
|
||||||
ROUND_B_4WAY(4); \
|
ROUND_B_4WAY(4); \
|
||||||
ROUND_B_4WAY(5); \
|
ROUND_B_4WAY(5); \
|
||||||
H0 = mm256_xor4( V8, V0, S0, H0 ); \
|
H0 = mm256_xor3( V8, V0, H0 ); \
|
||||||
H1 = mm256_xor4( V9, V1, S1, H1 ); \
|
H1 = mm256_xor3( V9, V1, H1 ); \
|
||||||
H2 = mm256_xor4( VA, V2, S2, H2 ); \
|
H2 = mm256_xor3( VA, V2, H2 ); \
|
||||||
H3 = mm256_xor4( VB, V3, S3, H3 ); \
|
H3 = mm256_xor3( VB, V3, H3 ); \
|
||||||
H4 = mm256_xor4( VC, V4, S0, H4 ); \
|
H4 = mm256_xor3( VC, V4, H4 ); \
|
||||||
H5 = mm256_xor4( VD, V5, S1, H5 ); \
|
H5 = mm256_xor3( VD, V5, H5 ); \
|
||||||
H6 = mm256_xor4( VE, V6, S2, H6 ); \
|
H6 = mm256_xor3( VE, V6, H6 ); \
|
||||||
H7 = mm256_xor4( VF, V7, S3, H7 ); \
|
H7 = mm256_xor3( VF, V7, H7 ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
@@ -831,10 +810,10 @@ void blake512_4way_compress( blake_4way_big_context *sc )
|
|||||||
V5 = sc->H[5];
|
V5 = sc->H[5];
|
||||||
V6 = sc->H[6];
|
V6 = sc->H[6];
|
||||||
V7 = sc->H[7];
|
V7 = sc->H[7];
|
||||||
V8 = _mm256_xor_si256( sc->S[0], m256_const1_64( CB0 ) );
|
V8 = m256_const1_64( CB0 );
|
||||||
V9 = _mm256_xor_si256( sc->S[1], m256_const1_64( CB1 ) );
|
V9 = m256_const1_64( CB1 );
|
||||||
VA = _mm256_xor_si256( sc->S[2], m256_const1_64( CB2 ) );
|
VA = m256_const1_64( CB2 );
|
||||||
VB = _mm256_xor_si256( sc->S[3], m256_const1_64( CB3 ) );
|
VB = m256_const1_64( CB3 );
|
||||||
VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
|
VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
|
||||||
m256_const1_64( CB4 ) );
|
m256_const1_64( CB4 ) );
|
||||||
VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
|
VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
|
||||||
@@ -880,19 +859,18 @@ void blake512_4way_compress( blake_4way_big_context *sc )
|
|||||||
ROUND_B_4WAY(4);
|
ROUND_B_4WAY(4);
|
||||||
ROUND_B_4WAY(5);
|
ROUND_B_4WAY(5);
|
||||||
|
|
||||||
sc->H[0] = mm256_xor4( V8, V0, sc->S[0], sc->H[0] );
|
sc->H[0] = mm256_xor3( V8, V0, sc->H[0] );
|
||||||
sc->H[1] = mm256_xor4( V9, V1, sc->S[1], sc->H[1] );
|
sc->H[1] = mm256_xor3( V9, V1, sc->H[1] );
|
||||||
sc->H[2] = mm256_xor4( VA, V2, sc->S[2], sc->H[2] );
|
sc->H[2] = mm256_xor3( VA, V2, sc->H[2] );
|
||||||
sc->H[3] = mm256_xor4( VB, V3, sc->S[3], sc->H[3] );
|
sc->H[3] = mm256_xor3( VB, V3, sc->H[3] );
|
||||||
sc->H[4] = mm256_xor4( VC, V4, sc->S[0], sc->H[4] );
|
sc->H[4] = mm256_xor3( VC, V4, sc->H[4] );
|
||||||
sc->H[5] = mm256_xor4( VD, V5, sc->S[1], sc->H[5] );
|
sc->H[5] = mm256_xor3( VD, V5, sc->H[5] );
|
||||||
sc->H[6] = mm256_xor4( VE, V6, sc->S[2], sc->H[6] );
|
sc->H[6] = mm256_xor3( VE, V6, sc->H[6] );
|
||||||
sc->H[7] = mm256_xor4( VF, V7, sc->S[3], sc->H[7] );
|
sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
|
||||||
}
|
}
|
||||||
|
|
||||||
void blake512_4way_init( blake_4way_big_context *sc )
|
void blake512_4way_init( blake_4way_big_context *sc )
|
||||||
{
|
{
|
||||||
__m256i zero = m256_zero;
|
|
||||||
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
|
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||||
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
|
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||||
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
|
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||||
@@ -902,11 +880,6 @@ void blake512_4way_init( blake_4way_big_context *sc )
|
|||||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||||
|
|
||||||
casti_m256i( sc->S, 0 ) = zero;
|
|
||||||
casti_m256i( sc->S, 1 ) = zero;
|
|
||||||
casti_m256i( sc->S, 2 ) = zero;
|
|
||||||
casti_m256i( sc->S, 3 ) = zero;
|
|
||||||
|
|
||||||
sc->T0 = sc->T1 = 0;
|
sc->T0 = sc->T1 = 0;
|
||||||
sc->ptr = 0;
|
sc->ptr = 0;
|
||||||
}
|
}
|
||||||
@@ -1026,11 +999,6 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
|||||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||||
|
|
||||||
casti_m256i( sc->S, 0 ) = m256_zero;
|
|
||||||
casti_m256i( sc->S, 1 ) = m256_zero;
|
|
||||||
casti_m256i( sc->S, 2 ) = m256_zero;
|
|
||||||
casti_m256i( sc->S, 3 ) = m256_zero;
|
|
||||||
|
|
||||||
sc->T0 = sc->T1 = 0;
|
sc->T0 = sc->T1 = 0;
|
||||||
sc->ptr = 0;
|
sc->ptr = 0;
|
||||||
|
|
||||||
|
|||||||
@@ -323,7 +323,7 @@ int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )
|
|||||||
|
|
||||||
int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
|
int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
|
||||||
{
|
{
|
||||||
blake2s_state S[1];
|
blake2s_state S;
|
||||||
|
|
||||||
/* Verify parameters */
|
/* Verify parameters */
|
||||||
if ( NULL == in ) return -1;
|
if ( NULL == in ) return -1;
|
||||||
@@ -334,15 +334,15 @@ int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen
|
|||||||
|
|
||||||
if( keylen > 0 )
|
if( keylen > 0 )
|
||||||
{
|
{
|
||||||
if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
|
if( blake2s_init_key( &S, outlen, key, keylen ) < 0 ) return -1;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if( blake2s_init( S, outlen ) < 0 ) return -1;
|
if( blake2s_init( &S, outlen ) < 0 ) return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
blake2s_update( S, ( uint8_t * )in, inlen );
|
blake2s_update( &S, ( uint8_t * )in, inlen );
|
||||||
blake2s_final( S, out, outlen );
|
blake2s_final( &S, out, outlen );
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -116,7 +116,7 @@ extern "C" {
|
|||||||
uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32
|
uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32
|
||||||
} blake2s_param;
|
} blake2s_param;
|
||||||
|
|
||||||
ALIGN( 64 ) typedef struct __blake2s_state
|
typedef struct ALIGN( 64 ) __blake2s_state
|
||||||
{
|
{
|
||||||
uint32_t h[8];
|
uint32_t h[8];
|
||||||
uint32_t t[2];
|
uint32_t t[2];
|
||||||
|
|||||||
@@ -18,7 +18,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// state context
|
// state context
|
||||||
ALIGN(64) typedef struct {
|
typedef ALIGN(64) struct {
|
||||||
uint8_t b[128]; // input buffer
|
uint8_t b[128]; // input buffer
|
||||||
uint64_t h[8]; // chained state
|
uint64_t h[8]; // chained state
|
||||||
uint64_t t[2]; // total number of bytes
|
uint64_t t[2]; // total number of bytes
|
||||||
|
|||||||
@@ -867,40 +867,35 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
|
|||||||
qt[30] = expand2s8( qt, M, H, 30 );
|
qt[30] = expand2s8( qt, M, H, 30 );
|
||||||
qt[31] = expand2s8( qt, M, H, 31 );
|
qt[31] = expand2s8( qt, M, H, 31 );
|
||||||
|
|
||||||
xl = _mm256_xor_si256(
|
xl = mm256_xor3( mm256_xor3( qt[16], qt[17], qt[18] ),
|
||||||
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
mm256_xor3( qt[19], qt[20], qt[21] ),
|
||||||
mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
_mm256_xor_si256( qt[22], qt[23] ) );
|
||||||
xh = _mm256_xor_si256( xl, _mm256_xor_si256(
|
|
||||||
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
xh = mm256_xor3( mm256_xor3( xl, qt[24], qt[25] ),
|
||||||
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
mm256_xor3( qt[26], qt[27], qt[28] ),
|
||||||
|
mm256_xor3( qt[29], qt[30], qt[31] ) );
|
||||||
|
|
||||||
#define DH1L( m, sl, sr, a, b, c ) \
|
#define DH1L( m, sl, sr, a, b, c ) \
|
||||||
_mm256_add_epi32( \
|
_mm256_add_epi32( mm256_xor3( M[m], _mm256_slli_epi32( xh, sl ), \
|
||||||
_mm256_xor_si256( M[m], \
|
_mm256_srli_epi32( qt[a], sr ) ), \
|
||||||
_mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
|
mm256_xor3( xl, qt[b], qt[c] ) )
|
||||||
_mm256_srli_epi32( qt[a], sr ) ) ), \
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
|
|
||||||
|
|
||||||
#define DH1R( m, sl, sr, a, b, c ) \
|
#define DH1R( m, sl, sr, a, b, c ) \
|
||||||
_mm256_add_epi32( \
|
_mm256_add_epi32( mm256_xor3( M[m], _mm256_srli_epi32( xh, sl ), \
|
||||||
_mm256_xor_si256( M[m], \
|
_mm256_slli_epi32( qt[a], sr ) ), \
|
||||||
_mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
|
mm256_xor3( xl, qt[b], qt[c] ) )
|
||||||
_mm256_slli_epi32( qt[a], sr ) ) ), \
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
|
|
||||||
|
|
||||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||||
_mm256_add_epi32( _mm256_add_epi32( \
|
_mm256_add_epi32( _mm256_add_epi32( \
|
||||||
mm256_rol_32( dH[h], rl ), \
|
mm256_rol_32( dH[h], rl ), \
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
|
mm256_xor3( xh, qt[a], M[m] ) ), \
|
||||||
_mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
|
mm256_xor3( _mm256_slli_epi32( xl, sl ), qt[b], qt[c] ) )
|
||||||
_mm256_xor_si256( qt[b], qt[c] ) ) );
|
|
||||||
|
|
||||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||||
_mm256_add_epi32( _mm256_add_epi32( \
|
_mm256_add_epi32( _mm256_add_epi32( \
|
||||||
mm256_rol_32( dH[h], rl ), \
|
mm256_rol_32( dH[h], rl ), \
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
|
mm256_xor3( xh, qt[a], M[m] ) ), \
|
||||||
_mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
|
mm256_xor3( _mm256_srli_epi32( xl, sr ), qt[b], qt[c] ) )
|
||||||
_mm256_xor_si256( qt[b], qt[c] ) ) );
|
|
||||||
|
|
||||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||||
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
||||||
@@ -924,88 +919,6 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
|
|||||||
#undef DH2L
|
#undef DH2L
|
||||||
#undef DH2R
|
#undef DH2R
|
||||||
|
|
||||||
/*
|
|
||||||
dH[ 0] = _mm256_add_epi32(
|
|
||||||
_mm256_xor_si256( M[0],
|
|
||||||
_mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
|
|
||||||
_mm256_srli_epi32( qt[16], 5 ) ) ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
|
|
||||||
dH[ 1] = _mm256_add_epi32(
|
|
||||||
_mm256_xor_si256( M[1],
|
|
||||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 7 ),
|
|
||||||
_mm256_slli_epi32( qt[17], 8 ) ) ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
|
|
||||||
dH[ 2] = _mm256_add_epi32(
|
|
||||||
_mm256_xor_si256( M[2],
|
|
||||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 5 ),
|
|
||||||
_mm256_slli_epi32( qt[18], 5 ) ) ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
|
|
||||||
dH[ 3] = _mm256_add_epi32(
|
|
||||||
_mm256_xor_si256( M[3],
|
|
||||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 1 ),
|
|
||||||
_mm256_slli_epi32( qt[19], 5 ) ) ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
|
|
||||||
dH[ 4] = _mm256_add_epi32(
|
|
||||||
_mm256_xor_si256( M[4],
|
|
||||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 3 ),
|
|
||||||
_mm256_slli_epi32( qt[20], 0 ) ) ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
|
|
||||||
dH[ 5] = _mm256_add_epi32(
|
|
||||||
_mm256_xor_si256( M[5],
|
|
||||||
_mm256_xor_si256( _mm256_slli_epi32( xh, 6 ),
|
|
||||||
_mm256_srli_epi32( qt[21], 6 ) ) ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
|
|
||||||
dH[ 6] = _mm256_add_epi32(
|
|
||||||
_mm256_xor_si256( M[6],
|
|
||||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 4 ),
|
|
||||||
_mm256_slli_epi32( qt[22], 6 ) ) ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
|
|
||||||
dH[ 7] = _mm256_add_epi32(
|
|
||||||
_mm256_xor_si256( M[7],
|
|
||||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 11 ),
|
|
||||||
_mm256_slli_epi32( qt[23], 2 ) ) ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
|
|
||||||
dH[ 8] = _mm256_add_epi32( _mm256_add_epi32(
|
|
||||||
mm256_rol_32( dH[4], 9 ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
|
|
||||||
_mm256_xor_si256( _mm256_slli_epi32( xl, 8 ),
|
|
||||||
_mm256_xor_si256( qt[23], qt[ 8] ) ) );
|
|
||||||
dH[ 9] = _mm256_add_epi32( _mm256_add_epi32(
|
|
||||||
mm256_rol_32( dH[5], 10 ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
|
|
||||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 6 ),
|
|
||||||
_mm256_xor_si256( qt[16], qt[ 9] ) ) );
|
|
||||||
dH[10] = _mm256_add_epi32( _mm256_add_epi32(
|
|
||||||
mm256_rol_32( dH[6], 11 ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
|
|
||||||
_mm256_xor_si256( _mm256_slli_epi32( xl, 6 ),
|
|
||||||
_mm256_xor_si256( qt[17], qt[10] ) ) );
|
|
||||||
dH[11] = _mm256_add_epi32( _mm256_add_epi32(
|
|
||||||
mm256_rol_32( dH[7], 12 ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
|
|
||||||
_mm256_xor_si256( _mm256_slli_epi32( xl, 4 ),
|
|
||||||
_mm256_xor_si256( qt[18], qt[11] ) ) );
|
|
||||||
dH[12] = _mm256_add_epi32( _mm256_add_epi32(
|
|
||||||
mm256_rol_32( dH[0], 13 ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
|
|
||||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 3 ),
|
|
||||||
_mm256_xor_si256( qt[19], qt[12] ) ) );
|
|
||||||
dH[13] = _mm256_add_epi32( _mm256_add_epi32(
|
|
||||||
mm256_rol_32( dH[1], 14 ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
|
|
||||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 4 ),
|
|
||||||
_mm256_xor_si256( qt[20], qt[13] ) ) );
|
|
||||||
dH[14] = _mm256_add_epi32( _mm256_add_epi32(
|
|
||||||
mm256_rol_32( dH[2], 15 ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
|
|
||||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 7 ),
|
|
||||||
_mm256_xor_si256( qt[21], qt[14] ) ) );
|
|
||||||
dH[15] = _mm256_add_epi32( _mm256_add_epi32(
|
|
||||||
mm256_rol_32( dH[3], 16 ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
|
|
||||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
|
|
||||||
_mm256_xor_si256( qt[22], qt[15] ) ) );
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static const __m256i final_s8[16] =
|
static const __m256i final_s8[16] =
|
||||||
@@ -1422,40 +1335,35 @@ void compress_small_16way( const __m512i *M, const __m512i H[16],
|
|||||||
qt[30] = expand2s16( qt, M, H, 30 );
|
qt[30] = expand2s16( qt, M, H, 30 );
|
||||||
qt[31] = expand2s16( qt, M, H, 31 );
|
qt[31] = expand2s16( qt, M, H, 31 );
|
||||||
|
|
||||||
xl = _mm512_xor_si512(
|
xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
|
||||||
mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
mm512_xor3( qt[19], qt[20], qt[21] ),
|
||||||
mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
_mm512_xor_si512( qt[22], qt[23] ) );
|
||||||
xh = _mm512_xor_si512( xl, _mm512_xor_si512(
|
|
||||||
mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
xh = mm512_xor3( mm512_xor3( xl, qt[24], qt[25] ),
|
||||||
mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
mm512_xor3( qt[26], qt[27], qt[28] ),
|
||||||
|
mm512_xor3( qt[29], qt[30], qt[31] ) );
|
||||||
|
|
||||||
#define DH1L( m, sl, sr, a, b, c ) \
|
#define DH1L( m, sl, sr, a, b, c ) \
|
||||||
_mm512_add_epi32( \
|
_mm512_add_epi32( mm512_xor3( M[m], _mm512_slli_epi32( xh, sl ), \
|
||||||
_mm512_xor_si512( M[m], \
|
_mm512_srli_epi32( qt[a], sr ) ), \
|
||||||
_mm512_xor_si512( _mm512_slli_epi32( xh, sl ), \
|
mm512_xor3( xl, qt[b], qt[c] ) )
|
||||||
_mm512_srli_epi32( qt[a], sr ) ) ), \
|
|
||||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
|
||||||
|
|
||||||
#define DH1R( m, sl, sr, a, b, c ) \
|
#define DH1R( m, sl, sr, a, b, c ) \
|
||||||
_mm512_add_epi32( \
|
_mm512_add_epi32( mm512_xor3( M[m], _mm512_srli_epi32( xh, sl ), \
|
||||||
_mm512_xor_si512( M[m], \
|
_mm512_slli_epi32( qt[a], sr ) ), \
|
||||||
_mm512_xor_si512( _mm512_srli_epi32( xh, sl ), \
|
mm512_xor3( xl, qt[b], qt[c] ) )
|
||||||
_mm512_slli_epi32( qt[a], sr ) ) ), \
|
|
||||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
|
||||||
|
|
||||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||||
_mm512_add_epi32( _mm512_add_epi32( \
|
_mm512_add_epi32( _mm512_add_epi32( \
|
||||||
mm512_rol_32( dH[h], rl ), \
|
mm512_rol_32( dH[h], rl ), \
|
||||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
mm512_xor3( xh, qt[a], M[m] ) ), \
|
||||||
_mm512_xor_si512( _mm512_slli_epi32( xl, sl ), \
|
mm512_xor3( _mm512_slli_epi32( xl, sl ), qt[b], qt[c] ) )
|
||||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
|
||||||
|
|
||||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||||
_mm512_add_epi32( _mm512_add_epi32( \
|
_mm512_add_epi32( _mm512_add_epi32( \
|
||||||
mm512_rol_32( dH[h], rl ), \
|
mm512_rol_32( dH[h], rl ), \
|
||||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
mm512_xor3( xh, qt[a], M[m] ) ), \
|
||||||
_mm512_xor_si512( _mm512_srli_epi32( xl, sr ), \
|
mm512_xor3( _mm512_srli_epi32( xl, sr ), qt[b], qt[c] ) )
|
||||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
|
||||||
|
|
||||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||||
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
||||||
|
|||||||
@@ -594,22 +594,15 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
|||||||
#define rb6(x) mm256_rol_64( x, 43 )
|
#define rb6(x) mm256_rol_64( x, 43 )
|
||||||
#define rb7(x) mm256_rol_64( x, 53 )
|
#define rb7(x) mm256_rol_64( x, 53 )
|
||||||
|
|
||||||
#define rol_off_64( M, j, off ) \
|
#define rol_off_64( M, j ) \
|
||||||
mm256_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
|
mm256_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
|
||||||
( ( (j) + (off) ) & 0xF ) + 1 )
|
|
||||||
|
|
||||||
#define add_elt_b( M, H, j ) \
|
#define add_elt_b( mj0, mj3, mj10, h, K ) \
|
||||||
_mm256_xor_si256( \
|
_mm256_xor_si256( h, _mm256_add_epi64( K, \
|
||||||
_mm256_add_epi64( \
|
_mm256_sub_epi64( _mm256_add_epi64( mj0, mj3 ), mj10 ) ) )
|
||||||
_mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \
|
|
||||||
rol_off_64( M, j, 3 ) ), \
|
|
||||||
rol_off_64( M, j, 10 ) ), \
|
|
||||||
_mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
|
|
||||||
H[ ( (j)+7 ) & 0xF ] )
|
|
||||||
|
|
||||||
|
#define expand1_b( qt, i ) \
|
||||||
#define expand1b( qt, M, H, i ) \
|
mm256_add4_64( \
|
||||||
_mm256_add_epi64( mm256_add4_64( \
|
|
||||||
mm256_add4_64( sb1( qt[ (i)-16 ] ), sb2( qt[ (i)-15 ] ), \
|
mm256_add4_64( sb1( qt[ (i)-16 ] ), sb2( qt[ (i)-15 ] ), \
|
||||||
sb3( qt[ (i)-14 ] ), sb0( qt[ (i)-13 ] )), \
|
sb3( qt[ (i)-14 ] ), sb0( qt[ (i)-13 ] )), \
|
||||||
mm256_add4_64( sb1( qt[ (i)-12 ] ), sb2( qt[ (i)-11 ] ), \
|
mm256_add4_64( sb1( qt[ (i)-12 ] ), sb2( qt[ (i)-11 ] ), \
|
||||||
@@ -617,11 +610,10 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
|||||||
mm256_add4_64( sb1( qt[ (i)- 8 ] ), sb2( qt[ (i)- 7 ] ), \
|
mm256_add4_64( sb1( qt[ (i)- 8 ] ), sb2( qt[ (i)- 7 ] ), \
|
||||||
sb3( qt[ (i)- 6 ] ), sb0( qt[ (i)- 5 ] )), \
|
sb3( qt[ (i)- 6 ] ), sb0( qt[ (i)- 5 ] )), \
|
||||||
mm256_add4_64( sb1( qt[ (i)- 4 ] ), sb2( qt[ (i)- 3 ] ), \
|
mm256_add4_64( sb1( qt[ (i)- 4 ] ), sb2( qt[ (i)- 3 ] ), \
|
||||||
sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ), \
|
sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) )
|
||||||
add_elt_b( M, H, (i)-16 ) )
|
|
||||||
|
|
||||||
#define expand2b( qt, M, H, i) \
|
#define expand2_b( qt, i) \
|
||||||
_mm256_add_epi64( mm256_add4_64( \
|
mm256_add4_64( \
|
||||||
mm256_add4_64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ), \
|
mm256_add4_64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ), \
|
||||||
qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ), \
|
qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ), \
|
||||||
mm256_add4_64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ), \
|
mm256_add4_64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ), \
|
||||||
@@ -629,159 +621,98 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
|||||||
mm256_add4_64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ), \
|
mm256_add4_64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ), \
|
||||||
qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ), \
|
qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ), \
|
||||||
mm256_add4_64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ), \
|
mm256_add4_64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ), \
|
||||||
sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \
|
sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) )
|
||||||
add_elt_b( M, H, (i)-16 ) )
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define Wb0 \
|
#define Wb0 \
|
||||||
_mm256_add_epi64( \
|
_mm256_add_epi64( \
|
||||||
_mm256_add_epi64( \
|
_mm256_add_epi64( _mm256_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
|
_mm256_add_epi64( mh[13], mh[14] ) )
|
||||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
|
||||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
|
||||||
_mm256_add_epi64( _mm256_xor_si256( M[13], H[13] ), \
|
|
||||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
|
||||||
|
|
||||||
#define Wb1 \
|
#define Wb1 \
|
||||||
_mm256_add_epi64( \
|
_mm256_add_epi64( \
|
||||||
_mm256_add_epi64( \
|
_mm256_add_epi64( _mm256_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
|
_mm256_sub_epi64( mh[14], mh[15] ) )
|
||||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
|
||||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[14], H[14] ), \
|
|
||||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
|
||||||
|
|
||||||
#define Wb2 \
|
#define Wb2 \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( \
|
||||||
_mm256_add_epi64( \
|
_mm256_add_epi64( _mm256_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \
|
||||||
_mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
_mm256_sub_epi64( mh[12], mh[15] ) )
|
||||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
|
||||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
|
||||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
|
||||||
|
|
||||||
#define Wb3 \
|
#define Wb3 \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( \
|
||||||
_mm256_add_epi64( \
|
_mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
_mm256_sub_epi64( mh[10], \
|
||||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
mh[13] ) )
|
||||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[10], H[10] ), \
|
|
||||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
|
||||||
|
|
||||||
#define Wb4 \
|
#define Wb4 \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( \
|
||||||
_mm256_add_epi64( \
|
_mm256_add_epi64( _mm256_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \
|
||||||
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
_mm256_add_epi64( mh[11], mh[14] ) )
|
||||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
|
||||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
|
||||||
_mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
|
|
||||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
|
||||||
|
|
||||||
#define Wb5 \
|
#define Wb5 \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( \
|
||||||
_mm256_add_epi64( \
|
_mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
_mm256_sub_epi64( mh[12], mh[15] ) )
|
||||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
|
||||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
|
||||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
|
||||||
|
|
||||||
#define Wb6 \
|
#define Wb6 \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
|
_mm256_sub_epi64( mh[11], mh[13] ) )
|
||||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
|
||||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[11], H[11] ), \
|
|
||||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
|
||||||
|
|
||||||
#define Wb7 \
|
#define Wb7 \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
_mm256_add_epi64( mh[12], mh[14] ) )
|
||||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
|
||||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
|
||||||
_mm256_add_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
|
||||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
|
||||||
|
|
||||||
#define Wb8 \
|
#define Wb8 \
|
||||||
_mm256_add_epi64( \
|
_mm256_add_epi64( \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
_mm256_sub_epi64( mh[13], mh[15] ) )
|
||||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
|
||||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[13], H[13] ), \
|
|
||||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
|
||||||
|
|
||||||
#define Wb9 \
|
#define Wb9 \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( \
|
||||||
_mm256_add_epi64( \
|
_mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
_mm256_sub_epi64( mh[ 7], mh[14] ) )
|
||||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
|
||||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
|
|
||||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
|
||||||
|
|
||||||
#define Wb10 \
|
#define Wb10 \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
_mm256_sub_epi64( mh[ 7], mh[15] ) )
|
||||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
|
||||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
|
|
||||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
|
||||||
|
|
||||||
#define Wb11 \
|
#define Wb11 \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
_mm256_sub_epi64( mh[ 5], mh[ 9] ) )
|
||||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
|
||||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
|
|
||||||
_mm256_xor_si256( M[ 9], H[ 9] ) ) )
|
|
||||||
|
|
||||||
#define Wb12 \
|
#define Wb12 \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( _mm256_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \
|
||||||
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
_mm256_sub_epi64( mh[ 9], mh[10] ) )
|
||||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
|
||||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
|
|
||||||
_mm256_xor_si256( M[10], H[10] ) ) )
|
|
||||||
|
|
||||||
#define Wb13 \
|
#define Wb13 \
|
||||||
_mm256_add_epi64( \
|
_mm256_add_epi64( \
|
||||||
_mm256_add_epi64( \
|
_mm256_add_epi64( _mm256_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \
|
||||||
_mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
_mm256_add_epi64( mh[10], mh[11] ) )
|
||||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
|
||||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
|
||||||
_mm256_add_epi64( _mm256_xor_si256( M[10], H[10] ), \
|
|
||||||
_mm256_xor_si256( M[11], H[11] ) ) )
|
|
||||||
|
|
||||||
#define Wb14 \
|
#define Wb14 \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( \
|
||||||
_mm256_add_epi64( \
|
_mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
_mm256_add_epi64( mh[11], mh[12] ) )
|
||||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
|
||||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
|
||||||
_mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
|
|
||||||
_mm256_xor_si256( M[12], H[12] ) ) )
|
|
||||||
|
|
||||||
#define Wb15 \
|
#define Wb15 \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( \
|
||||||
_mm256_sub_epi64( \
|
_mm256_sub_epi64( _mm256_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
_mm256_sub_epi64( mh[ 9], mh[13] ) )
|
||||||
_mm256_xor_si256( M[ 4], H[4] ) ), \
|
|
||||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
|
||||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
|
|
||||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
|
||||||
|
|
||||||
|
|
||||||
void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||||
{
|
{
|
||||||
__m256i qt[32], xl, xh;
|
__m256i qt[32], xl, xh;
|
||||||
|
__m256i mh[16];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for ( i = 0; i < 16; i++ )
|
||||||
|
mh[i] = _mm256_xor_si256( M[i], H[i] );
|
||||||
|
|
||||||
qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] );
|
qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] );
|
||||||
qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] );
|
qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] );
|
||||||
@@ -799,22 +730,60 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
|||||||
qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] );
|
qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] );
|
||||||
qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] );
|
qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] );
|
||||||
qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] );
|
qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] );
|
||||||
qt[16] = expand1b( qt, M, H, 16 );
|
|
||||||
qt[17] = expand1b( qt, M, H, 17 );
|
__m256i mj[16];
|
||||||
qt[18] = expand2b( qt, M, H, 18 );
|
for ( i = 0; i < 16; i++ )
|
||||||
qt[19] = expand2b( qt, M, H, 19 );
|
mj[i] = rol_off_64( M, i );
|
||||||
qt[20] = expand2b( qt, M, H, 20 );
|
|
||||||
qt[21] = expand2b( qt, M, H, 21 );
|
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7],
|
||||||
qt[22] = expand2b( qt, M, H, 22 );
|
(const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) );
|
||||||
qt[23] = expand2b( qt, M, H, 23 );
|
qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8],
|
||||||
qt[24] = expand2b( qt, M, H, 24 );
|
(const __m256i)_mm256_set1_epi64x( 17 * 0x0555555555555555ULL ) );
|
||||||
qt[25] = expand2b( qt, M, H, 25 );
|
qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9],
|
||||||
qt[26] = expand2b( qt, M, H, 26 );
|
(const __m256i)_mm256_set1_epi64x( 18 * 0x0555555555555555ULL ) );
|
||||||
qt[27] = expand2b( qt, M, H, 27 );
|
qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10],
|
||||||
qt[28] = expand2b( qt, M, H, 28 );
|
(const __m256i)_mm256_set1_epi64x( 19 * 0x0555555555555555ULL ) );
|
||||||
qt[29] = expand2b( qt, M, H, 29 );
|
qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11],
|
||||||
qt[30] = expand2b( qt, M, H, 30 );
|
(const __m256i)_mm256_set1_epi64x( 20 * 0x0555555555555555ULL ) );
|
||||||
qt[31] = expand2b( qt, M, H, 31 );
|
qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12],
|
||||||
|
(const __m256i)_mm256_set1_epi64x( 21 * 0x0555555555555555ULL ) );
|
||||||
|
qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13],
|
||||||
|
(const __m256i)_mm256_set1_epi64x( 22 * 0x0555555555555555ULL ) );
|
||||||
|
qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14],
|
||||||
|
(const __m256i)_mm256_set1_epi64x( 23 * 0x0555555555555555ULL ) );
|
||||||
|
qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15],
|
||||||
|
(const __m256i)_mm256_set1_epi64x( 24 * 0x0555555555555555ULL ) );
|
||||||
|
qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0],
|
||||||
|
(const __m256i)_mm256_set1_epi64x( 25 * 0x0555555555555555ULL ) );
|
||||||
|
qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1],
|
||||||
|
(const __m256i)_mm256_set1_epi64x( 26 * 0x0555555555555555ULL ) );
|
||||||
|
qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2],
|
||||||
|
(const __m256i)_mm256_set1_epi64x( 27 * 0x0555555555555555ULL ) );
|
||||||
|
qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3],
|
||||||
|
(const __m256i)_mm256_set1_epi64x( 28 * 0x0555555555555555ULL ) );
|
||||||
|
qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4],
|
||||||
|
(const __m256i)_mm256_set1_epi64x( 29 * 0x0555555555555555ULL ) );
|
||||||
|
qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5],
|
||||||
|
(const __m256i)_mm256_set1_epi64x( 30 * 0x0555555555555555ULL ) );
|
||||||
|
qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6],
|
||||||
|
(const __m256i)_mm256_set1_epi64x( 31 * 0x0555555555555555ULL ) );
|
||||||
|
|
||||||
|
qt[16] = _mm256_add_epi64( qt[16], expand1_b( qt, 16 ) );
|
||||||
|
qt[17] = _mm256_add_epi64( qt[17], expand1_b( qt, 17 ) );
|
||||||
|
qt[18] = _mm256_add_epi64( qt[18], expand2_b( qt, 18 ) );
|
||||||
|
qt[19] = _mm256_add_epi64( qt[19], expand2_b( qt, 19 ) );
|
||||||
|
qt[20] = _mm256_add_epi64( qt[20], expand2_b( qt, 20 ) );
|
||||||
|
qt[21] = _mm256_add_epi64( qt[21], expand2_b( qt, 21 ) );
|
||||||
|
qt[22] = _mm256_add_epi64( qt[22], expand2_b( qt, 22 ) );
|
||||||
|
qt[23] = _mm256_add_epi64( qt[23], expand2_b( qt, 23 ) );
|
||||||
|
qt[24] = _mm256_add_epi64( qt[24], expand2_b( qt, 24 ) );
|
||||||
|
qt[25] = _mm256_add_epi64( qt[25], expand2_b( qt, 25 ) );
|
||||||
|
qt[26] = _mm256_add_epi64( qt[26], expand2_b( qt, 26 ) );
|
||||||
|
qt[27] = _mm256_add_epi64( qt[27], expand2_b( qt, 27 ) );
|
||||||
|
qt[28] = _mm256_add_epi64( qt[28], expand2_b( qt, 28 ) );
|
||||||
|
qt[29] = _mm256_add_epi64( qt[29], expand2_b( qt, 29 ) );
|
||||||
|
qt[30] = _mm256_add_epi64( qt[30], expand2_b( qt, 30 ) );
|
||||||
|
qt[31] = _mm256_add_epi64( qt[31], expand2_b( qt, 31 ) );
|
||||||
|
|
||||||
xl = _mm256_xor_si256(
|
xl = _mm256_xor_si256(
|
||||||
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||||
@@ -823,7 +792,6 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
|||||||
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||||
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||||
|
|
||||||
|
|
||||||
#define DH1L( m, sl, sr, a, b, c ) \
|
#define DH1L( m, sl, sr, a, b, c ) \
|
||||||
_mm256_add_epi64( \
|
_mm256_add_epi64( \
|
||||||
_mm256_xor_si256( M[m], \
|
_mm256_xor_si256( M[m], \
|
||||||
@@ -1066,21 +1034,15 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|||||||
#define r8b6(x) mm512_rol_64( x, 43 )
|
#define r8b6(x) mm512_rol_64( x, 43 )
|
||||||
#define r8b7(x) mm512_rol_64( x, 53 )
|
#define r8b7(x) mm512_rol_64( x, 53 )
|
||||||
|
|
||||||
#define rol8w_off_64( M, j, off ) \
|
#define rol8w_off_64( M, j ) \
|
||||||
mm512_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
|
mm512_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
|
||||||
( ( (j) + (off) ) & 0xF ) + 1 )
|
|
||||||
|
|
||||||
#define add_elt_b8( M, H, j ) \
|
#define add_elt_b8( mj0, mj3, mj10, h, K ) \
|
||||||
_mm512_xor_si512( \
|
_mm512_xor_si512( h, _mm512_add_epi64( K, \
|
||||||
_mm512_add_epi64( \
|
_mm512_sub_epi64( _mm512_add_epi64( mj0, mj3 ), mj10 ) ) )
|
||||||
_mm512_sub_epi64( _mm512_add_epi64( rol8w_off_64( M, j, 0 ), \
|
|
||||||
rol8w_off_64( M, j, 3 ) ), \
|
|
||||||
rol8w_off_64( M, j, 10 ) ), \
|
|
||||||
_mm512_set1_epi64( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
|
|
||||||
H[ ( (j)+7 ) & 0xF ] )
|
|
||||||
|
|
||||||
#define expand1b8( qt, M, H, i ) \
|
#define expand1_b8( qt, i ) \
|
||||||
_mm512_add_epi64( mm512_add4_64( \
|
mm512_add4_64( \
|
||||||
mm512_add4_64( s8b1( qt[ (i)-16 ] ), s8b2( qt[ (i)-15 ] ), \
|
mm512_add4_64( s8b1( qt[ (i)-16 ] ), s8b2( qt[ (i)-15 ] ), \
|
||||||
s8b3( qt[ (i)-14 ] ), s8b0( qt[ (i)-13 ] )), \
|
s8b3( qt[ (i)-14 ] ), s8b0( qt[ (i)-13 ] )), \
|
||||||
mm512_add4_64( s8b1( qt[ (i)-12 ] ), s8b2( qt[ (i)-11 ] ), \
|
mm512_add4_64( s8b1( qt[ (i)-12 ] ), s8b2( qt[ (i)-11 ] ), \
|
||||||
@@ -1088,11 +1050,10 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|||||||
mm512_add4_64( s8b1( qt[ (i)- 8 ] ), s8b2( qt[ (i)- 7 ] ), \
|
mm512_add4_64( s8b1( qt[ (i)- 8 ] ), s8b2( qt[ (i)- 7 ] ), \
|
||||||
s8b3( qt[ (i)- 6 ] ), s8b0( qt[ (i)- 5 ] )), \
|
s8b3( qt[ (i)- 6 ] ), s8b0( qt[ (i)- 5 ] )), \
|
||||||
mm512_add4_64( s8b1( qt[ (i)- 4 ] ), s8b2( qt[ (i)- 3 ] ), \
|
mm512_add4_64( s8b1( qt[ (i)- 4 ] ), s8b2( qt[ (i)- 3 ] ), \
|
||||||
s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ), \
|
s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) )
|
||||||
add_elt_b8( M, H, (i)-16 ) )
|
|
||||||
|
|
||||||
#define expand2b8( qt, M, H, i) \
|
#define expand2_b8( qt, i) \
|
||||||
_mm512_add_epi64( mm512_add4_64( \
|
mm512_add4_64( \
|
||||||
mm512_add4_64( qt[ (i)-16 ], r8b1( qt[ (i)-15 ] ), \
|
mm512_add4_64( qt[ (i)-16 ], r8b1( qt[ (i)-15 ] ), \
|
||||||
qt[ (i)-14 ], r8b2( qt[ (i)-13 ] ) ), \
|
qt[ (i)-14 ], r8b2( qt[ (i)-13 ] ) ), \
|
||||||
mm512_add4_64( qt[ (i)-12 ], r8b3( qt[ (i)-11 ] ), \
|
mm512_add4_64( qt[ (i)-12 ], r8b3( qt[ (i)-11 ] ), \
|
||||||
@@ -1100,157 +1061,97 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|||||||
mm512_add4_64( qt[ (i)- 8 ], r8b5( qt[ (i)- 7 ] ), \
|
mm512_add4_64( qt[ (i)- 8 ], r8b5( qt[ (i)- 7 ] ), \
|
||||||
qt[ (i)- 6 ], r8b6( qt[ (i)- 5 ] ) ), \
|
qt[ (i)- 6 ], r8b6( qt[ (i)- 5 ] ) ), \
|
||||||
mm512_add4_64( qt[ (i)- 4 ], r8b7( qt[ (i)- 3 ] ), \
|
mm512_add4_64( qt[ (i)- 4 ], r8b7( qt[ (i)- 3 ] ), \
|
||||||
s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \
|
s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) )
|
||||||
add_elt_b8( M, H, (i)-16 ) )
|
|
||||||
|
|
||||||
#define W8b0 \
|
#define W8b0 \
|
||||||
_mm512_add_epi64( \
|
_mm512_add_epi64( \
|
||||||
_mm512_add_epi64( \
|
_mm512_add_epi64( _mm512_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
|
_mm512_add_epi64( mh[13], mh[14] ) )
|
||||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
|
||||||
_mm512_xor_si512( M[10], H[10] ) ), \
|
|
||||||
_mm512_add_epi64( _mm512_xor_si512( M[13], H[13] ), \
|
|
||||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
|
||||||
|
|
||||||
#define W8b1 \
|
#define W8b1 \
|
||||||
_mm512_add_epi64( \
|
_mm512_add_epi64( \
|
||||||
_mm512_add_epi64( \
|
_mm512_add_epi64( _mm512_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 6], H[ 6] ), \
|
_mm512_sub_epi64( mh[14], mh[15] ) )
|
||||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
|
||||||
_mm512_xor_si512( M[11], H[11] ) ), \
|
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[14], H[14] ), \
|
|
||||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
|
||||||
|
|
||||||
#define W8b2 \
|
#define W8b2 \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( \
|
||||||
_mm512_add_epi64( \
|
_mm512_add_epi64( _mm512_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \
|
||||||
_mm512_add_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
_mm512_sub_epi64( mh[12], mh[15] ) )
|
||||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
|
||||||
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
|
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
|
||||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
|
||||||
|
|
||||||
#define W8b3 \
|
#define W8b3 \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( \
|
||||||
_mm512_add_epi64( \
|
_mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
_mm512_sub_epi64( mh[10], mh[13] ) )
|
||||||
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
|
|
||||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[10], H[10] ), \
|
|
||||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
|
||||||
|
|
||||||
#define W8b4 \
|
#define W8b4 \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( \
|
||||||
_mm512_add_epi64( \
|
_mm512_add_epi64( _mm512_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \
|
||||||
_mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
_mm512_add_epi64( mh[11], mh[14] ) )
|
||||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
|
||||||
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
|
|
||||||
_mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
|
|
||||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
|
||||||
|
|
||||||
#define W8b5 \
|
#define W8b5 \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( \
|
||||||
_mm512_add_epi64( \
|
_mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
|
_mm512_sub_epi64( mh[12], mh[15] ) )
|
||||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
|
||||||
_mm512_xor_si512( M[10], H[10] ) ), \
|
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
|
||||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
|
||||||
|
|
||||||
#define W8b6 \
|
#define W8b6 \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 4], H[ 4] ), \
|
_mm512_sub_epi64( mh[11], mh[13] ) )
|
||||||
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
|
|
||||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[11], H[11] ), \
|
|
||||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
|
||||||
|
|
||||||
#define W8b7 \
|
#define W8b7 \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
_mm512_add_epi64( mh[12], mh[14] ) )
|
||||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
|
||||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
|
||||||
_mm512_add_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
|
||||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
|
||||||
|
|
||||||
#define W8b8 \
|
#define W8b8 \
|
||||||
_mm512_add_epi64( \
|
_mm512_add_epi64( \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
|
_mm512_sub_epi64( mh[13], mh[15] ) )
|
||||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
|
||||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[13], H[13] ), \
|
|
||||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
|
||||||
|
|
||||||
#define W8b9 \
|
#define W8b9 \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( \
|
||||||
_mm512_add_epi64( \
|
_mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
_mm512_sub_epi64( mh[ 7], mh[14] ) )
|
||||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
|
||||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
|
|
||||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
|
||||||
|
|
||||||
#define W8b10 \
|
#define W8b10 \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
|
_mm512_sub_epi64( mh[ 7], mh[15] ) )
|
||||||
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
|
|
||||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
|
|
||||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
|
||||||
|
|
||||||
#define W8b11 \
|
#define W8b11 \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
|
_mm512_sub_epi64( mh[ 5], mh[ 9] ) )
|
||||||
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
|
|
||||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
|
|
||||||
_mm512_xor_si512( M[ 9], H[ 9] ) ) )
|
|
||||||
|
|
||||||
#define W8b12 \
|
#define W8b12 \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( _mm512_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \
|
||||||
_mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
_mm512_sub_epi64( mh[ 9], mh[10] ) )
|
||||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
|
||||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
|
|
||||||
_mm512_xor_si512( M[10], H[10] ) ) )
|
|
||||||
|
|
||||||
#define W8b13 \
|
#define W8b13 \
|
||||||
_mm512_add_epi64( \
|
_mm512_add_epi64( \
|
||||||
_mm512_add_epi64( \
|
_mm512_add_epi64( _mm512_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \
|
||||||
_mm512_add_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
|
_mm512_add_epi64( mh[10], mh[11] ) )
|
||||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
|
||||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
|
||||||
_mm512_add_epi64( _mm512_xor_si512( M[10], H[10] ), \
|
|
||||||
_mm512_xor_si512( M[11], H[11] ) ) )
|
|
||||||
|
|
||||||
#define W8b14 \
|
#define W8b14 \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( \
|
||||||
_mm512_add_epi64( \
|
_mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
|
_mm512_add_epi64( mh[11], mh[12] ) )
|
||||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
|
||||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
|
||||||
_mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
|
|
||||||
_mm512_xor_si512( M[12], H[12] ) ) )
|
|
||||||
|
|
||||||
#define W8b15 \
|
#define W8b15 \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( \
|
||||||
_mm512_sub_epi64( \
|
_mm512_sub_epi64( _mm512_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
_mm512_sub_epi64( mh[ 9], mh[13] ) )
|
||||||
_mm512_xor_si512( M[ 4], H[4] ) ), \
|
|
||||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
|
||||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
|
|
||||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
|
||||||
|
|
||||||
void compress_big_8way( const __m512i *M, const __m512i H[16],
|
void compress_big_8way( const __m512i *M, const __m512i H[16],
|
||||||
__m512i dH[16] )
|
__m512i dH[16] )
|
||||||
{
|
{
|
||||||
__m512i qt[32], xl, xh;
|
__m512i qt[32], xl, xh;
|
||||||
|
__m512i mh[16];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for ( i = 0; i < 16; i++ )
|
||||||
|
mh[i] = _mm512_xor_si512( M[i], H[i] );
|
||||||
|
|
||||||
qt[ 0] = _mm512_add_epi64( s8b0( W8b0 ), H[ 1] );
|
qt[ 0] = _mm512_add_epi64( s8b0( W8b0 ), H[ 1] );
|
||||||
qt[ 1] = _mm512_add_epi64( s8b1( W8b1 ), H[ 2] );
|
qt[ 1] = _mm512_add_epi64( s8b1( W8b1 ), H[ 2] );
|
||||||
@@ -1268,57 +1169,90 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
|
|||||||
qt[13] = _mm512_add_epi64( s8b3( W8b13), H[14] );
|
qt[13] = _mm512_add_epi64( s8b3( W8b13), H[14] );
|
||||||
qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
|
qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
|
||||||
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
|
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
|
||||||
qt[16] = expand1b8( qt, M, H, 16 );
|
|
||||||
qt[17] = expand1b8( qt, M, H, 17 );
|
|
||||||
qt[18] = expand2b8( qt, M, H, 18 );
|
|
||||||
qt[19] = expand2b8( qt, M, H, 19 );
|
|
||||||
qt[20] = expand2b8( qt, M, H, 20 );
|
|
||||||
qt[21] = expand2b8( qt, M, H, 21 );
|
|
||||||
qt[22] = expand2b8( qt, M, H, 22 );
|
|
||||||
qt[23] = expand2b8( qt, M, H, 23 );
|
|
||||||
qt[24] = expand2b8( qt, M, H, 24 );
|
|
||||||
qt[25] = expand2b8( qt, M, H, 25 );
|
|
||||||
qt[26] = expand2b8( qt, M, H, 26 );
|
|
||||||
qt[27] = expand2b8( qt, M, H, 27 );
|
|
||||||
qt[28] = expand2b8( qt, M, H, 28 );
|
|
||||||
qt[29] = expand2b8( qt, M, H, 29 );
|
|
||||||
qt[30] = expand2b8( qt, M, H, 30 );
|
|
||||||
qt[31] = expand2b8( qt, M, H, 31 );
|
|
||||||
|
|
||||||
xl = _mm512_xor_si512(
|
__m512i mj[16];
|
||||||
mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
for ( i = 0; i < 16; i++ )
|
||||||
mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
mj[i] = rol8w_off_64( M, i );
|
||||||
xh = _mm512_xor_si512( xl, _mm512_xor_si512(
|
|
||||||
mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7],
|
||||||
mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
(const __m512i)_mm512_set1_epi64( 16 * 0x0555555555555555ULL ) );
|
||||||
|
qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8],
|
||||||
|
(const __m512i)_mm512_set1_epi64( 17 * 0x0555555555555555ULL ) );
|
||||||
|
qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9],
|
||||||
|
(const __m512i)_mm512_set1_epi64( 18 * 0x0555555555555555ULL ) );
|
||||||
|
qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10],
|
||||||
|
(const __m512i)_mm512_set1_epi64( 19 * 0x0555555555555555ULL ) );
|
||||||
|
qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11],
|
||||||
|
(const __m512i)_mm512_set1_epi64( 20 * 0x0555555555555555ULL ) );
|
||||||
|
qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12],
|
||||||
|
(const __m512i)_mm512_set1_epi64( 21 * 0x0555555555555555ULL ) );
|
||||||
|
qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13],
|
||||||
|
(const __m512i)_mm512_set1_epi64( 22 * 0x0555555555555555ULL ) );
|
||||||
|
qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14],
|
||||||
|
(const __m512i)_mm512_set1_epi64( 23 * 0x0555555555555555ULL ) );
|
||||||
|
qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15],
|
||||||
|
(const __m512i)_mm512_set1_epi64( 24 * 0x0555555555555555ULL ) );
|
||||||
|
qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0],
|
||||||
|
(const __m512i)_mm512_set1_epi64( 25 * 0x0555555555555555ULL ) );
|
||||||
|
qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1],
|
||||||
|
(const __m512i)_mm512_set1_epi64( 26 * 0x0555555555555555ULL ) );
|
||||||
|
qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2],
|
||||||
|
(const __m512i)_mm512_set1_epi64( 27 * 0x0555555555555555ULL ) );
|
||||||
|
qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3],
|
||||||
|
(const __m512i)_mm512_set1_epi64( 28 * 0x0555555555555555ULL ) );
|
||||||
|
qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4],
|
||||||
|
(const __m512i)_mm512_set1_epi64( 29 * 0x0555555555555555ULL ) );
|
||||||
|
qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5],
|
||||||
|
(const __m512i)_mm512_set1_epi64( 30 * 0x0555555555555555ULL ) );
|
||||||
|
qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6],
|
||||||
|
(const __m512i)_mm512_set1_epi64( 31 * 0x0555555555555555ULL ) );
|
||||||
|
|
||||||
|
qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
|
||||||
|
qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
|
||||||
|
qt[18] = _mm512_add_epi64( qt[18], expand2_b8( qt, 18 ) );
|
||||||
|
qt[19] = _mm512_add_epi64( qt[19], expand2_b8( qt, 19 ) );
|
||||||
|
qt[20] = _mm512_add_epi64( qt[20], expand2_b8( qt, 20 ) );
|
||||||
|
qt[21] = _mm512_add_epi64( qt[21], expand2_b8( qt, 21 ) );
|
||||||
|
qt[22] = _mm512_add_epi64( qt[22], expand2_b8( qt, 22 ) );
|
||||||
|
qt[23] = _mm512_add_epi64( qt[23], expand2_b8( qt, 23 ) );
|
||||||
|
qt[24] = _mm512_add_epi64( qt[24], expand2_b8( qt, 24 ) );
|
||||||
|
qt[25] = _mm512_add_epi64( qt[25], expand2_b8( qt, 25 ) );
|
||||||
|
qt[26] = _mm512_add_epi64( qt[26], expand2_b8( qt, 26 ) );
|
||||||
|
qt[27] = _mm512_add_epi64( qt[27], expand2_b8( qt, 27 ) );
|
||||||
|
qt[28] = _mm512_add_epi64( qt[28], expand2_b8( qt, 28 ) );
|
||||||
|
qt[29] = _mm512_add_epi64( qt[29], expand2_b8( qt, 29 ) );
|
||||||
|
qt[30] = _mm512_add_epi64( qt[30], expand2_b8( qt, 30 ) );
|
||||||
|
qt[31] = _mm512_add_epi64( qt[31], expand2_b8( qt, 31 ) );
|
||||||
|
|
||||||
|
xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
|
||||||
|
mm512_xor3( qt[19], qt[20], qt[21] ),
|
||||||
|
_mm512_xor_si512( qt[22], qt[23] ) );
|
||||||
|
|
||||||
|
xh = mm512_xor3( mm512_xor3( xl, qt[24], qt[25] ),
|
||||||
|
mm512_xor3( qt[26], qt[27], qt[28] ),
|
||||||
|
mm512_xor3( qt[29], qt[30], qt[31] ) );
|
||||||
|
|
||||||
#define DH1L( m, sl, sr, a, b, c ) \
|
#define DH1L( m, sl, sr, a, b, c ) \
|
||||||
_mm512_add_epi64( \
|
_mm512_add_epi64( mm512_xor3( M[m], _mm512_slli_epi64( xh, sl ), \
|
||||||
_mm512_xor_si512( M[m], \
|
_mm512_srli_epi64( qt[a], sr ) ), \
|
||||||
_mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \
|
mm512_xor3( xl, qt[b], qt[c] ) )
|
||||||
_mm512_srli_epi64( qt[a], sr ) ) ), \
|
|
||||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
|
||||||
|
|
||||||
#define DH1R( m, sl, sr, a, b, c ) \
|
#define DH1R( m, sl, sr, a, b, c ) \
|
||||||
_mm512_add_epi64( \
|
_mm512_add_epi64( mm512_xor3( M[m], _mm512_srli_epi64( xh, sl ), \
|
||||||
_mm512_xor_si512( M[m], \
|
_mm512_slli_epi64( qt[a], sr ) ), \
|
||||||
_mm512_xor_si512( _mm512_srli_epi64( xh, sl ), \
|
mm512_xor3( xl, qt[b], qt[c] ) )
|
||||||
_mm512_slli_epi64( qt[a], sr ) ) ), \
|
|
||||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
|
||||||
|
|
||||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||||
_mm512_add_epi64( _mm512_add_epi64( \
|
_mm512_add_epi64( _mm512_add_epi64( \
|
||||||
mm512_rol_64( dH[h], rl ), \
|
mm512_rol_64( dH[h], rl ), \
|
||||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
mm512_xor3( xh, qt[a], M[m] ) ), \
|
||||||
_mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \
|
mm512_xor3( _mm512_slli_epi64( xl, sl ), qt[b], qt[c] ) )
|
||||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
|
||||||
|
|
||||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||||
_mm512_add_epi64( _mm512_add_epi64( \
|
_mm512_add_epi64( _mm512_add_epi64( \
|
||||||
mm512_rol_64( dH[h], rl ), \
|
mm512_rol_64( dH[h], rl ), \
|
||||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
mm512_xor3( xh, qt[a], M[m] ) ), \
|
||||||
_mm512_xor_si512( _mm512_srli_epi64( xl, sr ), \
|
mm512_xor3( _mm512_srli_epi64( xl, sr ), qt[b], qt[c] ) )
|
||||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
|
||||||
|
|
||||||
|
|
||||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||||
|
|||||||
@@ -98,6 +98,138 @@ static void transform_4way( cube_4way_context *sp )
|
|||||||
_mm512_store_si512( (__m512i*)sp->h + 7, x7 );
|
_mm512_store_si512( (__m512i*)sp->h + 7, x7 );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 8 ways, 4 way parallel double buffered
|
||||||
|
static void transform_4way_2buf( cube_4way_2buf_context *sp )
|
||||||
|
{
|
||||||
|
int r;
|
||||||
|
const int rounds = sp->rounds;
|
||||||
|
|
||||||
|
__m512i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||||
|
__m512i y0, y1, y2, y3, y4, y5, y6, y7;
|
||||||
|
__m512i tx0, tx1, ty0, ty1;
|
||||||
|
|
||||||
|
x0 = _mm512_load_si512( (__m512i*)sp->h0 );
|
||||||
|
x1 = _mm512_load_si512( (__m512i*)sp->h0 + 1 );
|
||||||
|
x2 = _mm512_load_si512( (__m512i*)sp->h0 + 2 );
|
||||||
|
x3 = _mm512_load_si512( (__m512i*)sp->h0 + 3 );
|
||||||
|
x4 = _mm512_load_si512( (__m512i*)sp->h0 + 4 );
|
||||||
|
x5 = _mm512_load_si512( (__m512i*)sp->h0 + 5 );
|
||||||
|
x6 = _mm512_load_si512( (__m512i*)sp->h0 + 6 );
|
||||||
|
x7 = _mm512_load_si512( (__m512i*)sp->h0 + 7 );
|
||||||
|
|
||||||
|
y0 = _mm512_load_si512( (__m512i*)sp->h1 );
|
||||||
|
y1 = _mm512_load_si512( (__m512i*)sp->h1 + 1 );
|
||||||
|
y2 = _mm512_load_si512( (__m512i*)sp->h1 + 2 );
|
||||||
|
y3 = _mm512_load_si512( (__m512i*)sp->h1 + 3 );
|
||||||
|
y4 = _mm512_load_si512( (__m512i*)sp->h1 + 4 );
|
||||||
|
y5 = _mm512_load_si512( (__m512i*)sp->h1 + 5 );
|
||||||
|
y6 = _mm512_load_si512( (__m512i*)sp->h1 + 6 );
|
||||||
|
y7 = _mm512_load_si512( (__m512i*)sp->h1 + 7 );
|
||||||
|
|
||||||
|
|
||||||
|
for ( r = 0; r < rounds; ++r )
|
||||||
|
{
|
||||||
|
x4 = _mm512_add_epi32( x0, x4 );
|
||||||
|
y4 = _mm512_add_epi32( y0, y4 );
|
||||||
|
tx0 = x0;
|
||||||
|
ty0 = y0;
|
||||||
|
x5 = _mm512_add_epi32( x1, x5 );
|
||||||
|
y5 = _mm512_add_epi32( y1, y5 );
|
||||||
|
tx1 = x1;
|
||||||
|
ty1 = y1;
|
||||||
|
x0 = mm512_rol_32( x2, 7 );
|
||||||
|
y0 = mm512_rol_32( y2, 7 );
|
||||||
|
x6 = _mm512_add_epi32( x2, x6 );
|
||||||
|
y6 = _mm512_add_epi32( y2, y6 );
|
||||||
|
x1 = mm512_rol_32( x3, 7 );
|
||||||
|
y1 = mm512_rol_32( y3, 7 );
|
||||||
|
x7 = _mm512_add_epi32( x3, x7 );
|
||||||
|
y7 = _mm512_add_epi32( y3, y7 );
|
||||||
|
|
||||||
|
|
||||||
|
x2 = mm512_rol_32( tx0, 7 );
|
||||||
|
y2 = mm512_rol_32( ty0, 7 );
|
||||||
|
x0 = _mm512_xor_si512( x0, x4 );
|
||||||
|
y0 = _mm512_xor_si512( y0, y4 );
|
||||||
|
x4 = mm512_swap128_64( x4 );
|
||||||
|
x3 = mm512_rol_32( tx1, 7 );
|
||||||
|
y3 = mm512_rol_32( ty1, 7 );
|
||||||
|
y4 = mm512_swap128_64( y4 );
|
||||||
|
|
||||||
|
x1 = _mm512_xor_si512( x1, x5 );
|
||||||
|
y1 = _mm512_xor_si512( y1, y5 );
|
||||||
|
x5 = mm512_swap128_64( x5 );
|
||||||
|
x2 = _mm512_xor_si512( x2, x6 );
|
||||||
|
y2 = _mm512_xor_si512( y2, y6 );
|
||||||
|
y5 = mm512_swap128_64( y5 );
|
||||||
|
x3 = _mm512_xor_si512( x3, x7 );
|
||||||
|
y3 = _mm512_xor_si512( y3, y7 );
|
||||||
|
|
||||||
|
x6 = mm512_swap128_64( x6 );
|
||||||
|
x4 = _mm512_add_epi32( x0, x4 );
|
||||||
|
y4 = _mm512_add_epi32( y0, y4 );
|
||||||
|
y6 = mm512_swap128_64( y6 );
|
||||||
|
x5 = _mm512_add_epi32( x1, x5 );
|
||||||
|
y5 = _mm512_add_epi32( y1, y5 );
|
||||||
|
x7 = mm512_swap128_64( x7 );
|
||||||
|
x6 = _mm512_add_epi32( x2, x6 );
|
||||||
|
y6 = _mm512_add_epi32( y2, y6 );
|
||||||
|
tx0 = x0;
|
||||||
|
ty0 = y0;
|
||||||
|
y7 = mm512_swap128_64( y7 );
|
||||||
|
tx1 = x2;
|
||||||
|
ty1 = y2;
|
||||||
|
x0 = mm512_rol_32( x1, 11 );
|
||||||
|
y0 = mm512_rol_32( y1, 11 );
|
||||||
|
|
||||||
|
x7 = _mm512_add_epi32( x3, x7 );
|
||||||
|
y7 = _mm512_add_epi32( y3, y7 );
|
||||||
|
|
||||||
|
x1 = mm512_rol_32( tx0, 11 );
|
||||||
|
y1 = mm512_rol_32( ty0, 11 );
|
||||||
|
x0 = _mm512_xor_si512( x0, x4 );
|
||||||
|
x4 = mm512_swap64_32( x4 );
|
||||||
|
y0 = _mm512_xor_si512( y0, y4 );
|
||||||
|
x2 = mm512_rol_32( x3, 11 );
|
||||||
|
y4 = mm512_swap64_32( y4 );
|
||||||
|
y2 = mm512_rol_32( y3, 11 );
|
||||||
|
x1 = _mm512_xor_si512( x1, x5 );
|
||||||
|
x5 = mm512_swap64_32( x5 );
|
||||||
|
y1 = _mm512_xor_si512( y1, y5 );
|
||||||
|
x3 = mm512_rol_32( tx1, 11 );
|
||||||
|
y5 = mm512_swap64_32( y5 );
|
||||||
|
y3 = mm512_rol_32( ty1, 11 );
|
||||||
|
|
||||||
|
x2 = _mm512_xor_si512( x2, x6 );
|
||||||
|
x6 = mm512_swap64_32( x6 );
|
||||||
|
y2 = _mm512_xor_si512( y2, y6 );
|
||||||
|
y6 = mm512_swap64_32( y6 );
|
||||||
|
x3 = _mm512_xor_si512( x3, x7 );
|
||||||
|
x7 = mm512_swap64_32( x7 );
|
||||||
|
y3 = _mm512_xor_si512( y3, y7 );
|
||||||
|
|
||||||
|
y7 = mm512_swap64_32( y7 );
|
||||||
|
}
|
||||||
|
|
||||||
|
_mm512_store_si512( (__m512i*)sp->h0, x0 );
|
||||||
|
_mm512_store_si512( (__m512i*)sp->h0 + 1, x1 );
|
||||||
|
_mm512_store_si512( (__m512i*)sp->h0 + 2, x2 );
|
||||||
|
_mm512_store_si512( (__m512i*)sp->h0 + 3, x3 );
|
||||||
|
_mm512_store_si512( (__m512i*)sp->h0 + 4, x4 );
|
||||||
|
_mm512_store_si512( (__m512i*)sp->h0 + 5, x5 );
|
||||||
|
_mm512_store_si512( (__m512i*)sp->h0 + 6, x6 );
|
||||||
|
_mm512_store_si512( (__m512i*)sp->h0 + 7, x7 );
|
||||||
|
|
||||||
|
_mm512_store_si512( (__m512i*)sp->h1, y0 );
|
||||||
|
_mm512_store_si512( (__m512i*)sp->h1 + 1, y1 );
|
||||||
|
_mm512_store_si512( (__m512i*)sp->h1 + 2, y2 );
|
||||||
|
_mm512_store_si512( (__m512i*)sp->h1 + 3, y3 );
|
||||||
|
_mm512_store_si512( (__m512i*)sp->h1 + 4, y4 );
|
||||||
|
_mm512_store_si512( (__m512i*)sp->h1 + 5, y5 );
|
||||||
|
_mm512_store_si512( (__m512i*)sp->h1 + 6, y6 );
|
||||||
|
_mm512_store_si512( (__m512i*)sp->h1 + 7, y7 );
|
||||||
|
}
|
||||||
|
|
||||||
int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
|
int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
|
||||||
int blockbytes )
|
int blockbytes )
|
||||||
{
|
{
|
||||||
@@ -219,6 +351,67 @@ int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int cube_4way_2buf_full( cube_4way_2buf_context *sp,
|
||||||
|
void *output0, void *output1, int hashbitlen,
|
||||||
|
const void *data0, const void *data1, size_t size )
|
||||||
|
{
|
||||||
|
__m512i *h0 = (__m512i*)sp->h0;
|
||||||
|
__m512i *h1 = (__m512i*)sp->h1;
|
||||||
|
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||||
|
: (__m128i*)IV256 );
|
||||||
|
sp->hashlen = hashbitlen/128;
|
||||||
|
sp->blocksize = 32/16;
|
||||||
|
sp->rounds = 16;
|
||||||
|
sp->pos = 0;
|
||||||
|
|
||||||
|
h1[0] = h0[0] = m512_const1_128( iv[0] );
|
||||||
|
h1[1] = h0[1] = m512_const1_128( iv[1] );
|
||||||
|
h1[2] = h0[2] = m512_const1_128( iv[2] );
|
||||||
|
h1[3] = h0[3] = m512_const1_128( iv[3] );
|
||||||
|
h1[4] = h0[4] = m512_const1_128( iv[4] );
|
||||||
|
h1[5] = h0[5] = m512_const1_128( iv[5] );
|
||||||
|
h1[6] = h0[6] = m512_const1_128( iv[6] );
|
||||||
|
h1[7] = h0[7] = m512_const1_128( iv[7] );
|
||||||
|
|
||||||
|
const int len = size >> 4;
|
||||||
|
const __m512i *in0 = (__m512i*)data0;
|
||||||
|
const __m512i *in1 = (__m512i*)data1;
|
||||||
|
__m512i *hash0 = (__m512i*)output0;
|
||||||
|
__m512i *hash1 = (__m512i*)output1;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for ( i = 0; i < len; i++ )
|
||||||
|
{
|
||||||
|
sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], in0[i] );
|
||||||
|
sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], in1[i] );
|
||||||
|
sp->pos++;
|
||||||
|
if ( sp->pos == sp->blocksize )
|
||||||
|
{
|
||||||
|
transform_4way_2buf( sp );
|
||||||
|
sp->pos = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||||
|
__m512i tmp = m512_const2_64( 0, 0x0000000000000080 );
|
||||||
|
sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], tmp );
|
||||||
|
sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], tmp );
|
||||||
|
|
||||||
|
transform_4way_2buf( sp );
|
||||||
|
|
||||||
|
tmp = m512_const2_64( 0x0000000100000000, 0 );
|
||||||
|
sp->h0[7] = _mm512_xor_si512( sp->h0[7], tmp );
|
||||||
|
sp->h1[7] = _mm512_xor_si512( sp->h1[7], tmp );
|
||||||
|
|
||||||
|
for ( i = 0; i < 10; ++i )
|
||||||
|
transform_4way_2buf( sp );
|
||||||
|
|
||||||
|
memcpy( hash0, sp->h0, sp->hashlen<<6);
|
||||||
|
memcpy( hash1, sp->h1, sp->hashlen<<6);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||||
const void *data, size_t size )
|
const void *data, size_t size )
|
||||||
@@ -259,6 +452,21 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
|
|||||||
|
|
||||||
// 2 way 128
|
// 2 way 128
|
||||||
|
|
||||||
|
// This isn't expected to be used with AVX512 so HW rotate intruction
|
||||||
|
// is assumed not avaiable.
|
||||||
|
// Use double buffering to optimize serial bit rotations. Full double
|
||||||
|
// buffering isn't practical because it needs twice as many registers
|
||||||
|
// with AVX2 having only half as many as AVX512.
|
||||||
|
#define ROL2( out0, out1, in0, in1, c ) \
|
||||||
|
{ \
|
||||||
|
__m256i t0 = _mm256_slli_epi32( in0, c ); \
|
||||||
|
__m256i t1 = _mm256_slli_epi32( in1, c ); \
|
||||||
|
out0 = _mm256_srli_epi32( in0, 32-(c) ); \
|
||||||
|
out1 = _mm256_srli_epi32( in1, 32-(c) ); \
|
||||||
|
out0 = _mm256_or_si256( out0, t0 ); \
|
||||||
|
out1 = _mm256_or_si256( out1, t1 ); \
|
||||||
|
}
|
||||||
|
|
||||||
static void transform_2way( cube_2way_context *sp )
|
static void transform_2way( cube_2way_context *sp )
|
||||||
{
|
{
|
||||||
int r;
|
int r;
|
||||||
@@ -283,35 +491,31 @@ static void transform_2way( cube_2way_context *sp )
|
|||||||
x7 = _mm256_add_epi32( x3, x7 );
|
x7 = _mm256_add_epi32( x3, x7 );
|
||||||
y0 = x0;
|
y0 = x0;
|
||||||
y1 = x1;
|
y1 = x1;
|
||||||
x0 = mm256_rol_32( x2, 7 );
|
ROL2( x0, x1, x2, x3, 7 );
|
||||||
x1 = mm256_rol_32( x3, 7 );
|
ROL2( x2, x3, y0, y1, 7 );
|
||||||
x2 = mm256_rol_32( y0, 7 );
|
|
||||||
x3 = mm256_rol_32( y1, 7 );
|
|
||||||
x0 = _mm256_xor_si256( x0, x4 );
|
x0 = _mm256_xor_si256( x0, x4 );
|
||||||
x1 = _mm256_xor_si256( x1, x5 );
|
|
||||||
x2 = _mm256_xor_si256( x2, x6 );
|
|
||||||
x3 = _mm256_xor_si256( x3, x7 );
|
|
||||||
x4 = mm256_swap128_64( x4 );
|
x4 = mm256_swap128_64( x4 );
|
||||||
x5 = mm256_swap128_64( x5 );
|
|
||||||
x6 = mm256_swap128_64( x6 );
|
|
||||||
x7 = mm256_swap128_64( x7 );
|
|
||||||
x4 = _mm256_add_epi32( x0, x4 );
|
|
||||||
x5 = _mm256_add_epi32( x1, x5 );
|
|
||||||
x6 = _mm256_add_epi32( x2, x6 );
|
|
||||||
x7 = _mm256_add_epi32( x3, x7 );
|
|
||||||
y0 = x0;
|
|
||||||
y1 = x2;
|
|
||||||
x0 = mm256_rol_32( x1, 11 );
|
|
||||||
x1 = mm256_rol_32( y0, 11 );
|
|
||||||
x2 = mm256_rol_32( x3, 11 );
|
|
||||||
x3 = mm256_rol_32( y1, 11 );
|
|
||||||
x0 = _mm256_xor_si256( x0, x4 );
|
|
||||||
x1 = _mm256_xor_si256( x1, x5 );
|
x1 = _mm256_xor_si256( x1, x5 );
|
||||||
x2 = _mm256_xor_si256( x2, x6 );
|
x2 = _mm256_xor_si256( x2, x6 );
|
||||||
|
x5 = mm256_swap128_64( x5 );
|
||||||
x3 = _mm256_xor_si256( x3, x7 );
|
x3 = _mm256_xor_si256( x3, x7 );
|
||||||
|
x4 = _mm256_add_epi32( x0, x4 );
|
||||||
|
x6 = mm256_swap128_64( x6 );
|
||||||
|
y0 = x0;
|
||||||
|
x5 = _mm256_add_epi32( x1, x5 );
|
||||||
|
x7 = mm256_swap128_64( x7 );
|
||||||
|
x6 = _mm256_add_epi32( x2, x6 );
|
||||||
|
y1 = x2;
|
||||||
|
ROL2( x0, x1, x1, y0, 11 );
|
||||||
|
x7 = _mm256_add_epi32( x3, x7 );
|
||||||
|
ROL2( x2, x3, x3, y1, 11 );
|
||||||
|
x0 = _mm256_xor_si256( x0, x4 );
|
||||||
x4 = mm256_swap64_32( x4 );
|
x4 = mm256_swap64_32( x4 );
|
||||||
|
x1 = _mm256_xor_si256( x1, x5 );
|
||||||
x5 = mm256_swap64_32( x5 );
|
x5 = mm256_swap64_32( x5 );
|
||||||
|
x2 = _mm256_xor_si256( x2, x6 );
|
||||||
x6 = mm256_swap64_32( x6 );
|
x6 = mm256_swap64_32( x6 );
|
||||||
|
x3 = _mm256_xor_si256( x3, x7 );
|
||||||
x7 = mm256_swap64_32( x7 );
|
x7 = mm256_swap64_32( x7 );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -17,41 +17,41 @@ struct _cube_4way_context
|
|||||||
int pos;
|
int pos;
|
||||||
} __attribute__ ((aligned (128)));
|
} __attribute__ ((aligned (128)));
|
||||||
|
|
||||||
|
struct _cube_4way_2buf_context
|
||||||
|
{
|
||||||
|
__m512i h0[8];
|
||||||
|
__m512i h1[8];
|
||||||
|
int hashlen;
|
||||||
|
int rounds;
|
||||||
|
int blocksize;
|
||||||
|
int pos;
|
||||||
|
} __attribute__ ((aligned (128)));
|
||||||
|
|
||||||
|
|
||||||
typedef struct _cube_4way_context cube_4way_context;
|
typedef struct _cube_4way_context cube_4way_context;
|
||||||
|
|
||||||
|
typedef struct _cube_4way_2buf_context cube_4way_2buf_context;
|
||||||
|
|
||||||
int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
|
int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
|
||||||
int blockbytes );
|
int blockbytes );
|
||||||
|
|
||||||
int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
|
int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
|
||||||
|
|
||||||
int cube_4way_close( cube_4way_context *sp, void *output );
|
int cube_4way_close( cube_4way_context *sp, void *output );
|
||||||
|
|
||||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||||
const void *data, size_t size );
|
const void *data, size_t size );
|
||||||
|
|
||||||
int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||||
const void *data, size_t size );
|
const void *data, size_t size );
|
||||||
|
|
||||||
int cube_4x256_full( cube_4way_context *sp, void *output, int hashbitlen,
|
int cube_4way_2buf_full( cube_4way_2buf_context *sp,
|
||||||
const void *data, size_t size );
|
void *output0, void *output1, int hashbitlen,
|
||||||
|
const void *data0, const void *data1, size_t size );
|
||||||
#define cube512_4way_init( sp ) cube_4way_update( sp, 512 )
|
|
||||||
#define cube512_4way_update cube_4way_update
|
|
||||||
#define cube512_4way_update_close cube_4way_update
|
|
||||||
#define cube512_4way_close cube_4way_update
|
|
||||||
#define cube512_4way_full( sp, output, data, size ) \
|
|
||||||
cube_4way_full( sp, output, 512, data, size )
|
|
||||||
#define cube512_4x256_full( sp, output, data, size ) \
|
|
||||||
cube_4x256_full( sp, output, 512, data, size )
|
|
||||||
|
|
||||||
#define cube256_4way_init( sp ) cube_4way_update( sp, 256 )
|
|
||||||
#define cube256_4way_update cube_4way_update
|
|
||||||
#define cube256_4way_update_close cube_4way_update
|
|
||||||
#define cube256_4way_close cube_4way_update
|
|
||||||
#define cube256_4way_full( sp, output, data, size ) \
|
|
||||||
cube_4way_full( sp, output, 256, data, size )
|
|
||||||
#define cube256_4x256_full( sp, output, data, size ) \
|
|
||||||
cube_4x256_full( sp, output, 256, data, size )
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 2x128, 2 way parallel SSE2
|
// 2x128, 2 way parallel AVX2
|
||||||
|
|
||||||
struct _cube_2way_context
|
struct _cube_2way_context
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -31,10 +31,14 @@ static void transform( cubehashParam *sp )
|
|||||||
for ( r = 0; r < rounds; ++r )
|
for ( r = 0; r < rounds; ++r )
|
||||||
{
|
{
|
||||||
x1 = _mm512_add_epi32( x0, x1 );
|
x1 = _mm512_add_epi32( x0, x1 );
|
||||||
x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 );
|
x0 = mm512_swap_256( x0 );
|
||||||
x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) );
|
x0 = mm512_rol_32( x0, 7 );
|
||||||
x0 = _mm512_xor_si512( mm512_rol_32(
|
x0 = _mm512_xor_si512( x0, x1 );
|
||||||
mm512_swap256_128( x0 ), 11 ), x1 );
|
x1 = mm512_swap128_64( x1 );
|
||||||
|
x1 = _mm512_add_epi32( x0, x1 );
|
||||||
|
x0 = mm512_swap256_128( x0 );
|
||||||
|
x0 = mm512_rol_32( x0, 11 );
|
||||||
|
x0 = _mm512_xor_si512( x0, x1 );
|
||||||
x1 = mm512_swap64_32( x1 );
|
x1 = mm512_swap64_32( x1 );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -53,10 +53,24 @@ MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x000
|
|||||||
MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
|
MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
|
||||||
|
|
||||||
|
|
||||||
|
#define ECHO_SUBBYTES4(state, j) \
|
||||||
|
state[0][j] = _mm_aesenc_si128(state[0][j], k1);\
|
||||||
|
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||||
|
state[1][j] = _mm_aesenc_si128(state[1][j], k1);\
|
||||||
|
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||||
|
state[2][j] = _mm_aesenc_si128(state[2][j], k1);\
|
||||||
|
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||||
|
state[3][j] = _mm_aesenc_si128(state[3][j], k1);\
|
||||||
|
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||||
|
state[0][j] = _mm_aesenc_si128(state[0][j], m128_zero ); \
|
||||||
|
state[1][j] = _mm_aesenc_si128(state[1][j], m128_zero ); \
|
||||||
|
state[2][j] = _mm_aesenc_si128(state[2][j], m128_zero ); \
|
||||||
|
state[3][j] = _mm_aesenc_si128(state[3][j], m128_zero )
|
||||||
|
|
||||||
#define ECHO_SUBBYTES(state, i, j) \
|
#define ECHO_SUBBYTES(state, i, j) \
|
||||||
state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
|
state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
|
||||||
state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\
|
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||||
k1 = _mm_add_epi32(k1, M128(const1))
|
state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero))
|
||||||
|
|
||||||
#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
|
#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
|
||||||
s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
|
s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
|
||||||
@@ -73,7 +87,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
|||||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||||
s2 = _mm_xor_si128(s2, t2);\
|
s2 = _mm_xor_si128(s2, t2);\
|
||||||
state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
|
state2[0][j] = mm128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
|
||||||
state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
|
state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
|
||||||
state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
|
state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
|
||||||
state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
|
state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
|
||||||
@@ -83,7 +97,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
|||||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||||
s2 = _mm_xor_si128(s2, t2);\
|
s2 = _mm_xor_si128(s2, t2);\
|
||||||
state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
|
state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
|
||||||
state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
|
state2[1][j] = mm128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
|
||||||
state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
|
state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
|
||||||
state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
|
state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
|
||||||
s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
|
s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
|
||||||
@@ -93,10 +107,29 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
|||||||
s2 = _mm_xor_si128(s2, t2);\
|
s2 = _mm_xor_si128(s2, t2);\
|
||||||
state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
|
state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
|
||||||
state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
|
state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
|
||||||
state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
|
state2[2][j] = mm128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
|
||||||
state2[3][j] = _mm_xor_si128(state2[3][j], s2)
|
state2[3][j] = _mm_xor_si128(state2[3][j], s2)
|
||||||
|
|
||||||
|
|
||||||
|
#define ECHO_ROUND_UNROLL2 \
|
||||||
|
ECHO_SUBBYTES4(_state, 0);\
|
||||||
|
ECHO_SUBBYTES4(_state, 1);\
|
||||||
|
ECHO_SUBBYTES4(_state, 2);\
|
||||||
|
ECHO_SUBBYTES4(_state, 3);\
|
||||||
|
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||||
|
ECHO_SUBBYTES4(_state2, 0);\
|
||||||
|
ECHO_SUBBYTES4(_state2, 1);\
|
||||||
|
ECHO_SUBBYTES4(_state2, 2);\
|
||||||
|
ECHO_SUBBYTES4(_state2, 3);\
|
||||||
|
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||||
|
|
||||||
|
/*
|
||||||
#define ECHO_ROUND_UNROLL2 \
|
#define ECHO_ROUND_UNROLL2 \
|
||||||
ECHO_SUBBYTES(_state, 0, 0);\
|
ECHO_SUBBYTES(_state, 0, 0);\
|
||||||
ECHO_SUBBYTES(_state, 1, 0);\
|
ECHO_SUBBYTES(_state, 1, 0);\
|
||||||
@@ -138,7 +171,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
|||||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
#define SAVESTATE(dst, src)\
|
#define SAVESTATE(dst, src)\
|
||||||
|
|||||||
@@ -10,22 +10,27 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
|||||||
0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234
|
0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234
|
||||||
};
|
};
|
||||||
*/
|
*/
|
||||||
// do these need to be reversed?
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
#define ECHO_SUBBYTES4(state, j) \
|
||||||
#define mul2mask \
|
state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
|
||||||
m512_const2_64( 0, 0x00001b00 )
|
k1 = _mm512_add_epi32( k1, one ); \
|
||||||
//_mm512_set4_epi32( 0, 0, 0, 0x00001b00 )
|
state[1][j] = _mm512_aesenc_epi128( state[1][j], k1 ); \
|
||||||
// _mm512_set4_epi32( 0x00001b00, 0, 0, 0 )
|
k1 = _mm512_add_epi32( k1, one ); \
|
||||||
|
state[2][j] = _mm512_aesenc_epi128( state[2][j], k1 ); \
|
||||||
#define lsbmask m512_const1_32( 0x01010101 )
|
k1 = _mm512_add_epi32( k1, one ); \
|
||||||
|
state[3][j] = _mm512_aesenc_epi128( state[3][j], k1 ); \
|
||||||
|
k1 = _mm512_add_epi32( k1, one ); \
|
||||||
|
state[0][j] = _mm512_aesenc_epi128( state[0][j], m512_zero ); \
|
||||||
|
state[1][j] = _mm512_aesenc_epi128( state[1][j], m512_zero ); \
|
||||||
|
state[2][j] = _mm512_aesenc_epi128( state[2][j], m512_zero ); \
|
||||||
|
state[3][j] = _mm512_aesenc_epi128( state[3][j], m512_zero )
|
||||||
|
|
||||||
#define ECHO_SUBBYTES( state, i, j ) \
|
#define ECHO_SUBBYTES( state, i, j ) \
|
||||||
state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
|
state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
|
||||||
state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero ); \
|
k1 = _mm512_add_epi32( k1, one ); \
|
||||||
k1 = _mm512_add_epi32( k1, m512_one_128 );
|
state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero );
|
||||||
|
|
||||||
#define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) do \
|
#define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) do \
|
||||||
{ \
|
{ \
|
||||||
@@ -46,8 +51,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
|||||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||||
s2 = _mm512_xor_si512( s2, t2 );\
|
s2 = _mm512_xor_si512( s2, t2 );\
|
||||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
|
state2[ 0 ][ j ] = mm512_xor3( state2[ 0 ][ j ], s2, state1[ 1 ][ j1 ] ); \
|
||||||
_mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
|
|
||||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
|
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
|
||||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
|
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
|
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||||
@@ -57,8 +61,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
|||||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
|
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
|
||||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
|
state2[ 1 ][ j ] = mm512_xor3( state2[ 1 ][ j ], s2, state1[ 2 ][ j2 ] ); \
|
||||||
_mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
|
|
||||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
|
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
|
||||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
|
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
|
||||||
s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
|
s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
|
||||||
@@ -68,11 +71,29 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
|||||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
|
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
|
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
|
state2[ 2 ][ j ] = mm512_xor3( state2[ 2 ][ j ], s2, state1[ 3 ][ j3] ); \
|
||||||
_mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
|
|
||||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
|
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
|
#define ECHO_ROUND_UNROLL2 \
|
||||||
|
ECHO_SUBBYTES4(_state, 0);\
|
||||||
|
ECHO_SUBBYTES4(_state, 1);\
|
||||||
|
ECHO_SUBBYTES4(_state, 2);\
|
||||||
|
ECHO_SUBBYTES4(_state, 3);\
|
||||||
|
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||||
|
ECHO_SUBBYTES4(_state2, 0);\
|
||||||
|
ECHO_SUBBYTES4(_state2, 1);\
|
||||||
|
ECHO_SUBBYTES4(_state2, 2);\
|
||||||
|
ECHO_SUBBYTES4(_state2, 3);\
|
||||||
|
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||||
|
|
||||||
|
/*
|
||||||
#define ECHO_ROUND_UNROLL2 \
|
#define ECHO_ROUND_UNROLL2 \
|
||||||
ECHO_SUBBYTES(_state, 0, 0);\
|
ECHO_SUBBYTES(_state, 0, 0);\
|
||||||
ECHO_SUBBYTES(_state, 1, 0);\
|
ECHO_SUBBYTES(_state, 1, 0);\
|
||||||
@@ -114,6 +135,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
|||||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||||
|
*/
|
||||||
|
|
||||||
#define SAVESTATE(dst, src)\
|
#define SAVESTATE(dst, src)\
|
||||||
dst[0][0] = src[0][0];\
|
dst[0][0] = src[0][0];\
|
||||||
@@ -140,6 +162,9 @@ void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg,
|
|||||||
unsigned int r, b, i, j;
|
unsigned int r, b, i, j;
|
||||||
__m512i t1, t2, s2, k1;
|
__m512i t1, t2, s2, k1;
|
||||||
__m512i _state[4][4], _state2[4][4], _statebackup[4][4];
|
__m512i _state[4][4], _state2[4][4], _statebackup[4][4];
|
||||||
|
__m512i one = m512_one_128;
|
||||||
|
__m512i mul2mask = m512_const2_64( 0, 0x00001b00 );
|
||||||
|
__m512i lsbmask = m512_const1_32( 0x01010101 );
|
||||||
|
|
||||||
_state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ];
|
_state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ];
|
||||||
_state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ];
|
_state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ];
|
||||||
@@ -404,10 +429,24 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
|||||||
|
|
||||||
#define lsbmask_2way m256_const1_32( 0x01010101 )
|
#define lsbmask_2way m256_const1_32( 0x01010101 )
|
||||||
|
|
||||||
|
#define ECHO_SUBBYTES4_2WAY( state, j ) \
|
||||||
|
state[0][j] = _mm256_aesenc_epi128( state[0][j], k1 ); \
|
||||||
|
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||||
|
state[1][j] = _mm256_aesenc_epi128( state[1][j], k1 ); \
|
||||||
|
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||||
|
state[2][j] = _mm256_aesenc_epi128( state[2][j], k1 ); \
|
||||||
|
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||||
|
state[3][j] = _mm256_aesenc_epi128( state[3][j], k1 ); \
|
||||||
|
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||||
|
state[0][j] = _mm256_aesenc_epi128( state[0][j], m256_zero ); \
|
||||||
|
state[1][j] = _mm256_aesenc_epi128( state[1][j], m256_zero ); \
|
||||||
|
state[2][j] = _mm256_aesenc_epi128( state[2][j], m256_zero ); \
|
||||||
|
state[3][j] = _mm256_aesenc_epi128( state[3][j], m256_zero )
|
||||||
|
|
||||||
#define ECHO_SUBBYTES_2WAY( state, i, j ) \
|
#define ECHO_SUBBYTES_2WAY( state, i, j ) \
|
||||||
state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \
|
state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \
|
||||||
|
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||||
state[i][j] = _mm256_aesenc_epi128( state[i][j], m256_zero ); \
|
state[i][j] = _mm256_aesenc_epi128( state[i][j], m256_zero ); \
|
||||||
k1 = _mm256_add_epi32( k1, m256_one_128 );
|
|
||||||
|
|
||||||
#define ECHO_MIXBYTES_2WAY( state1, state2, j, t1, t2, s2 ) do \
|
#define ECHO_MIXBYTES_2WAY( state1, state2, j, t1, t2, s2 ) do \
|
||||||
{ \
|
{ \
|
||||||
@@ -455,6 +494,25 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
|||||||
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
|
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
|
#define ECHO_ROUND_UNROLL2_2WAY \
|
||||||
|
ECHO_SUBBYTES4_2WAY(_state, 0);\
|
||||||
|
ECHO_SUBBYTES4_2WAY(_state, 1);\
|
||||||
|
ECHO_SUBBYTES4_2WAY(_state, 2);\
|
||||||
|
ECHO_SUBBYTES4_2WAY(_state, 3);\
|
||||||
|
ECHO_MIXBYTES_2WAY(_state, _state2, 0, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES_2WAY(_state, _state2, 1, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES_2WAY(_state, _state2, 2, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES_2WAY(_state, _state2, 3, t1, t2, s2);\
|
||||||
|
ECHO_SUBBYTES4_2WAY(_state2, 0);\
|
||||||
|
ECHO_SUBBYTES4_2WAY(_state2, 1);\
|
||||||
|
ECHO_SUBBYTES4_2WAY(_state2, 2);\
|
||||||
|
ECHO_SUBBYTES4_2WAY(_state2, 3);\
|
||||||
|
ECHO_MIXBYTES_2WAY(_state2, _state, 0, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
|
||||||
|
ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
|
||||||
|
|
||||||
|
/*
|
||||||
#define ECHO_ROUND_UNROLL2_2WAY \
|
#define ECHO_ROUND_UNROLL2_2WAY \
|
||||||
ECHO_SUBBYTES_2WAY(_state, 0, 0);\
|
ECHO_SUBBYTES_2WAY(_state, 0, 0);\
|
||||||
ECHO_SUBBYTES_2WAY(_state, 1, 0);\
|
ECHO_SUBBYTES_2WAY(_state, 1, 0);\
|
||||||
@@ -496,6 +554,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
|||||||
ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
|
ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
|
||||||
ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
|
ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
|
||||||
ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
|
ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
|
||||||
|
*/
|
||||||
|
|
||||||
#define SAVESTATE_2WAY(dst, src)\
|
#define SAVESTATE_2WAY(dst, src)\
|
||||||
dst[0][0] = src[0][0];\
|
dst[0][0] = src[0][0];\
|
||||||
|
|||||||
@@ -124,7 +124,16 @@ MYALIGN const unsigned int _IV512[] = {
|
|||||||
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
|
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||||
s7 = _mm_xor_si128(s7, t1)
|
s7 = _mm_xor_si128(s7, t1)
|
||||||
|
|
||||||
|
#define PRESUPERMIX(t0, t1, t2, t3, t4)\
|
||||||
|
t2 = t0;\
|
||||||
|
t3 = _mm_add_epi8(t0, t0);\
|
||||||
|
t4 = _mm_add_epi8(t3, t3);\
|
||||||
|
t1 = _mm_srli_epi16(t0, 6);\
|
||||||
|
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||||
|
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||||
|
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
||||||
|
|
||||||
|
/*
|
||||||
#define PRESUPERMIX(x, t1, s1, s2, t2)\
|
#define PRESUPERMIX(x, t1, s1, s2, t2)\
|
||||||
s1 = x;\
|
s1 = x;\
|
||||||
s2 = _mm_add_epi8(x, x);\
|
s2 = _mm_add_epi8(x, x);\
|
||||||
@@ -133,37 +142,59 @@ MYALIGN const unsigned int _IV512[] = {
|
|||||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||||
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||||
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
||||||
|
*/
|
||||||
|
|
||||||
#define SUBSTITUTE(r0, _t1, _t2, _t3, _t0)\
|
#define SUBSTITUTE(r0, _t2 )\
|
||||||
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
|
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
|
||||||
_t2 = _mm_aesenclast_si128( _t2, m128_zero )
|
_t2 = _mm_aesenclast_si128( _t2, m128_zero )
|
||||||
|
|
||||||
|
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||||
|
t2 = t0;\
|
||||||
|
t3 = _mm_add_epi8(t0, t0);\
|
||||||
|
t4 = _mm_add_epi8(t3, t3);\
|
||||||
|
t1 = _mm_srli_epi16(t0, 6);\
|
||||||
|
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||||
|
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
|
||||||
|
t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
|
||||||
|
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||||
|
t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
|
||||||
|
t4 = _mm_xor_si128(t4, t1);\
|
||||||
|
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
|
||||||
|
t4 = _mm_xor_si128(t4, t1);\
|
||||||
|
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
||||||
|
t2 = mm128_xor3(t2, t3, t0 );\
|
||||||
|
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||||
|
t4 = mm128_xor3( t4, t1, t2 ); \
|
||||||
|
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||||
|
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
||||||
|
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
|
||||||
|
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
||||||
|
t4 = mm128_xor3( t4, t2, t1 ); \
|
||||||
|
t0 = _mm_xor_si128(t0, t3);\
|
||||||
|
t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
|
||||||
|
|
||||||
|
/*
|
||||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||||
PRESUPERMIX(t0, t1, t2, t3, t4);\
|
PRESUPERMIX(t0, t1, t2, t3, t4);\
|
||||||
POSTSUPERMIX(t0, t1, t2, t3, t4)
|
POSTSUPERMIX(t0, t1, t2, t3, t4)
|
||||||
|
*/
|
||||||
|
|
||||||
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
|
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
|
||||||
t1 = t2;\
|
t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
|
||||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1b));\
|
|
||||||
t4 = t1;\
|
t4 = t1;\
|
||||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
|
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
|
||||||
t4 = _mm_xor_si128(t4, t1);\
|
t4 = _mm_xor_si128(t4, t1);\
|
||||||
t1 = t4;\
|
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
|
||||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1d));\
|
|
||||||
t4 = _mm_xor_si128(t4, t1);\
|
t4 = _mm_xor_si128(t4, t1);\
|
||||||
t1 = t2;\
|
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
||||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1a));\
|
|
||||||
t4 = _mm_xor_si128(t4, t1);\
|
t4 = _mm_xor_si128(t4, t1);\
|
||||||
t2 = _mm_xor_si128(t2, t3);\
|
t2 = mm128_xor3(t2, t3, t0 );\
|
||||||
t2 = _mm_xor_si128(t2, t0);\
|
|
||||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||||
t4 = _mm_xor_si128(t4, t2);\
|
t4 = _mm_xor_si128(t4, t2);\
|
||||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||||
t4 = _mm_xor_si128(t4, t2);\
|
t4 = _mm_xor_si128(t4, t2);\
|
||||||
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
||||||
t1 = t0;\
|
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
|
||||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix4a));\
|
|
||||||
t4 = _mm_xor_si128(t4, t1);\
|
t4 = _mm_xor_si128(t4, t1);\
|
||||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
||||||
t0 = _mm_xor_si128(t0, t3);\
|
t0 = _mm_xor_si128(t0, t3);\
|
||||||
@@ -171,59 +202,55 @@ MYALIGN const unsigned int _IV512[] = {
|
|||||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
|
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
|
||||||
t4 = _mm_xor_si128(t4, t0)
|
t4 = _mm_xor_si128(t4, t0)
|
||||||
|
|
||||||
|
|
||||||
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
|
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
|
||||||
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
||||||
PACK_S0(r1c, r1a, _t0);\
|
PACK_S0(r1c, r1a, _t0);\
|
||||||
SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
|
SUBSTITUTE(r1c, _t2 );\
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
||||||
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
||||||
r2c = _mm_xor_si128(r2c, _t0);\
|
r2c = _mm_xor_si128(r2c, _t0);\
|
||||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||||
r2d = _mm_xor_si128(r2d, _t0);\
|
r2d = _mm_xor_si128(r2d, _t0);\
|
||||||
UNPACK_S0(r1c, r1a, _t3);\
|
UNPACK_S0(r1c, r1a, _t3);\
|
||||||
SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
|
SUBSTITUTE(r2c, _t2 );\
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
||||||
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
||||||
r3c = _mm_xor_si128(r3c, _t0);\
|
r3c = _mm_xor_si128(r3c, _t0);\
|
||||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||||
r3d = _mm_xor_si128(r3d, _t0);\
|
r3d = _mm_xor_si128(r3d, _t0);\
|
||||||
UNPACK_S0(r2c, r2a, _t3);\
|
UNPACK_S0(r2c, r2a, _t3);\
|
||||||
SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
|
SUBSTITUTE(r3c, _t2 );\
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
||||||
UNPACK_S0(r3c, r3a, _t3)
|
UNPACK_S0(r3c, r3a, _t3)
|
||||||
|
|
||||||
|
|
||||||
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
|
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
|
||||||
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
||||||
PACK_S0(r1c, r1a, _t0);\
|
PACK_S0(r1c, r1a, _t0);\
|
||||||
SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
|
SUBSTITUTE( r1c, _t2 );\
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
||||||
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
||||||
r2c = _mm_xor_si128(r2c, _t0);\
|
r2c = _mm_xor_si128(r2c, _t0);\
|
||||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||||
r2d = _mm_xor_si128(r2d, _t0);\
|
r2d = _mm_xor_si128(r2d, _t0);\
|
||||||
UNPACK_S0(r1c, r1a, _t3);\
|
UNPACK_S0(r1c, r1a, _t3);\
|
||||||
SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
|
SUBSTITUTE(r2c, _t2 );\
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
||||||
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
||||||
r3c = _mm_xor_si128(r3c, _t0);\
|
r3c = _mm_xor_si128(r3c, _t0);\
|
||||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||||
r3d = _mm_xor_si128(r3d, _t0);\
|
r3d = _mm_xor_si128(r3d, _t0);\
|
||||||
UNPACK_S0(r2c, r2a, _t3);\
|
UNPACK_S0(r2c, r2a, _t3);\
|
||||||
SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
|
SUBSTITUTE( r3c, _t2 );\
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
||||||
_t0 = _mm_shuffle_epi32(r3c, 0x39);\
|
_t0 = _mm_shuffle_epi32(r3c, 0x39);\
|
||||||
r4c = _mm_xor_si128(r4c, _t0);\
|
r4c = _mm_xor_si128(r4c, _t0);\
|
||||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||||
r4d = _mm_xor_si128(r4d, _t0);\
|
r4d = _mm_xor_si128(r4d, _t0);\
|
||||||
UNPACK_S0(r3c, r3a, _t3);\
|
UNPACK_S0(r3c, r3a, _t3);\
|
||||||
SUBSTITUTE(r4c, _t1, _t2, _t3, _t0);\
|
SUBSTITUTE( r4c, _t2 );\
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
|
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
|
||||||
UNPACK_S0(r4c, r4a, _t3)
|
UNPACK_S0(r4c, r4a, _t3)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define LOADCOLUMN(x, s, a)\
|
#define LOADCOLUMN(x, s, a)\
|
||||||
block[0] = col[(base + a + 0) % s];\
|
block[0] = col[(base + a + 0) % s];\
|
||||||
block[1] = col[(base + a + 1) % s];\
|
block[1] = col[(base + a + 1) % s];\
|
||||||
@@ -278,44 +305,42 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while( uBlockCount > 0 )
|
while( uBlockCount > 0 )
|
||||||
{
|
{
|
||||||
TIX512( pmsg, ctx->state[ 7], ctx->state[2], ctx->state[8], ctx->state[9],
|
TIX512( pmsg, ctx->state[ 7],ctx->state[2],ctx->state[8],ctx->state[9],
|
||||||
ctx->state[10], ctx->state[0], ctx->state[1], ctx->state[2],
|
ctx->state[10],ctx->state[0],ctx->state[1],ctx->state[2],
|
||||||
_t0, _t1, _t2 );
|
_t0, _t1, _t2 );
|
||||||
SUBROUND512_4( ctx->state[0], ctx->state[1], ctx->state[11],
|
SUBROUND512_4( ctx->state[0], ctx->state[1],ctx->state[11],ctx->state[5],
|
||||||
ctx->state[5], ctx->state[11], ctx->state[0],
|
ctx->state[11],ctx->state[0],ctx->state[10],ctx->state[4],
|
||||||
ctx->state[10], ctx->state[4], ctx->state[10],
|
ctx->state[10],ctx->state[11],ctx->state[9],ctx->state[3],
|
||||||
ctx->state[11], ctx->state[9], ctx->state[3],
|
ctx->state[9],ctx->state[10],ctx->state[8],ctx->state[2] );
|
||||||
ctx->state[9], ctx->state[10], ctx->state[8],
|
|
||||||
ctx->state[2] );
|
|
||||||
|
|
||||||
ctx->base++;
|
ctx->base++;
|
||||||
pmsg += 4;
|
pmsg += 4;
|
||||||
uBlockCount--;
|
uBlockCount--;
|
||||||
if( uBlockCount == 0 ) break;
|
if( uBlockCount == 0 ) break;
|
||||||
|
|
||||||
TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4], ctx->state[5],
|
TIX512( pmsg, ctx->state[3],ctx->state[10],ctx->state[4],ctx->state[5],
|
||||||
ctx->state[6], ctx->state[8], ctx->state[9], ctx->state[10],
|
ctx->state[6],ctx->state[8], ctx->state[9],ctx->state[10],
|
||||||
_t0, _t1, _t2 );
|
_t0, _t1, _t2 );
|
||||||
|
|
||||||
SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], ctx->state[1], ctx->state[7], ctx->state[8], ctx->state[6], ctx->state[0],
|
SUBROUND512_4( ctx->state[8],ctx->state[9],ctx->state[7],ctx->state[1],
|
||||||
ctx->state[6], ctx->state[7], ctx->state[5], ctx->state[11],
|
ctx->state[7],ctx->state[8],ctx->state[6],ctx->state[0],
|
||||||
ctx->state[5], ctx->state[6, ctx->state[4], ctx->state[10]);
|
ctx->state[6],ctx->state[7],ctx->state[5],ctx->state[11],
|
||||||
|
ctx->state[5],ctx->state[6],ctx->state[4],ctx->state[10] );
|
||||||
|
|
||||||
ctx->base++;
|
ctx->base++;
|
||||||
pmsg += 4;
|
pmsg += 4;
|
||||||
uBlockCount--;
|
uBlockCount--;
|
||||||
if( uBlockCount == 0 ) break;
|
if( uBlockCount == 0 ) break;
|
||||||
|
|
||||||
TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0], ctx->state[1],
|
TIX512( pmsg, ctx->state[11],ctx->state[6],ctx->state[0],ctx->state[1],
|
||||||
ctx->state[2], ctx->state[4], ctx->state[5], ctx->state[6],
|
ctx->state[2], ctx->state[4],ctx->state[5],ctx->state[6],
|
||||||
_t0, _t1, _t2);
|
_t0, _t1, _t2);
|
||||||
SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3], ctx->state[9],
|
SUBROUND512_4( ctx->state[4],ctx->state[5],ctx->state[3],ctx->state[9],
|
||||||
ctx->state[3], ctx->state[4], ctx->state[2], ctx->state[8],
|
ctx->state[3],ctx->state[4],ctx->state[2],ctx->state[8],
|
||||||
ctx->state[2], ctx->state[3], ctx->state[1], ctx->state[7],
|
ctx->state[2],ctx->state[3],ctx->state[1],ctx->state[7],
|
||||||
ctx->state[1], ctx->state[2], ctx->state[0], ctx->state[6]);
|
ctx->state[1],ctx->state[2],ctx->state[0],ctx->state[6]);
|
||||||
|
|
||||||
ctx->base = 0;
|
ctx->base = 0;
|
||||||
pmsg += 4;
|
pmsg += 4;
|
||||||
@@ -357,7 +382,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
|||||||
|
|
||||||
// SMIX
|
// SMIX
|
||||||
LOADCOLUMN(r0, 36, 0);
|
LOADCOLUMN(r0, 36, 0);
|
||||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
SUBSTITUTE(r0, _t2);
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||||
STORECOLUMN(r0, 36);
|
STORECOLUMN(r0, 36);
|
||||||
}
|
}
|
||||||
@@ -375,7 +400,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
|||||||
|
|
||||||
// SMIX
|
// SMIX
|
||||||
LOADCOLUMN(r0, 36, 0);
|
LOADCOLUMN(r0, 36, 0);
|
||||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
SUBSTITUTE(r0, _t2);
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||||
STORECOLUMN(r0, 36);
|
STORECOLUMN(r0, 36);
|
||||||
|
|
||||||
@@ -390,7 +415,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
|||||||
|
|
||||||
// SMIX
|
// SMIX
|
||||||
LOADCOLUMN(r0, 36, 0);
|
LOADCOLUMN(r0, 36, 0);
|
||||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
SUBSTITUTE(r0, _t2);
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||||
STORECOLUMN(r0, 36);
|
STORECOLUMN(r0, 36);
|
||||||
|
|
||||||
@@ -405,7 +430,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
|||||||
|
|
||||||
// SMIX
|
// SMIX
|
||||||
LOADCOLUMN(r0, 36, 0);
|
LOADCOLUMN(r0, 36, 0);
|
||||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
SUBSTITUTE(r0, _t2);
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||||
STORECOLUMN(r0, 36);
|
STORECOLUMN(r0, 36);
|
||||||
|
|
||||||
@@ -420,7 +445,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
|||||||
|
|
||||||
// SMIX
|
// SMIX
|
||||||
LOADCOLUMN(r0, 36, 0);
|
LOADCOLUMN(r0, 36, 0);
|
||||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
SUBSTITUTE(r0, _t2);
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||||
STORECOLUMN(r0, 36);
|
STORECOLUMN(r0, 36);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,6 +16,10 @@
|
|||||||
|
|
||||||
#if defined(__AES__)
|
#if defined(__AES__)
|
||||||
|
|
||||||
|
#if !defined(__SSE4_1__)
|
||||||
|
#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "algo/sha/sha3_common.h"
|
#include "algo/sha/sha3_common.h"
|
||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
|
|
||||||
|
|||||||
@@ -67,11 +67,9 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
|||||||
* xmm[j] will be lost
|
* xmm[j] will be lost
|
||||||
* xmm[k] has to be all 0x1b */
|
* xmm[k] has to be all 0x1b */
|
||||||
#define MUL2(i, j, k){\
|
#define MUL2(i, j, k){\
|
||||||
j = _mm_xor_si128(j, j);\
|
j = _mm_cmpgt_epi8( m128_zero, i);\
|
||||||
j = _mm_cmpgt_epi8(j, i);\
|
|
||||||
i = _mm_add_epi8(i, i);\
|
i = _mm_add_epi8(i, i);\
|
||||||
j = _mm_and_si128(j, k);\
|
i = mm128_xorand(i, j, k );\
|
||||||
i = _mm_xor_si128(i, j);\
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**/
|
/**/
|
||||||
@@ -93,6 +91,96 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
|||||||
We almost fit into 16 registers, need only 3 spills to memory.
|
We almost fit into 16 registers, need only 3 spills to memory.
|
||||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||||
K. Matusiewicz, 2011/05/29 */
|
K. Matusiewicz, 2011/05/29 */
|
||||||
|
|
||||||
|
#if defined(__AVX512VL__)
|
||||||
|
|
||||||
|
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||||
|
/* t_i = a_i + a_{i+1} */\
|
||||||
|
b6 = a0;\
|
||||||
|
b7 = a1;\
|
||||||
|
a0 = _mm_xor_si128(a0, a1);\
|
||||||
|
b0 = a2;\
|
||||||
|
a1 = _mm_xor_si128(a1, a2);\
|
||||||
|
b1 = a3;\
|
||||||
|
TEMP2 = _mm_xor_si128(a2, a3);\
|
||||||
|
b2 = a4;\
|
||||||
|
a3 = _mm_xor_si128(a3, a4);\
|
||||||
|
b3 = a5;\
|
||||||
|
a4 = _mm_xor_si128(a4, a5);\
|
||||||
|
b4 = a6;\
|
||||||
|
a5 = _mm_xor_si128(a5, a6);\
|
||||||
|
b5 = a7;\
|
||||||
|
a6 = _mm_xor_si128(a6, a7);\
|
||||||
|
a7 = _mm_xor_si128(a7, b6);\
|
||||||
|
\
|
||||||
|
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||||
|
TEMP0 = mm128_xor3( b0, a4, a6 ); \
|
||||||
|
/* spill values y_4, y_5 to memory */\
|
||||||
|
TEMP1 = mm128_xor3( b1, a5, a7 );\
|
||||||
|
b2 = mm128_xor3( b2, a6, a0 ); \
|
||||||
|
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||||
|
b0 = a0;\
|
||||||
|
b3 = mm128_xor3( b3, a7, a1 ); \
|
||||||
|
b1 = a1;\
|
||||||
|
b6 = mm128_xor3( b6, a4, TEMP2 ); \
|
||||||
|
b4 = mm128_xor3( b4, a0, TEMP2 ); \
|
||||||
|
b7 = mm128_xor3( b7, a5, a3 ); \
|
||||||
|
b5 = mm128_xor3( b5, a1, a3 ); \
|
||||||
|
\
|
||||||
|
/* compute x_i = t_i + t_{i+3} */\
|
||||||
|
a0 = _mm_xor_si128(a0, a3);\
|
||||||
|
a1 = _mm_xor_si128(a1, a4);\
|
||||||
|
a2 = _mm_xor_si128(TEMP2, a5);\
|
||||||
|
a3 = _mm_xor_si128(a3, a6);\
|
||||||
|
a4 = _mm_xor_si128(a4, a7);\
|
||||||
|
a5 = _mm_xor_si128(a5, b0);\
|
||||||
|
a6 = _mm_xor_si128(a6, b1);\
|
||||||
|
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||||
|
\
|
||||||
|
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||||
|
/* compute w_i : add y_{i+4} */\
|
||||||
|
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||||
|
MUL2(a0, b0, b1);\
|
||||||
|
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||||
|
MUL2(a1, b0, b1);\
|
||||||
|
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||||
|
MUL2(a2, b0, b1);\
|
||||||
|
a2 = _mm_xor_si128(a2, b2);\
|
||||||
|
MUL2(a3, b0, b1);\
|
||||||
|
a3 = _mm_xor_si128(a3, b3);\
|
||||||
|
MUL2(a4, b0, b1);\
|
||||||
|
a4 = _mm_xor_si128(a4, b4);\
|
||||||
|
MUL2(a5, b0, b1);\
|
||||||
|
a5 = _mm_xor_si128(a5, b5);\
|
||||||
|
MUL2(a6, b0, b1);\
|
||||||
|
a6 = _mm_xor_si128(a6, b6);\
|
||||||
|
MUL2(a7, b0, b1);\
|
||||||
|
a7 = _mm_xor_si128(a7, b7);\
|
||||||
|
\
|
||||||
|
/* compute v_i : double w_i */\
|
||||||
|
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||||
|
MUL2(a0, b0, b1);\
|
||||||
|
b5 = _mm_xor_si128(b5, a0);\
|
||||||
|
MUL2(a1, b0, b1);\
|
||||||
|
b6 = _mm_xor_si128(b6, a1);\
|
||||||
|
MUL2(a2, b0, b1);\
|
||||||
|
b7 = _mm_xor_si128(b7, a2);\
|
||||||
|
MUL2(a5, b0, b1);\
|
||||||
|
b2 = _mm_xor_si128(b2, a5);\
|
||||||
|
MUL2(a6, b0, b1);\
|
||||||
|
b3 = _mm_xor_si128(b3, a6);\
|
||||||
|
MUL2(a7, b0, b1);\
|
||||||
|
b4 = _mm_xor_si128(b4, a7);\
|
||||||
|
MUL2(a3, b0, b1);\
|
||||||
|
MUL2(a4, b0, b1);\
|
||||||
|
b0 = TEMP0;\
|
||||||
|
b1 = TEMP1;\
|
||||||
|
b0 = _mm_xor_si128(b0, a3);\
|
||||||
|
b1 = _mm_xor_si128(b1, a4);\
|
||||||
|
}/*MixBytes*/
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||||
/* t_i = a_i + a_{i+1} */\
|
/* t_i = a_i + a_{i+1} */\
|
||||||
b6 = a0;\
|
b6 = a0;\
|
||||||
@@ -189,6 +277,8 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
|||||||
b1 = _mm_xor_si128(b1, a4);\
|
b1 = _mm_xor_si128(b1, a4);\
|
||||||
}/*MixBytes*/
|
}/*MixBytes*/
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/* one round
|
/* one round
|
||||||
* a0-a7 = input rows
|
* a0-a7 = input rows
|
||||||
|
|||||||
@@ -58,11 +58,9 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
|||||||
* xmm[j] will be lost
|
* xmm[j] will be lost
|
||||||
* xmm[k] has to be all 0x1b */
|
* xmm[k] has to be all 0x1b */
|
||||||
#define MUL2(i, j, k){\
|
#define MUL2(i, j, k){\
|
||||||
j = _mm_xor_si128(j, j);\
|
j = _mm_cmpgt_epi8( m128_zero, i);\
|
||||||
j = _mm_cmpgt_epi8(j, i);\
|
|
||||||
i = _mm_add_epi8(i, i);\
|
i = _mm_add_epi8(i, i);\
|
||||||
j = _mm_and_si128(j, k);\
|
i = mm128_xorand(i, j, k );\
|
||||||
i = _mm_xor_si128(i, j);\
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Yet another implementation of MixBytes.
|
/* Yet another implementation of MixBytes.
|
||||||
@@ -82,6 +80,96 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
|||||||
We almost fit into 16 registers, need only 3 spills to memory.
|
We almost fit into 16 registers, need only 3 spills to memory.
|
||||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||||
K. Matusiewicz, 2011/05/29 */
|
K. Matusiewicz, 2011/05/29 */
|
||||||
|
|
||||||
|
#if defined(__AVX512VL__)
|
||||||
|
|
||||||
|
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||||
|
/* t_i = a_i + a_{i+1} */\
|
||||||
|
b6 = a0;\
|
||||||
|
b7 = a1;\
|
||||||
|
a0 = _mm_xor_si128(a0, a1);\
|
||||||
|
b0 = a2;\
|
||||||
|
a1 = _mm_xor_si128(a1, a2);\
|
||||||
|
b1 = a3;\
|
||||||
|
TEMP2 = _mm_xor_si128(a2, a3);\
|
||||||
|
b2 = a4;\
|
||||||
|
a3 = _mm_xor_si128(a3, a4);\
|
||||||
|
b3 = a5;\
|
||||||
|
a4 = _mm_xor_si128(a4, a5);\
|
||||||
|
b4 = a6;\
|
||||||
|
a5 = _mm_xor_si128(a5, a6);\
|
||||||
|
b5 = a7;\
|
||||||
|
a6 = _mm_xor_si128(a6, a7);\
|
||||||
|
a7 = _mm_xor_si128(a7, b6);\
|
||||||
|
\
|
||||||
|
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||||
|
TEMP0 = mm128_xor3( b0, a4, a6 ); \
|
||||||
|
/* spill values y_4, y_5 to memory */\
|
||||||
|
TEMP1 = mm128_xor3( b1, a5, a7 );\
|
||||||
|
b2 = mm128_xor3( b2, a6, a0 ); \
|
||||||
|
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||||
|
b0 = a0;\
|
||||||
|
b3 = mm128_xor3( b3, a7, a1 ); \
|
||||||
|
b1 = a1;\
|
||||||
|
b6 = mm128_xor3( b6, a4, TEMP2 ); \
|
||||||
|
b4 = mm128_xor3( b4, a0, TEMP2 ); \
|
||||||
|
b7 = mm128_xor3( b7, a5, a3 ); \
|
||||||
|
b5 = mm128_xor3( b5, a1, a3 ); \
|
||||||
|
\
|
||||||
|
/* compute x_i = t_i + t_{i+3} */\
|
||||||
|
a0 = _mm_xor_si128(a0, a3);\
|
||||||
|
a1 = _mm_xor_si128(a1, a4);\
|
||||||
|
a2 = _mm_xor_si128(TEMP2, a5);\
|
||||||
|
a3 = _mm_xor_si128(a3, a6);\
|
||||||
|
a4 = _mm_xor_si128(a4, a7);\
|
||||||
|
a5 = _mm_xor_si128(a5, b0);\
|
||||||
|
a6 = _mm_xor_si128(a6, b1);\
|
||||||
|
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||||
|
\
|
||||||
|
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||||
|
/* compute w_i : add y_{i+4} */\
|
||||||
|
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||||
|
MUL2(a0, b0, b1);\
|
||||||
|
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||||
|
MUL2(a1, b0, b1);\
|
||||||
|
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||||
|
MUL2(a2, b0, b1);\
|
||||||
|
a2 = _mm_xor_si128(a2, b2);\
|
||||||
|
MUL2(a3, b0, b1);\
|
||||||
|
a3 = _mm_xor_si128(a3, b3);\
|
||||||
|
MUL2(a4, b0, b1);\
|
||||||
|
a4 = _mm_xor_si128(a4, b4);\
|
||||||
|
MUL2(a5, b0, b1);\
|
||||||
|
a5 = _mm_xor_si128(a5, b5);\
|
||||||
|
MUL2(a6, b0, b1);\
|
||||||
|
a6 = _mm_xor_si128(a6, b6);\
|
||||||
|
MUL2(a7, b0, b1);\
|
||||||
|
a7 = _mm_xor_si128(a7, b7);\
|
||||||
|
\
|
||||||
|
/* compute v_i : double w_i */\
|
||||||
|
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||||
|
MUL2(a0, b0, b1);\
|
||||||
|
b5 = _mm_xor_si128(b5, a0);\
|
||||||
|
MUL2(a1, b0, b1);\
|
||||||
|
b6 = _mm_xor_si128(b6, a1);\
|
||||||
|
MUL2(a2, b0, b1);\
|
||||||
|
b7 = _mm_xor_si128(b7, a2);\
|
||||||
|
MUL2(a5, b0, b1);\
|
||||||
|
b2 = _mm_xor_si128(b2, a5);\
|
||||||
|
MUL2(a6, b0, b1);\
|
||||||
|
b3 = _mm_xor_si128(b3, a6);\
|
||||||
|
MUL2(a7, b0, b1);\
|
||||||
|
b4 = _mm_xor_si128(b4, a7);\
|
||||||
|
MUL2(a3, b0, b1);\
|
||||||
|
MUL2(a4, b0, b1);\
|
||||||
|
b0 = TEMP0;\
|
||||||
|
b1 = TEMP1;\
|
||||||
|
b0 = _mm_xor_si128(b0, a3);\
|
||||||
|
b1 = _mm_xor_si128(b1, a4);\
|
||||||
|
}/*MixBytes*/
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||||
/* t_i = a_i + a_{i+1} */\
|
/* t_i = a_i + a_{i+1} */\
|
||||||
b6 = a0;\
|
b6 = a0;\
|
||||||
@@ -178,6 +266,8 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
|||||||
b1 = _mm_xor_si128(b1, a4);\
|
b1 = _mm_xor_si128(b1, a4);\
|
||||||
}/*MixBytes*/
|
}/*MixBytes*/
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
/* one round
|
/* one round
|
||||||
* i = round number
|
* i = round number
|
||||||
* a0-a7 = input rows
|
* a0-a7 = input rows
|
||||||
|
|||||||
@@ -43,7 +43,8 @@
|
|||||||
#define ROUNDS (ROUNDS1024)
|
#define ROUNDS (ROUNDS1024)
|
||||||
//#endif
|
//#endif
|
||||||
|
|
||||||
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||||
|
#define ROTL64(a,n) rol64( a, n )
|
||||||
|
|
||||||
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
||||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
||||||
|
|||||||
@@ -63,7 +63,8 @@ typedef crypto_uint64 u64;
|
|||||||
//#define ROUNDS (ROUNDS1024)
|
//#define ROUNDS (ROUNDS1024)
|
||||||
//#endif
|
//#endif
|
||||||
|
|
||||||
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||||
|
#define ROTL64(a,n) rol64( a, n )
|
||||||
|
|
||||||
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
||||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
|
|||||||
const int hashlen_m128i = 32 >> 4; // bytes to __m128i
|
const int hashlen_m128i = 32 >> 4; // bytes to __m128i
|
||||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||||
int rem = ctx->rem_ptr;
|
int rem = ctx->rem_ptr;
|
||||||
int blocks = len / SIZE256;
|
uint64_t blocks = len / SIZE256;
|
||||||
__m512i* in = (__m512i*)input;
|
__m512i* in = (__m512i*)input;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
@@ -89,21 +89,21 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
|
|||||||
if ( i == SIZE256 - 1 )
|
if ( i == SIZE256 - 1 )
|
||||||
{
|
{
|
||||||
// only 1 vector left in buffer, all padding at once
|
// only 1 vector left in buffer, all padding at once
|
||||||
ctx->buffer[i] = m512_const2_64( (uint64_t)blocks << 56, 0x80 );
|
ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// add first padding
|
// add first padding
|
||||||
ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
|
ctx->buffer[i] = m512_const2_64( 0, 0x80 );
|
||||||
// add zero padding
|
// add zero padding
|
||||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||||
ctx->buffer[i] = m512_zero;
|
ctx->buffer[i] = m512_zero;
|
||||||
|
|
||||||
// add length padding, second last byte is zero unless blocks > 255
|
// add length padding, second last byte is zero unless blocks > 255
|
||||||
ctx->buffer[i] = m512_const2_64( (uint64_t)blocks << 56, 0 );
|
ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
|
||||||
}
|
}
|
||||||
|
|
||||||
// digest final padding block and do output transform
|
// digest final padding block and do output transform
|
||||||
TF512_4way( ctx->chaining, ctx->buffer );
|
TF512_4way( ctx->chaining, ctx->buffer );
|
||||||
|
|
||||||
OF512_4way( ctx->chaining );
|
OF512_4way( ctx->chaining );
|
||||||
@@ -122,7 +122,7 @@ int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
|
|||||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||||
int rem = ctx->rem_ptr;
|
int rem = ctx->rem_ptr;
|
||||||
int blocks = len / SIZE256;
|
uint64_t blocks = len / SIZE256;
|
||||||
__m512i* in = (__m512i*)input;
|
__m512i* in = (__m512i*)input;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
@@ -146,20 +146,18 @@ int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
|
|||||||
if ( i == SIZE256 - 1 )
|
if ( i == SIZE256 - 1 )
|
||||||
{
|
{
|
||||||
// only 1 vector left in buffer, all padding at once
|
// only 1 vector left in buffer, all padding at once
|
||||||
ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
|
ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
|
||||||
blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// add first padding
|
// add first padding
|
||||||
ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
|
ctx->buffer[i] = m512_const2_64( 0, 0x80 );
|
||||||
// add zero padding
|
// add zero padding
|
||||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||||
ctx->buffer[i] = m512_zero;
|
ctx->buffer[i] = m512_zero;
|
||||||
|
|
||||||
// add length padding, second last byte is zero unless blocks > 255
|
// add length padding, second last byte is zero unless blocks > 255
|
||||||
ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
|
ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
|
||||||
blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// digest final padding block and do output transform
|
// digest final padding block and do output transform
|
||||||
@@ -209,7 +207,7 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
|
|||||||
const int hashlen_m128i = 32 >> 4; // bytes to __m128i
|
const int hashlen_m128i = 32 >> 4; // bytes to __m128i
|
||||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||||
int rem = ctx->rem_ptr;
|
int rem = ctx->rem_ptr;
|
||||||
int blocks = len / SIZE256;
|
uint64_t blocks = len / SIZE256;
|
||||||
__m256i* in = (__m256i*)input;
|
__m256i* in = (__m256i*)input;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
@@ -247,7 +245,7 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
|
|||||||
if ( i == SIZE256 - 1 )
|
if ( i == SIZE256 - 1 )
|
||||||
{
|
{
|
||||||
// only 1 vector left in buffer, all padding at once
|
// only 1 vector left in buffer, all padding at once
|
||||||
ctx->buffer[i] = m256_const2_64( (uint64_t)blocks << 56, 0x80 );
|
ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -258,10 +256,10 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
|
|||||||
ctx->buffer[i] = m256_zero;
|
ctx->buffer[i] = m256_zero;
|
||||||
|
|
||||||
// add length padding, second last byte is zero unless blocks > 255
|
// add length padding, second last byte is zero unless blocks > 255
|
||||||
ctx->buffer[i] = m256_const2_64( (uint64_t)blocks << 56, 0 );
|
ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
|
||||||
}
|
}
|
||||||
|
|
||||||
// digest final padding block and do output transform
|
// digest final padding block and do output transform
|
||||||
TF512_2way( ctx->chaining, ctx->buffer );
|
TF512_2way( ctx->chaining, ctx->buffer );
|
||||||
|
|
||||||
OF512_2way( ctx->chaining );
|
OF512_2way( ctx->chaining );
|
||||||
@@ -279,7 +277,7 @@ int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output,
|
|||||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||||
int rem = ctx->rem_ptr;
|
int rem = ctx->rem_ptr;
|
||||||
int blocks = len / SIZE256;
|
uint64_t blocks = len / SIZE256;
|
||||||
__m256i* in = (__m256i*)input;
|
__m256i* in = (__m256i*)input;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
@@ -303,8 +301,7 @@ int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output,
|
|||||||
if ( i == SIZE256 - 1 )
|
if ( i == SIZE256 - 1 )
|
||||||
{
|
{
|
||||||
// only 1 vector left in buffer, all padding at once
|
// only 1 vector left in buffer, all padding at once
|
||||||
ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
|
ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
|
||||||
blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -315,8 +312,7 @@ int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output,
|
|||||||
ctx->buffer[i] = m256_zero;
|
ctx->buffer[i] = m256_zero;
|
||||||
|
|
||||||
// add length padding, second last byte is zero unless blocks > 255
|
// add length padding, second last byte is zero unless blocks > 255
|
||||||
ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
|
ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
|
||||||
blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// digest final padding block and do output transform
|
// digest final padding block and do output transform
|
||||||
|
|||||||
@@ -96,11 +96,9 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
|||||||
* xmm[j] will be lost
|
* xmm[j] will be lost
|
||||||
* xmm[k] has to be all 0x1b */
|
* xmm[k] has to be all 0x1b */
|
||||||
#define MUL2(i, j, k){\
|
#define MUL2(i, j, k){\
|
||||||
j = _mm512_xor_si512(j, j);\
|
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
|
||||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
|
|
||||||
i = _mm512_add_epi8(i, i);\
|
i = _mm512_add_epi8(i, i);\
|
||||||
j = _mm512_and_si512(j, k);\
|
i = mm512_xorand( i, j, k );\
|
||||||
i = _mm512_xor_si512(i, j);\
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Yet another implementation of MixBytes.
|
/* Yet another implementation of MixBytes.
|
||||||
@@ -120,6 +118,95 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
|||||||
We almost fit into 16 registers, need only 3 spills to memory.
|
We almost fit into 16 registers, need only 3 spills to memory.
|
||||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||||
K. Matusiewicz, 2011/05/29 */
|
K. Matusiewicz, 2011/05/29 */
|
||||||
|
|
||||||
|
#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
|
||||||
|
b0, b1, b2, b3, b4, b5, b6, b7) { \
|
||||||
|
/* t_i = a_i + a_{i+1} */\
|
||||||
|
b6 = a0; \
|
||||||
|
b7 = a1; \
|
||||||
|
a0 = _mm512_xor_si512( a0, a1 ); \
|
||||||
|
b0 = a2; \
|
||||||
|
a1 = _mm512_xor_si512( a1, a2 ); \
|
||||||
|
b1 = a3; \
|
||||||
|
TEMP2 = _mm512_xor_si512( a2, a3 ); \
|
||||||
|
b2 = a4; \
|
||||||
|
a3 = _mm512_xor_si512( a3, a4 ); \
|
||||||
|
b3 = a5; \
|
||||||
|
a4 = _mm512_xor_si512( a4, a5 );\
|
||||||
|
b4 = a6; \
|
||||||
|
a5 = _mm512_xor_si512( a5, a6 ); \
|
||||||
|
b5 = a7; \
|
||||||
|
a6 = _mm512_xor_si512( a6, a7 ); \
|
||||||
|
a7 = _mm512_xor_si512( a7, b6 ); \
|
||||||
|
\
|
||||||
|
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||||
|
TEMP0 = mm512_xor3( b0, a4, a6 ); \
|
||||||
|
/* spill values y_4, y_5 to memory */\
|
||||||
|
TEMP1 = mm512_xor3( b1, a5, a7 ); \
|
||||||
|
b2 = mm512_xor3( b2, a6, a0 ); \
|
||||||
|
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||||
|
b0 = a0; \
|
||||||
|
b3 = mm512_xor3( b3, a7, a1 ); \
|
||||||
|
b1 = a1; \
|
||||||
|
b6 = mm512_xor3( b6, a4, TEMP2 ); \
|
||||||
|
b4 = mm512_xor3( b4, a0, TEMP2 ); \
|
||||||
|
b7 = mm512_xor3( b7, a5, a3 ); \
|
||||||
|
b5 = mm512_xor3( b5, a1, a3 ); \
|
||||||
|
\
|
||||||
|
/* compute x_i = t_i + t_{i+3} */\
|
||||||
|
a0 = _mm512_xor_si512( a0, a3 ); \
|
||||||
|
a1 = _mm512_xor_si512( a1, a4 ); \
|
||||||
|
a2 = _mm512_xor_si512( TEMP2, a5 ); \
|
||||||
|
a3 = _mm512_xor_si512( a3, a6 ); \
|
||||||
|
a4 = _mm512_xor_si512( a4, a7 ); \
|
||||||
|
a5 = _mm512_xor_si512( a5, b0 ); \
|
||||||
|
a6 = _mm512_xor_si512( a6, b1 ); \
|
||||||
|
a7 = _mm512_xor_si512( a7, TEMP2 ); \
|
||||||
|
\
|
||||||
|
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||||
|
/* compute w_i : add y_{i+4} */\
|
||||||
|
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
|
||||||
|
MUL2( a0, b0, b1 ); \
|
||||||
|
a0 = _mm512_xor_si512( a0, TEMP0 ); \
|
||||||
|
MUL2( a1, b0, b1 ); \
|
||||||
|
a1 = _mm512_xor_si512( a1, TEMP1 ); \
|
||||||
|
MUL2( a2, b0, b1 ); \
|
||||||
|
a2 = _mm512_xor_si512( a2, b2 ); \
|
||||||
|
MUL2( a3, b0, b1 ); \
|
||||||
|
a3 = _mm512_xor_si512( a3, b3 ); \
|
||||||
|
MUL2( a4, b0, b1 ); \
|
||||||
|
a4 = _mm512_xor_si512( a4, b4 ); \
|
||||||
|
MUL2( a5, b0, b1 ); \
|
||||||
|
a5 = _mm512_xor_si512( a5, b5 ); \
|
||||||
|
MUL2( a6, b0, b1 ); \
|
||||||
|
a6 = _mm512_xor_si512( a6, b6 ); \
|
||||||
|
MUL2( a7, b0, b1 ); \
|
||||||
|
a7 = _mm512_xor_si512( a7, b7 ); \
|
||||||
|
\
|
||||||
|
/* compute v_i : double w_i */\
|
||||||
|
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||||
|
MUL2( a0, b0, b1 ); \
|
||||||
|
b5 = _mm512_xor_si512( b5, a0 ); \
|
||||||
|
MUL2( a1, b0, b1 ); \
|
||||||
|
b6 = _mm512_xor_si512( b6, a1 ); \
|
||||||
|
MUL2( a2, b0, b1 ); \
|
||||||
|
b7 = _mm512_xor_si512( b7, a2 ); \
|
||||||
|
MUL2( a5, b0, b1 ); \
|
||||||
|
b2 = _mm512_xor_si512( b2, a5 ); \
|
||||||
|
MUL2( a6, b0, b1 ); \
|
||||||
|
b3 = _mm512_xor_si512( b3, a6 ); \
|
||||||
|
MUL2( a7, b0, b1 ); \
|
||||||
|
b4 = _mm512_xor_si512( b4, a7 ); \
|
||||||
|
MUL2( a3, b0, b1 ); \
|
||||||
|
MUL2( a4, b0, b1 ); \
|
||||||
|
b0 = TEMP0;\
|
||||||
|
b1 = TEMP1;\
|
||||||
|
b0 = _mm512_xor_si512( b0, a3 ); \
|
||||||
|
b1 = _mm512_xor_si512( b1, a4 ); \
|
||||||
|
}/*MixBytes*/
|
||||||
|
|
||||||
|
|
||||||
|
#if 0
|
||||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||||
/* t_i = a_i + a_{i+1} */\
|
/* t_i = a_i + a_{i+1} */\
|
||||||
b6 = a0;\
|
b6 = a0;\
|
||||||
@@ -215,7 +302,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
|||||||
b0 = _mm512_xor_si512(b0, a3);\
|
b0 = _mm512_xor_si512(b0, a3);\
|
||||||
b1 = _mm512_xor_si512(b1, a4);\
|
b1 = _mm512_xor_si512(b1, a4);\
|
||||||
}/*MixBytes*/
|
}/*MixBytes*/
|
||||||
|
#endif
|
||||||
|
|
||||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||||
/* AddRoundConstant */\
|
/* AddRoundConstant */\
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
|
|||||||
const int hashlen_m128i = 64 / 16; // bytes to __m128i
|
const int hashlen_m128i = 64 / 16; // bytes to __m128i
|
||||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||||
int rem = ctx->rem_ptr;
|
int rem = ctx->rem_ptr;
|
||||||
int blocks = len / SIZE512;
|
uint64_t blocks = len / SIZE512;
|
||||||
__m512i* in = (__m512i*)input;
|
__m512i* in = (__m512i*)input;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
@@ -64,16 +64,14 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
|
|||||||
if ( i == SIZE512 - 1 )
|
if ( i == SIZE512 - 1 )
|
||||||
{
|
{
|
||||||
// only 1 vector left in buffer, all padding at once
|
// only 1 vector left in buffer, all padding at once
|
||||||
ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
|
ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
|
||||||
blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
|
ctx->buffer[i] = m512_const2_64( 0, 0x80 );
|
||||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||||
ctx->buffer[i] = m512_zero;
|
ctx->buffer[i] = m512_zero;
|
||||||
ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
|
ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
|
||||||
blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TF1024_4way( ctx->chaining, ctx->buffer );
|
TF1024_4way( ctx->chaining, ctx->buffer );
|
||||||
@@ -124,7 +122,7 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
|
ctx->buffer[i] = m512_const2_64( 0, 0x80 );
|
||||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||||
ctx->buffer[i] = m512_zero;
|
ctx->buffer[i] = m512_zero;
|
||||||
ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
|
ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
|
||||||
@@ -168,7 +166,7 @@ int groestl512_2way_update_close( groestl512_2way_context* ctx, void* output,
|
|||||||
const int hashlen_m128i = 64 / 16; // bytes to __m128i
|
const int hashlen_m128i = 64 / 16; // bytes to __m128i
|
||||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||||
int rem = ctx->rem_ptr;
|
int rem = ctx->rem_ptr;
|
||||||
int blocks = len / SIZE512;
|
uint64_t blocks = len / SIZE512;
|
||||||
__m256i* in = (__m256i*)input;
|
__m256i* in = (__m256i*)input;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
@@ -189,16 +187,14 @@ int groestl512_2way_update_close( groestl512_2way_context* ctx, void* output,
|
|||||||
if ( i == SIZE512 - 1 )
|
if ( i == SIZE512 - 1 )
|
||||||
{
|
{
|
||||||
// only 1 vector left in buffer, all padding at once
|
// only 1 vector left in buffer, all padding at once
|
||||||
ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
|
ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
|
||||||
blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
ctx->buffer[i] = m256_const2_64( 0, 0x80 );
|
ctx->buffer[i] = m256_const2_64( 0, 0x80 );
|
||||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||||
ctx->buffer[i] = m256_zero;
|
ctx->buffer[i] = m256_zero;
|
||||||
ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
|
ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
|
||||||
blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TF1024_2way( ctx->chaining, ctx->buffer );
|
TF1024_2way( ctx->chaining, ctx->buffer );
|
||||||
|
|||||||
@@ -104,11 +104,9 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
|||||||
* xmm[j] will be lost
|
* xmm[j] will be lost
|
||||||
* xmm[k] has to be all 0x1b */
|
* xmm[k] has to be all 0x1b */
|
||||||
#define MUL2(i, j, k){\
|
#define MUL2(i, j, k){\
|
||||||
j = _mm512_xor_si512(j, j);\
|
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
|
||||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
|
|
||||||
i = _mm512_add_epi8(i, i);\
|
i = _mm512_add_epi8(i, i);\
|
||||||
j = _mm512_and_si512(j, k);\
|
i = mm512_xorand( i, j, k );\
|
||||||
i = _mm512_xor_si512(i, j);\
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**/
|
/**/
|
||||||
@@ -130,100 +128,90 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
|||||||
We almost fit into 16 registers, need only 3 spills to memory.
|
We almost fit into 16 registers, need only 3 spills to memory.
|
||||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||||
K. Matusiewicz, 2011/05/29 */
|
K. Matusiewicz, 2011/05/29 */
|
||||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
|
||||||
|
b0, b1, b2, b3, b4, b5, b6, b7) { \
|
||||||
/* t_i = a_i + a_{i+1} */\
|
/* t_i = a_i + a_{i+1} */\
|
||||||
b6 = a0;\
|
b6 = a0; \
|
||||||
b7 = a1;\
|
b7 = a1; \
|
||||||
a0 = _mm512_xor_si512(a0, a1);\
|
a0 = _mm512_xor_si512( a0, a1 ); \
|
||||||
b0 = a2;\
|
b0 = a2; \
|
||||||
a1 = _mm512_xor_si512(a1, a2);\
|
a1 = _mm512_xor_si512( a1, a2 ); \
|
||||||
b1 = a3;\
|
b1 = a3; \
|
||||||
a2 = _mm512_xor_si512(a2, a3);\
|
TEMP2 = _mm512_xor_si512( a2, a3 ); \
|
||||||
b2 = a4;\
|
b2 = a4; \
|
||||||
a3 = _mm512_xor_si512(a3, a4);\
|
a3 = _mm512_xor_si512( a3, a4 ); \
|
||||||
b3 = a5;\
|
b3 = a5; \
|
||||||
a4 = _mm512_xor_si512(a4, a5);\
|
a4 = _mm512_xor_si512( a4, a5 );\
|
||||||
b4 = a6;\
|
b4 = a6; \
|
||||||
a5 = _mm512_xor_si512(a5, a6);\
|
a5 = _mm512_xor_si512( a5, a6 ); \
|
||||||
b5 = a7;\
|
b5 = a7; \
|
||||||
a6 = _mm512_xor_si512(a6, a7);\
|
a6 = _mm512_xor_si512( a6, a7 ); \
|
||||||
a7 = _mm512_xor_si512(a7, b6);\
|
a7 = _mm512_xor_si512( a7, b6 ); \
|
||||||
\
|
\
|
||||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||||
b0 = _mm512_xor_si512(b0, a4);\
|
TEMP0 = mm512_xor3( b0, a4, a6 ); \
|
||||||
b6 = _mm512_xor_si512(b6, a4);\
|
|
||||||
b1 = _mm512_xor_si512(b1, a5);\
|
|
||||||
b7 = _mm512_xor_si512(b7, a5);\
|
|
||||||
b2 = _mm512_xor_si512(b2, a6);\
|
|
||||||
b0 = _mm512_xor_si512(b0, a6);\
|
|
||||||
/* spill values y_4, y_5 to memory */\
|
/* spill values y_4, y_5 to memory */\
|
||||||
TEMP0 = b0;\
|
TEMP1 = mm512_xor3( b1, a5, a7 ); \
|
||||||
b3 = _mm512_xor_si512(b3, a7);\
|
b2 = mm512_xor3( b2, a6, a0 ); \
|
||||||
b1 = _mm512_xor_si512(b1, a7);\
|
|
||||||
TEMP1 = b1;\
|
|
||||||
b4 = _mm512_xor_si512(b4, a0);\
|
|
||||||
b2 = _mm512_xor_si512(b2, a0);\
|
|
||||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||||
b0 = a0;\
|
b0 = a0; \
|
||||||
b5 = _mm512_xor_si512(b5, a1);\
|
b3 = mm512_xor3( b3, a7, a1 ); \
|
||||||
b3 = _mm512_xor_si512(b3, a1);\
|
b1 = a1; \
|
||||||
b1 = a1;\
|
b6 = mm512_xor3( b6, a4, TEMP2 ); \
|
||||||
b6 = _mm512_xor_si512(b6, a2);\
|
b4 = mm512_xor3( b4, a0, TEMP2 ); \
|
||||||
b4 = _mm512_xor_si512(b4, a2);\
|
b7 = mm512_xor3( b7, a5, a3 ); \
|
||||||
TEMP2 = a2;\
|
b5 = mm512_xor3( b5, a1, a3 ); \
|
||||||
b7 = _mm512_xor_si512(b7, a3);\
|
|
||||||
b5 = _mm512_xor_si512(b5, a3);\
|
|
||||||
\
|
\
|
||||||
/* compute x_i = t_i + t_{i+3} */\
|
/* compute x_i = t_i + t_{i+3} */\
|
||||||
a0 = _mm512_xor_si512(a0, a3);\
|
a0 = _mm512_xor_si512( a0, a3 ); \
|
||||||
a1 = _mm512_xor_si512(a1, a4);\
|
a1 = _mm512_xor_si512( a1, a4 ); \
|
||||||
a2 = _mm512_xor_si512(a2, a5);\
|
a2 = _mm512_xor_si512( TEMP2, a5 ); \
|
||||||
a3 = _mm512_xor_si512(a3, a6);\
|
a3 = _mm512_xor_si512( a3, a6 ); \
|
||||||
a4 = _mm512_xor_si512(a4, a7);\
|
a4 = _mm512_xor_si512( a4, a7 ); \
|
||||||
a5 = _mm512_xor_si512(a5, b0);\
|
a5 = _mm512_xor_si512( a5, b0 ); \
|
||||||
a6 = _mm512_xor_si512(a6, b1);\
|
a6 = _mm512_xor_si512( a6, b1 ); \
|
||||||
a7 = _mm512_xor_si512(a7, TEMP2);\
|
a7 = _mm512_xor_si512( a7, TEMP2 ); \
|
||||||
\
|
\
|
||||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||||
/* compute w_i : add y_{i+4} */\
|
/* compute w_i : add y_{i+4} */\
|
||||||
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
|
||||||
MUL2(a0, b0, b1);\
|
MUL2( a0, b0, b1 ); \
|
||||||
a0 = _mm512_xor_si512(a0, TEMP0);\
|
a0 = _mm512_xor_si512( a0, TEMP0 ); \
|
||||||
MUL2(a1, b0, b1);\
|
MUL2( a1, b0, b1 ); \
|
||||||
a1 = _mm512_xor_si512(a1, TEMP1);\
|
a1 = _mm512_xor_si512( a1, TEMP1 ); \
|
||||||
MUL2(a2, b0, b1);\
|
MUL2( a2, b0, b1 ); \
|
||||||
a2 = _mm512_xor_si512(a2, b2);\
|
a2 = _mm512_xor_si512( a2, b2 ); \
|
||||||
MUL2(a3, b0, b1);\
|
MUL2( a3, b0, b1 ); \
|
||||||
a3 = _mm512_xor_si512(a3, b3);\
|
a3 = _mm512_xor_si512( a3, b3 ); \
|
||||||
MUL2(a4, b0, b1);\
|
MUL2( a4, b0, b1 ); \
|
||||||
a4 = _mm512_xor_si512(a4, b4);\
|
a4 = _mm512_xor_si512( a4, b4 ); \
|
||||||
MUL2(a5, b0, b1);\
|
MUL2( a5, b0, b1 ); \
|
||||||
a5 = _mm512_xor_si512(a5, b5);\
|
a5 = _mm512_xor_si512( a5, b5 ); \
|
||||||
MUL2(a6, b0, b1);\
|
MUL2( a6, b0, b1 ); \
|
||||||
a6 = _mm512_xor_si512(a6, b6);\
|
a6 = _mm512_xor_si512( a6, b6 ); \
|
||||||
MUL2(a7, b0, b1);\
|
MUL2( a7, b0, b1 ); \
|
||||||
a7 = _mm512_xor_si512(a7, b7);\
|
a7 = _mm512_xor_si512( a7, b7 ); \
|
||||||
\
|
\
|
||||||
/* compute v_i : double w_i */\
|
/* compute v_i : double w_i */\
|
||||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||||
MUL2(a0, b0, b1);\
|
MUL2( a0, b0, b1 ); \
|
||||||
b5 = _mm512_xor_si512(b5, a0);\
|
b5 = _mm512_xor_si512( b5, a0 ); \
|
||||||
MUL2(a1, b0, b1);\
|
MUL2( a1, b0, b1 ); \
|
||||||
b6 = _mm512_xor_si512(b6, a1);\
|
b6 = _mm512_xor_si512( b6, a1 ); \
|
||||||
MUL2(a2, b0, b1);\
|
MUL2( a2, b0, b1 ); \
|
||||||
b7 = _mm512_xor_si512(b7, a2);\
|
b7 = _mm512_xor_si512( b7, a2 ); \
|
||||||
MUL2(a5, b0, b1);\
|
MUL2( a5, b0, b1 ); \
|
||||||
b2 = _mm512_xor_si512(b2, a5);\
|
b2 = _mm512_xor_si512( b2, a5 ); \
|
||||||
MUL2(a6, b0, b1);\
|
MUL2( a6, b0, b1 ); \
|
||||||
b3 = _mm512_xor_si512(b3, a6);\
|
b3 = _mm512_xor_si512( b3, a6 ); \
|
||||||
MUL2(a7, b0, b1);\
|
MUL2( a7, b0, b1 ); \
|
||||||
b4 = _mm512_xor_si512(b4, a7);\
|
b4 = _mm512_xor_si512( b4, a7 ); \
|
||||||
MUL2(a3, b0, b1);\
|
MUL2( a3, b0, b1 ); \
|
||||||
MUL2(a4, b0, b1);\
|
MUL2( a4, b0, b1 ); \
|
||||||
b0 = TEMP0;\
|
b0 = TEMP0;\
|
||||||
b1 = TEMP1;\
|
b1 = TEMP1;\
|
||||||
b0 = _mm512_xor_si512(b0, a3);\
|
b0 = _mm512_xor_si512( b0, a3 ); \
|
||||||
b1 = _mm512_xor_si512(b1, a4);\
|
b1 = _mm512_xor_si512( b1, a4 ); \
|
||||||
}/*MixBytes*/
|
}/*MixBytes*/
|
||||||
|
|
||||||
/* one round
|
/* one round
|
||||||
@@ -709,11 +697,9 @@ static const __m256i SUBSH_MASK7_2WAY =
|
|||||||
* xmm[j] will be lost
|
* xmm[j] will be lost
|
||||||
* xmm[k] has to be all 0x1b */
|
* xmm[k] has to be all 0x1b */
|
||||||
#define MUL2_2WAY(i, j, k){\
|
#define MUL2_2WAY(i, j, k){\
|
||||||
j = _mm256_xor_si256(j, j);\
|
j = _mm256_cmpgt_epi8( m256_zero, i );\
|
||||||
j = _mm256_cmpgt_epi8(j, i );\
|
|
||||||
i = _mm256_add_epi8(i, i);\
|
i = _mm256_add_epi8(i, i);\
|
||||||
j = _mm256_and_si256(j, k);\
|
i = mm256_xorand( i, j, k );\
|
||||||
i = _mm256_xor_si256(i, j);\
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
#else
|
#else
|
||||||
#include "sph_groestl.h"
|
#include "sph_groestl.h"
|
||||||
#endif
|
#endif
|
||||||
#include "algo/sha/sph_sha2.h"
|
#include "algo/sha/sha256-hash.h"
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
#ifdef __AES__
|
#ifdef __AES__
|
||||||
@@ -19,7 +19,6 @@ typedef struct {
|
|||||||
#else
|
#else
|
||||||
sph_groestl512_context groestl;
|
sph_groestl512_context groestl;
|
||||||
#endif
|
#endif
|
||||||
sph_sha256_context sha;
|
|
||||||
} myrgr_ctx_holder;
|
} myrgr_ctx_holder;
|
||||||
|
|
||||||
myrgr_ctx_holder myrgr_ctx;
|
myrgr_ctx_holder myrgr_ctx;
|
||||||
@@ -31,7 +30,6 @@ void init_myrgr_ctx()
|
|||||||
#else
|
#else
|
||||||
sph_groestl512_init( &myrgr_ctx.groestl );
|
sph_groestl512_init( &myrgr_ctx.groestl );
|
||||||
#endif
|
#endif
|
||||||
sph_sha256_init( &myrgr_ctx.sha );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void myriad_hash(void *output, const void *input)
|
void myriad_hash(void *output, const void *input)
|
||||||
@@ -49,8 +47,7 @@ void myriad_hash(void *output, const void *input)
|
|||||||
sph_groestl512_close(&ctx.groestl, hash);
|
sph_groestl512_close(&ctx.groestl, hash);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
sph_sha256( &ctx.sha, hash, 64 );
|
sha256_full( hash, hash, 64 );
|
||||||
sph_sha256_close( &ctx.sha, hash );
|
|
||||||
|
|
||||||
memcpy(output, hash, 32);
|
memcpy(output, hash, 32);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ void myriad_8way_hash( void *output, const void *input )
|
|||||||
|
|
||||||
rintrlv_8x64_4x128( vhashA, vhashB, input, 640 );
|
rintrlv_8x64_4x128( vhashA, vhashB, input, 640 );
|
||||||
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 );
|
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 );
|
||||||
|
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(groestl512_4way_context) );
|
||||||
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 );
|
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 );
|
||||||
|
|
||||||
uint32_t hash0[20] __attribute__ ((aligned (64)));
|
uint32_t hash0[20] __attribute__ ((aligned (64)));
|
||||||
@@ -58,8 +59,6 @@ void myriad_8way_hash( void *output, const void *input )
|
|||||||
// rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 );
|
// rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 );
|
||||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||||
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
|
|
||||||
hash6, hash7 );
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
@@ -76,27 +75,27 @@ void myriad_8way_hash( void *output, const void *input )
|
|||||||
hash4, hash5, hash6, hash7, input, 640 );
|
hash4, hash5, hash6, hash7, input, 640 );
|
||||||
|
|
||||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
|
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
|
||||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
|
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
|
||||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
|
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
|
||||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
|
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
|
||||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 );
|
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 );
|
||||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 );
|
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 );
|
||||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 );
|
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 );
|
||||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 );
|
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 );
|
||||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
|
|
||||||
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
|
|
||||||
hash4, hash5, hash6, hash7, 512 );
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
|
||||||
|
hash6, hash7 );
|
||||||
|
|
||||||
sha256_8way_update( &ctx.sha, vhash, 64 );
|
sha256_8way_update( &ctx.sha, vhash, 64 );
|
||||||
sha256_8way_close( &ctx.sha, output );
|
sha256_8way_close( &ctx.sha, output );
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -548,7 +548,7 @@ static const sph_u32 T512[64][16] = {
|
|||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
// Hamsi 8 way
|
// Hamsi 8 way AVX512
|
||||||
|
|
||||||
#define INPUT_BIG8 \
|
#define INPUT_BIG8 \
|
||||||
do { \
|
do { \
|
||||||
@@ -560,22 +560,14 @@ do { \
|
|||||||
__m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
|
__m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
|
||||||
dm = mm512_negate_32( _mm512_or_si512( dm, \
|
dm = mm512_negate_32( _mm512_or_si512( dm, \
|
||||||
_mm512_slli_epi64( dm, 32 ) ) ); \
|
_mm512_slli_epi64( dm, 32 ) ) ); \
|
||||||
m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \
|
m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \
|
||||||
m512_const1_64( tp[0] ) ) ); \
|
m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \
|
||||||
m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \
|
m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \
|
||||||
m512_const1_64( tp[1] ) ) ); \
|
m3 = mm512_xorand( m3, dm, m512_const1_64( tp[3] ) ); \
|
||||||
m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \
|
m4 = mm512_xorand( m4, dm, m512_const1_64( tp[4] ) ); \
|
||||||
m512_const1_64( tp[2] ) ) ); \
|
m5 = mm512_xorand( m5, dm, m512_const1_64( tp[5] ) ); \
|
||||||
m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \
|
m6 = mm512_xorand( m6, dm, m512_const1_64( tp[6] ) ); \
|
||||||
m512_const1_64( tp[3] ) ) ); \
|
m7 = mm512_xorand( m7, dm, m512_const1_64( tp[7] ) ); \
|
||||||
m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \
|
|
||||||
m512_const1_64( tp[4] ) ) ); \
|
|
||||||
m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \
|
|
||||||
m512_const1_64( tp[5] ) ) ); \
|
|
||||||
m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \
|
|
||||||
m512_const1_64( tp[6] ) ) ); \
|
|
||||||
m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \
|
|
||||||
m512_const1_64( tp[7] ) ) ); \
|
|
||||||
tp += 8; \
|
tp += 8; \
|
||||||
db = _mm512_srli_epi64( db, 1 ); \
|
db = _mm512_srli_epi64( db, 1 ); \
|
||||||
} \
|
} \
|
||||||
@@ -585,20 +577,13 @@ do { \
|
|||||||
do { \
|
do { \
|
||||||
__m512i t; \
|
__m512i t; \
|
||||||
t = a; \
|
t = a; \
|
||||||
a = _mm512_and_si512( a, c ); \
|
a = mm512_xorand( d, a, c ); \
|
||||||
a = _mm512_xor_si512( a, d ); \
|
c = mm512_xor3( a, b, c ); \
|
||||||
c = _mm512_xor_si512( c, b ); \
|
b = mm512_xoror( b, d, t ); \
|
||||||
c = _mm512_xor_si512( c, a ); \
|
|
||||||
d = _mm512_or_si512( d, t ); \
|
|
||||||
d = _mm512_xor_si512( d, b ); \
|
|
||||||
t = _mm512_xor_si512( t, c ); \
|
t = _mm512_xor_si512( t, c ); \
|
||||||
b = d; \
|
d = mm512_xoror( a, b, t ); \
|
||||||
d = _mm512_or_si512( d, t ); \
|
t = mm512_xorand( t, a, b ); \
|
||||||
d = _mm512_xor_si512( d, a ); \
|
b = mm512_xor3( b, d, t ); \
|
||||||
a = _mm512_and_si512( a, b ); \
|
|
||||||
t = _mm512_xor_si512( t, a ); \
|
|
||||||
b = _mm512_xor_si512( b, d ); \
|
|
||||||
b = _mm512_xor_si512( b, t ); \
|
|
||||||
a = c; \
|
a = c; \
|
||||||
c = b; \
|
c = b; \
|
||||||
b = d; \
|
b = d; \
|
||||||
@@ -609,14 +594,12 @@ do { \
|
|||||||
do { \
|
do { \
|
||||||
a = mm512_rol_32( a, 13 ); \
|
a = mm512_rol_32( a, 13 ); \
|
||||||
c = mm512_rol_32( c, 3 ); \
|
c = mm512_rol_32( c, 3 ); \
|
||||||
b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \
|
b = mm512_xor3( a, b, c ); \
|
||||||
d = _mm512_xor_si512( d, _mm512_xor_si512( c, \
|
d = mm512_xor3( d, c, _mm512_slli_epi32( a, 3 ) ); \
|
||||||
_mm512_slli_epi32( a, 3 ) ) ); \
|
|
||||||
b = mm512_rol_32( b, 1 ); \
|
b = mm512_rol_32( b, 1 ); \
|
||||||
d = mm512_rol_32( d, 7 ); \
|
d = mm512_rol_32( d, 7 ); \
|
||||||
a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \
|
a = mm512_xor3( a, b, d ); \
|
||||||
c = _mm512_xor_si512( c, _mm512_xor_si512( d, \
|
c = mm512_xor3( c, d, _mm512_slli_epi32( b, 7 ) ); \
|
||||||
_mm512_slli_epi32( b, 7 ) ) ); \
|
|
||||||
a = mm512_rol_32( a, 5 ); \
|
a = mm512_rol_32( a, 5 ); \
|
||||||
c = mm512_rol_32( c, 22 ); \
|
c = mm512_rol_32( c, 22 ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
@@ -649,26 +632,25 @@ do { \
|
|||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
#define ROUND_BIG8(rc, alpha) \
|
#define ROUND_BIG8( alpha ) \
|
||||||
do { \
|
do { \
|
||||||
__m512i t0, t1, t2, t3; \
|
__m512i t0, t1, t2, t3; \
|
||||||
s0 = _mm512_xor_si512( s0, m512_const1_64( \
|
s0 = _mm512_xor_si512( s0, alpha[ 0] ); \
|
||||||
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
|
s1 = _mm512_xor_si512( s1, alpha[ 1] ); \
|
||||||
s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
|
s2 = _mm512_xor_si512( s2, alpha[ 2] ); \
|
||||||
s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
|
s3 = _mm512_xor_si512( s3, alpha[ 3] ); \
|
||||||
s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
|
s4 = _mm512_xor_si512( s4, alpha[ 4] ); \
|
||||||
s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
|
s5 = _mm512_xor_si512( s5, alpha[ 5] ); \
|
||||||
s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
|
s6 = _mm512_xor_si512( s6, alpha[ 6] ); \
|
||||||
s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
|
s7 = _mm512_xor_si512( s7, alpha[ 7] ); \
|
||||||
s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
|
s8 = _mm512_xor_si512( s8, alpha[ 8] ); \
|
||||||
s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
|
s9 = _mm512_xor_si512( s9, alpha[ 9] ); \
|
||||||
s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
|
sA = _mm512_xor_si512( sA, alpha[10] ); \
|
||||||
sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
|
sB = _mm512_xor_si512( sB, alpha[11] ); \
|
||||||
sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
|
sC = _mm512_xor_si512( sC, alpha[12] ); \
|
||||||
sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
|
sD = _mm512_xor_si512( sD, alpha[13] ); \
|
||||||
sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
|
sE = _mm512_xor_si512( sE, alpha[14] ); \
|
||||||
sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
|
sF = _mm512_xor_si512( sF, alpha[15] ); \
|
||||||
sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
|
|
||||||
\
|
\
|
||||||
SBOX8( s0, s4, s8, sC ); \
|
SBOX8( s0, s4, s8, sC ); \
|
||||||
SBOX8( s1, s5, s9, sD ); \
|
SBOX8( s1, s5, s9, sD ); \
|
||||||
@@ -748,28 +730,66 @@ do { \
|
|||||||
|
|
||||||
#define P_BIG8 \
|
#define P_BIG8 \
|
||||||
do { \
|
do { \
|
||||||
ROUND_BIG8(0, alpha_n); \
|
__m512i alpha[16]; \
|
||||||
ROUND_BIG8(1, alpha_n); \
|
for( int i = 0; i < 16; i++ ) \
|
||||||
ROUND_BIG8(2, alpha_n); \
|
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
|
||||||
ROUND_BIG8(3, alpha_n); \
|
ROUND_BIG8( alpha ); \
|
||||||
ROUND_BIG8(4, alpha_n); \
|
alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
|
||||||
ROUND_BIG8(5, alpha_n); \
|
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||||
|
ROUND_BIG8( alpha ); \
|
||||||
|
alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||||
|
ROUND_BIG8( alpha ); \
|
||||||
|
alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||||
|
ROUND_BIG8( alpha ); \
|
||||||
|
alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||||
|
ROUND_BIG8( alpha ); \
|
||||||
|
alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||||
|
ROUND_BIG8( alpha ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define PF_BIG8 \
|
#define PF_BIG8 \
|
||||||
do { \
|
do { \
|
||||||
ROUND_BIG8( 0, alpha_f); \
|
__m512i alpha[16]; \
|
||||||
ROUND_BIG8( 1, alpha_f); \
|
for( int i = 0; i < 16; i++ ) \
|
||||||
ROUND_BIG8( 2, alpha_f); \
|
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
|
||||||
ROUND_BIG8( 3, alpha_f); \
|
ROUND_BIG8( alpha ); \
|
||||||
ROUND_BIG8( 4, alpha_f); \
|
alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
|
||||||
ROUND_BIG8( 5, alpha_f); \
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
ROUND_BIG8( 6, alpha_f); \
|
ROUND_BIG8( alpha ); \
|
||||||
ROUND_BIG8( 7, alpha_f); \
|
alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
|
||||||
ROUND_BIG8( 8, alpha_f); \
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
ROUND_BIG8( 9, alpha_f); \
|
ROUND_BIG8( alpha ); \
|
||||||
ROUND_BIG8(10, alpha_f); \
|
alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
|
||||||
ROUND_BIG8(11, alpha_f); \
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG8( alpha ); \
|
||||||
|
alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG8( alpha ); \
|
||||||
|
alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG8( alpha ); \
|
||||||
|
alpha[0] = m512_const1_64( ( (uint64_t)6 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG8( alpha ); \
|
||||||
|
alpha[0] = m512_const1_64( ( (uint64_t)7 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG8( alpha ); \
|
||||||
|
alpha[0] = m512_const1_64( ( (uint64_t)8 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG8( alpha ); \
|
||||||
|
alpha[0] = m512_const1_64( ( (uint64_t)9 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG8( alpha ); \
|
||||||
|
alpha[0] = m512_const1_64( ( (uint64_t)10 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG8( alpha ); \
|
||||||
|
alpha[0] = m512_const1_64( ( (uint64_t)11 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG8( alpha ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define T_BIG8 \
|
#define T_BIG8 \
|
||||||
@@ -849,13 +869,11 @@ void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
|
|||||||
void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
|
void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
|
||||||
{
|
{
|
||||||
__m512i pad[1];
|
__m512i pad[1];
|
||||||
int ch, cl;
|
uint32_t ch, cl;
|
||||||
|
|
||||||
sph_enc32be( &ch, sc->count_high );
|
sph_enc32be( &ch, sc->count_high );
|
||||||
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
|
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
|
||||||
pad[0] = _mm512_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch,
|
pad[0] = _mm512_set1_epi64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
|
||||||
cl, ch, cl, ch, cl, ch, cl, ch );
|
|
||||||
// pad[0] = m512_const2_32( cl, ch );
|
|
||||||
sc->buf[0] = m512_const1_64( 0x80 );
|
sc->buf[0] = m512_const1_64( 0x80 );
|
||||||
hamsi_8way_big( sc, sc->buf, 1 );
|
hamsi_8way_big( sc, sc->buf, 1 );
|
||||||
hamsi_8way_big_final( sc, pad );
|
hamsi_8way_big_final( sc, pad );
|
||||||
@@ -863,11 +881,9 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
|
|||||||
mm512_block_bswap_32( (__m512i*)dst, sc->h );
|
mm512_block_bswap_32( (__m512i*)dst, sc->h );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#endif // AVX512
|
#endif // AVX512
|
||||||
|
|
||||||
|
// Hamsi 4 way AVX2
|
||||||
// Hamsi 4 way
|
|
||||||
|
|
||||||
#define INPUT_BIG \
|
#define INPUT_BIG \
|
||||||
do { \
|
do { \
|
||||||
@@ -986,26 +1002,25 @@ do { \
|
|||||||
#define sF m7
|
#define sF m7
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define ROUND_BIG(rc, alpha) \
|
#define ROUND_BIG( alpha ) \
|
||||||
do { \
|
do { \
|
||||||
__m256i t0, t1, t2, t3; \
|
__m256i t0, t1, t2, t3; \
|
||||||
s0 = _mm256_xor_si256( s0, m256_const1_64( \
|
s0 = _mm256_xor_si256( s0, alpha[ 0] ); \
|
||||||
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
|
s1 = _mm256_xor_si256( s1, alpha[ 1] ); \
|
||||||
s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
|
s2 = _mm256_xor_si256( s2, alpha[ 2] ); \
|
||||||
s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
|
s3 = _mm256_xor_si256( s3, alpha[ 3] ); \
|
||||||
s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
|
s4 = _mm256_xor_si256( s4, alpha[ 4] ); \
|
||||||
s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
|
s5 = _mm256_xor_si256( s5, alpha[ 5] ); \
|
||||||
s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
|
s6 = _mm256_xor_si256( s6, alpha[ 6] ); \
|
||||||
s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
|
s7 = _mm256_xor_si256( s7, alpha[ 7] ); \
|
||||||
s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
|
s8 = _mm256_xor_si256( s8, alpha[ 8] ); \
|
||||||
s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
|
s9 = _mm256_xor_si256( s9, alpha[ 9] ); \
|
||||||
s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
|
sA = _mm256_xor_si256( sA, alpha[10] ); \
|
||||||
sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
|
sB = _mm256_xor_si256( sB, alpha[11] ); \
|
||||||
sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
|
sC = _mm256_xor_si256( sC, alpha[12] ); \
|
||||||
sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
|
sD = _mm256_xor_si256( sD, alpha[13] ); \
|
||||||
sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
|
sE = _mm256_xor_si256( sE, alpha[14] ); \
|
||||||
sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
|
sF = _mm256_xor_si256( sF, alpha[15] ); \
|
||||||
sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
|
|
||||||
\
|
\
|
||||||
SBOX( s0, s4, s8, sC ); \
|
SBOX( s0, s4, s8, sC ); \
|
||||||
SBOX( s1, s5, s9, sD ); \
|
SBOX( s1, s5, s9, sD ); \
|
||||||
@@ -1085,28 +1100,66 @@ do { \
|
|||||||
|
|
||||||
#define P_BIG \
|
#define P_BIG \
|
||||||
do { \
|
do { \
|
||||||
ROUND_BIG(0, alpha_n); \
|
__m256i alpha[16]; \
|
||||||
ROUND_BIG(1, alpha_n); \
|
for( int i = 0; i < 16; i++ ) \
|
||||||
ROUND_BIG(2, alpha_n); \
|
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
|
||||||
ROUND_BIG(3, alpha_n); \
|
ROUND_BIG( alpha ); \
|
||||||
ROUND_BIG(4, alpha_n); \
|
alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
|
||||||
ROUND_BIG(5, alpha_n); \
|
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||||
|
ROUND_BIG( alpha ); \
|
||||||
|
alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||||
|
ROUND_BIG( alpha ); \
|
||||||
|
alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||||
|
ROUND_BIG( alpha ); \
|
||||||
|
alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||||
|
ROUND_BIG( alpha ); \
|
||||||
|
alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||||
|
ROUND_BIG( alpha ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define PF_BIG \
|
#define PF_BIG \
|
||||||
do { \
|
do { \
|
||||||
ROUND_BIG( 0, alpha_f); \
|
__m256i alpha[16]; \
|
||||||
ROUND_BIG( 1, alpha_f); \
|
for( int i = 0; i < 16; i++ ) \
|
||||||
ROUND_BIG( 2, alpha_f); \
|
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
|
||||||
ROUND_BIG( 3, alpha_f); \
|
ROUND_BIG( alpha ); \
|
||||||
ROUND_BIG( 4, alpha_f); \
|
alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
|
||||||
ROUND_BIG( 5, alpha_f); \
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
ROUND_BIG( 6, alpha_f); \
|
ROUND_BIG( alpha ); \
|
||||||
ROUND_BIG( 7, alpha_f); \
|
alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
|
||||||
ROUND_BIG( 8, alpha_f); \
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
ROUND_BIG( 9, alpha_f); \
|
ROUND_BIG( alpha ); \
|
||||||
ROUND_BIG(10, alpha_f); \
|
alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
|
||||||
ROUND_BIG(11, alpha_f); \
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG( alpha ); \
|
||||||
|
alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG( alpha ); \
|
||||||
|
alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG( alpha ); \
|
||||||
|
alpha[0] = m256_const1_64( ( (uint64_t)6 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG( alpha ); \
|
||||||
|
alpha[0] = m256_const1_64( ( (uint64_t)7 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG( alpha ); \
|
||||||
|
alpha[0] = m256_const1_64( ( (uint64_t)8 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG( alpha ); \
|
||||||
|
alpha[0] = m256_const1_64( ( (uint64_t)9 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG( alpha ); \
|
||||||
|
alpha[0] = m256_const1_64( ( (uint64_t)10 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG( alpha ); \
|
||||||
|
alpha[0] = m256_const1_64( ( (uint64_t)11 << 32 ) \
|
||||||
|
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||||
|
ROUND_BIG( alpha ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define T_BIG \
|
#define T_BIG \
|
||||||
@@ -1186,14 +1239,12 @@ void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
|
|||||||
void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||||
{
|
{
|
||||||
__m256i pad[1];
|
__m256i pad[1];
|
||||||
int ch, cl;
|
uint32_t ch, cl;
|
||||||
|
|
||||||
sph_enc32be( &ch, sc->count_high );
|
sph_enc32be( &ch, sc->count_high );
|
||||||
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
|
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
|
||||||
pad[0] = _mm256_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch );
|
pad[0] = _mm256_set1_epi64x( ((uint64_t)cl << 32 ) | (uint64_t)ch );
|
||||||
sc->buf[0] = m256_const1_64( 0x80 );
|
sc->buf[0] = m256_const1_64( 0x80 );
|
||||||
// sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL,
|
|
||||||
// 0UL, 0x80UL, 0UL, 0x80UL );
|
|
||||||
hamsi_big( sc, sc->buf, 1 );
|
hamsi_big( sc, sc->buf, 1 );
|
||||||
hamsi_big_final( sc, pad );
|
hamsi_big_final( sc, pad );
|
||||||
|
|
||||||
|
|||||||
@@ -522,49 +522,52 @@ do { \
|
|||||||
|
|
||||||
// Haval-256 8 way 32 bit avx2
|
// Haval-256 8 way 32 bit avx2
|
||||||
|
|
||||||
|
#if defined (__AVX512VL__)
|
||||||
|
|
||||||
|
// ( ~( a ^ b ) ) & c
|
||||||
|
#define mm256_andnotxor( a, b, c ) \
|
||||||
|
_mm256_ternarylogic_epi32( a, b, c, 0x82 )
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define mm256_andnotxor( a, b, c ) \
|
||||||
|
_mm256_andnot_si256( _mm256_xor_si256( a, b ), c )
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
|
#define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||||
_mm256_xor_si256( x0, \
|
mm256_xor3( x0, mm256_andxor( x1, x0, x4 ), \
|
||||||
_mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \
|
|
||||||
_mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
|
_mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
|
||||||
_mm256_and_si256( x3, x6 ) ) ) ) \
|
_mm256_and_si256( x3, x6 ) ) ) \
|
||||||
|
|
||||||
#define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
|
#define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||||
_mm256_xor_si256( \
|
mm256_xor3( mm256_andxor( x2, _mm256_andnot_si256( x3, x1 ), \
|
||||||
_mm256_and_si256( x2, \
|
mm256_xor3( _mm256_and_si256( x4, x5 ), x6, x0 ) ), \
|
||||||
_mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \
|
mm256_andxor( x4, x1, x5 ), \
|
||||||
_mm256_xor_si256( _mm256_and_si256( x4, x5 ), \
|
mm256_xorand( x0, x3, x5 ) ) \
|
||||||
_mm256_xor_si256( x6, x0 ) ) ) ), \
|
|
||||||
_mm256_xor_si256( \
|
|
||||||
_mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
|
|
||||||
_mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
|
|
||||||
|
|
||||||
#define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
|
#define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||||
_mm256_xor_si256( \
|
mm256_xor3( x0, \
|
||||||
_mm256_and_si256( x3, \
|
_mm256_and_si256( x3, \
|
||||||
_mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
|
mm256_xor3( _mm256_and_si256( x1, x2 ), x6, x0 ) ), \
|
||||||
_mm256_xor_si256( x6, x0 ) ) ), \
|
_mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
|
||||||
_mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \
|
_mm256_and_si256( x2, x5 ) ) )
|
||||||
_mm256_and_si256( x2, x5 ) ), x0 ) )
|
|
||||||
|
|
||||||
#define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
|
#define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||||
_mm256_xor_si256( \
|
mm256_xor3( \
|
||||||
_mm256_xor_si256( \
|
mm256_andxor( x3, x5, \
|
||||||
_mm256_and_si256( x3, \
|
_mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
|
||||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
|
_mm256_or_si256( x4, x6 ) ) ), \
|
||||||
_mm256_or_si256( x4, x6 ) ), x5 ) ), \
|
|
||||||
_mm256_and_si256( x4, \
|
_mm256_and_si256( x4, \
|
||||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \
|
mm256_xor3( x0, _mm256_andnot_si256( x2, x5 ), \
|
||||||
_mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \
|
_mm256_xor_si256( x1, x6 ) ) ), \
|
||||||
_mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
|
mm256_xorand( x0, x2, x6 ) )
|
||||||
|
|
||||||
|
|
||||||
#define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
|
#define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||||
_mm256_xor_si256( \
|
_mm256_xor_si256( \
|
||||||
_mm256_and_si256( x0, \
|
mm256_andnotxor( mm256_and3( x1, x2, x3 ), x5, x0 ), \
|
||||||
mm256_not( _mm256_xor_si256( \
|
mm256_xor3( _mm256_and_si256( x1, x4 ), \
|
||||||
_mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \
|
_mm256_and_si256( x2, x5 ), \
|
||||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
|
|
||||||
_mm256_and_si256( x2, x5 ) ), \
|
|
||||||
_mm256_and_si256( x3, x6 ) ) )
|
_mm256_and_si256( x3, x6 ) ) )
|
||||||
|
|
||||||
#define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
|
#define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
#include "hodl-gate.h"
|
#include "hodl-gate.h"
|
||||||
#include "hodl-wolf.h"
|
#include "hodl-wolf.h"
|
||||||
#include "miner.h"
|
#include "miner.h"
|
||||||
|
#include "algo/sha/sha256d.h"
|
||||||
|
|
||||||
#if defined(__AES__)
|
#if defined(__AES__)
|
||||||
|
|
||||||
|
|||||||
@@ -51,15 +51,15 @@ extern "C"{
|
|||||||
do { \
|
do { \
|
||||||
__m512i cc = _mm512_set1_epi64( c ); \
|
__m512i cc = _mm512_set1_epi64( c ); \
|
||||||
x3 = mm512_not( x3 ); \
|
x3 = mm512_not( x3 ); \
|
||||||
x0 = _mm512_xor_si512( x0, _mm512_andnot_si512( x2, cc ) ); \
|
x0 = mm512_xorandnot( x0, x2, cc ); \
|
||||||
tmp = _mm512_xor_si512( cc, _mm512_and_si512( x0, x1 ) ); \
|
tmp = mm512_xorand( cc, x0, x1 ); \
|
||||||
x0 = _mm512_xor_si512( x0, _mm512_and_si512( x2, x3 ) ); \
|
x0 = mm512_xorand( x0, x2, x3 ); \
|
||||||
x3 = _mm512_xor_si512( x3, _mm512_andnot_si512( x1, x2 ) ); \
|
x3 = mm512_xorandnot( x3, x1, x2 ); \
|
||||||
x1 = _mm512_xor_si512( x1, _mm512_and_si512( x0, x2 ) ); \
|
x1 = mm512_xorand( x1, x0, x2 ); \
|
||||||
x2 = _mm512_xor_si512( x2, _mm512_andnot_si512( x3, x0 ) ); \
|
x2 = mm512_xorandnot( x2, x3, x0 ); \
|
||||||
x0 = _mm512_xor_si512( x0, _mm512_or_si512( x1, x3 ) ); \
|
x0 = mm512_xoror( x0, x1, x3 ); \
|
||||||
x3 = _mm512_xor_si512( x3, _mm512_and_si512( x1, x2 ) ); \
|
x3 = mm512_xorand( x3, x1, x2 ); \
|
||||||
x1 = _mm512_xor_si512( x1, _mm512_and_si512( tmp, x0 ) ); \
|
x1 = mm512_xorand( x1, tmp, x0 ); \
|
||||||
x2 = _mm512_xor_si512( x2, tmp ); \
|
x2 = _mm512_xor_si512( x2, tmp ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
@@ -67,11 +67,11 @@ do { \
|
|||||||
do { \
|
do { \
|
||||||
x4 = _mm512_xor_si512( x4, x1 ); \
|
x4 = _mm512_xor_si512( x4, x1 ); \
|
||||||
x5 = _mm512_xor_si512( x5, x2 ); \
|
x5 = _mm512_xor_si512( x5, x2 ); \
|
||||||
x6 = _mm512_xor_si512( x6, _mm512_xor_si512( x3, x0 ) ); \
|
x6 = mm512_xor3( x6, x3, x0 ); \
|
||||||
x7 = _mm512_xor_si512( x7, x0 ); \
|
x7 = _mm512_xor_si512( x7, x0 ); \
|
||||||
x0 = _mm512_xor_si512( x0, x5 ); \
|
x0 = _mm512_xor_si512( x0, x5 ); \
|
||||||
x1 = _mm512_xor_si512( x1, x6 ); \
|
x1 = _mm512_xor_si512( x1, x6 ); \
|
||||||
x2 = _mm512_xor_si512( x2, _mm512_xor_si512( x7, x4 ) ); \
|
x2 = mm512_xor3( x2, x7, x4 ); \
|
||||||
x3 = _mm512_xor_si512( x3, x4 ); \
|
x3 = _mm512_xor_si512( x3, x4 ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
@@ -318,12 +318,12 @@ static const sph_u64 C[] = {
|
|||||||
#define Wz_8W(x, c, n) \
|
#define Wz_8W(x, c, n) \
|
||||||
do { \
|
do { \
|
||||||
__m512i t = _mm512_slli_epi64( _mm512_and_si512(x ## h, (c)), (n) ); \
|
__m512i t = _mm512_slli_epi64( _mm512_and_si512(x ## h, (c)), (n) ); \
|
||||||
x ## h = _mm512_or_si512( _mm512_and_si512( \
|
x ## h = mm512_orand( t, _mm512_srli_epi64( x ## h, (n) ), (c) ); \
|
||||||
_mm512_srli_epi64(x ## h, (n)), (c)), t ); \
|
|
||||||
t = _mm512_slli_epi64( _mm512_and_si512(x ## l, (c)), (n) ); \
|
t = _mm512_slli_epi64( _mm512_and_si512(x ## l, (c)), (n) ); \
|
||||||
x ## l = _mm512_or_si512( _mm512_and_si512((x ## l >> (n)), (c)), t ); \
|
x ## l = mm512_orand( t, (x ## l >> (n)), (c) ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
#define W80(x) Wz_8W(x, m512_const1_64( 0x5555555555555555 ), 1 )
|
#define W80(x) Wz_8W(x, m512_const1_64( 0x5555555555555555 ), 1 )
|
||||||
#define W81(x) Wz_8W(x, m512_const1_64( 0x3333333333333333 ), 2 )
|
#define W81(x) Wz_8W(x, m512_const1_64( 0x3333333333333333 ), 2 )
|
||||||
#define W82(x) Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 )
|
#define W82(x) Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 )
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
#include "keccak-gate.h"
|
#include "keccak-gate.h"
|
||||||
#include "sph_keccak.h"
|
#include "sph_keccak.h"
|
||||||
|
#include "algo/sha/sha256d.h"
|
||||||
|
|
||||||
int hard_coded_eb = 1;
|
int hard_coded_eb = 1;
|
||||||
|
|
||||||
|
|||||||
@@ -76,6 +76,9 @@ static const uint64_t RC[] = {
|
|||||||
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
|
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
|
||||||
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
|
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
|
||||||
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
|
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
|
||||||
|
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
|
||||||
|
#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
|
||||||
|
|
||||||
|
|
||||||
#include "keccak-macros.c"
|
#include "keccak-macros.c"
|
||||||
|
|
||||||
@@ -238,6 +241,8 @@ keccak512_8way_close(void *cc, void *dst)
|
|||||||
#undef NOT64
|
#undef NOT64
|
||||||
#undef ROL64
|
#undef ROL64
|
||||||
#undef KECCAK_F_1600
|
#undef KECCAK_F_1600
|
||||||
|
#undef XOROR
|
||||||
|
#undef XORAND
|
||||||
|
|
||||||
#endif // AVX512
|
#endif // AVX512
|
||||||
|
|
||||||
@@ -255,6 +260,8 @@ keccak512_8way_close(void *cc, void *dst)
|
|||||||
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
|
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
|
||||||
#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
|
#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
|
||||||
#define ROL64(d, v, n) (d = mm256_rol_64(v, n))
|
#define ROL64(d, v, n) (d = mm256_rol_64(v, n))
|
||||||
|
#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
|
||||||
|
#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
|
||||||
|
|
||||||
#include "keccak-macros.c"
|
#include "keccak-macros.c"
|
||||||
|
|
||||||
@@ -419,5 +426,7 @@ keccak512_4way_close(void *cc, void *dst)
|
|||||||
#undef NOT64
|
#undef NOT64
|
||||||
#undef ROL64
|
#undef ROL64
|
||||||
#undef KECCAK_F_1600
|
#undef KECCAK_F_1600
|
||||||
|
#undef XOROR
|
||||||
|
#undef XORAND
|
||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|||||||
@@ -110,20 +110,34 @@
|
|||||||
#ifdef KHI_XO
|
#ifdef KHI_XO
|
||||||
#undef KHI_XO
|
#undef KHI_XO
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define KHI_XO(d, a, b, c) do { \
|
||||||
|
XOROR(d, a, b, c); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
/*
|
||||||
#define KHI_XO(d, a, b, c) do { \
|
#define KHI_XO(d, a, b, c) do { \
|
||||||
DECL64(kt); \
|
DECL64(kt); \
|
||||||
OR64(kt, b, c); \
|
OR64(kt, b, c); \
|
||||||
XOR64(d, a, kt); \
|
XOR64(d, a, kt); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
*/
|
||||||
|
|
||||||
#ifdef KHI_XA
|
#ifdef KHI_XA
|
||||||
#undef KHI_XA
|
#undef KHI_XA
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define KHI_XA(d, a, b, c) do { \
|
||||||
|
XORAND(d, a, b, c); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
/*
|
||||||
#define KHI_XA(d, a, b, c) do { \
|
#define KHI_XA(d, a, b, c) do { \
|
||||||
DECL64(kt); \
|
DECL64(kt); \
|
||||||
AND64(kt, b, c); \
|
AND64(kt, b, c); \
|
||||||
XOR64(d, a, kt); \
|
XOR64(d, a, kt); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
*/
|
||||||
|
|
||||||
#ifdef KHI
|
#ifdef KHI
|
||||||
#undef KHI
|
#undef KHI
|
||||||
@@ -134,65 +148,47 @@
|
|||||||
do { \
|
do { \
|
||||||
DECL64(c0); \
|
DECL64(c0); \
|
||||||
DECL64(c1); \
|
DECL64(c1); \
|
||||||
DECL64(c2); \
|
|
||||||
DECL64(c3); \
|
|
||||||
DECL64(c4); \
|
|
||||||
DECL64(bnn); \
|
DECL64(bnn); \
|
||||||
NOT64(bnn, b20); \
|
NOT64(bnn, b20); \
|
||||||
KHI_XO(c0, b00, b10, b20); \
|
KHI_XO(c0, b00, b10, b20); \
|
||||||
KHI_XO(c1, b10, bnn, b30); \
|
KHI_XO(c1, b10, bnn, b30); \
|
||||||
KHI_XA(c2, b20, b30, b40); \
|
KHI_XA(b20, b20, b30, b40); \
|
||||||
KHI_XO(c3, b30, b40, b00); \
|
KHI_XO(b30, b30, b40, b00); \
|
||||||
KHI_XA(c4, b40, b00, b10); \
|
KHI_XA(b40, b40, b00, b10); \
|
||||||
MOV64(b00, c0); \
|
MOV64(b00, c0); \
|
||||||
MOV64(b10, c1); \
|
MOV64(b10, c1); \
|
||||||
MOV64(b20, c2); \
|
|
||||||
MOV64(b30, c3); \
|
|
||||||
MOV64(b40, c4); \
|
|
||||||
NOT64(bnn, b41); \
|
NOT64(bnn, b41); \
|
||||||
KHI_XO(c0, b01, b11, b21); \
|
KHI_XO(c0, b01, b11, b21); \
|
||||||
KHI_XA(c1, b11, b21, b31); \
|
KHI_XA(c1, b11, b21, b31); \
|
||||||
KHI_XO(c2, b21, b31, bnn); \
|
KHI_XO(b21, b21, b31, bnn); \
|
||||||
KHI_XO(c3, b31, b41, b01); \
|
KHI_XO(b31, b31, b41, b01); \
|
||||||
KHI_XA(c4, b41, b01, b11); \
|
KHI_XA(b41, b41, b01, b11); \
|
||||||
MOV64(b01, c0); \
|
MOV64(b01, c0); \
|
||||||
MOV64(b11, c1); \
|
MOV64(b11, c1); \
|
||||||
MOV64(b21, c2); \
|
|
||||||
MOV64(b31, c3); \
|
|
||||||
MOV64(b41, c4); \
|
|
||||||
NOT64(bnn, b32); \
|
NOT64(bnn, b32); \
|
||||||
KHI_XO(c0, b02, b12, b22); \
|
KHI_XO(c0, b02, b12, b22); \
|
||||||
KHI_XA(c1, b12, b22, b32); \
|
KHI_XA(c1, b12, b22, b32); \
|
||||||
KHI_XA(c2, b22, bnn, b42); \
|
KHI_XA(b22, b22, bnn, b42); \
|
||||||
KHI_XO(c3, bnn, b42, b02); \
|
KHI_XO(b32, bnn, b42, b02); \
|
||||||
KHI_XA(c4, b42, b02, b12); \
|
KHI_XA(b42, b42, b02, b12); \
|
||||||
MOV64(b02, c0); \
|
MOV64(b02, c0); \
|
||||||
MOV64(b12, c1); \
|
MOV64(b12, c1); \
|
||||||
MOV64(b22, c2); \
|
|
||||||
MOV64(b32, c3); \
|
|
||||||
MOV64(b42, c4); \
|
|
||||||
NOT64(bnn, b33); \
|
NOT64(bnn, b33); \
|
||||||
KHI_XA(c0, b03, b13, b23); \
|
KHI_XA(c0, b03, b13, b23); \
|
||||||
KHI_XO(c1, b13, b23, b33); \
|
KHI_XO(c1, b13, b23, b33); \
|
||||||
KHI_XO(c2, b23, bnn, b43); \
|
KHI_XO(b23, b23, bnn, b43); \
|
||||||
KHI_XA(c3, bnn, b43, b03); \
|
KHI_XA(b33, bnn, b43, b03); \
|
||||||
KHI_XO(c4, b43, b03, b13); \
|
KHI_XO(b43, b43, b03, b13); \
|
||||||
MOV64(b03, c0); \
|
MOV64(b03, c0); \
|
||||||
MOV64(b13, c1); \
|
MOV64(b13, c1); \
|
||||||
MOV64(b23, c2); \
|
|
||||||
MOV64(b33, c3); \
|
|
||||||
MOV64(b43, c4); \
|
|
||||||
NOT64(bnn, b14); \
|
NOT64(bnn, b14); \
|
||||||
KHI_XA(c0, b04, bnn, b24); \
|
KHI_XA(c0, b04, bnn, b24); \
|
||||||
KHI_XO(c1, bnn, b24, b34); \
|
KHI_XO(c1, bnn, b24, b34); \
|
||||||
KHI_XA(c2, b24, b34, b44); \
|
KHI_XA(b24, b24, b34, b44); \
|
||||||
KHI_XO(c3, b34, b44, b04); \
|
KHI_XO(b34, b34, b44, b04); \
|
||||||
KHI_XA(c4, b44, b04, b14); \
|
KHI_XA(b44, b44, b04, b14); \
|
||||||
MOV64(b04, c0); \
|
MOV64(b04, c0); \
|
||||||
MOV64(b14, c1); \
|
MOV64(b14, c1); \
|
||||||
MOV64(b24, c2); \
|
|
||||||
MOV64(b34, c3); \
|
|
||||||
MOV64(b44, c4); \
|
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#ifdef IOTA
|
#ifdef IOTA
|
||||||
@@ -201,6 +197,7 @@
|
|||||||
#define IOTA(r) XOR64_IOTA(a00, a00, r)
|
#define IOTA(r) XOR64_IOTA(a00, a00, r)
|
||||||
|
|
||||||
#ifdef P0
|
#ifdef P0
|
||||||
|
#undef P0
|
||||||
#undef P1
|
#undef P1
|
||||||
#undef P2
|
#undef P2
|
||||||
#undef P3
|
#undef P3
|
||||||
|
|||||||
@@ -66,6 +66,17 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
|
|||||||
a = _mm512_xor_si512(a,c0);\
|
a = _mm512_xor_si512(a,c0);\
|
||||||
b = _mm512_xor_si512(b,c1);
|
b = _mm512_xor_si512(b,c1);
|
||||||
|
|
||||||
|
#define MULT24W( a0, a1 ) \
|
||||||
|
do { \
|
||||||
|
__m512i b = _mm512_xor_si512( a0, \
|
||||||
|
_mm512_maskz_shuffle_epi32( 0xbbbb, a1, 16 ) ); \
|
||||||
|
a0 = _mm512_or_si512( _mm512_bsrli_epi128( b, 4 ), \
|
||||||
|
_mm512_bslli_epi128( a1,12 ) ); \
|
||||||
|
a1 = _mm512_or_si512( _mm512_bsrli_epi128( a1, 4 ), \
|
||||||
|
_mm512_bslli_epi128( b,12 ) ); \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
/*
|
||||||
#define MULT24W( a0, a1, mask ) \
|
#define MULT24W( a0, a1, mask ) \
|
||||||
do { \
|
do { \
|
||||||
__m512i b = _mm512_xor_si512( a0, \
|
__m512i b = _mm512_xor_si512( a0, \
|
||||||
@@ -73,6 +84,7 @@ do { \
|
|||||||
a0 = _mm512_or_si512( _mm512_bsrli_epi128(b,4), _mm512_bslli_epi128(a1,12) );\
|
a0 = _mm512_or_si512( _mm512_bsrli_epi128(b,4), _mm512_bslli_epi128(a1,12) );\
|
||||||
a1 = _mm512_or_si512( _mm512_bsrli_epi128(a1,4), _mm512_bslli_epi128(b,12) );\
|
a1 = _mm512_or_si512( _mm512_bsrli_epi128(a1,4), _mm512_bslli_epi128(b,12) );\
|
||||||
} while(0)
|
} while(0)
|
||||||
|
*/
|
||||||
|
|
||||||
// confirm pointer arithmetic
|
// confirm pointer arithmetic
|
||||||
// ok but use array indexes
|
// ok but use array indexes
|
||||||
@@ -85,6 +97,21 @@ do { \
|
|||||||
MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\
|
MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\
|
||||||
ADD_CONSTANT4W(*x, *(x+4), c0, c1);
|
ADD_CONSTANT4W(*x, *(x+4), c0, c1);
|
||||||
|
|
||||||
|
#define SUBCRUMB4W(a0,a1,a2,a3,t)\
|
||||||
|
t = a0;\
|
||||||
|
a0 = mm512_xoror( a3, a0, a1 ); \
|
||||||
|
a2 = _mm512_xor_si512(a2,a3);\
|
||||||
|
a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
|
||||||
|
a3 = mm512_xorand( a2, a3, t ); \
|
||||||
|
a2 = mm512_xorand( a1, a2, a0);\
|
||||||
|
a1 = _mm512_or_si512(a1,a3);\
|
||||||
|
a3 = _mm512_xor_si512(a3,a2);\
|
||||||
|
t = _mm512_xor_si512(t,a1);\
|
||||||
|
a2 = _mm512_and_si512(a2,a1);\
|
||||||
|
a1 = mm512_xnor(a1,a0);\
|
||||||
|
a0 = t;
|
||||||
|
|
||||||
|
/*
|
||||||
#define SUBCRUMB4W(a0,a1,a2,a3,t)\
|
#define SUBCRUMB4W(a0,a1,a2,a3,t)\
|
||||||
t = _mm512_load_si512(&a0);\
|
t = _mm512_load_si512(&a0);\
|
||||||
a0 = _mm512_or_si512(a0,a1);\
|
a0 = _mm512_or_si512(a0,a1);\
|
||||||
@@ -103,7 +130,25 @@ do { \
|
|||||||
a2 = _mm512_and_si512(a2,a1);\
|
a2 = _mm512_and_si512(a2,a1);\
|
||||||
a1 = _mm512_xor_si512(a1,a0);\
|
a1 = _mm512_xor_si512(a1,a0);\
|
||||||
a0 = _mm512_load_si512(&t);
|
a0 = _mm512_load_si512(&t);
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define MIXWORD4W(a,b,t1,t2)\
|
||||||
|
b = _mm512_xor_si512(a,b);\
|
||||||
|
t1 = _mm512_slli_epi32(a,2);\
|
||||||
|
t2 = _mm512_srli_epi32(a,30);\
|
||||||
|
a = mm512_xoror( b, t1, t2 ); \
|
||||||
|
t1 = _mm512_slli_epi32(b,14);\
|
||||||
|
t2 = _mm512_srli_epi32(b,18);\
|
||||||
|
b = _mm512_or_si512(t1,t2);\
|
||||||
|
b = mm512_xoror( a, t1, t2 ); \
|
||||||
|
t1 = _mm512_slli_epi32(a,10);\
|
||||||
|
t2 = _mm512_srli_epi32(a,22);\
|
||||||
|
a = mm512_xoror( b, t1, t2 ); \
|
||||||
|
t1 = _mm512_slli_epi32(b,1);\
|
||||||
|
t2 = _mm512_srli_epi32(b,31);\
|
||||||
|
b = _mm512_or_si512(t1,t2);
|
||||||
|
|
||||||
|
/*
|
||||||
#define MIXWORD4W(a,b,t1,t2)\
|
#define MIXWORD4W(a,b,t1,t2)\
|
||||||
b = _mm512_xor_si512(a,b);\
|
b = _mm512_xor_si512(a,b);\
|
||||||
t1 = _mm512_slli_epi32(a,2);\
|
t1 = _mm512_slli_epi32(a,2);\
|
||||||
@@ -121,6 +166,7 @@ do { \
|
|||||||
t1 = _mm512_slli_epi32(b,1);\
|
t1 = _mm512_slli_epi32(b,1);\
|
||||||
t2 = _mm512_srli_epi32(b,31);\
|
t2 = _mm512_srli_epi32(b,31);\
|
||||||
b = _mm512_or_si512(t1,t2);
|
b = _mm512_or_si512(t1,t2);
|
||||||
|
*/
|
||||||
|
|
||||||
#define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
|
#define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
|
||||||
a1 = _mm512_shuffle_epi32(a1,147);\
|
a1 = _mm512_shuffle_epi32(a1,147);\
|
||||||
@@ -235,21 +281,13 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
|||||||
__m512i msg0, msg1;
|
__m512i msg0, msg1;
|
||||||
__m512i tmp[2];
|
__m512i tmp[2];
|
||||||
__m512i x[8];
|
__m512i x[8];
|
||||||
const __m512i MASK = m512_const2_64( 0, 0x00000000ffffffff );
|
|
||||||
|
|
||||||
t0 = chainv[0];
|
t0 = mm512_xor3( chainv[0], chainv[2], chainv[4] );
|
||||||
t1 = chainv[1];
|
t1 = mm512_xor3( chainv[1], chainv[3], chainv[5] );
|
||||||
|
t0 = mm512_xor3( t0, chainv[6], chainv[8] );
|
||||||
|
t1 = mm512_xor3( t1, chainv[7], chainv[9] );
|
||||||
|
|
||||||
t0 = _mm512_xor_si512( t0, chainv[2] );
|
MULT24W( t0, t1 );
|
||||||
t1 = _mm512_xor_si512( t1, chainv[3] );
|
|
||||||
t0 = _mm512_xor_si512( t0, chainv[4] );
|
|
||||||
t1 = _mm512_xor_si512( t1, chainv[5] );
|
|
||||||
t0 = _mm512_xor_si512( t0, chainv[6] );
|
|
||||||
t1 = _mm512_xor_si512( t1, chainv[7] );
|
|
||||||
t0 = _mm512_xor_si512( t0, chainv[8] );
|
|
||||||
t1 = _mm512_xor_si512( t1, chainv[9] );
|
|
||||||
|
|
||||||
MULT24W( t0, t1, MASK );
|
|
||||||
|
|
||||||
msg0 = _mm512_shuffle_epi32( msg[0], 27 );
|
msg0 = _mm512_shuffle_epi32( msg[0], 27 );
|
||||||
msg1 = _mm512_shuffle_epi32( msg[1], 27 );
|
msg1 = _mm512_shuffle_epi32( msg[1], 27 );
|
||||||
@@ -268,68 +306,67 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
|||||||
t0 = chainv[0];
|
t0 = chainv[0];
|
||||||
t1 = chainv[1];
|
t1 = chainv[1];
|
||||||
|
|
||||||
MULT24W( chainv[0], chainv[1], MASK );
|
MULT24W( chainv[0], chainv[1] );
|
||||||
chainv[0] = _mm512_xor_si512( chainv[0], chainv[2] );
|
chainv[0] = _mm512_xor_si512( chainv[0], chainv[2] );
|
||||||
chainv[1] = _mm512_xor_si512( chainv[1], chainv[3] );
|
chainv[1] = _mm512_xor_si512( chainv[1], chainv[3] );
|
||||||
|
|
||||||
MULT24W( chainv[2], chainv[3], MASK );
|
MULT24W( chainv[2], chainv[3] );
|
||||||
chainv[2] = _mm512_xor_si512(chainv[2], chainv[4]);
|
chainv[2] = _mm512_xor_si512(chainv[2], chainv[4]);
|
||||||
chainv[3] = _mm512_xor_si512(chainv[3], chainv[5]);
|
chainv[3] = _mm512_xor_si512(chainv[3], chainv[5]);
|
||||||
|
|
||||||
MULT24W( chainv[4], chainv[5], MASK );
|
MULT24W( chainv[4], chainv[5] );
|
||||||
chainv[4] = _mm512_xor_si512(chainv[4], chainv[6]);
|
chainv[4] = _mm512_xor_si512(chainv[4], chainv[6]);
|
||||||
chainv[5] = _mm512_xor_si512(chainv[5], chainv[7]);
|
chainv[5] = _mm512_xor_si512(chainv[5], chainv[7]);
|
||||||
|
|
||||||
MULT24W( chainv[6], chainv[7], MASK );
|
MULT24W( chainv[6], chainv[7] );
|
||||||
chainv[6] = _mm512_xor_si512(chainv[6], chainv[8]);
|
chainv[6] = _mm512_xor_si512(chainv[6], chainv[8]);
|
||||||
chainv[7] = _mm512_xor_si512(chainv[7], chainv[9]);
|
chainv[7] = _mm512_xor_si512(chainv[7], chainv[9]);
|
||||||
|
|
||||||
MULT24W( chainv[8], chainv[9], MASK );
|
MULT24W( chainv[8], chainv[9] );
|
||||||
chainv[8] = _mm512_xor_si512( chainv[8], t0 );
|
chainv[8] = _mm512_xor_si512( chainv[8], t0 );
|
||||||
chainv[9] = _mm512_xor_si512( chainv[9], t1 );
|
chainv[9] = _mm512_xor_si512( chainv[9], t1 );
|
||||||
|
|
||||||
t0 = chainv[8];
|
t0 = chainv[8];
|
||||||
t1 = chainv[9];
|
t1 = chainv[9];
|
||||||
|
|
||||||
MULT24W( chainv[8], chainv[9], MASK );
|
MULT24W( chainv[8], chainv[9] );
|
||||||
chainv[8] = _mm512_xor_si512( chainv[8], chainv[6] );
|
chainv[8] = _mm512_xor_si512( chainv[8], chainv[6] );
|
||||||
chainv[9] = _mm512_xor_si512( chainv[9], chainv[7] );
|
chainv[9] = _mm512_xor_si512( chainv[9], chainv[7] );
|
||||||
|
|
||||||
MULT24W( chainv[6], chainv[7], MASK );
|
MULT24W( chainv[6], chainv[7] );
|
||||||
chainv[6] = _mm512_xor_si512( chainv[6], chainv[4] );
|
chainv[6] = _mm512_xor_si512( chainv[6], chainv[4] );
|
||||||
chainv[7] = _mm512_xor_si512( chainv[7], chainv[5] );
|
chainv[7] = _mm512_xor_si512( chainv[7], chainv[5] );
|
||||||
|
|
||||||
MULT24W( chainv[4], chainv[5], MASK );
|
MULT24W( chainv[4], chainv[5] );
|
||||||
chainv[4] = _mm512_xor_si512( chainv[4], chainv[2] );
|
chainv[4] = _mm512_xor_si512( chainv[4], chainv[2] );
|
||||||
chainv[5] = _mm512_xor_si512( chainv[5], chainv[3] );
|
chainv[5] = _mm512_xor_si512( chainv[5], chainv[3] );
|
||||||
|
|
||||||
MULT24W( chainv[2], chainv[3], MASK );
|
MULT24W( chainv[2], chainv[3] );
|
||||||
chainv[2] = _mm512_xor_si512( chainv[2], chainv[0] );
|
chainv[2] = _mm512_xor_si512( chainv[2], chainv[0] );
|
||||||
chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
|
chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
|
||||||
|
|
||||||
MULT24W( chainv[0], chainv[1], MASK );
|
MULT24W( chainv[0], chainv[1] );
|
||||||
chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 );
|
chainv[0] = mm512_xor3( chainv[0], t0, msg0 );
|
||||||
chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 );
|
chainv[1] = mm512_xor3( chainv[1], t1, msg1 );
|
||||||
|
|
||||||
MULT24W( msg0, msg1, MASK );
|
MULT24W( msg0, msg1 );
|
||||||
chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
|
chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
|
||||||
chainv[3] = _mm512_xor_si512( chainv[3], msg1 );
|
chainv[3] = _mm512_xor_si512( chainv[3], msg1 );
|
||||||
|
|
||||||
MULT24W( msg0, msg1, MASK );
|
MULT24W( msg0, msg1 );
|
||||||
chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
|
chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
|
||||||
chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
|
chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
|
||||||
|
|
||||||
MULT24W( msg0, msg1, MASK );
|
MULT24W( msg0, msg1 );
|
||||||
chainv[6] = _mm512_xor_si512( chainv[6], msg0 );
|
chainv[6] = _mm512_xor_si512( chainv[6], msg0 );
|
||||||
chainv[7] = _mm512_xor_si512( chainv[7], msg1 );
|
chainv[7] = _mm512_xor_si512( chainv[7], msg1 );
|
||||||
|
|
||||||
MULT24W( msg0, msg1, MASK );
|
MULT24W( msg0, msg1);
|
||||||
chainv[8] = _mm512_xor_si512( chainv[8], msg0 );
|
chainv[8] = _mm512_xor_si512( chainv[8], msg0 );
|
||||||
chainv[9] = _mm512_xor_si512( chainv[9], msg1 );
|
chainv[9] = _mm512_xor_si512( chainv[9], msg1 );
|
||||||
|
|
||||||
MULT24W( msg0, msg1, MASK );
|
MULT24W( msg0, msg1 );
|
||||||
|
|
||||||
// replace with ror
|
|
||||||
chainv[3] = _mm512_rol_epi32( chainv[3], 1 );
|
chainv[3] = _mm512_rol_epi32( chainv[3], 1 );
|
||||||
chainv[5] = _mm512_rol_epi32( chainv[5], 2 );
|
chainv[5] = _mm512_rol_epi32( chainv[5], 2 );
|
||||||
chainv[7] = _mm512_rol_epi32( chainv[7], 3 );
|
chainv[7] = _mm512_rol_epi32( chainv[7], 3 );
|
||||||
@@ -389,18 +426,10 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
|||||||
/*---- blank round with m=0 ----*/
|
/*---- blank round with m=0 ----*/
|
||||||
rnd512_4way( state, zero );
|
rnd512_4way( state, zero );
|
||||||
|
|
||||||
t[0] = chainv[0];
|
t[0] = mm512_xor3( chainv[0], chainv[2], chainv[4] );
|
||||||
t[1] = chainv[1];
|
t[1] = mm512_xor3( chainv[1], chainv[3], chainv[5] );
|
||||||
|
t[0] = mm512_xor3( t[0], chainv[6], chainv[8] );
|
||||||
t[0] = _mm512_xor_si512( t[0], chainv[2] );
|
t[1] = mm512_xor3( t[1], chainv[7], chainv[9] );
|
||||||
t[1] = _mm512_xor_si512( t[1], chainv[3] );
|
|
||||||
t[0] = _mm512_xor_si512( t[0], chainv[4] );
|
|
||||||
t[1] = _mm512_xor_si512( t[1], chainv[5] );
|
|
||||||
t[0] = _mm512_xor_si512( t[0], chainv[6] );
|
|
||||||
t[1] = _mm512_xor_si512( t[1], chainv[7] );
|
|
||||||
t[0] = _mm512_xor_si512( t[0], chainv[8] );
|
|
||||||
t[1] = _mm512_xor_si512( t[1], chainv[9] );
|
|
||||||
|
|
||||||
t[0] = _mm512_shuffle_epi32( t[0], 27 );
|
t[0] = _mm512_shuffle_epi32( t[0], 27 );
|
||||||
t[1] = _mm512_shuffle_epi32( t[1], 27 );
|
t[1] = _mm512_shuffle_epi32( t[1], 27 );
|
||||||
|
|
||||||
@@ -496,7 +525,7 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
|
|||||||
{
|
{
|
||||||
// remaining data bytes
|
// remaining data bytes
|
||||||
buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
|
buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
|
||||||
buffer[1] = m512_const2_64( 0, 0x0000000080000000 );
|
buffer[1] = m512_const1_i128( 0x0000000080000000 );
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -520,7 +549,7 @@ int luffa_4way_close( luffa_4way_context *state, void *hashval )
|
|||||||
rnd512_4way( state, buffer );
|
rnd512_4way( state, buffer );
|
||||||
else
|
else
|
||||||
{ // empty pad block, constant data
|
{ // empty pad block, constant data
|
||||||
msg[0] = m512_const2_64( 0, 0x0000000080000000 );
|
msg[0] = m512_const1_i128( 0x0000000080000000 );
|
||||||
msg[1] = m512_zero;
|
msg[1] = m512_zero;
|
||||||
rnd512_4way( state, msg );
|
rnd512_4way( state, msg );
|
||||||
}
|
}
|
||||||
@@ -583,13 +612,13 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
|
|||||||
{
|
{
|
||||||
// padding of partial block
|
// padding of partial block
|
||||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||||
msg[1] = m512_const2_64( 0, 0x0000000080000000 );
|
msg[1] = m512_const1_i128( 0x0000000080000000 );
|
||||||
rnd512_4way( state, msg );
|
rnd512_4way( state, msg );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// empty pad block
|
// empty pad block
|
||||||
msg[0] = m512_const2_64( 0, 0x0000000080000000 );
|
msg[0] = m512_const1_i128( 0x0000000080000000 );
|
||||||
msg[1] = m512_zero;
|
msg[1] = m512_zero;
|
||||||
rnd512_4way( state, msg );
|
rnd512_4way( state, msg );
|
||||||
}
|
}
|
||||||
@@ -631,13 +660,13 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
|||||||
{
|
{
|
||||||
// padding of partial block
|
// padding of partial block
|
||||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||||
msg[1] = m512_const2_64( 0, 0x0000000080000000 );
|
msg[1] = m512_const1_i128( 0x0000000080000000 );
|
||||||
rnd512_4way( state, msg );
|
rnd512_4way( state, msg );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// empty pad block
|
// empty pad block
|
||||||
msg[0] = m512_const2_64( 0, 0x0000000080000000 );
|
msg[0] = m512_const1_i128( 0x0000000080000000 );
|
||||||
msg[1] = m512_zero;
|
msg[1] = m512_zero;
|
||||||
rnd512_4way( state, msg );
|
rnd512_4way( state, msg );
|
||||||
}
|
}
|
||||||
@@ -666,8 +695,6 @@ do { \
|
|||||||
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
|
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
// confirm pointer arithmetic
|
|
||||||
// ok but use array indexes
|
|
||||||
#define STEP_PART(x,c0,c1,t)\
|
#define STEP_PART(x,c0,c1,t)\
|
||||||
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
|
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||||
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
||||||
@@ -678,23 +705,23 @@ do { \
|
|||||||
ADD_CONSTANT(*x, *(x+4), c0, c1);
|
ADD_CONSTANT(*x, *(x+4), c0, c1);
|
||||||
|
|
||||||
#define SUBCRUMB(a0,a1,a2,a3,t)\
|
#define SUBCRUMB(a0,a1,a2,a3,t)\
|
||||||
t = _mm256_load_si256(&a0);\
|
t = a0;\
|
||||||
a0 = _mm256_or_si256(a0,a1);\
|
a0 = _mm256_or_si256(a0,a1);\
|
||||||
a2 = _mm256_xor_si256(a2,a3);\
|
a2 = _mm256_xor_si256(a2,a3);\
|
||||||
a1 = _mm256_andnot_si256(a1, m256_neg1 );\
|
a1 = mm256_not( a1 );\
|
||||||
a0 = _mm256_xor_si256(a0,a3);\
|
a0 = _mm256_xor_si256(a0,a3);\
|
||||||
a3 = _mm256_and_si256(a3,t);\
|
a3 = _mm256_and_si256(a3,t);\
|
||||||
a1 = _mm256_xor_si256(a1,a3);\
|
a1 = _mm256_xor_si256(a1,a3);\
|
||||||
a3 = _mm256_xor_si256(a3,a2);\
|
a3 = _mm256_xor_si256(a3,a2);\
|
||||||
a2 = _mm256_and_si256(a2,a0);\
|
a2 = _mm256_and_si256(a2,a0);\
|
||||||
a0 = _mm256_andnot_si256(a0, m256_neg1 );\
|
a0 = mm256_not( a0 );\
|
||||||
a2 = _mm256_xor_si256(a2,a1);\
|
a2 = _mm256_xor_si256(a2,a1);\
|
||||||
a1 = _mm256_or_si256(a1,a3);\
|
a1 = _mm256_or_si256(a1,a3);\
|
||||||
t = _mm256_xor_si256(t,a1);\
|
t = _mm256_xor_si256(t,a1);\
|
||||||
a3 = _mm256_xor_si256(a3,a2);\
|
a3 = _mm256_xor_si256(a3,a2);\
|
||||||
a2 = _mm256_and_si256(a2,a1);\
|
a2 = _mm256_and_si256(a2,a1);\
|
||||||
a1 = _mm256_xor_si256(a1,a0);\
|
a1 = _mm256_xor_si256(a1,a0);\
|
||||||
a0 = _mm256_load_si256(&t);\
|
a0 = t;\
|
||||||
|
|
||||||
#define MIXWORD(a,b,t1,t2)\
|
#define MIXWORD(a,b,t1,t2)\
|
||||||
b = _mm256_xor_si256(a,b);\
|
b = _mm256_xor_si256(a,b);\
|
||||||
@@ -832,7 +859,7 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
|||||||
__m256i msg0, msg1;
|
__m256i msg0, msg1;
|
||||||
__m256i tmp[2];
|
__m256i tmp[2];
|
||||||
__m256i x[8];
|
__m256i x[8];
|
||||||
const __m256i MASK = m256_const2_64( 0, 0x00000000ffffffff );
|
const __m256i MASK = m256_const1_i128( 0x00000000ffffffff );
|
||||||
|
|
||||||
t0 = chainv[0];
|
t0 = chainv[0];
|
||||||
t1 = chainv[1];
|
t1 = chainv[1];
|
||||||
@@ -1088,7 +1115,7 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
|
|||||||
{
|
{
|
||||||
// remaining data bytes
|
// remaining data bytes
|
||||||
buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
|
buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
|
||||||
buffer[1] = m256_const2_64( 0, 0x0000000080000000 );
|
buffer[1] = m256_const1_i128( 0x0000000080000000 );
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -1104,7 +1131,7 @@ int luffa_2way_close( luffa_2way_context *state, void *hashval )
|
|||||||
rnd512_2way( state, buffer );
|
rnd512_2way( state, buffer );
|
||||||
else
|
else
|
||||||
{ // empty pad block, constant data
|
{ // empty pad block, constant data
|
||||||
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
|
msg[0] = m256_const1_i128( 0x0000000080000000 );
|
||||||
msg[1] = m256_zero;
|
msg[1] = m256_zero;
|
||||||
rnd512_2way( state, msg );
|
rnd512_2way( state, msg );
|
||||||
}
|
}
|
||||||
@@ -1159,13 +1186,13 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
|
|||||||
{
|
{
|
||||||
// padding of partial block
|
// padding of partial block
|
||||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||||
msg[1] = m256_const2_64( 0, 0x0000000080000000 );
|
msg[1] = m256_const1_i128( 0x0000000080000000 );
|
||||||
rnd512_2way( state, msg );
|
rnd512_2way( state, msg );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// empty pad block
|
// empty pad block
|
||||||
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
|
msg[0] = m256_const1_i128( 0x0000000080000000 );
|
||||||
msg[1] = m256_zero;
|
msg[1] = m256_zero;
|
||||||
rnd512_2way( state, msg );
|
rnd512_2way( state, msg );
|
||||||
}
|
}
|
||||||
@@ -1206,13 +1233,13 @@ int luffa_2way_update_close( luffa_2way_context *state,
|
|||||||
{
|
{
|
||||||
// padding of partial block
|
// padding of partial block
|
||||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||||
msg[1] = m256_const2_64( 0, 0x0000000080000000 );
|
msg[1] = m256_const1_i128( 0x0000000080000000 );
|
||||||
rnd512_2way( state, msg );
|
rnd512_2way( state, msg );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// empty pad block
|
// empty pad block
|
||||||
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
|
msg[0] = m256_const1_i128( 0x0000000080000000 );
|
||||||
msg[1] = m256_zero;
|
msg[1] = m256_zero;
|
||||||
rnd512_2way( state, msg );
|
rnd512_2way( state, msg );
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,7 +23,7 @@
|
|||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
#include "luffa_for_sse2.h"
|
#include "luffa_for_sse2.h"
|
||||||
|
|
||||||
#define MULT2(a0,a1) do \
|
#define MULT2( a0, a1 ) do \
|
||||||
{ \
|
{ \
|
||||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \
|
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \
|
||||||
a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) ); \
|
a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) ); \
|
||||||
@@ -345,11 +345,11 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
|
|||||||
// 16 byte partial block exists for 80 byte len
|
// 16 byte partial block exists for 80 byte len
|
||||||
if ( state->rembytes )
|
if ( state->rembytes )
|
||||||
// padding of partial block
|
// padding of partial block
|
||||||
rnd512( state, m128_const_64( 0, 0x80000000 ),
|
rnd512( state, m128_const_i128( 0x80000000 ),
|
||||||
mm128_bswap_32( cast_m128i( data ) ) );
|
mm128_bswap_32( cast_m128i( data ) ) );
|
||||||
else
|
else
|
||||||
// empty pad block
|
// empty pad block
|
||||||
rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
|
rnd512( state, m128_zero, m128_const_i128( 0x80000000 ) );
|
||||||
|
|
||||||
finalization512( state, (uint32*) output );
|
finalization512( state, (uint32*) output );
|
||||||
if ( state->hashbitlen > 512 )
|
if ( state->hashbitlen > 512 )
|
||||||
@@ -394,11 +394,11 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
|
|||||||
// 16 byte partial block exists for 80 byte len
|
// 16 byte partial block exists for 80 byte len
|
||||||
if ( state->rembytes )
|
if ( state->rembytes )
|
||||||
// padding of partial block
|
// padding of partial block
|
||||||
rnd512( state, m128_const_64( 0, 0x80000000 ),
|
rnd512( state, m128_const_i128( 0x80000000 ),
|
||||||
mm128_bswap_32( cast_m128i( data ) ) );
|
mm128_bswap_32( cast_m128i( data ) ) );
|
||||||
else
|
else
|
||||||
// empty pad block
|
// empty pad block
|
||||||
rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
|
rnd512( state, m128_zero, m128_const_i128( 0x80000000 ) );
|
||||||
|
|
||||||
finalization512( state, (uint32*) output );
|
finalization512( state, (uint32*) output );
|
||||||
if ( state->hashbitlen > 512 )
|
if ( state->hashbitlen > 512 )
|
||||||
@@ -606,7 +606,6 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
|||||||
|
|
||||||
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
|
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
|
||||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||||
// casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
|
||||||
|
|
||||||
rnd512( state, zero, zero );
|
rnd512( state, zero, zero );
|
||||||
|
|
||||||
@@ -621,7 +620,6 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
|||||||
|
|
||||||
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
|
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
|
||||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||||
// casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|||||||
@@ -16,7 +16,7 @@
|
|||||||
typedef struct {
|
typedef struct {
|
||||||
blake256_16way_context blake;
|
blake256_16way_context blake;
|
||||||
keccak256_8way_context keccak;
|
keccak256_8way_context keccak;
|
||||||
cube_4way_context cube;
|
cube_4way_2buf_context cube;
|
||||||
skein256_8way_context skein;
|
skein256_8way_context skein;
|
||||||
#if defined(__VAES__)
|
#if defined(__VAES__)
|
||||||
groestl256_4way_context groestl;
|
groestl256_4way_context groestl;
|
||||||
@@ -30,13 +30,7 @@ static __thread allium_16way_ctx_holder allium_16way_ctx;
|
|||||||
bool init_allium_16way_ctx()
|
bool init_allium_16way_ctx()
|
||||||
{
|
{
|
||||||
keccak256_8way_init( &allium_16way_ctx.keccak );
|
keccak256_8way_init( &allium_16way_ctx.keccak );
|
||||||
cube_4way_init( &allium_16way_ctx.cube, 256, 16, 32 );
|
|
||||||
skein256_8way_init( &allium_16way_ctx.skein );
|
skein256_8way_init( &allium_16way_ctx.skein );
|
||||||
#if defined(__VAES__)
|
|
||||||
groestl256_4way_init( &allium_16way_ctx.groestl, 32 );
|
|
||||||
#else
|
|
||||||
init_groestl256( &allium_16way_ctx.groestl, 32 );
|
|
||||||
#endif
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -115,8 +109,7 @@ void allium_16way_hash( void *state, const void *input )
|
|||||||
intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
|
intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||||
intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
|
intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||||
|
|
||||||
cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 );
|
||||||
cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
|
||||||
|
|
||||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
|
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
|
||||||
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
|
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
|
||||||
@@ -124,8 +117,7 @@ void allium_16way_hash( void *state, const void *input )
|
|||||||
intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 );
|
intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 );
|
||||||
intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 );
|
intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 );
|
||||||
|
|
||||||
cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 );
|
||||||
cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
|
||||||
|
|
||||||
dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 );
|
dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 );
|
||||||
dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 );
|
dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 );
|
||||||
@@ -255,7 +247,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
|||||||
typedef struct {
|
typedef struct {
|
||||||
blake256_8way_context blake;
|
blake256_8way_context blake;
|
||||||
keccak256_4way_context keccak;
|
keccak256_4way_context keccak;
|
||||||
cubehashParam cube;
|
cube_2way_context cube;
|
||||||
skein256_4way_context skein;
|
skein256_4way_context skein;
|
||||||
#if defined(__VAES__)
|
#if defined(__VAES__)
|
||||||
groestl256_2way_context groestl;
|
groestl256_2way_context groestl;
|
||||||
@@ -269,13 +261,7 @@ static __thread allium_8way_ctx_holder allium_8way_ctx;
|
|||||||
bool init_allium_8way_ctx()
|
bool init_allium_8way_ctx()
|
||||||
{
|
{
|
||||||
keccak256_4way_init( &allium_8way_ctx.keccak );
|
keccak256_4way_init( &allium_8way_ctx.keccak );
|
||||||
cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 );
|
|
||||||
skein256_4way_init( &allium_8way_ctx.skein );
|
skein256_4way_init( &allium_8way_ctx.skein );
|
||||||
#if defined(__VAES__)
|
|
||||||
groestl256_2way_init( &allium_8way_ctx.groestl, 32 );
|
|
||||||
#else
|
|
||||||
init_groestl256( &allium_8way_ctx.groestl, 32 );
|
|
||||||
#endif
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -320,21 +306,20 @@ void allium_8way_hash( void *hash, const void *input )
|
|||||||
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
|
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
|
||||||
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
|
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
|
||||||
|
|
||||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
|
|
||||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
intrlv_2x128( vhashA, hash0, hash1, 256 );
|
||||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
|
intrlv_2x128( vhashB, hash2, hash3, 256 );
|
||||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
|
cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
||||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
dintrlv_2x128( hash0, hash1, vhashA, 256 );
|
||||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
|
dintrlv_2x128( hash2, hash3, vhashB, 256 );
|
||||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
|
||||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*)hash4, 32 );
|
intrlv_2x128( vhashA, hash4, hash5, 256 );
|
||||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
intrlv_2x128( vhashB, hash6, hash7, 256 );
|
||||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*)hash5, 32 );
|
cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
||||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*)hash6, 32 );
|
dintrlv_2x128( hash4, hash5, vhashA, 256 );
|
||||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
dintrlv_2x128( hash6, hash7, vhashB, 256 );
|
||||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*)hash7, 32 );
|
|
||||||
|
|
||||||
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
|
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
|
||||||
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
|
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
|
||||||
|
|||||||
@@ -66,13 +66,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
|||||||
|
|
||||||
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
G2W_4X64( s0, s1, s2, s3 ); \
|
G2W_4X64( s0, s1, s2, s3 ); \
|
||||||
s1 = mm512_ror256_64( s1); \
|
s3 = mm512_shufll256_64( s3 ); \
|
||||||
|
s1 = mm512_shuflr256_64( s1); \
|
||||||
s2 = mm512_swap256_128( s2 ); \
|
s2 = mm512_swap256_128( s2 ); \
|
||||||
s3 = mm512_rol256_64( s3 ); \
|
|
||||||
G2W_4X64( s0, s1, s2, s3 ); \
|
G2W_4X64( s0, s1, s2, s3 ); \
|
||||||
s1 = mm512_rol256_64( s1 ); \
|
s3 = mm512_shuflr256_64( s3 ); \
|
||||||
s2 = mm512_swap256_128( s2 ); \
|
s1 = mm512_shufll256_64( s1 ); \
|
||||||
s3 = mm512_ror256_64( s3 );
|
s2 = mm512_swap256_128( s2 );
|
||||||
|
|
||||||
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
|
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
@@ -107,13 +107,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
|||||||
|
|
||||||
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||||
G_4X64( s0, s1, s2, s3 ); \
|
G_4X64( s0, s1, s2, s3 ); \
|
||||||
s1 = mm256_ror_1x64( s1); \
|
s3 = mm256_shufll_64( s3 ); \
|
||||||
|
s1 = mm256_shuflr_64( s1); \
|
||||||
s2 = mm256_swap_128( s2 ); \
|
s2 = mm256_swap_128( s2 ); \
|
||||||
s3 = mm256_rol_1x64( s3 ); \
|
|
||||||
G_4X64( s0, s1, s2, s3 ); \
|
G_4X64( s0, s1, s2, s3 ); \
|
||||||
s1 = mm256_rol_1x64( s1 ); \
|
s3 = mm256_shuflr_64( s3 ); \
|
||||||
s2 = mm256_swap_128( s2 ); \
|
s1 = mm256_shufll_64( s1 ); \
|
||||||
s3 = mm256_ror_1x64( s3 );
|
s2 = mm256_swap_128( s2 );
|
||||||
|
|
||||||
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
|
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
|
||||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||||
@@ -148,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
|||||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||||
G_2X64( s0, s2, s4, s6 ); \
|
G_2X64( s0, s2, s4, s6 ); \
|
||||||
G_2X64( s1, s3, s5, s7 ); \
|
G_2X64( s1, s3, s5, s7 ); \
|
||||||
mm128_ror256_64( s2, s3 ); \
|
mm128_vrol256_64( s6, s7 ); \
|
||||||
|
mm128_vror256_64( s2, s3 ); \
|
||||||
mm128_swap256_128( s4, s5 ); \
|
mm128_swap256_128( s4, s5 ); \
|
||||||
mm128_rol256_64( s6, s7 ); \
|
|
||||||
G_2X64( s0, s2, s4, s6 ); \
|
G_2X64( s0, s2, s4, s6 ); \
|
||||||
G_2X64( s1, s3, s5, s7 ); \
|
G_2X64( s1, s3, s5, s7 ); \
|
||||||
mm128_rol256_64( s2, s3 ); \
|
mm128_vror256_64( s6, s7 ); \
|
||||||
mm128_swap256_128( s4, s5 ); \
|
mm128_vrol256_64( s2, s3 ); \
|
||||||
mm128_ror256_64( s6, s7 );
|
mm128_swap256_128( s4, s5 );
|
||||||
|
|
||||||
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||||
|
|||||||
@@ -13,6 +13,7 @@
|
|||||||
#include "algo/whirlpool/sph_whirlpool.h"
|
#include "algo/whirlpool/sph_whirlpool.h"
|
||||||
#include "algo/ripemd/sph_ripemd.h"
|
#include "algo/ripemd/sph_ripemd.h"
|
||||||
#include "algo/sha/sph_sha2.h"
|
#include "algo/sha/sph_sha2.h"
|
||||||
|
#include "algo/sha/sha256-hash.h"
|
||||||
|
|
||||||
#define EPSa DBL_EPSILON
|
#define EPSa DBL_EPSILON
|
||||||
#define EPS1 DBL_EPSILON
|
#define EPS1 DBL_EPSILON
|
||||||
@@ -104,7 +105,7 @@ uint32_t sw2_( int nnounce )
|
|||||||
}
|
}
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
sph_sha256_context sha256;
|
sha256_context sha256;
|
||||||
sph_sha512_context sha512;
|
sph_sha512_context sha512;
|
||||||
sph_keccak512_context keccak;
|
sph_keccak512_context keccak;
|
||||||
sph_whirlpool_context whirlpool;
|
sph_whirlpool_context whirlpool;
|
||||||
@@ -117,7 +118,7 @@ m7m_ctx_holder m7m_ctx;
|
|||||||
|
|
||||||
void init_m7m_ctx()
|
void init_m7m_ctx()
|
||||||
{
|
{
|
||||||
sph_sha256_init( &m7m_ctx );
|
sha256_ctx_init( &m7m_ctx.sha256 );
|
||||||
sph_sha512_init( &m7m_ctx.sha512 );
|
sph_sha512_init( &m7m_ctx.sha512 );
|
||||||
sph_keccak512_init( &m7m_ctx.keccak );
|
sph_keccak512_init( &m7m_ctx.keccak );
|
||||||
sph_whirlpool_init( &m7m_ctx.whirlpool );
|
sph_whirlpool_init( &m7m_ctx.whirlpool );
|
||||||
@@ -153,11 +154,10 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
|||||||
|
|
||||||
m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64)));
|
m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64)));
|
||||||
memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) );
|
memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) );
|
||||||
sph_sha256_context ctxf_sha256;
|
|
||||||
|
|
||||||
memcpy(data, pdata, 80);
|
memcpy(data, pdata, 80);
|
||||||
|
|
||||||
sph_sha256( &ctx1.sha256, data, M7_MIDSTATE_LEN );
|
sha256_update( &ctx1.sha256, data, M7_MIDSTATE_LEN );
|
||||||
sph_sha512( &ctx1.sha512, data, M7_MIDSTATE_LEN );
|
sph_sha512( &ctx1.sha512, data, M7_MIDSTATE_LEN );
|
||||||
sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN );
|
sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN );
|
||||||
sph_whirlpool( &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
|
sph_whirlpool( &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
|
||||||
@@ -189,8 +189,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
|||||||
|
|
||||||
memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );
|
memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );
|
||||||
|
|
||||||
sph_sha256( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
|
sha256_update( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||||
sph_sha256_close( &ctx2.sha256, bhash[0] );
|
sha256_final( &ctx2.sha256, bhash[0] );
|
||||||
|
|
||||||
sph_sha512( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
|
sph_sha512( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||||
sph_sha512_close( &ctx2.sha512, bhash[1] );
|
sph_sha512_close( &ctx2.sha512, bhash[1] );
|
||||||
@@ -225,9 +225,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
|||||||
bytes = mpz_sizeinbase(product, 256);
|
bytes = mpz_sizeinbase(product, 256);
|
||||||
mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);
|
mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);
|
||||||
|
|
||||||
sph_sha256_init( &ctxf_sha256 );
|
sha256_full( hash, bdata, bytes );
|
||||||
sph_sha256( &ctxf_sha256, bdata, bytes );
|
|
||||||
sph_sha256_close( &ctxf_sha256, hash );
|
|
||||||
|
|
||||||
digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
|
digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
|
||||||
mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
|
mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
|
||||||
@@ -260,9 +258,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
|||||||
mpzscale=bytes;
|
mpzscale=bytes;
|
||||||
mpz_export(bdata, NULL, -1, 1, 0, 0, product);
|
mpz_export(bdata, NULL, -1, 1, 0, 0, product);
|
||||||
|
|
||||||
sph_sha256_init( &ctxf_sha256 );
|
sha256_full( hash, bdata, bytes );
|
||||||
sph_sha256( &ctxf_sha256, bdata, bytes );
|
|
||||||
sph_sha256_close( &ctxf_sha256, hash );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
|
if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
|
||||||
|
|||||||
@@ -312,10 +312,26 @@ do { \
|
|||||||
BUPDATE1_8W( 7, 1 ); \
|
BUPDATE1_8W( 7, 1 ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
#if defined(__AVX512VL__)
|
||||||
|
|
||||||
|
#define GAMMA_8W(n0, n1, n2, n4) \
|
||||||
|
( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )
|
||||||
|
|
||||||
|
#define THETA_8W(n0, n1, n2, n4) \
|
||||||
|
( g ## n0 = mm256_xor3( a ## n0, a ## n1, a ## n4 ) )
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
#define GAMMA_8W(n0, n1, n2, n4) \
|
#define GAMMA_8W(n0, n1, n2, n4) \
|
||||||
(g ## n0 = _mm256_xor_si256( a ## n0, \
|
(g ## n0 = _mm256_xor_si256( a ## n0, \
|
||||||
_mm256_or_si256( a ## n1, mm256_not( a ## n2 ) ) ) )
|
_mm256_or_si256( a ## n1, mm256_not( a ## n2 ) ) ) )
|
||||||
|
|
||||||
|
#define THETA_8W(n0, n1, n2, n4) \
|
||||||
|
( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \
|
||||||
|
a ## n4 ) ) )
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#define PI_ALL_8W do { \
|
#define PI_ALL_8W do { \
|
||||||
a0 = g0; \
|
a0 = g0; \
|
||||||
a1 = mm256_rol_32( g7, 1 ); \
|
a1 = mm256_rol_32( g7, 1 ); \
|
||||||
@@ -336,9 +352,6 @@ do { \
|
|||||||
a16 = mm256_rol_32( g10, 8 ); \
|
a16 = mm256_rol_32( g10, 8 ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define THETA_8W(n0, n1, n2, n4) \
|
|
||||||
( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \
|
|
||||||
a ## n4 ) ) )
|
|
||||||
|
|
||||||
#define SIGMA_ALL_8W do { \
|
#define SIGMA_ALL_8W do { \
|
||||||
a0 = _mm256_xor_si256( g0, m256_one_32 ); \
|
a0 = _mm256_xor_si256( g0, m256_one_32 ); \
|
||||||
|
|||||||
@@ -127,9 +127,7 @@ void quark_8way_hash( void *state, const void *input )
|
|||||||
|
|
||||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||||
|
|
||||||
if ( ( vh_mask & 0x0f ) != 0x0f )
|
|
||||||
groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||||
if ( ( vh_mask & 0xf0 ) != 0xf0 )
|
|
||||||
groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||||
|
|
||||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||||
@@ -139,21 +137,13 @@ void quark_8way_hash( void *state, const void *input )
|
|||||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
vhash, 512 );
|
vhash, 512 );
|
||||||
|
|
||||||
if ( hash0[0] & 8 )
|
|
||||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||||
if ( hash1[0] & 8 )
|
|
||||||
groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||||
if ( hash2[0] & 8)
|
|
||||||
groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||||
if ( hash3[0] & 8 )
|
|
||||||
groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||||
if ( hash4[0] & 8 )
|
|
||||||
groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||||
if ( hash5[0] & 8 )
|
|
||||||
groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||||
if ( hash6[0] & 8 )
|
|
||||||
groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||||
if ( hash7[0] & 8 )
|
|
||||||
groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||||
|
|
||||||
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
|||||||
@@ -7,24 +7,19 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "sph_ripemd.h"
|
#include "sph_ripemd.h"
|
||||||
#include "algo/sha/sph_sha2.h"
|
#include "algo/sha/sha256-hash.h"
|
||||||
|
|
||||||
void lbry_hash(void* output, const void* input)
|
void lbry_hash(void* output, const void* input)
|
||||||
{
|
{
|
||||||
sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
|
sha256_context ctx_sha256 __attribute__ ((aligned (64)));
|
||||||
sph_sha512_context ctx_sha512 __attribute__ ((aligned (64)));
|
sph_sha512_context ctx_sha512 __attribute__ ((aligned (64)));
|
||||||
sph_ripemd160_context ctx_ripemd __attribute__ ((aligned (64)));
|
sph_ripemd160_context ctx_ripemd __attribute__ ((aligned (64)));
|
||||||
uint32_t _ALIGN(64) hashA[16];
|
uint32_t _ALIGN(64) hashA[16];
|
||||||
uint32_t _ALIGN(64) hashB[16];
|
uint32_t _ALIGN(64) hashB[16];
|
||||||
uint32_t _ALIGN(64) hashC[16];
|
uint32_t _ALIGN(64) hashC[16];
|
||||||
|
|
||||||
sph_sha256_init( &ctx_sha256 );
|
sha256_full( hashA, input, 112 );
|
||||||
sph_sha256( &ctx_sha256, input, 112 );
|
sha256_full( hashA, hashA, 32 );
|
||||||
sph_sha256_close( &ctx_sha256, hashA );
|
|
||||||
|
|
||||||
sph_sha256_init( &ctx_sha256 );
|
|
||||||
sph_sha256( &ctx_sha256, hashA, 32 );
|
|
||||||
sph_sha256_close( &ctx_sha256, hashA );
|
|
||||||
|
|
||||||
sph_sha512_init( &ctx_sha512 );
|
sph_sha512_init( &ctx_sha512 );
|
||||||
sph_sha512( &ctx_sha512, hashA, 32 );
|
sph_sha512( &ctx_sha512, hashA, 32 );
|
||||||
@@ -38,14 +33,12 @@ void lbry_hash(void* output, const void* input)
|
|||||||
sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 );
|
sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 );
|
||||||
sph_ripemd160_close( &ctx_ripemd, hashC );
|
sph_ripemd160_close( &ctx_ripemd, hashC );
|
||||||
|
|
||||||
sph_sha256_init( &ctx_sha256 );
|
sha256_ctx_init( &ctx_sha256 );
|
||||||
sph_sha256( &ctx_sha256, hashB, 20 );
|
sha256_update( &ctx_sha256, hashB, 20 );
|
||||||
sph_sha256( &ctx_sha256, hashC, 20 );
|
sha256_update( &ctx_sha256, hashC, 20 );
|
||||||
sph_sha256_close( &ctx_sha256, hashA );
|
sha256_final( &ctx_sha256, hashA );
|
||||||
|
|
||||||
sph_sha256_init( &ctx_sha256 );
|
sha256_full( hashA, hashA, 32 );
|
||||||
sph_sha256( &ctx_sha256, hashA, 32 );
|
|
||||||
sph_sha256_close( &ctx_sha256, hashA );
|
|
||||||
|
|
||||||
memcpy( output, hashA, 32 );
|
memcpy( output, hashA, 32 );
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -69,8 +69,12 @@ typedef unsigned int uint;
|
|||||||
#define SCRYPT_HASH_BLOCK_SIZE 64U
|
#define SCRYPT_HASH_BLOCK_SIZE 64U
|
||||||
#define SCRYPT_HASH_DIGEST_SIZE 32U
|
#define SCRYPT_HASH_DIGEST_SIZE 32U
|
||||||
|
|
||||||
#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
|
//#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
|
||||||
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
|
//#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
|
||||||
|
|
||||||
|
#define ROTL32(a,b) rol32(a,b)
|
||||||
|
#define ROTR32(a,b) ror32(a,b)
|
||||||
|
|
||||||
|
|
||||||
#define U8TO32_BE(p) \
|
#define U8TO32_BE(p) \
|
||||||
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
|
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
|
||||||
|
|||||||
3265
algo/scrypt/scrypt-core-4way.c
Normal file
3265
algo/scrypt/scrypt-core-4way.c
Normal file
File diff suppressed because it is too large
Load Diff
70
algo/scrypt/scrypt-core-4way.h
Normal file
70
algo/scrypt/scrypt-core-4way.h
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
#ifndef SCRYPT_CORE_4WAY_H__
|
||||||
|
#define SCRYPT_CORE_4WAY_H__
|
||||||
|
|
||||||
|
#include "simd-utils.h"
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );
|
||||||
|
|
||||||
|
// Serial SIMD over 4 way parallel
|
||||||
|
void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N );
|
||||||
|
|
||||||
|
// 4 way parallel over serial SIMD
|
||||||
|
void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
|
void scrypt_core_8way( __m256i *X, __m256i *V, uint32_t N );
|
||||||
|
|
||||||
|
// 2 way parallel over SIMD128
|
||||||
|
void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N );
|
||||||
|
|
||||||
|
// Double buffered 2 way parallel over SIMD128
|
||||||
|
void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N );
|
||||||
|
|
||||||
|
// Triplee buffered 2 way parallel over SIMD128
|
||||||
|
void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N );
|
||||||
|
|
||||||
|
// Serial SIMD128 over 2 way parallel
|
||||||
|
void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N );
|
||||||
|
|
||||||
|
// Double buffered simd over parallel
|
||||||
|
void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N );
|
||||||
|
|
||||||
|
// Triple buffered 2 way
|
||||||
|
void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, const uint32_t N );
|
||||||
|
|
||||||
|
// Quadruple buffered
|
||||||
|
void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__SSE2__)
|
||||||
|
|
||||||
|
// Parallel 4 way, 4x memory
|
||||||
|
void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N );
|
||||||
|
|
||||||
|
// Linear SIMD 1 way, 1x memory, lowest
|
||||||
|
void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||||
|
|
||||||
|
// Double buffered, 2x memory
|
||||||
|
void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||||
|
|
||||||
|
// Triple buffered
|
||||||
|
void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||||
|
|
||||||
|
// Quadruple buffered, 4x memory
|
||||||
|
void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// For reference only
|
||||||
|
void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
206
algo/scrypt/scrypt-core-ref.c
Normal file
206
algo/scrypt/scrypt-core-ref.c
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
#include "scrypt-core-ref.h"
|
||||||
|
|
||||||
|
#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
|
||||||
|
|
||||||
|
static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
|
||||||
|
{
|
||||||
|
uint32_t x0 = (B[ 0] ^= C[ 0]),
|
||||||
|
x1 = (B[ 1] ^= C[ 1]),
|
||||||
|
x2 = (B[ 2] ^= C[ 2]),
|
||||||
|
x3 = (B[ 3] ^= C[ 3]);
|
||||||
|
uint32_t x4 = (B[ 4] ^= C[ 4]),
|
||||||
|
x5 = (B[ 5] ^= C[ 5]),
|
||||||
|
x6 = (B[ 6] ^= C[ 6]),
|
||||||
|
x7 = (B[ 7] ^= C[ 7]);
|
||||||
|
uint32_t x8 = (B[ 8] ^= C[ 8]),
|
||||||
|
x9 = (B[ 9] ^= C[ 9]),
|
||||||
|
xa = (B[10] ^= C[10]),
|
||||||
|
xb = (B[11] ^= C[11]);
|
||||||
|
uint32_t xc = (B[12] ^= C[12]),
|
||||||
|
xd = (B[13] ^= C[13]),
|
||||||
|
xe = (B[14] ^= C[14]),
|
||||||
|
xf = (B[15] ^= C[15]);
|
||||||
|
|
||||||
|
/* Operate on columns. */
|
||||||
|
x4 ^= ROTL(x0 + xc, 7);
|
||||||
|
x9 ^= ROTL(x5 + x1, 7);
|
||||||
|
xe ^= ROTL(xa + x6, 7);
|
||||||
|
x3 ^= ROTL(xf + xb, 7);
|
||||||
|
x8 ^= ROTL(x4 + x0, 9);
|
||||||
|
xd ^= ROTL(x9 + x5, 9);
|
||||||
|
x2 ^= ROTL(xe + xa, 9);
|
||||||
|
x7 ^= ROTL(x3 + xf, 9);
|
||||||
|
xc ^= ROTL(x8 + x4, 13);
|
||||||
|
x1 ^= ROTL(xd + x9, 13);
|
||||||
|
x6 ^= ROTL(x2 + xe, 13);
|
||||||
|
xb ^= ROTL(x7 + x3, 13);
|
||||||
|
x0 ^= ROTL(xc + x8, 18);
|
||||||
|
x5 ^= ROTL(x1 + xd, 18);
|
||||||
|
xa ^= ROTL(x6 + x2, 18);
|
||||||
|
xf ^= ROTL(xb + x7, 18);
|
||||||
|
|
||||||
|
/* Operate on rows. */
|
||||||
|
x1 ^= ROTL(x0 + x3, 7);
|
||||||
|
x6 ^= ROTL(x5 + x4, 7);
|
||||||
|
xb ^= ROTL(xa + x9, 7);
|
||||||
|
xc ^= ROTL(xf + xe, 7);
|
||||||
|
x2 ^= ROTL(x1 + x0, 9);
|
||||||
|
x7 ^= ROTL(x6 + x5, 9);
|
||||||
|
x8 ^= ROTL(xb + xa, 9);
|
||||||
|
xd ^= ROTL(xc + xf, 9);
|
||||||
|
x3 ^= ROTL(x2 + x1, 13);
|
||||||
|
x4 ^= ROTL(x7 + x6, 13);
|
||||||
|
x9 ^= ROTL(x8 + xb, 13);
|
||||||
|
xe ^= ROTL(xd + xc, 13);
|
||||||
|
x0 ^= ROTL(x3 + x2, 18);
|
||||||
|
x5 ^= ROTL(x4 + x7, 18);
|
||||||
|
xa ^= ROTL(x9 + x8, 18);
|
||||||
|
xf ^= ROTL(xe + xd, 18);
|
||||||
|
|
||||||
|
/* Operate on columns. */
|
||||||
|
x4 ^= ROTL(x0 + xc, 7);
|
||||||
|
x9 ^= ROTL(x5 + x1, 7);
|
||||||
|
xe ^= ROTL(xa + x6, 7);
|
||||||
|
x3 ^= ROTL(xf + xb, 7);
|
||||||
|
x8 ^= ROTL(x4 + x0, 9);
|
||||||
|
xd ^= ROTL(x9 + x5, 9);
|
||||||
|
x2 ^= ROTL(xe + xa, 9);
|
||||||
|
x7 ^= ROTL(x3 + xf, 9);
|
||||||
|
xc ^= ROTL(x8 + x4, 13);
|
||||||
|
x1 ^= ROTL(xd + x9, 13);
|
||||||
|
x6 ^= ROTL(x2 + xe, 13);
|
||||||
|
xb ^= ROTL(x7 + x3, 13);
|
||||||
|
x0 ^= ROTL(xc + x8, 18);
|
||||||
|
x5 ^= ROTL(x1 + xd, 18);
|
||||||
|
xa ^= ROTL(x6 + x2, 18);
|
||||||
|
xf ^= ROTL(xb + x7, 18);
|
||||||
|
|
||||||
|
/* Operate on rows. */
|
||||||
|
x1 ^= ROTL(x0 + x3, 7);
|
||||||
|
x6 ^= ROTL(x5 + x4, 7);
|
||||||
|
xb ^= ROTL(xa + x9, 7);
|
||||||
|
xc ^= ROTL(xf + xe, 7);
|
||||||
|
x2 ^= ROTL(x1 + x0, 9);
|
||||||
|
x7 ^= ROTL(x6 + x5, 9);
|
||||||
|
x8 ^= ROTL(xb + xa, 9);
|
||||||
|
xd ^= ROTL(xc + xf, 9);
|
||||||
|
x3 ^= ROTL(x2 + x1, 13);
|
||||||
|
x4 ^= ROTL(x7 + x6, 13);
|
||||||
|
x9 ^= ROTL(x8 + xb, 13);
|
||||||
|
xe ^= ROTL(xd + xc, 13);
|
||||||
|
x0 ^= ROTL(x3 + x2, 18);
|
||||||
|
x5 ^= ROTL(x4 + x7, 18);
|
||||||
|
xa ^= ROTL(x9 + x8, 18);
|
||||||
|
xf ^= ROTL(xe + xd, 18);
|
||||||
|
|
||||||
|
/* Operate on columns. */
|
||||||
|
x4 ^= ROTL(x0 + xc, 7);
|
||||||
|
x9 ^= ROTL(x5 + x1, 7);
|
||||||
|
xe ^= ROTL(xa + x6, 7);
|
||||||
|
x3 ^= ROTL(xf + xb, 7);
|
||||||
|
x8 ^= ROTL(x4 + x0, 9);
|
||||||
|
xd ^= ROTL(x9 + x5, 9);
|
||||||
|
x2 ^= ROTL(xe + xa, 9);
|
||||||
|
x7 ^= ROTL(x3 + xf, 9);
|
||||||
|
xc ^= ROTL(x8 + x4, 13);
|
||||||
|
x1 ^= ROTL(xd + x9, 13);
|
||||||
|
x6 ^= ROTL(x2 + xe, 13);
|
||||||
|
xb ^= ROTL(x7 + x3, 13);
|
||||||
|
x0 ^= ROTL(xc + x8, 18);
|
||||||
|
x5 ^= ROTL(x1 + xd, 18);
|
||||||
|
xa ^= ROTL(x6 + x2, 18);
|
||||||
|
xf ^= ROTL(xb + x7, 18);
|
||||||
|
|
||||||
|
/* Operate on rows. */
|
||||||
|
x1 ^= ROTL(x0 + x3, 7);
|
||||||
|
x6 ^= ROTL(x5 + x4, 7);
|
||||||
|
xb ^= ROTL(xa + x9, 7);
|
||||||
|
xc ^= ROTL(xf + xe, 7);
|
||||||
|
x2 ^= ROTL(x1 + x0, 9);
|
||||||
|
x7 ^= ROTL(x6 + x5, 9);
|
||||||
|
x8 ^= ROTL(xb + xa, 9);
|
||||||
|
xd ^= ROTL(xc + xf, 9);
|
||||||
|
x3 ^= ROTL(x2 + x1, 13);
|
||||||
|
x4 ^= ROTL(x7 + x6, 13);
|
||||||
|
x9 ^= ROTL(x8 + xb, 13);
|
||||||
|
xe ^= ROTL(xd + xc, 13);
|
||||||
|
x0 ^= ROTL(x3 + x2, 18);
|
||||||
|
x5 ^= ROTL(x4 + x7, 18);
|
||||||
|
xa ^= ROTL(x9 + x8, 18);
|
||||||
|
xf ^= ROTL(xe + xd, 18);
|
||||||
|
|
||||||
|
/* Operate on columns. */
|
||||||
|
x4 ^= ROTL(x0 + xc, 7);
|
||||||
|
x9 ^= ROTL(x5 + x1, 7);
|
||||||
|
xe ^= ROTL(xa + x6, 7);
|
||||||
|
x3 ^= ROTL(xf + xb, 7);
|
||||||
|
x8 ^= ROTL(x4 + x0, 9);
|
||||||
|
xd ^= ROTL(x9 + x5, 9);
|
||||||
|
x2 ^= ROTL(xe + xa, 9);
|
||||||
|
x7 ^= ROTL(x3 + xf, 9);
|
||||||
|
xc ^= ROTL(x8 + x4, 13);
|
||||||
|
x1 ^= ROTL(xd + x9, 13);
|
||||||
|
x6 ^= ROTL(x2 + xe, 13);
|
||||||
|
xb ^= ROTL(x7 + x3, 13);
|
||||||
|
x0 ^= ROTL(xc + x8, 18);
|
||||||
|
x5 ^= ROTL(x1 + xd, 18);
|
||||||
|
xa ^= ROTL(x6 + x2, 18);
|
||||||
|
xf ^= ROTL(xb + x7, 18);
|
||||||
|
|
||||||
|
/* Operate on rows. */
|
||||||
|
x1 ^= ROTL(x0 + x3, 7);
|
||||||
|
x6 ^= ROTL(x5 + x4, 7);
|
||||||
|
xb ^= ROTL(xa + x9, 7);
|
||||||
|
xc ^= ROTL(xf + xe, 7);
|
||||||
|
x2 ^= ROTL(x1 + x0, 9);
|
||||||
|
x7 ^= ROTL(x6 + x5, 9);
|
||||||
|
x8 ^= ROTL(xb + xa, 9);
|
||||||
|
xd ^= ROTL(xc + xf, 9);
|
||||||
|
x3 ^= ROTL(x2 + x1, 13);
|
||||||
|
x4 ^= ROTL(x7 + x6, 13);
|
||||||
|
x9 ^= ROTL(x8 + xb, 13);
|
||||||
|
xe ^= ROTL(xd + xc, 13);
|
||||||
|
x0 ^= ROTL(x3 + x2, 18);
|
||||||
|
x5 ^= ROTL(x4 + x7, 18);
|
||||||
|
xa ^= ROTL(x9 + x8, 18);
|
||||||
|
xf ^= ROTL(xe + xd, 18);
|
||||||
|
|
||||||
|
B[ 0] += x0;
|
||||||
|
B[ 1] += x1;
|
||||||
|
B[ 2] += x2;
|
||||||
|
B[ 3] += x3;
|
||||||
|
B[ 4] += x4;
|
||||||
|
B[ 5] += x5;
|
||||||
|
B[ 6] += x6;
|
||||||
|
B[ 7] += x7;
|
||||||
|
B[ 8] += x8;
|
||||||
|
B[ 9] += x9;
|
||||||
|
B[10] += xa;
|
||||||
|
B[11] += xb;
|
||||||
|
B[12] += xc;
|
||||||
|
B[13] += xd;
|
||||||
|
B[14] += xe;
|
||||||
|
B[15] += xf;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param X input/ouput
|
||||||
|
* @param V scratch buffer
|
||||||
|
* @param N factor (def. 1024)
|
||||||
|
*/
|
||||||
|
void scrypt_core_ref(uint32_t *X, uint32_t *V, uint32_t N)
|
||||||
|
{
|
||||||
|
for (uint32_t i = 0; i < N; i++) {
|
||||||
|
memcpy(&V[i * 32], X, 128);
|
||||||
|
xor_salsa8(&X[0], &X[16]);
|
||||||
|
xor_salsa8(&X[16], &X[0]);
|
||||||
|
}
|
||||||
|
for (uint32_t i = 0; i < N; i++) {
|
||||||
|
uint32_t j = 32 * (X[16] & (N - 1));
|
||||||
|
for (uint8_t k = 0; k < 32; k++)
|
||||||
|
X[k] ^= V[j + k];
|
||||||
|
xor_salsa8(&X[0], &X[16]);
|
||||||
|
xor_salsa8(&X[16], &X[0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
1593
algo/scrypt/scrypt.c
1593
algo/scrypt/scrypt.c
File diff suppressed because it is too large
Load Diff
@@ -39,17 +39,10 @@
|
|||||||
void
|
void
|
||||||
SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
|
SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
|
||||||
{
|
{
|
||||||
#if defined(HMAC_SPH_SHA)
|
sha256_context ctx;
|
||||||
sph_sha256_context ctx;
|
sha256_ctx_init( &ctx );
|
||||||
sph_sha256_init( &ctx );
|
sha256_update( &ctx, in, len );
|
||||||
sph_sha256( &ctx, in, len );
|
sha256_final( &ctx, digest );
|
||||||
sph_sha256_close( &ctx, digest );
|
|
||||||
#else
|
|
||||||
SHA256_CTX ctx;
|
|
||||||
SHA256_Init( &ctx );
|
|
||||||
SHA256_Update( &ctx, in, len );
|
|
||||||
SHA256_Final( digest, &ctx );
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -71,7 +64,7 @@ HMAC_SHA256_Buf( const void *K, size_t Klen, const void *in, size_t len,
|
|||||||
void
|
void
|
||||||
HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
|
HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
|
||||||
{
|
{
|
||||||
unsigned char pad[64];
|
unsigned char pad[64] __attribute__ ((aligned (64)));
|
||||||
unsigned char khash[32];
|
unsigned char khash[32];
|
||||||
const unsigned char * K = _K;
|
const unsigned char * K = _K;
|
||||||
size_t i;
|
size_t i;
|
||||||
@@ -79,51 +72,28 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
|
|||||||
/* If Klen > 64, the key is really SHA256(K). */
|
/* If Klen > 64, the key is really SHA256(K). */
|
||||||
if ( Klen > 64 )
|
if ( Klen > 64 )
|
||||||
{
|
{
|
||||||
|
sha256_ctx_init( &ctx->ictx );
|
||||||
#if defined(HMAC_SPH_SHA)
|
sha256_update( &ctx->ictx, K, Klen );
|
||||||
sph_sha256_init( &ctx->ictx );
|
sha256_final( &ctx->ictx, khash );
|
||||||
sph_sha256( &ctx->ictx, K, Klen );
|
|
||||||
sph_sha256_close( &ctx->ictx, khash );
|
|
||||||
#else
|
|
||||||
SHA256_Init( &ctx->ictx );
|
|
||||||
SHA256_Update( &ctx->ictx, K, Klen );
|
|
||||||
SHA256_Final( khash, &ctx->ictx );
|
|
||||||
#endif
|
|
||||||
K = khash;
|
K = khash;
|
||||||
Klen = 32;
|
Klen = 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
|
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
|
||||||
#if defined(HMAC_SPH_SHA)
|
sha256_ctx_init( &ctx->ictx );
|
||||||
sph_sha256_init( &ctx->ictx );
|
|
||||||
#else
|
|
||||||
SHA256_Init( &ctx->ictx );
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x36;
|
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x36;
|
||||||
|
|
||||||
memset( pad + Klen, 0x36, 64 - Klen );
|
memset( pad + Klen, 0x36, 64 - Klen );
|
||||||
#if defined(HMAC_SPH_SHA)
|
sha256_update( &ctx->ictx, pad, 64 );
|
||||||
sph_sha256( &ctx->ictx, pad, 64 );
|
|
||||||
#else
|
|
||||||
SHA256_Update( &ctx->ictx, pad, 64 );
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
|
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
|
||||||
#if defined(HMAC_SPH_SHA)
|
sha256_ctx_init( &ctx->octx );
|
||||||
sph_sha256_init( &ctx->octx );
|
|
||||||
#else
|
|
||||||
SHA256_Init( &ctx->octx );
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x5c;
|
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x5c;
|
||||||
|
|
||||||
memset( pad + Klen, 0x5c, 64 - Klen );
|
memset( pad + Klen, 0x5c, 64 - Klen );
|
||||||
#if defined(HMAC_SPH_SHA)
|
sha256_update( &ctx->octx, pad, 64 );
|
||||||
sph_sha256( &ctx->octx, pad, 64 );
|
|
||||||
#else
|
|
||||||
SHA256_Update( &ctx->octx, pad, 64 );
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Add bytes to the HMAC-SHA256 operation. */
|
/* Add bytes to the HMAC-SHA256 operation. */
|
||||||
@@ -131,33 +101,17 @@ void
|
|||||||
HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
|
HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
|
||||||
{
|
{
|
||||||
/* Feed data to the inner SHA256 operation. */
|
/* Feed data to the inner SHA256 operation. */
|
||||||
#if defined(HMAC_SPH_SHA)
|
sha256_update( &ctx->ictx, in, len );
|
||||||
sph_sha256( &ctx->ictx, in, len );
|
|
||||||
#else
|
|
||||||
SHA256_Update( &ctx->ictx, in, len );
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Finish an HMAC-SHA256 operation. */
|
/* Finish an HMAC-SHA256 operation. */
|
||||||
void
|
void
|
||||||
HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
|
HMAC_SHA256_Final( void *digest, HMAC_SHA256_CTX *ctx )
|
||||||
{
|
{
|
||||||
unsigned char ihash[32];
|
uint32_t ihash[8] __attribute__ ((aligned (32)));
|
||||||
|
sha256_final( &ctx->ictx, ihash );
|
||||||
#if defined(HMAC_SPH_SHA)
|
sha256_update( &ctx->octx, ihash, 32 );
|
||||||
sph_sha256_close( &ctx->ictx, ihash );
|
sha256_final( &ctx->octx, digest );
|
||||||
sph_sha256( &ctx->octx, ihash, 32 );
|
|
||||||
sph_sha256_close( &ctx->octx, digest );
|
|
||||||
#else
|
|
||||||
/* Finish the inner SHA256 operation. */
|
|
||||||
SHA256_Final( ihash, &ctx->ictx );
|
|
||||||
|
|
||||||
/* Feed the inner hash to the outer SHA256 operation. */
|
|
||||||
SHA256_Update( &ctx->octx, ihash, 32 );
|
|
||||||
|
|
||||||
/* Finish the outer SHA256 operation. */
|
|
||||||
SHA256_Final( digest, &ctx->octx );
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -170,8 +124,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
|
|||||||
size_t saltlen, uint64_t c, uint8_t *buf, size_t dkLen )
|
size_t saltlen, uint64_t c, uint8_t *buf, size_t dkLen )
|
||||||
{
|
{
|
||||||
HMAC_SHA256_CTX PShctx, hctx;
|
HMAC_SHA256_CTX PShctx, hctx;
|
||||||
uint8_t _ALIGN(128) T[32];
|
uint64_t _ALIGN(128) T[4];
|
||||||
uint8_t _ALIGN(128) U[32];
|
uint64_t _ALIGN(128) U[4];
|
||||||
|
// uint8_t _ALIGN(128) T[32];
|
||||||
|
// uint8_t _ALIGN(128) U[32];
|
||||||
uint32_t ivec;
|
uint32_t ivec;
|
||||||
size_t i, clen;
|
size_t i, clen;
|
||||||
uint64_t j;
|
uint64_t j;
|
||||||
@@ -207,10 +163,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
|
|||||||
// _mm_xor_si128( ((__m128i*)T)[0], ((__m128i*)U)[0] );
|
// _mm_xor_si128( ((__m128i*)T)[0], ((__m128i*)U)[0] );
|
||||||
// _mm_xor_si128( ((__m128i*)T)[1], ((__m128i*)U)[1] );
|
// _mm_xor_si128( ((__m128i*)T)[1], ((__m128i*)U)[1] );
|
||||||
|
|
||||||
// for ( k = 0; k < 4; k++ ) T[k] ^= U[k];
|
for ( k = 0; k < 4; k++ ) T[k] ^= U[k];
|
||||||
|
|
||||||
for ( k = 0; k < 32; k++ )
|
// for ( k = 0; k < 32; k++ )
|
||||||
T[k] ^= U[k];
|
// T[k] ^= U[k];
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Copy as many bytes as necessary into buf. */
|
/* Copy as many bytes as necessary into buf. */
|
||||||
|
|||||||
@@ -29,30 +29,20 @@
|
|||||||
#ifndef HMAC_SHA256_H__
|
#ifndef HMAC_SHA256_H__
|
||||||
#define HMAC_SHA256_H__
|
#define HMAC_SHA256_H__
|
||||||
|
|
||||||
//#define HMAC_SSL_SHA 1
|
|
||||||
#define HMAC_SPH_SHA 1
|
|
||||||
|
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "sph_sha2.h"
|
#include "sha256-hash.h"
|
||||||
#include <openssl/sha.h>
|
|
||||||
|
|
||||||
|
|
||||||
typedef struct HMAC_SHA256Context
|
typedef struct HMAC_SHA256Context
|
||||||
{
|
{
|
||||||
#if defined(HMAC_SPH_SHA)
|
sha256_context ictx;
|
||||||
sph_sha256_context ictx;
|
sha256_context octx;
|
||||||
sph_sha256_context octx;
|
|
||||||
#else
|
|
||||||
SHA256_CTX ictx;
|
|
||||||
SHA256_CTX octx;
|
|
||||||
#endif
|
|
||||||
} HMAC_SHA256_CTX;
|
} HMAC_SHA256_CTX;
|
||||||
|
|
||||||
void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
|
void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
|
||||||
void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t );
|
void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t );
|
||||||
void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t );
|
void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t );
|
||||||
void HMAC_SHA256_Final( unsigned char [32], HMAC_SHA256_CTX * );
|
void HMAC_SHA256_Final( void*, HMAC_SHA256_CTX * );
|
||||||
void HMAC_SHA256_Buf( const void *, size_t Klen, const void *,
|
void HMAC_SHA256_Buf( const void *, size_t Klen, const void *,
|
||||||
size_t len, uint8_t digest[32] );
|
size_t len, uint8_t digest[32] );
|
||||||
|
|
||||||
|
|||||||
@@ -51,7 +51,6 @@ typedef struct {
|
|||||||
__m128i buf[64>>2];
|
__m128i buf[64>>2];
|
||||||
__m128i val[8];
|
__m128i val[8];
|
||||||
uint32_t count_high, count_low;
|
uint32_t count_high, count_low;
|
||||||
bool initialized;
|
|
||||||
} sha256_4way_context __attribute__ ((aligned (64)));
|
} sha256_4way_context __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
void sha256_4way_init( sha256_4way_context *sc );
|
void sha256_4way_init( sha256_4way_context *sc );
|
||||||
@@ -59,6 +58,10 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data,
|
|||||||
size_t len );
|
size_t len );
|
||||||
void sha256_4way_close( sha256_4way_context *sc, void *dst );
|
void sha256_4way_close( sha256_4way_context *sc, void *dst );
|
||||||
void sha256_4way_full( void *dst, const void *data, size_t len );
|
void sha256_4way_full( void *dst, const void *data, size_t len );
|
||||||
|
void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
|
||||||
|
const __m128i *state_in );
|
||||||
|
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
|
||||||
|
const __m128i *state_in );
|
||||||
|
|
||||||
#endif // SSE2
|
#endif // SSE2
|
||||||
|
|
||||||
@@ -70,13 +73,21 @@ typedef struct {
|
|||||||
__m256i buf[64>>2];
|
__m256i buf[64>>2];
|
||||||
__m256i val[8];
|
__m256i val[8];
|
||||||
uint32_t count_high, count_low;
|
uint32_t count_high, count_low;
|
||||||
bool initialized;
|
|
||||||
} sha256_8way_context __attribute__ ((aligned (128)));
|
} sha256_8way_context __attribute__ ((aligned (128)));
|
||||||
|
|
||||||
void sha256_8way_init( sha256_8way_context *sc );
|
void sha256_8way_init( sha256_8way_context *sc );
|
||||||
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
|
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
|
||||||
void sha256_8way_close( sha256_8way_context *sc, void *dst );
|
void sha256_8way_close( sha256_8way_context *sc, void *dst );
|
||||||
void sha256_8way_full( void *dst, const void *data, size_t len );
|
void sha256_8way_full( void *dst, const void *data, size_t len );
|
||||||
|
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
|
||||||
|
const __m256i *state_in );
|
||||||
|
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
|
||||||
|
const __m256i *state_in );
|
||||||
|
|
||||||
|
void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
|
||||||
|
const __m256i *state_in );
|
||||||
|
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||||
|
const __m256i *state_in, const __m256i *state_mid );
|
||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
@@ -88,13 +99,20 @@ typedef struct {
|
|||||||
__m512i buf[64>>2];
|
__m512i buf[64>>2];
|
||||||
__m512i val[8];
|
__m512i val[8];
|
||||||
uint32_t count_high, count_low;
|
uint32_t count_high, count_low;
|
||||||
bool initialized;
|
|
||||||
} sha256_16way_context __attribute__ ((aligned (128)));
|
} sha256_16way_context __attribute__ ((aligned (128)));
|
||||||
|
|
||||||
void sha256_16way_init( sha256_16way_context *sc );
|
void sha256_16way_init( sha256_16way_context *sc );
|
||||||
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
|
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
|
||||||
void sha256_16way_close( sha256_16way_context *sc, void *dst );
|
void sha256_16way_close( sha256_16way_context *sc, void *dst );
|
||||||
void sha256_16way_full( void *dst, const void *data, size_t len );
|
void sha256_16way_full( void *dst, const void *data, size_t len );
|
||||||
|
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
|
||||||
|
const __m512i *state_in );
|
||||||
|
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
|
||||||
|
const __m512i *state_in );
|
||||||
|
void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
|
||||||
|
const __m512i *state_in );
|
||||||
|
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||||
|
const __m512i *state_in, const __m512i *state_mid );
|
||||||
|
|
||||||
#endif // AVX512
|
#endif // AVX512
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
* any later version. See COPYING for more details.
|
* any later version. See COPYING for more details.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "algo-gate-api.h"
|
#include "sha256d-4way.h"
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <inttypes.h>
|
#include <inttypes.h>
|
||||||
@@ -180,6 +180,9 @@ static const uint32_t sha256d_hash1[16] = {
|
|||||||
0x00000000, 0x00000000, 0x00000000, 0x00000100
|
0x00000000, 0x00000000, 0x00000000, 0x00000100
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// this performs the entire hash all over again, why?
|
||||||
|
// because main function only does 56 rounds.
|
||||||
|
|
||||||
static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
|
static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
|
||||||
{
|
{
|
||||||
uint32_t S[16];
|
uint32_t S[16];
|
||||||
@@ -195,8 +198,29 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
|
|||||||
hash[i] = swab32(hash[i]);
|
hash[i] = swab32(hash[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
|
/*
|
||||||
|
#if defined (__SHA__)
|
||||||
|
|
||||||
|
#include "algo/sha/sph_sha2.h"
|
||||||
|
|
||||||
|
void sha256d(unsigned char *hash, const unsigned char *data, int len)
|
||||||
{
|
{
|
||||||
|
sph_sha256_context ctx __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
sph_sha256_init( &ctx );
|
||||||
|
sph_sha256( &ctx, data, len );
|
||||||
|
sph_sha256_close( &ctx, hash );
|
||||||
|
|
||||||
|
sph_sha256_init( &ctx );
|
||||||
|
sph_sha256( &ctx, hash, 32 );
|
||||||
|
sph_sha256_close( &ctx, hash );
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
void sha256d(unsigned char *hash, const unsigned char *data, int len)
|
||||||
|
{
|
||||||
|
|
||||||
uint32_t S[16], T[16];
|
uint32_t S[16], T[16];
|
||||||
int i, r;
|
int i, r;
|
||||||
|
|
||||||
@@ -220,6 +244,9 @@ extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
|
|||||||
be32enc((uint32_t *)hash + i, T[i]);
|
be32enc((uint32_t *)hash + i, T[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
*/
|
||||||
|
|
||||||
static inline void sha256d_preextend(uint32_t *W)
|
static inline void sha256d_preextend(uint32_t *W)
|
||||||
{
|
{
|
||||||
W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
|
W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
|
||||||
@@ -467,7 +494,7 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
|
|||||||
void sha256d_ms_4way(uint32_t *hash, uint32_t *data,
|
void sha256d_ms_4way(uint32_t *hash, uint32_t *data,
|
||||||
const uint32_t *midstate, const uint32_t *prehash);
|
const uint32_t *midstate, const uint32_t *prehash);
|
||||||
|
|
||||||
static inline int scanhash_sha256d_4way( struct work *work,
|
static inline int scanhash_sha256d_4way_pooler( struct work *work,
|
||||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
@@ -528,7 +555,7 @@ static inline int scanhash_sha256d_4way( struct work *work,
|
|||||||
void sha256d_ms_8way(uint32_t *hash, uint32_t *data,
|
void sha256d_ms_8way(uint32_t *hash, uint32_t *data,
|
||||||
const uint32_t *midstate, const uint32_t *prehash);
|
const uint32_t *midstate, const uint32_t *prehash);
|
||||||
|
|
||||||
static inline int scanhash_sha256d_8way( struct work *work,
|
static inline int scanhash_sha256d_8way_pooler( struct work *work,
|
||||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
@@ -584,7 +611,7 @@ static inline int scanhash_sha256d_8way( struct work *work,
|
|||||||
|
|
||||||
#endif /* HAVE_SHA256_8WAY */
|
#endif /* HAVE_SHA256_8WAY */
|
||||||
|
|
||||||
int scanhash_sha256d( struct work *work,
|
int scanhash_sha256d_pooler( struct work *work,
|
||||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
@@ -600,11 +627,11 @@ int scanhash_sha256d( struct work *work,
|
|||||||
|
|
||||||
#ifdef HAVE_SHA256_8WAY
|
#ifdef HAVE_SHA256_8WAY
|
||||||
if (sha256_use_8way())
|
if (sha256_use_8way())
|
||||||
return scanhash_sha256d_8way( work, max_nonce, hashes_done, mythr );
|
return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr );
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAVE_SHA256_4WAY
|
#ifdef HAVE_SHA256_4WAY
|
||||||
if (sha256_use_4way())
|
if (sha256_use_4way())
|
||||||
return scanhash_sha256d_4way( work, max_nonce, hashes_done, mythr );
|
return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr );
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
memcpy(data, pdata + 16, 64);
|
memcpy(data, pdata + 16, 64);
|
||||||
@@ -631,6 +658,7 @@ int scanhash_sha256d( struct work *work,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
|
int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
@@ -660,13 +688,17 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
|
|||||||
pdata[19] = n;
|
pdata[19] = n;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
bool register_sha256d_algo( algo_gate_t* gate )
|
bool register_sha256d_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
gate->scanhash = (void*)&scanhash_sha256d;
|
#if defined(SHA256D_16WAY)
|
||||||
gate->hash = (void*)&sha256d;
|
gate->scanhash = (void*)&scanhash_sha256d_16way;
|
||||||
|
#else
|
||||||
|
gate->scanhash = (void*)&scanhash_sha256d_pooler;
|
||||||
|
#endif
|
||||||
|
// gate->hash = (void*)&sha256d;
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
689
algo/sha/sha256-hash-2way-ni.c
Normal file
689
algo/sha/sha256-hash-2way-ni.c
Normal file
@@ -0,0 +1,689 @@
|
|||||||
|
/* Intel SHA extensions using C intrinsics */
|
||||||
|
/* Written and place in public domain by Jeffrey Walton */
|
||||||
|
/* Based on code from Intel, and by Sean Gulley for */
|
||||||
|
/* the miTLS project. */
|
||||||
|
|
||||||
|
// A stripped down version with byte swapping removed.
|
||||||
|
|
||||||
|
#if defined(__SHA__)
|
||||||
|
|
||||||
|
#include "sha256-hash.h"
|
||||||
|
|
||||||
|
void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
|
||||||
|
const void *msg_X, const void *msg_Y,
|
||||||
|
const uint32_t *in_X, const uint32_t *in_Y )
|
||||||
|
{
|
||||||
|
__m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
|
||||||
|
__m128i MSG_X, MSG_Y, TMP_X, TMP_Y;
|
||||||
|
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
|
||||||
|
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
|
||||||
|
__m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y;
|
||||||
|
|
||||||
|
// Load initial values
|
||||||
|
TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
|
||||||
|
STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
|
||||||
|
TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
|
||||||
|
STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
|
||||||
|
|
||||||
|
TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
|
||||||
|
TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
|
||||||
|
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
|
||||||
|
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
|
||||||
|
STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
|
||||||
|
STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
|
||||||
|
STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
|
||||||
|
STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
|
||||||
|
|
||||||
|
// Save current hash
|
||||||
|
ABEF_SAVE_X = STATE0_X;
|
||||||
|
ABEF_SAVE_Y = STATE0_Y;
|
||||||
|
CDGH_SAVE_X = STATE1_X;
|
||||||
|
CDGH_SAVE_Y = STATE1_Y;
|
||||||
|
|
||||||
|
// Rounds 0-3
|
||||||
|
TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
|
||||||
|
TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
|
||||||
|
TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
|
||||||
|
MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
|
||||||
|
// Rounds 4-7
|
||||||
|
TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
|
||||||
|
TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
|
||||||
|
TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||||
|
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||||
|
|
||||||
|
// Rounds 8-11
|
||||||
|
TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
|
||||||
|
TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
|
||||||
|
TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||||
|
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||||
|
|
||||||
|
// Rounds 12-15
|
||||||
|
TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
|
||||||
|
TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
|
||||||
|
TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||||
|
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||||
|
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||||
|
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||||
|
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||||
|
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||||
|
|
||||||
|
// Rounds 16-19
|
||||||
|
TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||||
|
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||||
|
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||||
|
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||||
|
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||||
|
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||||
|
|
||||||
|
// Rounds 20-23
|
||||||
|
TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||||
|
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||||
|
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||||
|
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||||
|
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||||
|
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||||
|
|
||||||
|
// Rounds 24-27
|
||||||
|
TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||||
|
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||||
|
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||||
|
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||||
|
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||||
|
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||||
|
|
||||||
|
// Rounds 28-31
|
||||||
|
TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||||
|
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||||
|
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||||
|
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||||
|
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||||
|
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||||
|
|
||||||
|
// Rounds 32-35
|
||||||
|
TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||||
|
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||||
|
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||||
|
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||||
|
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||||
|
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||||
|
|
||||||
|
// Rounds 36-39
|
||||||
|
TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||||
|
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||||
|
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||||
|
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||||
|
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||||
|
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||||
|
|
||||||
|
// Rounds 40-43
|
||||||
|
TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||||
|
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||||
|
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||||
|
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||||
|
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||||
|
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||||
|
|
||||||
|
// Rounds 44-47
|
||||||
|
TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||||
|
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||||
|
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||||
|
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||||
|
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||||
|
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||||
|
|
||||||
|
// Rounds 48-51
|
||||||
|
TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||||
|
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||||
|
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||||
|
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||||
|
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||||
|
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||||
|
|
||||||
|
// Rounds 52-55
|
||||||
|
TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||||
|
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||||
|
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||||
|
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||||
|
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
|
||||||
|
// Rounds 56-59
|
||||||
|
TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||||
|
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||||
|
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||||
|
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||||
|
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
|
||||||
|
// Rounds 60-63
|
||||||
|
TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
|
||||||
|
// Add values back to state
|
||||||
|
STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
|
||||||
|
STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
|
||||||
|
STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
|
||||||
|
STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
|
||||||
|
|
||||||
|
TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
|
||||||
|
TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
|
||||||
|
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
|
||||||
|
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
|
||||||
|
STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
|
||||||
|
STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
|
||||||
|
STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
|
||||||
|
STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
|
||||||
|
|
||||||
|
// Save state
|
||||||
|
_mm_store_si128((__m128i*) &out_X[0], STATE0_X);
|
||||||
|
_mm_store_si128((__m128i*) &out_X[4], STATE1_X);
|
||||||
|
_mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
|
||||||
|
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
|
||||||
|
}
|
||||||
|
|
||||||
|
void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
|
||||||
|
const void *msg_X, const void *msg_Y,
|
||||||
|
const uint32_t *in_X, const uint32_t *in_Y )
|
||||||
|
{
|
||||||
|
__m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
|
||||||
|
__m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK;
|
||||||
|
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
|
||||||
|
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
|
||||||
|
__m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y;
|
||||||
|
|
||||||
|
// Load initial values
|
||||||
|
TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
|
||||||
|
STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
|
||||||
|
TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
|
||||||
|
STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
|
||||||
|
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||||
|
|
||||||
|
TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
|
||||||
|
TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
|
||||||
|
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
|
||||||
|
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
|
||||||
|
STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
|
||||||
|
STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
|
||||||
|
STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
|
||||||
|
STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
|
||||||
|
|
||||||
|
// Save current hash
|
||||||
|
ABEF_SAVE_X = STATE0_X;
|
||||||
|
ABEF_SAVE_Y = STATE0_Y;
|
||||||
|
CDGH_SAVE_X = STATE1_X;
|
||||||
|
CDGH_SAVE_Y = STATE1_Y;
|
||||||
|
|
||||||
|
// Rounds 0-3
|
||||||
|
TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
|
||||||
|
TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
|
||||||
|
TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
|
||||||
|
TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK );
|
||||||
|
TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK );
|
||||||
|
MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
|
||||||
|
// Rounds 4-7
|
||||||
|
TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
|
||||||
|
TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
|
||||||
|
TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
|
||||||
|
TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK );
|
||||||
|
TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK );
|
||||||
|
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||||
|
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||||
|
|
||||||
|
// Rounds 8-11
|
||||||
|
TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
|
||||||
|
TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
|
||||||
|
TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
|
||||||
|
TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK );
|
||||||
|
TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK );
|
||||||
|
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||||
|
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||||
|
|
||||||
|
// Rounds 12-15
|
||||||
|
TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
|
||||||
|
TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
|
||||||
|
TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
|
||||||
|
TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK );
|
||||||
|
TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK );
|
||||||
|
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||||
|
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||||
|
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||||
|
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||||
|
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||||
|
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||||
|
|
||||||
|
// Rounds 16-19
|
||||||
|
TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||||
|
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||||
|
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||||
|
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||||
|
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||||
|
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||||
|
|
||||||
|
// Rounds 20-23
|
||||||
|
TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||||
|
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||||
|
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||||
|
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||||
|
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||||
|
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||||
|
|
||||||
|
// Rounds 24-27
|
||||||
|
TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||||
|
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||||
|
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||||
|
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||||
|
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||||
|
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||||
|
|
||||||
|
// Rounds 28-31
|
||||||
|
TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||||
|
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||||
|
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||||
|
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||||
|
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||||
|
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||||
|
|
||||||
|
// Rounds 32-35
|
||||||
|
TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||||
|
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||||
|
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||||
|
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||||
|
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||||
|
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||||
|
|
||||||
|
// Rounds 36-39
|
||||||
|
TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||||
|
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||||
|
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||||
|
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||||
|
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||||
|
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||||
|
|
||||||
|
// Rounds 40-43
|
||||||
|
TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||||
|
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||||
|
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||||
|
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||||
|
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||||
|
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||||
|
|
||||||
|
// Rounds 44-47
|
||||||
|
TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||||
|
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||||
|
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||||
|
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||||
|
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||||
|
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||||
|
|
||||||
|
// Rounds 48-51
|
||||||
|
TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||||
|
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||||
|
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||||
|
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||||
|
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||||
|
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||||
|
|
||||||
|
// Rounds 52-55
|
||||||
|
TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||||
|
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||||
|
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||||
|
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||||
|
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
|
||||||
|
// Rounds 56-59
|
||||||
|
TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||||
|
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||||
|
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||||
|
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||||
|
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||||
|
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
|
||||||
|
// Rounds 60-63
|
||||||
|
TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
|
||||||
|
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||||
|
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||||
|
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||||
|
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||||
|
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||||
|
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||||
|
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||||
|
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||||
|
|
||||||
|
// Add values back to state
|
||||||
|
STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
|
||||||
|
STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
|
||||||
|
STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
|
||||||
|
STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
|
||||||
|
|
||||||
|
TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
|
||||||
|
TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
|
||||||
|
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
|
||||||
|
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
|
||||||
|
STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
|
||||||
|
STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
|
||||||
|
STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
|
||||||
|
STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
|
||||||
|
|
||||||
|
// Save state
|
||||||
|
_mm_store_si128((__m128i*) &out_X[0], STATE0_X);
|
||||||
|
_mm_store_si128((__m128i*) &out_X[4], STATE1_X);
|
||||||
|
_mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
|
||||||
|
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -3,13 +3,203 @@
|
|||||||
/* Based on code from Intel, and by Sean Gulley for */
|
/* Based on code from Intel, and by Sean Gulley for */
|
||||||
/* the miTLS project. */
|
/* the miTLS project. */
|
||||||
|
|
||||||
// A drop in replacement for the function of the same name in sph_sha2.c.
|
// A stripped down version with byte swapping removed.
|
||||||
|
|
||||||
#if defined(__SHA__)
|
#if defined(__SHA__)
|
||||||
|
|
||||||
#include "simd-utils.h"
|
#include "sha256-hash.h"
|
||||||
|
|
||||||
static void sha2_round( const uint8_t input[], uint32_t state[8] )
|
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
|
||||||
|
const uint32_t *state_in )
|
||||||
|
{
|
||||||
|
__m128i STATE0, STATE1;
|
||||||
|
__m128i MSG, TMP;
|
||||||
|
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||||
|
__m128i ABEF_SAVE, CDGH_SAVE;
|
||||||
|
|
||||||
|
// Load initial values
|
||||||
|
TMP = _mm_load_si128((__m128i*) &state_in[0]);
|
||||||
|
STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
|
||||||
|
// MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||||
|
|
||||||
|
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
|
||||||
|
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
|
||||||
|
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
|
||||||
|
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
|
||||||
|
|
||||||
|
// Save current hash
|
||||||
|
ABEF_SAVE = STATE0;
|
||||||
|
CDGH_SAVE = STATE1;
|
||||||
|
|
||||||
|
// Rounds 0-3
|
||||||
|
TMSG0 = _mm_load_si128((const __m128i*) (input+0));
|
||||||
|
// TMSG0 = _mm_shuffle_epi8(MSG, MASK);
|
||||||
|
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
|
||||||
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
|
|
||||||
|
// Rounds 4-7
|
||||||
|
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
|
||||||
|
// TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
|
||||||
|
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
|
||||||
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
|
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||||
|
|
||||||
|
// Rounds 8-11
|
||||||
|
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
|
||||||
|
// TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
|
||||||
|
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
|
||||||
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
|
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||||
|
|
||||||
|
// Rounds 12-15
|
||||||
|
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
|
||||||
|
// TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
|
||||||
|
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
|
||||||
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
|
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||||
|
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||||
|
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||||
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
|
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||||
|
|
||||||
|
// Rounds 16-19
|
||||||
|
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
|
||||||
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
|
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||||
|
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||||
|
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||||
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
|
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||||
|
|
||||||
|
// Rounds 20-23
|
||||||
|
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
|
||||||
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
|
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||||
|
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||||
|
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||||
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
|
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||||
|
|
||||||
|
// Rounds 24-27
|
||||||
|
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
|
||||||
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
|
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||||
|
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||||
|
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||||
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
|
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||||
|
|
||||||
|
// Rounds 28-31
|
||||||
|
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
|
||||||
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
|
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||||
|
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||||
|
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||||
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
|
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||||
|
|
||||||
|
// Rounds 32-35
|
||||||
|
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
|
||||||
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
|
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||||
|
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||||
|
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||||
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
|
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||||
|
|
||||||
|
// Rounds 36-39
|
||||||
|
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
|
||||||
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
|
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||||
|
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||||
|
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||||
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
|
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||||
|
|
||||||
|
// Rounds 40-43
|
||||||
|
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
|
||||||
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
|
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||||
|
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||||
|
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||||
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
|
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||||
|
|
||||||
|
// Rounds 44-47
|
||||||
|
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
|
||||||
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
|
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||||
|
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||||
|
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||||
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
|
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||||
|
|
||||||
|
// Rounds 48-51
|
||||||
|
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
|
||||||
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
|
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||||
|
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||||
|
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||||
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
|
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||||
|
|
||||||
|
// Rounds 52-55
|
||||||
|
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
|
||||||
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
|
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||||
|
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||||
|
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||||
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
|
|
||||||
|
// Rounds 56-59
|
||||||
|
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
|
||||||
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
|
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||||
|
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||||
|
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||||
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
|
|
||||||
|
// Rounds 60-63
|
||||||
|
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
|
||||||
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
|
|
||||||
|
// Add values back to state
|
||||||
|
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
|
||||||
|
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
|
||||||
|
|
||||||
|
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
|
||||||
|
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
|
||||||
|
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
|
||||||
|
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
|
||||||
|
|
||||||
|
// Save state
|
||||||
|
_mm_store_si128((__m128i*) &state_out[0], STATE0);
|
||||||
|
_mm_store_si128((__m128i*) &state_out[4], STATE1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
|
||||||
|
const uint32_t *state_in )
|
||||||
{
|
{
|
||||||
__m128i STATE0, STATE1;
|
__m128i STATE0, STATE1;
|
||||||
__m128i MSG, TMP, MASK;
|
__m128i MSG, TMP, MASK;
|
||||||
@@ -17,8 +207,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
|
|||||||
__m128i ABEF_SAVE, CDGH_SAVE;
|
__m128i ABEF_SAVE, CDGH_SAVE;
|
||||||
|
|
||||||
// Load initial values
|
// Load initial values
|
||||||
TMP = _mm_load_si128((__m128i*) &state[0]);
|
TMP = _mm_load_si128((__m128i*) &state_in[0]);
|
||||||
STATE1 = _mm_load_si128((__m128i*) &state[4]);
|
STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
|
||||||
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||||
|
|
||||||
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
|
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
|
||||||
@@ -31,8 +221,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
|
|||||||
CDGH_SAVE = STATE1;
|
CDGH_SAVE = STATE1;
|
||||||
|
|
||||||
// Rounds 0-3
|
// Rounds 0-3
|
||||||
MSG = _mm_load_si128((const __m128i*) (input+0));
|
TMSG0 = _mm_load_si128((const __m128i*) (input+0));
|
||||||
TMSG0 = _mm_shuffle_epi8(MSG, MASK);
|
TMSG0 = _mm_shuffle_epi8( TMSG0, MASK );
|
||||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
|
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
|
||||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
@@ -46,7 +236,6 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
|
|||||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||||
|
|
||||||
// Rounds 8-11
|
// Rounds 8-11
|
||||||
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
|
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
|
||||||
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
|
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
|
||||||
@@ -192,9 +381,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
|
|||||||
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
|
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
|
||||||
|
|
||||||
// Save state
|
// Save state
|
||||||
_mm_store_si128((__m128i*) &state[0], STATE0);
|
_mm_store_si128((__m128i*) &state_out[0], STATE0);
|
||||||
_mm_store_si128((__m128i*) &state[4], STATE1);
|
_mm_store_si128((__m128i*) &state_out[4], STATE1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
142
algo/sha/sha256-hash.c
Normal file
142
algo/sha/sha256-hash.c
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
#include "sha256-hash.h"
|
||||||
|
|
||||||
|
static const uint32_t SHA256_IV[8] =
|
||||||
|
{
|
||||||
|
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||||
|
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
static const uint8_t SHA256_PAD[64] =
|
||||||
|
{
|
||||||
|
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||||
|
};
|
||||||
|
*/
|
||||||
|
|
||||||
|
void sha256_ctx_init( sha256_context *ctx )
|
||||||
|
{
|
||||||
|
memcpy( ctx->state, SHA256_IV, sizeof SHA256_IV );
|
||||||
|
ctx->count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void sha256_update( sha256_context *ctx, const void *data, size_t len )
|
||||||
|
{
|
||||||
|
int ptr = ctx->count & 0x3f;
|
||||||
|
const uint8_t *src = data;
|
||||||
|
|
||||||
|
ctx->count += (uint64_t)len;
|
||||||
|
|
||||||
|
if ( len < 64 - ptr )
|
||||||
|
{
|
||||||
|
memcpy( ctx->buf + ptr, src, len );
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy( ctx->buf + ptr, src, 64 - ptr );
|
||||||
|
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||||
|
src += 64 - ptr;
|
||||||
|
len -= 64 - ptr;
|
||||||
|
|
||||||
|
while ( len >= 64 )
|
||||||
|
{
|
||||||
|
sha256_transform_be( ctx->state, (uint32_t*)src, ctx->state );
|
||||||
|
src += 64;
|
||||||
|
len -= 64;
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy( ctx->buf, src, len );
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
void sha256_final( sha256_context *ctx, uint32_t *hash )
|
||||||
|
{
|
||||||
|
size_t r;
|
||||||
|
|
||||||
|
|
||||||
|
/* Figure out how many bytes we have buffered. */
|
||||||
|
r = ctx->count & 0x3f;
|
||||||
|
// r = ( ctx->count >> 3 ) & 0x3f;
|
||||||
|
|
||||||
|
//printf("final: count= %d, r= %d\n", ctx->count, r );
|
||||||
|
|
||||||
|
/* Pad to 56 mod 64, transforming if we finish a block en route. */
|
||||||
|
if ( r < 56 )
|
||||||
|
{
|
||||||
|
/* Pad to 56 mod 64. */
|
||||||
|
memcpy( &ctx->buf[r], SHA256_PAD, 56 - r );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Finish the current block and mix. */
|
||||||
|
memcpy( &ctx->buf[r], SHA256_PAD, 64 - r );
|
||||||
|
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||||
|
|
||||||
|
// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
|
||||||
|
|
||||||
|
/* The start of the final block is all zeroes. */
|
||||||
|
memset( &ctx->buf[0], 0, 56 );
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Add the terminating bit-count. */
|
||||||
|
ctx->buf[56] = bswap_64( ctx->count << 3 );
|
||||||
|
// ctx->buf[56] = bswap_64( ctx->count );
|
||||||
|
// be64enc( &ctx->buf[56], ctx->count );
|
||||||
|
|
||||||
|
/* Mix in the final block. */
|
||||||
|
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||||
|
|
||||||
|
// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
|
||||||
|
|
||||||
|
for ( int i = 0; i < 8; i++ ) hash[i] = bswap_32( ctx->state[i] );
|
||||||
|
|
||||||
|
// for ( int i = 0; i < 8; i++ ) be32enc( hash + 4*i, ctx->state + i );
|
||||||
|
|
||||||
|
/*
|
||||||
|
// be32enc_vect(digest, ctx->state, 4);
|
||||||
|
// be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
|
||||||
|
// Encode vector, two words at a time.
|
||||||
|
do {
|
||||||
|
be32enc(&dst[0], src[0]);
|
||||||
|
be32enc(&dst[4], src[1]);
|
||||||
|
src += 2;
|
||||||
|
dst += 8;
|
||||||
|
} while (--len);
|
||||||
|
*/
|
||||||
|
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void sha256_final( sha256_context *ctx, void *hash )
|
||||||
|
{
|
||||||
|
int ptr = ctx->count & 0x3f;
|
||||||
|
|
||||||
|
ctx->buf[ ptr++ ] = 0x80;
|
||||||
|
|
||||||
|
if ( ptr > 56 )
|
||||||
|
{
|
||||||
|
memset( ctx->buf + ptr, 0, 64 - ptr );
|
||||||
|
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||||
|
memset( ctx->buf, 0, 56 );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
memset( ctx->buf + ptr, 0, 56 - ptr );
|
||||||
|
|
||||||
|
*(uint64_t*)(&ctx->buf[56]) = bswap_64( ctx->count << 3 );
|
||||||
|
|
||||||
|
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||||
|
|
||||||
|
for ( int i = 0; i < 8; i++ )
|
||||||
|
( (uint32_t*)hash )[i] = bswap_32( ctx->state[i] );
|
||||||
|
}
|
||||||
|
|
||||||
|
void sha256_full( void *hash, const void *data, size_t len )
|
||||||
|
{
|
||||||
|
sha256_context ctx;
|
||||||
|
sha256_ctx_init( &ctx );
|
||||||
|
sha256_update( &ctx, data, len );
|
||||||
|
sha256_final( &ctx, hash );
|
||||||
|
}
|
||||||
|
|
||||||
60
algo/sha/sha256-hash.h
Normal file
60
algo/sha/sha256-hash.h
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
#ifndef SHA256_HASH_H__
|
||||||
|
#define SHA256_HASH_H__ 1
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include "simd-utils.h"
|
||||||
|
#include "cpuminer-config.h"
|
||||||
|
#include "sph_sha2.h"
|
||||||
|
|
||||||
|
|
||||||
|
// generic interface
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
unsigned char buf[64]; /* first field, for alignment */
|
||||||
|
uint32_t state[8];
|
||||||
|
uint64_t count;
|
||||||
|
} sha256_context __attribute__((aligned(64)));
|
||||||
|
|
||||||
|
void sha256_full( void *hash, const void *data, size_t len );
|
||||||
|
void sha256_update( sha256_context *ctx, const void *data, size_t len );
|
||||||
|
void sha256_final( sha256_context *ctx, void *hash );
|
||||||
|
void sha256_ctx_init( sha256_context *ctx );
|
||||||
|
void sha256_transform_le( uint32_t *state_out, const uint32_t *data,
|
||||||
|
const uint32_t *state_in );
|
||||||
|
void sha256_transform_be( uint32_t *state_out, const uint32_t *data,
|
||||||
|
const uint32_t *state_in );
|
||||||
|
|
||||||
|
#if defined(__SHA__)
|
||||||
|
|
||||||
|
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
|
||||||
|
const uint32_t *state_in );
|
||||||
|
|
||||||
|
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
|
||||||
|
const uint32_t *state_in );
|
||||||
|
|
||||||
|
// 2 way with interleaved instructions
|
||||||
|
void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
|
||||||
|
const void *msg_X, const void *msg_Y,
|
||||||
|
const uint32_t *in_X, const uint32_t *in_Y );
|
||||||
|
|
||||||
|
void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
|
||||||
|
const void *msg_X, const void *msg_Y,
|
||||||
|
const uint32_t *in_X, const uint32_t *in_Y );
|
||||||
|
|
||||||
|
// Select target
|
||||||
|
// with SHA...
|
||||||
|
#define sha256_transform_le sha256_opt_transform_le
|
||||||
|
#define sha256_transform_be sha256_opt_transform_be
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
// without SHA...
|
||||||
|
#define sha256_transform_le sph_sha256_transform_le
|
||||||
|
#define sha256_transform_be sph_sha256_transform_be
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// SHA can't do only 3 rounds
|
||||||
|
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
|
||||||
|
|
||||||
|
#endif
|
||||||
274
algo/sha/sha256d-4way.c
Normal file
274
algo/sha/sha256d-4way.c
Normal file
@@ -0,0 +1,274 @@
|
|||||||
|
#include "sha256d-4way.h"
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include "sha-hash-4way.h"
|
||||||
|
|
||||||
|
#if defined(SHA256D_16WAY)
|
||||||
|
|
||||||
|
int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
__m512i block[16] __attribute__ ((aligned (64)));
|
||||||
|
__m512i hash32[8] __attribute__ ((aligned (32)));
|
||||||
|
__m512i initstate[8] __attribute__ ((aligned (32)));
|
||||||
|
__m512i midstate1[8] __attribute__ ((aligned (32)));
|
||||||
|
__m512i midstate2[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||||
|
__m512i vdata[20] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
const uint32_t *ptarget = work->target;
|
||||||
|
const uint32_t targ32_d7 = ptarget[7];
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const uint32_t last_nonce = max_nonce - 16;
|
||||||
|
uint32_t n = first_nonce;
|
||||||
|
__m512i *noncev = vdata + 19;
|
||||||
|
const int thr_id = mythr->id;
|
||||||
|
const bool bench = opt_benchmark;
|
||||||
|
const __m512i last_byte = m512_const1_32( 0x80000000 );
|
||||||
|
const __m512i sixteen = m512_const1_32( 16 );
|
||||||
|
|
||||||
|
for ( int i = 0; i < 19; i++ )
|
||||||
|
vdata[i] = m512_const1_32( pdata[i] );
|
||||||
|
|
||||||
|
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||||
|
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||||
|
|
||||||
|
// initialize state
|
||||||
|
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
|
||||||
|
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
|
||||||
|
initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
|
||||||
|
initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
|
||||||
|
initstate[4] = m512_const1_64( 0x510E527F510E527F );
|
||||||
|
initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
|
||||||
|
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||||
|
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||||
|
|
||||||
|
sha256_16way_transform_le( midstate1, vdata, initstate );
|
||||||
|
|
||||||
|
// Do 3 rounds on the first 12 bytes of the next block
|
||||||
|
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
// 1. final 16 bytes of data, with padding
|
||||||
|
memcpy_512( block, vdata + 16, 4 );
|
||||||
|
block[ 4] = last_byte;
|
||||||
|
memset_zero_512( block + 5, 10 );
|
||||||
|
block[15] = m512_const1_32( 80*8 ); // bit count
|
||||||
|
sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
|
||||||
|
|
||||||
|
// 2. 32 byte hash from 1.
|
||||||
|
memcpy_512( block, hash32, 8 );
|
||||||
|
block[ 8] = last_byte;
|
||||||
|
memset_zero_512( block + 9, 6 );
|
||||||
|
block[15] = m512_const1_32( 32*8 ); // bit count
|
||||||
|
sha256_16way_transform_le( hash32, block, initstate );
|
||||||
|
|
||||||
|
// byte swap final hash for testing
|
||||||
|
mm512_block_bswap_32( hash32, hash32 );
|
||||||
|
|
||||||
|
for ( int lane = 0; lane < 16; lane++ )
|
||||||
|
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||||
|
{
|
||||||
|
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||||
|
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||||
|
{
|
||||||
|
pdata[19] = n + lane;
|
||||||
|
submit_solution( work, lane_hash, mythr );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||||
|
n += 16;
|
||||||
|
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||||
|
pdata[19] = n;
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(SHA256D_8WAY)
|
||||||
|
|
||||||
|
int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
__m256i block[16] __attribute__ ((aligned (64)));
|
||||||
|
__m256i hash32[8] __attribute__ ((aligned (32)));
|
||||||
|
__m256i initstate[8] __attribute__ ((aligned (32)));
|
||||||
|
__m256i midstate1[8] __attribute__ ((aligned (32)));
|
||||||
|
__m256i midstate2[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||||
|
__m256i vdata[20] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
const uint32_t *ptarget = work->target;
|
||||||
|
const uint32_t targ32_d7 = ptarget[7];
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const uint32_t last_nonce = max_nonce - 8;
|
||||||
|
uint32_t n = first_nonce;
|
||||||
|
__m256i *noncev = vdata + 19;
|
||||||
|
const int thr_id = mythr->id;
|
||||||
|
const bool bench = opt_benchmark;
|
||||||
|
const __m256i last_byte = m256_const1_32( 0x80000000 );
|
||||||
|
const __m256i eight = m256_const1_32( 8 );
|
||||||
|
|
||||||
|
for ( int i = 0; i < 19; i++ )
|
||||||
|
vdata[i] = m256_const1_32( pdata[i] );
|
||||||
|
|
||||||
|
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||||
|
|
||||||
|
// initialize state
|
||||||
|
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
|
||||||
|
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
|
||||||
|
initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
|
||||||
|
initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
|
||||||
|
initstate[4] = m256_const1_64( 0x510E527F510E527F );
|
||||||
|
initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
|
||||||
|
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
|
||||||
|
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||||
|
|
||||||
|
sha256_8way_transform_le( midstate1, vdata, initstate );
|
||||||
|
|
||||||
|
// Do 3 rounds on the first 12 bytes of the next block
|
||||||
|
sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
// 1. final 16 bytes of data, with padding
|
||||||
|
memcpy_256( block, vdata + 16, 4 );
|
||||||
|
block[ 4] = last_byte;
|
||||||
|
memset_zero_256( block + 5, 10 );
|
||||||
|
block[15] = m256_const1_32( 80*8 ); // bit count
|
||||||
|
sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
|
||||||
|
|
||||||
|
// 2. 32 byte hash from 1.
|
||||||
|
memcpy_256( block, hash32, 8 );
|
||||||
|
block[ 8] = last_byte;
|
||||||
|
memset_zero_256( block + 9, 6 );
|
||||||
|
block[15] = m256_const1_32( 32*8 ); // bit count
|
||||||
|
sha256_8way_transform_le( hash32, block, initstate );
|
||||||
|
|
||||||
|
// byte swap final hash for testing
|
||||||
|
mm256_block_bswap_32( hash32, hash32 );
|
||||||
|
|
||||||
|
for ( int lane = 0; lane < 8; lane++ )
|
||||||
|
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||||
|
{
|
||||||
|
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||||
|
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||||
|
{
|
||||||
|
pdata[19] = n + lane;
|
||||||
|
submit_solution( work, lane_hash, mythr );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||||
|
n += 8;
|
||||||
|
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||||
|
pdata[19] = n;
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(SHA256D_4WAY)
|
||||||
|
|
||||||
|
int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
__m128i block[16] __attribute__ ((aligned (64)));
|
||||||
|
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||||
|
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||||
|
__m128i midstate[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||||
|
__m128i vdata[20] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
const uint32_t *ptarget = work->target;
|
||||||
|
const uint32_t targ32_d7 = ptarget[7];
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const uint32_t last_nonce = max_nonce - 4;
|
||||||
|
uint32_t n = first_nonce;
|
||||||
|
__m128i *noncev = vdata + 19;
|
||||||
|
const int thr_id = mythr->id;
|
||||||
|
const bool bench = opt_benchmark;
|
||||||
|
const __m128i last_byte = m128_const1_32( 0x80000000 );
|
||||||
|
const __m128i four = m128_const1_32( 4 );
|
||||||
|
|
||||||
|
for ( int i = 0; i < 19; i++ )
|
||||||
|
vdata[i] = m128_const1_32( pdata[i] );
|
||||||
|
|
||||||
|
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||||
|
|
||||||
|
// initialize state
|
||||||
|
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
|
||||||
|
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||||
|
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
|
||||||
|
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
|
||||||
|
initstate[4] = m128_const1_64( 0x510E527F510E527F );
|
||||||
|
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
|
||||||
|
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
|
||||||
|
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||||
|
|
||||||
|
// hash first 64 bytes of data
|
||||||
|
sha256_4way_transform_le( midstate, vdata, initstate );
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
// 1. final 16 bytes of data, with padding
|
||||||
|
memcpy_128( block, vdata + 16, 4 );
|
||||||
|
block[ 4] = last_byte;
|
||||||
|
memset_zero_128( block + 5, 10 );
|
||||||
|
block[15] = m128_const1_32( 80*8 ); // bit count
|
||||||
|
sha256_4way_transform_le( hash32, block, midstate );
|
||||||
|
|
||||||
|
// 2. 32 byte hash from 1.
|
||||||
|
memcpy_128( block, hash32, 8 );
|
||||||
|
block[ 8] = last_byte;
|
||||||
|
memset_zero_128( block + 9, 6 );
|
||||||
|
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||||
|
sha256_4way_transform_le( hash32, block, initstate );
|
||||||
|
|
||||||
|
// byte swap final hash for testing
|
||||||
|
mm128_block_bswap_32( hash32, hash32 );
|
||||||
|
|
||||||
|
for ( int lane = 0; lane < 4; lane++ )
|
||||||
|
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||||
|
{
|
||||||
|
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||||
|
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||||
|
{
|
||||||
|
pdata[19] = n + lane;
|
||||||
|
submit_solution( work, lane_hash, mythr );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*noncev = _mm_add_epi32( *noncev, four );
|
||||||
|
n += 4;
|
||||||
|
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||||
|
pdata[19] = n;
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
bool register_sha256d_algo( algo_gate_t* gate )
|
||||||
|
{
|
||||||
|
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
|
#if defined(SHA256D_16WAY)
|
||||||
|
gate->scanhash = (void*)&scanhash_sha256d_16way;
|
||||||
|
#elif defined(SHA256D_8WAY)
|
||||||
|
gate->scanhash = (void*)&scanhash_sha256d_8way;
|
||||||
|
#elif defined(SHA256D_4WAY)
|
||||||
|
gate->scanhash = (void*)&scanhash_sha256d_4way;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// gate->hash = (void*)&sha256d;
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
*/
|
||||||
|
|
||||||
48
algo/sha/sha256d-4way.h
Normal file
48
algo/sha/sha256d-4way.h
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
#ifndef __SHA256D_4WAY_H__
|
||||||
|
#define __SHA256D_4WAY_H__ 1
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include "algo-gate-api.h"
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
#define SHA256D_16WAY 1
|
||||||
|
/*
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
#define SHA256D_8WAY 1
|
||||||
|
#else
|
||||||
|
#define SHA256D_4WAY 1
|
||||||
|
*/
|
||||||
|
#endif
|
||||||
|
|
||||||
|
bool register_sha256d_algo( algo_gate_t* gate );
|
||||||
|
|
||||||
|
#if defined(SHA256D_16WAY)
|
||||||
|
|
||||||
|
int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
#endif
|
||||||
|
/*
|
||||||
|
#if defined(SHA256D_8WAY)
|
||||||
|
|
||||||
|
int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(SHA256D_4WAY)
|
||||||
|
|
||||||
|
int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
#endif
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
#if defined(__SHA__)
|
||||||
|
|
||||||
|
int scanhash_sha256d( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
*/
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
8
algo/sha/sha256d.c
Normal file
8
algo/sha/sha256d.c
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
#include "sha256d.h"
|
||||||
|
|
||||||
|
void sha256d( void *hash, const void *data, int len )
|
||||||
|
{
|
||||||
|
sha256_full( hash, data, len );
|
||||||
|
sha256_full( hash, hash, 32 );
|
||||||
|
}
|
||||||
|
|
||||||
7
algo/sha/sha256d.h
Normal file
7
algo/sha/sha256d.h
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
#include "algo-gate-api.h"
|
||||||
|
#include <string.h>
|
||||||
|
#include <inttypes.h>
|
||||||
|
#include "sha256-hash.h"
|
||||||
|
|
||||||
|
void sha256d( void *hash, const void *data, int len );
|
||||||
|
|
||||||
@@ -3,14 +3,14 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "algo/sha/sph_sha2.h"
|
#include "algo/sha/sha256-hash.h"
|
||||||
|
|
||||||
static __thread sph_sha256_context sha256q_ctx __attribute__ ((aligned (64)));
|
static __thread sha256_context sha256q_ctx __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
void sha256q_midstate( const void* input )
|
void sha256q_midstate( const void* input )
|
||||||
{
|
{
|
||||||
sph_sha256_init( &sha256q_ctx );
|
sha256_ctx_init( &sha256q_ctx );
|
||||||
sph_sha256( &sha256q_ctx, input, 64 );
|
sha256_update( &sha256q_ctx, input, 64 );
|
||||||
}
|
}
|
||||||
|
|
||||||
int sha256q_hash( void* output, const void* input )
|
int sha256q_hash( void* output, const void* input )
|
||||||
@@ -19,23 +19,15 @@ int sha256q_hash( void* output, const void* input )
|
|||||||
const int midlen = 64; // bytes
|
const int midlen = 64; // bytes
|
||||||
const int tail = 80 - midlen; // 16
|
const int tail = 80 - midlen; // 16
|
||||||
|
|
||||||
sph_sha256_context ctx __attribute__ ((aligned (64)));
|
sha256_context ctx __attribute__ ((aligned (64)));
|
||||||
memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx );
|
memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx );
|
||||||
|
|
||||||
sph_sha256( &ctx, input + midlen, tail );
|
sha256_update( &ctx, input + midlen, tail );
|
||||||
sph_sha256_close( &ctx, hash );
|
sha256_final( &ctx, hash );
|
||||||
|
|
||||||
sph_sha256_init( &ctx );
|
sha256_full( hash, hash, 32 );
|
||||||
sph_sha256( &ctx, hash, 32 );
|
sha256_full( hash, hash, 32 );
|
||||||
sph_sha256_close( &ctx, hash );
|
sha256_full( output, hash, 32 );
|
||||||
|
|
||||||
sph_sha256_init( &ctx );
|
|
||||||
sph_sha256( &ctx, hash, 32 );
|
|
||||||
sph_sha256_close( &ctx, hash );
|
|
||||||
|
|
||||||
sph_sha256_init( &ctx );
|
|
||||||
sph_sha256( &ctx, hash, 32 );
|
|
||||||
sph_sha256_close( &ctx, output );
|
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,64 +7,84 @@
|
|||||||
|
|
||||||
#if defined(SHA256T_16WAY)
|
#if defined(SHA256T_16WAY)
|
||||||
|
|
||||||
static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64)));
|
|
||||||
|
|
||||||
void sha256t_16way_hash( void* output, const void* input )
|
|
||||||
{
|
|
||||||
uint32_t vhash[8*16] __attribute__ ((aligned (64)));
|
|
||||||
sha256_16way_context ctx;
|
|
||||||
memcpy( &ctx, &sha256_ctx16, sizeof ctx );
|
|
||||||
|
|
||||||
sha256_16way_update( &ctx, input + (64<<4), 16 );
|
|
||||||
sha256_16way_close( &ctx, vhash );
|
|
||||||
|
|
||||||
sha256_16way_init( &ctx );
|
|
||||||
sha256_16way_update( &ctx, vhash, 32 );
|
|
||||||
sha256_16way_close( &ctx, vhash );
|
|
||||||
|
|
||||||
sha256_16way_init( &ctx );
|
|
||||||
sha256_16way_update( &ctx, vhash, 32 );
|
|
||||||
sha256_16way_close( &ctx, output );
|
|
||||||
}
|
|
||||||
|
|
||||||
int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
|
__m512i block[16] __attribute__ ((aligned (64)));
|
||||||
uint32_t hash32[8*16] __attribute__ ((aligned (32)));
|
__m512i hash32[8] __attribute__ ((aligned (32)));
|
||||||
|
__m512i initstate[8] __attribute__ ((aligned (32)));
|
||||||
|
__m512i midstate1[8] __attribute__ ((aligned (32)));
|
||||||
|
__m512i midstate2[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t *hash32_d7 = &(hash32[7<<4]);
|
__m512i vdata[20] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
const uint32_t *ptarget = work->target;
|
const uint32_t *ptarget = work->target;
|
||||||
const uint32_t targ32_d7 = ptarget[7];
|
const uint32_t targ32_d7 = ptarget[7];
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
const uint32_t last_nonce = max_nonce - 16;
|
const uint32_t last_nonce = max_nonce - 16;
|
||||||
uint32_t n = first_nonce;
|
uint32_t n = first_nonce;
|
||||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
__m512i *noncev = vdata + 19;
|
||||||
const int thr_id = mythr->id;
|
const int thr_id = mythr->id;
|
||||||
const bool bench = opt_benchmark;
|
const bool bench = opt_benchmark;
|
||||||
|
const __m512i last_byte = m512_const1_32( 0x80000000 );
|
||||||
|
const __m512i sixteen = m512_const1_32( 16 );
|
||||||
|
|
||||||
|
for ( int i = 0; i < 19; i++ )
|
||||||
|
vdata[i] = m512_const1_32( pdata[i] );
|
||||||
|
|
||||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
|
||||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||||
sha256_16way_init( &sha256_ctx16 );
|
|
||||||
sha256_16way_update( &sha256_ctx16, vdata, 64 );
|
// initialize state
|
||||||
|
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
|
||||||
|
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
|
||||||
|
initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
|
||||||
|
initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
|
||||||
|
initstate[4] = m512_const1_64( 0x510E527F510E527F );
|
||||||
|
initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
|
||||||
|
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||||
|
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||||
|
|
||||||
|
sha256_16way_transform_le( midstate1, vdata, initstate );
|
||||||
|
|
||||||
|
// Do 3 rounds on the first 12 bytes of the next block
|
||||||
|
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
pdata[19] = n;
|
// 1. final 16 bytes of data, with padding
|
||||||
sha256t_16way_hash( hash32, vdata );
|
memcpy_512( block, vdata + 16, 4 );
|
||||||
|
block[ 4] = last_byte;
|
||||||
|
memset_zero_512( block + 5, 10 );
|
||||||
|
block[15] = m512_const1_32( 80*8 ); // bit count
|
||||||
|
sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
|
||||||
|
|
||||||
|
// 2. 32 byte hash from 1.
|
||||||
|
memcpy_512( block, hash32, 8 );
|
||||||
|
block[ 8] = last_byte;
|
||||||
|
memset_zero_512( block + 9, 6 );
|
||||||
|
block[15] = m512_const1_32( 32*8 ); // bit count
|
||||||
|
sha256_16way_transform_le( hash32, block, initstate );
|
||||||
|
|
||||||
|
// 3. 32 byte hash from 2.
|
||||||
|
memcpy_512( block, hash32, 8 );
|
||||||
|
sha256_16way_transform_le( hash32, block, initstate );
|
||||||
|
|
||||||
|
// byte swap final hash for testing
|
||||||
|
mm512_block_bswap_32( hash32, hash32 );
|
||||||
|
|
||||||
for ( int lane = 0; lane < 16; lane++ )
|
for ( int lane = 0; lane < 16; lane++ )
|
||||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||||
{
|
{
|
||||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||||
{
|
{
|
||||||
pdata[19] = bswap_32( n + lane );
|
pdata[19] = n + lane;
|
||||||
submit_solution( work, lane_hash, mythr );
|
submit_solution( work, lane_hash, mythr );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
|
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||||
n += 16;
|
n += 16;
|
||||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||||
pdata[19] = n;
|
pdata[19] = n;
|
||||||
@@ -72,67 +92,88 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(SHA256T_8WAY)
|
#if defined(SHA256T_8WAY)
|
||||||
|
|
||||||
static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
|
|
||||||
|
|
||||||
void sha256t_8way_hash( void* output, const void* input )
|
|
||||||
{
|
|
||||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
|
||||||
sha256_8way_context ctx;
|
|
||||||
memcpy( &ctx, &sha256_ctx8, sizeof ctx );
|
|
||||||
|
|
||||||
sha256_8way_update( &ctx, input + (64<<3), 16 );
|
|
||||||
sha256_8way_close( &ctx, vhash );
|
|
||||||
|
|
||||||
sha256_8way_init( &ctx );
|
|
||||||
sha256_8way_update( &ctx, vhash, 32 );
|
|
||||||
sha256_8way_close( &ctx, vhash );
|
|
||||||
|
|
||||||
sha256_8way_init( &ctx );
|
|
||||||
sha256_8way_update( &ctx, vhash, 32 );
|
|
||||||
sha256_8way_close( &ctx, output );
|
|
||||||
}
|
|
||||||
|
|
||||||
int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
__m256i block[16] __attribute__ ((aligned (64)));
|
||||||
uint32_t hash32[8*8] __attribute__ ((aligned (32)));
|
__m256i hash32[8] __attribute__ ((aligned (32)));
|
||||||
|
__m256i initstate[8] __attribute__ ((aligned (32)));
|
||||||
|
__m256i midstate1[8] __attribute__ ((aligned (32)));
|
||||||
|
__m256i midstate2[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t *hash32_d7 = &(hash32[7<<3]);
|
__m256i vdata[20] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
const uint32_t *ptarget = work->target;
|
const uint32_t *ptarget = work->target;
|
||||||
const uint32_t targ32_d7 = ptarget[7];
|
const uint32_t targ32_d7 = ptarget[7];
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
const uint32_t last_nonce = max_nonce - 8;
|
const uint32_t last_nonce = max_nonce - 8;
|
||||||
uint32_t n = first_nonce;
|
uint32_t n = first_nonce;
|
||||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
__m256i *noncev = vdata + 19;
|
||||||
const int thr_id = mythr->id;
|
const int thr_id = mythr->id;
|
||||||
const bool bench = opt_benchmark;
|
const bool bench = opt_benchmark;
|
||||||
|
const __m256i last_byte = m256_const1_32( 0x80000000 );
|
||||||
|
const __m256i eight = m256_const1_32( 8 );
|
||||||
|
|
||||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
for ( int i = 0; i < 19; i++ )
|
||||||
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
vdata[i] = m256_const1_32( pdata[i] );
|
||||||
sha256_8way_init( &sha256_ctx8 );
|
|
||||||
sha256_8way_update( &sha256_ctx8, vdata, 64 );
|
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||||
|
|
||||||
|
// initialize state
|
||||||
|
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
|
||||||
|
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
|
||||||
|
initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
|
||||||
|
initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
|
||||||
|
initstate[4] = m256_const1_64( 0x510E527F510E527F );
|
||||||
|
initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
|
||||||
|
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
|
||||||
|
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||||
|
|
||||||
|
sha256_8way_transform_le( midstate1, vdata, initstate );
|
||||||
|
|
||||||
|
// Do 3 rounds on the first 12 bytes of the next block
|
||||||
|
sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
pdata[19] = n;
|
// 1. final 16 bytes of data, with padding
|
||||||
sha256t_8way_hash( hash32, vdata );
|
memcpy_256( block, vdata + 16, 4 );
|
||||||
|
block[ 4] = last_byte;
|
||||||
|
memset_zero_256( block + 5, 10 );
|
||||||
|
block[15] = m256_const1_32( 80*8 ); // bit count
|
||||||
|
sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
|
||||||
|
|
||||||
|
// 2. 32 byte hash from 1.
|
||||||
|
memcpy_256( block, hash32, 8 );
|
||||||
|
block[ 8] = last_byte;
|
||||||
|
memset_zero_256( block + 9, 6 );
|
||||||
|
block[15] = m256_const1_32( 32*8 ); // bit count
|
||||||
|
sha256_8way_transform_le( hash32, block, initstate );
|
||||||
|
|
||||||
|
// 3. 32 byte hash from 2.
|
||||||
|
memcpy_256( block, hash32, 8 );
|
||||||
|
sha256_8way_transform_le( hash32, block, initstate );
|
||||||
|
|
||||||
|
// byte swap final hash for testing
|
||||||
|
mm256_block_bswap_32( hash32, hash32 );
|
||||||
|
|
||||||
for ( int lane = 0; lane < 8; lane++ )
|
for ( int lane = 0; lane < 8; lane++ )
|
||||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||||
{
|
{
|
||||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||||
{
|
{
|
||||||
pdata[19] = bswap_32( n + lane );
|
pdata[19] = n + lane;
|
||||||
submit_solution( work, lane_hash, mythr );
|
submit_solution( work, lane_hash, mythr );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
|
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||||
n += 8;
|
n += 8;
|
||||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||||
pdata[19] = n;
|
pdata[19] = n;
|
||||||
@@ -144,82 +185,84 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
|||||||
|
|
||||||
#if defined(SHA256T_4WAY)
|
#if defined(SHA256T_4WAY)
|
||||||
|
|
||||||
static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
|
|
||||||
|
|
||||||
void sha256t_4way_hash( void* output, const void* input )
|
|
||||||
{
|
|
||||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
|
||||||
sha256_4way_context ctx;
|
|
||||||
memcpy( &ctx, &sha256_ctx4, sizeof ctx );
|
|
||||||
|
|
||||||
sha256_4way_update( &ctx, input + (64<<2), 16 );
|
|
||||||
sha256_4way_close( &ctx, vhash );
|
|
||||||
|
|
||||||
sha256_4way_init( &ctx );
|
|
||||||
sha256_4way_update( &ctx, vhash, 32 );
|
|
||||||
sha256_4way_close( &ctx, vhash );
|
|
||||||
|
|
||||||
sha256_4way_init( &ctx );
|
|
||||||
sha256_4way_update( &ctx, vhash, 32 );
|
|
||||||
sha256_4way_close( &ctx, output );
|
|
||||||
}
|
|
||||||
|
|
||||||
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
__m128i block[16] __attribute__ ((aligned (64)));
|
||||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t *hash7 = &(hash[7<<2]);
|
__m128i midstate[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||||
|
__m128i vdata[20] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
const uint32_t *ptarget = work->target;
|
const uint32_t *ptarget = work->target;
|
||||||
const uint32_t Htarg = ptarget[7];
|
const uint32_t targ32_d7 = ptarget[7];
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const uint32_t last_nonce = max_nonce - 4;
|
||||||
uint32_t n = first_nonce;
|
uint32_t n = first_nonce;
|
||||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
__m128i *noncev = vdata + 19;
|
||||||
const int thr_id = mythr->id;
|
const int thr_id = mythr->id;
|
||||||
|
const bool bench = opt_benchmark;
|
||||||
|
const __m128i last_byte = m128_const1_32( 0x80000000 );
|
||||||
|
const __m128i four = m128_const1_32( 4 );
|
||||||
|
|
||||||
const uint64_t htmax[] = { 0,
|
for ( int i = 0; i < 19; i++ )
|
||||||
0xF,
|
vdata[i] = m128_const1_32( pdata[i] );
|
||||||
0xFF,
|
|
||||||
0xFFF,
|
|
||||||
0xFFFF,
|
|
||||||
0x10000000 };
|
|
||||||
const uint32_t masks[] = { 0xFFFFFFFF,
|
|
||||||
0xFFFFFFF0,
|
|
||||||
0xFFFFFF00,
|
|
||||||
0xFFFFF000,
|
|
||||||
0xFFFF0000,
|
|
||||||
0 };
|
|
||||||
|
|
||||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||||
sha256_4way_init( &sha256_ctx4 );
|
|
||||||
sha256_4way_update( &sha256_ctx4, vdata, 64 );
|
|
||||||
|
|
||||||
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
// initialize state
|
||||||
|
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
|
||||||
|
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||||
|
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
|
||||||
|
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
|
||||||
|
initstate[4] = m128_const1_64( 0x510E527F510E527F );
|
||||||
|
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
|
||||||
|
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
|
||||||
|
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||||
|
|
||||||
|
// hash first 64 bytes of data
|
||||||
|
sha256_4way_transform_le( midstate, vdata, initstate );
|
||||||
|
|
||||||
|
do
|
||||||
{
|
{
|
||||||
const uint32_t mask = masks[m];
|
// 1. final 16 bytes of data, with padding
|
||||||
do {
|
memcpy_128( block, vdata + 16, 4 );
|
||||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
|
block[ 4] = last_byte;
|
||||||
pdata[19] = n;
|
memset_zero_128( block + 5, 10 );
|
||||||
|
block[15] = m128_const1_32( 80*8 ); // bit count
|
||||||
|
sha256_4way_transform_le( hash32, block, midstate );
|
||||||
|
|
||||||
sha256t_4way_hash( hash, vdata );
|
// 2. 32 byte hash from 1.
|
||||||
|
memcpy_128( block, hash32, 8 );
|
||||||
|
block[ 8] = last_byte;
|
||||||
|
memset_zero_128( block + 9, 6 );
|
||||||
|
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||||
|
sha256_4way_transform_le( hash32, block, initstate );
|
||||||
|
|
||||||
|
// 3. 32 byte hash from 2.
|
||||||
|
memcpy_128( block, hash32, 8 );
|
||||||
|
sha256_4way_transform_le( hash32, block, initstate );
|
||||||
|
|
||||||
|
// byte swap final hash for testing
|
||||||
|
mm128_block_bswap_32( hash32, hash32 );
|
||||||
|
|
||||||
for ( int lane = 0; lane < 4; lane++ )
|
for ( int lane = 0; lane < 4; lane++ )
|
||||||
if ( !( hash7[ lane ] & mask ) )
|
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||||
{
|
{
|
||||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||||
{
|
{
|
||||||
pdata[19] = n + lane;
|
pdata[19] = n + lane;
|
||||||
submit_solution( work, lane_hash, mythr );
|
submit_solution( work, lane_hash, mythr );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*noncev = _mm_add_epi32( *noncev, four );
|
||||||
n += 4;
|
n += 4;
|
||||||
} while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
|
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||||
break;
|
pdata[19] = n;
|
||||||
}
|
*hashes_done = n - first_nonce;
|
||||||
*hashes_done = n - first_nonce + 1;
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,17 +5,13 @@ bool register_sha256t_algo( algo_gate_t* gate )
|
|||||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
#if defined(SHA256T_16WAY)
|
#if defined(SHA256T_16WAY)
|
||||||
gate->scanhash = (void*)&scanhash_sha256t_16way;
|
gate->scanhash = (void*)&scanhash_sha256t_16way;
|
||||||
gate->hash = (void*)&sha256t_16way_hash;
|
|
||||||
#elif defined(__SHA__)
|
#elif defined(__SHA__)
|
||||||
gate->optimizations = SHA_OPT;
|
gate->optimizations = SHA_OPT;
|
||||||
gate->scanhash = (void*)&scanhash_sha256t;
|
gate->scanhash = (void*)&scanhash_sha256t;
|
||||||
gate->hash = (void*)&sha256t_hash;
|
|
||||||
#elif defined(SHA256T_8WAY)
|
#elif defined(SHA256T_8WAY)
|
||||||
gate->scanhash = (void*)&scanhash_sha256t_8way;
|
gate->scanhash = (void*)&scanhash_sha256t_8way;
|
||||||
gate->hash = (void*)&sha256t_8way_hash;
|
|
||||||
#else
|
#else
|
||||||
gate->scanhash = (void*)&scanhash_sha256t_4way;
|
gate->scanhash = (void*)&scanhash_sha256t_4way;
|
||||||
gate->hash = (void*)&sha256t_4way_hash;
|
|
||||||
#endif
|
#endif
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,7 +17,6 @@ bool register_sha256q_algo( algo_gate_t* gate );
|
|||||||
|
|
||||||
#if defined(SHA256T_16WAY)
|
#if defined(SHA256T_16WAY)
|
||||||
|
|
||||||
void sha256t_16way_hash( void *output, const void *input );
|
|
||||||
int scanhash_sha256t_16way( struct work *work, uint32_t max_nonce,
|
int scanhash_sha256t_16way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
void sha256q_16way_hash( void *output, const void *input );
|
void sha256q_16way_hash( void *output, const void *input );
|
||||||
@@ -27,7 +26,6 @@ int scanhash_sha256q_16way( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
#if defined(SHA256T_8WAY)
|
#if defined(SHA256T_8WAY)
|
||||||
|
|
||||||
void sha256t_8way_hash( void *output, const void *input );
|
|
||||||
int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
|
int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
void sha256q_8way_hash( void *output, const void *input );
|
void sha256q_8way_hash( void *output, const void *input );
|
||||||
@@ -37,7 +35,6 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
#if defined(SHA256T_4WAY)
|
#if defined(SHA256T_4WAY)
|
||||||
|
|
||||||
void sha256t_4way_hash( void *output, const void *input );
|
|
||||||
int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
void sha256q_4way_hash( void *output, const void *input );
|
void sha256q_4way_hash( void *output, const void *input );
|
||||||
@@ -45,10 +42,13 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
|
|||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__SHA__)
|
||||||
|
|
||||||
int sha256t_hash( void *output, const void *input );
|
|
||||||
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
int sha256q_hash( void *output, const void *input );
|
int sha256q_hash( void *output, const void *input );
|
||||||
int scanhash_sha256q( struct work *work, uint32_t max_nonce,
|
int scanhash_sha256q( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|||||||
@@ -3,46 +3,23 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "algo/sha/sph_sha2.h"
|
//#include "algo/sha/sph_sha2.h"
|
||||||
|
#include "sha256-hash.h"
|
||||||
|
|
||||||
|
#if defined(__SHA__)
|
||||||
|
|
||||||
// Only used on CPUs with SHA
|
// Only used on CPUs with SHA
|
||||||
|
|
||||||
static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64)));
|
|
||||||
|
|
||||||
void sha256t_midstate( const void* input )
|
|
||||||
{
|
|
||||||
sph_sha256_init( &sha256t_ctx );
|
|
||||||
sph_sha256( &sha256t_ctx, input, 64 );
|
|
||||||
}
|
|
||||||
|
|
||||||
int sha256t_hash( void* output, const void* input )
|
|
||||||
{
|
|
||||||
uint32_t _ALIGN(64) hash[16];
|
|
||||||
const int midlen = 64; // bytes
|
|
||||||
const int tail = 80 - midlen; // 16
|
|
||||||
|
|
||||||
sph_sha256_context ctx __attribute__ ((aligned (64)));
|
|
||||||
memcpy( &ctx, &sha256t_ctx, sizeof sha256t_ctx );
|
|
||||||
|
|
||||||
sph_sha256( &ctx, input + midlen, tail );
|
|
||||||
sph_sha256_close( &ctx, hash );
|
|
||||||
|
|
||||||
sph_sha256_init( &ctx );
|
|
||||||
sph_sha256( &ctx, hash, 32 );
|
|
||||||
sph_sha256_close( &ctx, hash );
|
|
||||||
|
|
||||||
sph_sha256_init( &ctx );
|
|
||||||
sph_sha256( &ctx, hash, 32 );
|
|
||||||
sph_sha256_close( &ctx, output );
|
|
||||||
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
uint32_t edata[20] __attribute__((aligned(64)));
|
uint32_t block0[16] __attribute__ ((aligned (64)));
|
||||||
uint32_t hash[8] __attribute__((aligned(64)));
|
uint32_t block1[16] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t initstate[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t midstate[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
@@ -50,24 +27,76 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
|||||||
uint32_t n = first_nonce;
|
uint32_t n = first_nonce;
|
||||||
const int thr_id = mythr->id;
|
const int thr_id = mythr->id;
|
||||||
const bool bench = opt_benchmark;
|
const bool bench = opt_benchmark;
|
||||||
|
__m128i shuf_bswap32 =
|
||||||
|
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||||
|
|
||||||
mm128_bswap32_80( edata, pdata );
|
// initialize state
|
||||||
sha256t_midstate( edata );
|
initstate[0] = 0x6A09E667;
|
||||||
|
initstate[1] = 0xBB67AE85;
|
||||||
|
initstate[2] = 0x3C6EF372;
|
||||||
|
initstate[3] = 0xA54FF53A;
|
||||||
|
initstate[4] = 0x510E527F;
|
||||||
|
initstate[5] = 0x9B05688C;
|
||||||
|
initstate[6] = 0x1F83D9AB;
|
||||||
|
initstate[7] = 0x5BE0CD19;
|
||||||
|
|
||||||
|
// hash first 64 bytes of data
|
||||||
|
sha256_opt_transform_le( midstate, pdata, initstate );
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
edata[19] = n;
|
// 1. final 16 bytes of data, with padding
|
||||||
if ( likely( sha256t_hash( hash, edata ) ) )
|
memcpy( block0, pdata + 16, 16 );
|
||||||
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
|
memcpy( block1, pdata + 16, 16 );
|
||||||
|
block0[ 3] = n;
|
||||||
|
block1[ 3] = n+1;
|
||||||
|
block0[ 4] = block1[ 4] = 0x80000000;
|
||||||
|
memset( block0 + 5, 0, 40 );
|
||||||
|
memset( block1 + 5, 0, 40 );
|
||||||
|
block0[15] = block1[15] = 80*8; // bit count
|
||||||
|
sha256_ni2way_transform_le( hash0, hash1, block0, block1, midstate, midstate );
|
||||||
|
|
||||||
|
// 2. 32 byte hash from 1.
|
||||||
|
memcpy( block0, hash0, 32 );
|
||||||
|
memcpy( block1, hash1, 32 );
|
||||||
|
block0[ 8] = block1[ 8] = 0x80000000;
|
||||||
|
memset( block0 + 9, 0, 24 );
|
||||||
|
memset( block1 + 9, 0, 24 );
|
||||||
|
block0[15] = block1[15] = 32*8; // bit count
|
||||||
|
sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
|
||||||
|
|
||||||
|
// 3. 32 byte hash from 2.
|
||||||
|
memcpy( block0, hash0, 32 );
|
||||||
|
memcpy( block1, hash1, 32 );
|
||||||
|
sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
|
||||||
|
|
||||||
|
// byte swap final hash for testing
|
||||||
|
casti_m128i( hash0, 0 ) =
|
||||||
|
_mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
|
||||||
|
casti_m128i( hash0, 1 ) =
|
||||||
|
_mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
|
||||||
|
casti_m128i( hash1, 0 ) =
|
||||||
|
_mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
|
||||||
|
casti_m128i( hash1, 1 ) =
|
||||||
|
_mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
|
||||||
|
|
||||||
|
if ( unlikely( valid_hash( hash0, ptarget ) && !bench ) )
|
||||||
{
|
{
|
||||||
pdata[19] = bswap_32( n );
|
|
||||||
submit_solution( work, hash, mythr );
|
|
||||||
}
|
|
||||||
n++;
|
|
||||||
} while ( n < last_nonce && !work_restart[thr_id].restart );
|
|
||||||
*hashes_done = n - first_nonce;
|
|
||||||
pdata[19] = n;
|
pdata[19] = n;
|
||||||
|
submit_solution( work, hash0, mythr );
|
||||||
|
}
|
||||||
|
if ( unlikely( valid_hash( hash1, ptarget ) && !bench ) )
|
||||||
|
{
|
||||||
|
pdata[19] = n+1;
|
||||||
|
submit_solution( work, hash1, mythr );
|
||||||
|
}
|
||||||
|
n += 2;
|
||||||
|
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||||
|
|
||||||
|
pdata[19] = n;
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|||||||
@@ -95,83 +95,35 @@ static const uint64_t K512[80] =
|
|||||||
|
|
||||||
// SHA-512 8 way 64 bit
|
// SHA-512 8 way 64 bit
|
||||||
|
|
||||||
#define CH8W(X, Y, Z) \
|
#define CH8W( X, Y, Z ) _mm512_ternarylogic_epi64( X, Y, Z, 0xca )
|
||||||
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z )
|
|
||||||
|
|
||||||
#define MAJ8W(X, Y, Z) \
|
#define MAJ8W( X, Y, Z ) _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
|
||||||
_mm512_or_si512( _mm512_and_si512( X, Y ), \
|
|
||||||
_mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
|
|
||||||
|
|
||||||
#define BSG8W_5_0(x) \
|
#define BSG8W_5_0( x ) mm512_xor3( _mm512_ror_epi64( x, 28 ), \
|
||||||
_mm512_xor_si512( _mm512_xor_si512( \
|
_mm512_ror_epi64( x, 34 ), \
|
||||||
mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) )
|
_mm512_ror_epi64( x, 39 ) )
|
||||||
|
|
||||||
#define BSG8W_5_1(x) \
|
#define BSG8W_5_1( x ) mm512_xor3( _mm512_ror_epi64( x, 14 ), \
|
||||||
_mm512_xor_si512( _mm512_xor_si512( \
|
_mm512_ror_epi64( x, 18 ), \
|
||||||
mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) )
|
_mm512_ror_epi64( x, 41 ) )
|
||||||
|
|
||||||
#define SSG8W_5_0(x) \
|
#define SSG8W_5_0( x ) mm512_xor3( _mm512_ror_epi64( x, 1 ), \
|
||||||
_mm512_xor_si512( _mm512_xor_si512( \
|
_mm512_ror_epi64( x, 8 ), \
|
||||||
mm512_ror_64(x, 1), mm512_ror_64(x, 8) ), _mm512_srli_epi64(x, 7) )
|
_mm512_srli_epi64( x, 7 ) )
|
||||||
|
|
||||||
#define SSG8W_5_1(x) \
|
#define SSG8W_5_1( x ) mm512_xor3( _mm512_ror_epi64( x, 19 ), \
|
||||||
_mm512_xor_si512( _mm512_xor_si512( \
|
_mm512_ror_epi64( x, 61 ), \
|
||||||
mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) )
|
_mm512_srli_epi64( x, 6 ) )
|
||||||
|
|
||||||
static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 )
|
#define SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i ) \
|
||||||
{
|
|
||||||
__m512i w0a, w1a, w0b, w1b;
|
|
||||||
w0a = mm512_ror_64( w0, 1 );
|
|
||||||
w1a = mm512_ror_64( w1,19 );
|
|
||||||
w0b = mm512_ror_64( w0, 8 );
|
|
||||||
w1b = mm512_ror_64( w1,61 );
|
|
||||||
w0a = _mm512_xor_si512( w0a, w0b );
|
|
||||||
w1a = _mm512_xor_si512( w1a, w1b );
|
|
||||||
w0b = _mm512_srli_epi64( w0, 7 );
|
|
||||||
w1b = _mm512_srli_epi64( w1, 6 );
|
|
||||||
w0a = _mm512_xor_si512( w0a, w0b );
|
|
||||||
w1a = _mm512_xor_si512( w1a, w1b );
|
|
||||||
return _mm512_add_epi64( w0a, w1a );
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#define SSG8W_512x2_0( w0, w1, i ) do \
|
|
||||||
{ \
|
|
||||||
__m512i X0a, X1a, X0b, X1b; \
|
|
||||||
X0a = mm512_ror_64( W[i-15], 1 ); \
|
|
||||||
X1a = mm512_ror_64( W[i-14], 1 ); \
|
|
||||||
X0b = mm512_ror_64( W[i-15], 8 ); \
|
|
||||||
X1b = mm512_ror_64( W[i-14], 8 ); \
|
|
||||||
X0a = _mm512_xor_si512( X0a, X0b ); \
|
|
||||||
X1a = _mm512_xor_si512( X1a, X1b ); \
|
|
||||||
X0b = _mm512_srli_epi64( W[i-15], 7 ); \
|
|
||||||
X1b = _mm512_srli_epi64( W[i-14], 7 ); \
|
|
||||||
w0 = _mm512_xor_si512( X0a, X0b ); \
|
|
||||||
w1 = _mm512_xor_si512( X1a, X1b ); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
#define SSG8W_512x2_1( w0, w1, i ) do \
|
|
||||||
{ \
|
|
||||||
__m512i X0a, X1a, X0b, X1b; \
|
|
||||||
X0a = mm512_ror_64( W[i-2],19 ); \
|
|
||||||
X1a = mm512_ror_64( W[i-1],19 ); \
|
|
||||||
X0b = mm512_ror_64( W[i-2],61 ); \
|
|
||||||
X1b = mm512_ror_64( W[i-1],61 ); \
|
|
||||||
X0a = _mm512_xor_si512( X0a, X0b ); \
|
|
||||||
X1a = _mm512_xor_si512( X1a, X1b ); \
|
|
||||||
X0b = _mm512_srli_epi64( W[i-2], 6 ); \
|
|
||||||
X1b = _mm512_srli_epi64( W[i-1], 6 ); \
|
|
||||||
w0 = _mm512_xor_si512( X0a, X0b ); \
|
|
||||||
w1 = _mm512_xor_si512( X1a, X1b ); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
|
||||||
do { \
|
do { \
|
||||||
__m512i T1, T2; \
|
__m512i T0 = _mm512_add_epi64( _mm512_set1_epi64( K512[i] ), W[ i ] ); \
|
||||||
__m512i K = _mm512_set1_epi64( K512[ i ] ); \
|
__m512i T1 = BSG8W_5_1( E ); \
|
||||||
T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
|
__m512i T2 = BSG8W_5_0( A ); \
|
||||||
K, W[i] ) ); \
|
T0 = _mm512_add_epi64( T0, CH8W( E, F, G ) ); \
|
||||||
T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
|
T1 = _mm512_add_epi64( T1, H ); \
|
||||||
|
T2 = _mm512_add_epi64( T2, MAJ8W( A, B, C ) ); \
|
||||||
|
T1 = _mm512_add_epi64( T1, T0 ); \
|
||||||
D = _mm512_add_epi64( D, T1 ); \
|
D = _mm512_add_epi64( D, T1 ); \
|
||||||
H = _mm512_add_epi64( T1, T2 ); \
|
H = _mm512_add_epi64( T1, T2 ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
@@ -187,8 +139,8 @@ sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] )
|
|||||||
mm512_block_bswap_64( W+8, in+8 );
|
mm512_block_bswap_64( W+8, in+8 );
|
||||||
|
|
||||||
for ( i = 16; i < 80; i++ )
|
for ( i = 16; i < 80; i++ )
|
||||||
W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ),
|
W[i] = mm512_add4_64( SSG8W_5_0( W[i-15] ), SSG8W_5_1( W[i-2] ),
|
||||||
_mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
|
W[ i- 7 ], W[ i-16 ] );
|
||||||
|
|
||||||
if ( ctx->initialized )
|
if ( ctx->initialized )
|
||||||
{
|
{
|
||||||
@@ -319,13 +271,12 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
|||||||
|
|
||||||
// SHA-512 4 way 64 bit
|
// SHA-512 4 way 64 bit
|
||||||
|
|
||||||
/*
|
|
||||||
#define CH(X, Y, Z) \
|
#define CH(X, Y, Z) \
|
||||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
||||||
|
|
||||||
#define MAJ(X, Y, Z) \
|
#define MAJ(X, Y, Z) \
|
||||||
_mm256_or_si256( _mm256_and_si256( X, Y ), \
|
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
|
||||||
_mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
|
Y_xor_Z ) )
|
||||||
|
|
||||||
#define BSG5_0(x) \
|
#define BSG5_0(x) \
|
||||||
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
|
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
|
||||||
@@ -334,16 +285,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
|||||||
#define BSG5_1(x) \
|
#define BSG5_1(x) \
|
||||||
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
|
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
|
||||||
_mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
|
_mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
|
||||||
*/
|
|
||||||
/*
|
|
||||||
#define BSG5_0(x) \
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( \
|
|
||||||
mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) )
|
|
||||||
|
|
||||||
#define BSG5_1(x) \
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( \
|
|
||||||
mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) )
|
|
||||||
*/
|
|
||||||
/*
|
/*
|
||||||
#define SSG5_0(x) \
|
#define SSG5_0(x) \
|
||||||
_mm256_xor_si256( _mm256_xor_si256( \
|
_mm256_xor_si256( _mm256_xor_si256( \
|
||||||
@@ -371,98 +313,25 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
|
|||||||
return _mm256_add_epi64( w0a, w1a );
|
return _mm256_add_epi64( w0a, w1a );
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
|
||||||
#define SSG512x2_0( w0, w1, i ) do \
|
|
||||||
{ \
|
|
||||||
__m256i X0a, X1a, X0b, X1b; \
|
|
||||||
X0a = mm256_ror_64( W[i-15], 1 ); \
|
|
||||||
X1a = mm256_ror_64( W[i-14], 1 ); \
|
|
||||||
X0b = mm256_ror_64( W[i-15], 8 ); \
|
|
||||||
X1b = mm256_ror_64( W[i-14], 8 ); \
|
|
||||||
X0a = _mm256_xor_si256( X0a, X0b ); \
|
|
||||||
X1a = _mm256_xor_si256( X1a, X1b ); \
|
|
||||||
X0b = _mm256_srli_epi64( W[i-15], 7 ); \
|
|
||||||
X1b = _mm256_srli_epi64( W[i-14], 7 ); \
|
|
||||||
w0 = _mm256_xor_si256( X0a, X0b ); \
|
|
||||||
w1 = _mm256_xor_si256( X1a, X1b ); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
#define SSG512x2_1( w0, w1, i ) do \
|
|
||||||
{ \
|
|
||||||
__m256i X0a, X1a, X0b, X1b; \
|
|
||||||
X0a = mm256_ror_64( W[i-2],19 ); \
|
|
||||||
X1a = mm256_ror_64( W[i-1],19 ); \
|
|
||||||
X0b = mm256_ror_64( W[i-2],61 ); \
|
|
||||||
X1b = mm256_ror_64( W[i-1],61 ); \
|
|
||||||
X0a = _mm256_xor_si256( X0a, X0b ); \
|
|
||||||
X1a = _mm256_xor_si256( X1a, X1b ); \
|
|
||||||
X0b = _mm256_srli_epi64( W[i-2], 6 ); \
|
|
||||||
X1b = _mm256_srli_epi64( W[i-1], 6 ); \
|
|
||||||
w0 = _mm256_xor_si256( X0a, X0b ); \
|
|
||||||
w1 = _mm256_xor_si256( X1a, X1b ); \
|
|
||||||
} while(0)
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
|
||||||
do { \
|
do { \
|
||||||
__m256i K = _mm256_set1_epi64x( K512[ i ] ); \
|
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[ i ] ); \
|
||||||
__m256i T1 = mm256_ror_64( E, 23 ); \
|
__m256i T1 = BSG5_1( E ); \
|
||||||
__m256i T2 = mm256_ror_64( A, 5 ); \
|
__m256i T2 = BSG5_0( A ); \
|
||||||
__m256i T3 = _mm256_xor_si256( F, G ); \
|
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
|
||||||
__m256i T4 = _mm256_or_si256( A, B ); \
|
T1 = _mm256_add_epi64( T1, H ); \
|
||||||
__m256i T5 = _mm256_and_si256( A, B ); \
|
T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \
|
||||||
K = _mm256_add_epi64( K, W[i] ); \
|
T1 = _mm256_add_epi64( T1, T0 ); \
|
||||||
T1 = _mm256_xor_si256( T1, E ); \
|
Y_xor_Z = X_xor_Y; \
|
||||||
T2 = _mm256_xor_si256( T2, A ); \
|
|
||||||
T3 = _mm256_and_si256( T3, E ); \
|
|
||||||
T4 = _mm256_and_si256( T4, C ); \
|
|
||||||
K = _mm256_add_epi64( H, K ); \
|
|
||||||
T1 = mm256_ror_64( T1, 4 ); \
|
|
||||||
T2 = mm256_ror_64( T2, 6 ); \
|
|
||||||
T3 = _mm256_xor_si256( T3, G ); \
|
|
||||||
T4 = _mm256_or_si256( T4, T5 ); \
|
|
||||||
T1 = _mm256_xor_si256( T1, E ); \
|
|
||||||
T2 = _mm256_xor_si256( T2, A ); \
|
|
||||||
T1 = mm256_ror_64( T1, 14 ); \
|
|
||||||
T2 = mm256_ror_64( T2, 28 ); \
|
|
||||||
T1 = _mm256_add_epi64( T1, T3 ); \
|
|
||||||
T2 = _mm256_add_epi64( T2, T4 ); \
|
|
||||||
T1 = _mm256_add_epi64( T1, K ); \
|
|
||||||
H = _mm256_add_epi64( T1, T2 ); \
|
|
||||||
D = _mm256_add_epi64( D, T1 ); \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
/*
|
|
||||||
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
|
||||||
do { \
|
|
||||||
__m256i K = _mm256_add_epi64( W[i], _mm256_set1_epi64x( K512[ i ] ) ); \
|
|
||||||
__m256i T1 = BSG5_1(E); \
|
|
||||||
__m256i T2 = BSG5_0(A); \
|
|
||||||
T1 = mm256_add4_64( T1, H, CH(E, F, G), K ); \
|
|
||||||
T2 = _mm256_add_epi64( T2, MAJ(A, B, C) ); \
|
|
||||||
D = _mm256_add_epi64( D, T1 ); \
|
D = _mm256_add_epi64( D, T1 ); \
|
||||||
H = _mm256_add_epi64( T1, T2 ); \
|
H = _mm256_add_epi64( T1, T2 ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
|
||||||
do { \
|
|
||||||
__m256i T1, T2; \
|
|
||||||
__m256i K = _mm256_set1_epi64x( K512[ i ] ); \
|
|
||||||
T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
|
|
||||||
K, W[i] ) ); \
|
|
||||||
T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
|
|
||||||
D = _mm256_add_epi64( D, T1 ); \
|
|
||||||
H = _mm256_add_epi64( T1, T2 ); \
|
|
||||||
} while (0)
|
|
||||||
*/
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
|
sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
register __m256i A, B, C, D, E, F, G, H;
|
register __m256i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
|
||||||
__m256i W[80];
|
__m256i W[80];
|
||||||
|
|
||||||
mm256_block_bswap_64( W , in );
|
mm256_block_bswap_64( W , in );
|
||||||
@@ -495,6 +364,8 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
|
|||||||
H = m256_const1_64( 0x5BE0CD19137E2179 );
|
H = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Y_xor_Z = _mm256_xor_si256( B, C );
|
||||||
|
|
||||||
for ( i = 0; i < 80; i += 8 )
|
for ( i = 0; i < 80; i += 8 )
|
||||||
{
|
{
|
||||||
SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
|
SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
|
||||||
|
|||||||
@@ -40,8 +40,8 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
|
#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
|
||||||
#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X)))
|
//#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X)))
|
||||||
|
#define MAJ( X, Y, Z ) ( Y ^ ( ( X_xor_Y = X ^ Y ) & ( Y_xor_Z ) ) )
|
||||||
#define ROTR SPH_ROTR32
|
#define ROTR SPH_ROTR32
|
||||||
|
|
||||||
#define BSG2_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
|
#define BSG2_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
|
||||||
@@ -71,12 +71,8 @@ static const sph_u32 H256[8] = {
|
|||||||
* of the compression function.
|
* of the compression function.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#if defined(__SHA__)
|
|
||||||
|
|
||||||
#include "sha256-hash-opt.c"
|
|
||||||
|
|
||||||
#else // no SHA
|
|
||||||
|
|
||||||
|
/*
|
||||||
static const sph_u32 K[64] = {
|
static const sph_u32 K[64] = {
|
||||||
SPH_C32(0x428A2F98), SPH_C32(0x71374491),
|
SPH_C32(0x428A2F98), SPH_C32(0x71374491),
|
||||||
SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
|
SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
|
||||||
@@ -111,6 +107,7 @@ static const sph_u32 K[64] = {
|
|||||||
SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
|
SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
|
||||||
SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
|
SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
|
||||||
};
|
};
|
||||||
|
*/
|
||||||
|
|
||||||
#if SPH_SMALL_FOOTPRINT_SHA2
|
#if SPH_SMALL_FOOTPRINT_SHA2
|
||||||
|
|
||||||
@@ -130,6 +127,7 @@ static const sph_u32 K[64] = {
|
|||||||
t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \
|
t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \
|
||||||
+ K[pcount + (pc)] + W[(pc) & 0x0F]); \
|
+ K[pcount + (pc)] + W[(pc) & 0x0F]); \
|
||||||
t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \
|
t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
d = SPH_T32(d + t1); \
|
d = SPH_T32(d + t1); \
|
||||||
h = SPH_T32(t1 + t2); \
|
h = SPH_T32(t1 + t2); \
|
||||||
} while (0)
|
} while (0)
|
||||||
@@ -140,7 +138,7 @@ static const sph_u32 K[64] = {
|
|||||||
SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc)
|
SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc)
|
||||||
|
|
||||||
#define SHA2_ROUND_BODY(in, r) do { \
|
#define SHA2_ROUND_BODY(in, r) do { \
|
||||||
sph_u32 A, B, C, D, E, F, G, H; \
|
sph_u32 A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; \
|
||||||
sph_u32 W[16]; \
|
sph_u32 W[16]; \
|
||||||
unsigned pcount; \
|
unsigned pcount; \
|
||||||
\
|
\
|
||||||
@@ -153,6 +151,7 @@ static const sph_u32 K[64] = {
|
|||||||
G = (r)[6]; \
|
G = (r)[6]; \
|
||||||
H = (r)[7]; \
|
H = (r)[7]; \
|
||||||
pcount = 0; \
|
pcount = 0; \
|
||||||
|
Y_xor_Z = B ^ C; \
|
||||||
SHA2_STEP1(A, B, C, D, E, F, G, H, in, 0); \
|
SHA2_STEP1(A, B, C, D, E, F, G, H, in, 0); \
|
||||||
SHA2_STEP1(H, A, B, C, D, E, F, G, in, 1); \
|
SHA2_STEP1(H, A, B, C, D, E, F, G, in, 1); \
|
||||||
SHA2_STEP1(G, H, A, B, C, D, E, F, in, 2); \
|
SHA2_STEP1(G, H, A, B, C, D, E, F, in, 2); \
|
||||||
@@ -200,7 +199,7 @@ static const sph_u32 K[64] = {
|
|||||||
#else // large footprint (default)
|
#else // large footprint (default)
|
||||||
|
|
||||||
#define SHA2_ROUND_BODY(in, r) do { \
|
#define SHA2_ROUND_BODY(in, r) do { \
|
||||||
sph_u32 A, B, C, D, E, F, G, H, T1, T2; \
|
sph_u32 A, B, C, D, E, F, G, H, T1, T2, X_xor_Y, Y_xor_Z;; \
|
||||||
sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
|
sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
|
||||||
sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
|
sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
|
||||||
\
|
\
|
||||||
@@ -212,388 +211,453 @@ static const sph_u32 K[64] = {
|
|||||||
F = (r)[5]; \
|
F = (r)[5]; \
|
||||||
G = (r)[6]; \
|
G = (r)[6]; \
|
||||||
H = (r)[7]; \
|
H = (r)[7]; \
|
||||||
|
Y_xor_Z = B ^ C; \
|
||||||
W00 = in(0); \
|
W00 = in(0); \
|
||||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||||
+ SPH_C32(0x428A2F98) + W00); \
|
+ SPH_C32(0x428A2F98) + W00); \
|
||||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
D = SPH_T32(D + T1); \
|
D = SPH_T32(D + T1); \
|
||||||
H = SPH_T32(T1 + T2); \
|
H = SPH_T32(T1 + T2); \
|
||||||
W01 = in(1); \
|
W01 = in(1); \
|
||||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||||
+ SPH_C32(0x71374491) + W01); \
|
+ SPH_C32(0x71374491) + W01); \
|
||||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
C = SPH_T32(C + T1); \
|
C = SPH_T32(C + T1); \
|
||||||
G = SPH_T32(T1 + T2); \
|
G = SPH_T32(T1 + T2); \
|
||||||
W02 = in(2); \
|
W02 = in(2); \
|
||||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||||
+ SPH_C32(0xB5C0FBCF) + W02); \
|
+ SPH_C32(0xB5C0FBCF) + W02); \
|
||||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
B = SPH_T32(B + T1); \
|
B = SPH_T32(B + T1); \
|
||||||
F = SPH_T32(T1 + T2); \
|
F = SPH_T32(T1 + T2); \
|
||||||
W03 = in(3); \
|
W03 = in(3); \
|
||||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||||
+ SPH_C32(0xE9B5DBA5) + W03); \
|
+ SPH_C32(0xE9B5DBA5) + W03); \
|
||||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
A = SPH_T32(A + T1); \
|
A = SPH_T32(A + T1); \
|
||||||
E = SPH_T32(T1 + T2); \
|
E = SPH_T32(T1 + T2); \
|
||||||
W04 = in(4); \
|
W04 = in(4); \
|
||||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||||
+ SPH_C32(0x3956C25B) + W04); \
|
+ SPH_C32(0x3956C25B) + W04); \
|
||||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
H = SPH_T32(H + T1); \
|
H = SPH_T32(H + T1); \
|
||||||
D = SPH_T32(T1 + T2); \
|
D = SPH_T32(T1 + T2); \
|
||||||
W05 = in(5); \
|
W05 = in(5); \
|
||||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||||
+ SPH_C32(0x59F111F1) + W05); \
|
+ SPH_C32(0x59F111F1) + W05); \
|
||||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
G = SPH_T32(G + T1); \
|
G = SPH_T32(G + T1); \
|
||||||
C = SPH_T32(T1 + T2); \
|
C = SPH_T32(T1 + T2); \
|
||||||
W06 = in(6); \
|
W06 = in(6); \
|
||||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||||
+ SPH_C32(0x923F82A4) + W06); \
|
+ SPH_C32(0x923F82A4) + W06); \
|
||||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
F = SPH_T32(F + T1); \
|
F = SPH_T32(F + T1); \
|
||||||
B = SPH_T32(T1 + T2); \
|
B = SPH_T32(T1 + T2); \
|
||||||
W07 = in(7); \
|
W07 = in(7); \
|
||||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||||
+ SPH_C32(0xAB1C5ED5) + W07); \
|
+ SPH_C32(0xAB1C5ED5) + W07); \
|
||||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
E = SPH_T32(E + T1); \
|
E = SPH_T32(E + T1); \
|
||||||
A = SPH_T32(T1 + T2); \
|
A = SPH_T32(T1 + T2); \
|
||||||
W08 = in(8); \
|
W08 = in(8); \
|
||||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||||
+ SPH_C32(0xD807AA98) + W08); \
|
+ SPH_C32(0xD807AA98) + W08); \
|
||||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
D = SPH_T32(D + T1); \
|
D = SPH_T32(D + T1); \
|
||||||
H = SPH_T32(T1 + T2); \
|
H = SPH_T32(T1 + T2); \
|
||||||
W09 = in(9); \
|
W09 = in(9); \
|
||||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||||
+ SPH_C32(0x12835B01) + W09); \
|
+ SPH_C32(0x12835B01) + W09); \
|
||||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
C = SPH_T32(C + T1); \
|
C = SPH_T32(C + T1); \
|
||||||
G = SPH_T32(T1 + T2); \
|
G = SPH_T32(T1 + T2); \
|
||||||
W10 = in(10); \
|
W10 = in(10); \
|
||||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||||
+ SPH_C32(0x243185BE) + W10); \
|
+ SPH_C32(0x243185BE) + W10); \
|
||||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
B = SPH_T32(B + T1); \
|
B = SPH_T32(B + T1); \
|
||||||
F = SPH_T32(T1 + T2); \
|
F = SPH_T32(T1 + T2); \
|
||||||
W11 = in(11); \
|
W11 = in(11); \
|
||||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||||
+ SPH_C32(0x550C7DC3) + W11); \
|
+ SPH_C32(0x550C7DC3) + W11); \
|
||||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
A = SPH_T32(A + T1); \
|
A = SPH_T32(A + T1); \
|
||||||
E = SPH_T32(T1 + T2); \
|
E = SPH_T32(T1 + T2); \
|
||||||
W12 = in(12); \
|
W12 = in(12); \
|
||||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||||
+ SPH_C32(0x72BE5D74) + W12); \
|
+ SPH_C32(0x72BE5D74) + W12); \
|
||||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
H = SPH_T32(H + T1); \
|
H = SPH_T32(H + T1); \
|
||||||
D = SPH_T32(T1 + T2); \
|
D = SPH_T32(T1 + T2); \
|
||||||
W13 = in(13); \
|
W13 = in(13); \
|
||||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||||
+ SPH_C32(0x80DEB1FE) + W13); \
|
+ SPH_C32(0x80DEB1FE) + W13); \
|
||||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
G = SPH_T32(G + T1); \
|
G = SPH_T32(G + T1); \
|
||||||
C = SPH_T32(T1 + T2); \
|
C = SPH_T32(T1 + T2); \
|
||||||
W14 = in(14); \
|
W14 = in(14); \
|
||||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||||
+ SPH_C32(0x9BDC06A7) + W14); \
|
+ SPH_C32(0x9BDC06A7) + W14); \
|
||||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
F = SPH_T32(F + T1); \
|
F = SPH_T32(F + T1); \
|
||||||
B = SPH_T32(T1 + T2); \
|
B = SPH_T32(T1 + T2); \
|
||||||
W15 = in(15); \
|
W15 = in(15); \
|
||||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||||
+ SPH_C32(0xC19BF174) + W15); \
|
+ SPH_C32(0xC19BF174) + W15); \
|
||||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
E = SPH_T32(E + T1); \
|
E = SPH_T32(E + T1); \
|
||||||
A = SPH_T32(T1 + T2); \
|
A = SPH_T32(T1 + T2); \
|
||||||
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
|
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
|
||||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||||
+ SPH_C32(0xE49B69C1) + W00); \
|
+ SPH_C32(0xE49B69C1) + W00); \
|
||||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
D = SPH_T32(D + T1); \
|
D = SPH_T32(D + T1); \
|
||||||
H = SPH_T32(T1 + T2); \
|
H = SPH_T32(T1 + T2); \
|
||||||
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
|
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
|
||||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||||
+ SPH_C32(0xEFBE4786) + W01); \
|
+ SPH_C32(0xEFBE4786) + W01); \
|
||||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
C = SPH_T32(C + T1); \
|
C = SPH_T32(C + T1); \
|
||||||
G = SPH_T32(T1 + T2); \
|
G = SPH_T32(T1 + T2); \
|
||||||
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
|
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
|
||||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||||
+ SPH_C32(0x0FC19DC6) + W02); \
|
+ SPH_C32(0x0FC19DC6) + W02); \
|
||||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
B = SPH_T32(B + T1); \
|
B = SPH_T32(B + T1); \
|
||||||
F = SPH_T32(T1 + T2); \
|
F = SPH_T32(T1 + T2); \
|
||||||
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
|
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
|
||||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||||
+ SPH_C32(0x240CA1CC) + W03); \
|
+ SPH_C32(0x240CA1CC) + W03); \
|
||||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
A = SPH_T32(A + T1); \
|
A = SPH_T32(A + T1); \
|
||||||
E = SPH_T32(T1 + T2); \
|
E = SPH_T32(T1 + T2); \
|
||||||
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
|
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
|
||||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||||
+ SPH_C32(0x2DE92C6F) + W04); \
|
+ SPH_C32(0x2DE92C6F) + W04); \
|
||||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
H = SPH_T32(H + T1); \
|
H = SPH_T32(H + T1); \
|
||||||
D = SPH_T32(T1 + T2); \
|
D = SPH_T32(T1 + T2); \
|
||||||
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
|
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
|
||||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||||
+ SPH_C32(0x4A7484AA) + W05); \
|
+ SPH_C32(0x4A7484AA) + W05); \
|
||||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
G = SPH_T32(G + T1); \
|
G = SPH_T32(G + T1); \
|
||||||
C = SPH_T32(T1 + T2); \
|
C = SPH_T32(T1 + T2); \
|
||||||
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
|
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
|
||||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||||
+ SPH_C32(0x5CB0A9DC) + W06); \
|
+ SPH_C32(0x5CB0A9DC) + W06); \
|
||||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
F = SPH_T32(F + T1); \
|
F = SPH_T32(F + T1); \
|
||||||
B = SPH_T32(T1 + T2); \
|
B = SPH_T32(T1 + T2); \
|
||||||
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
|
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
|
||||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||||
+ SPH_C32(0x76F988DA) + W07); \
|
+ SPH_C32(0x76F988DA) + W07); \
|
||||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
E = SPH_T32(E + T1); \
|
E = SPH_T32(E + T1); \
|
||||||
A = SPH_T32(T1 + T2); \
|
A = SPH_T32(T1 + T2); \
|
||||||
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
|
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
|
||||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||||
+ SPH_C32(0x983E5152) + W08); \
|
+ SPH_C32(0x983E5152) + W08); \
|
||||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
D = SPH_T32(D + T1); \
|
D = SPH_T32(D + T1); \
|
||||||
H = SPH_T32(T1 + T2); \
|
H = SPH_T32(T1 + T2); \
|
||||||
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
|
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
|
||||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||||
+ SPH_C32(0xA831C66D) + W09); \
|
+ SPH_C32(0xA831C66D) + W09); \
|
||||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
C = SPH_T32(C + T1); \
|
C = SPH_T32(C + T1); \
|
||||||
G = SPH_T32(T1 + T2); \
|
G = SPH_T32(T1 + T2); \
|
||||||
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
|
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
|
||||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||||
+ SPH_C32(0xB00327C8) + W10); \
|
+ SPH_C32(0xB00327C8) + W10); \
|
||||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
B = SPH_T32(B + T1); \
|
B = SPH_T32(B + T1); \
|
||||||
F = SPH_T32(T1 + T2); \
|
F = SPH_T32(T1 + T2); \
|
||||||
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
|
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
|
||||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||||
+ SPH_C32(0xBF597FC7) + W11); \
|
+ SPH_C32(0xBF597FC7) + W11); \
|
||||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
A = SPH_T32(A + T1); \
|
A = SPH_T32(A + T1); \
|
||||||
E = SPH_T32(T1 + T2); \
|
E = SPH_T32(T1 + T2); \
|
||||||
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
|
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
|
||||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||||
+ SPH_C32(0xC6E00BF3) + W12); \
|
+ SPH_C32(0xC6E00BF3) + W12); \
|
||||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
H = SPH_T32(H + T1); \
|
H = SPH_T32(H + T1); \
|
||||||
D = SPH_T32(T1 + T2); \
|
D = SPH_T32(T1 + T2); \
|
||||||
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
|
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
|
||||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||||
+ SPH_C32(0xD5A79147) + W13); \
|
+ SPH_C32(0xD5A79147) + W13); \
|
||||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
G = SPH_T32(G + T1); \
|
G = SPH_T32(G + T1); \
|
||||||
C = SPH_T32(T1 + T2); \
|
C = SPH_T32(T1 + T2); \
|
||||||
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
|
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
|
||||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||||
+ SPH_C32(0x06CA6351) + W14); \
|
+ SPH_C32(0x06CA6351) + W14); \
|
||||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
F = SPH_T32(F + T1); \
|
F = SPH_T32(F + T1); \
|
||||||
B = SPH_T32(T1 + T2); \
|
B = SPH_T32(T1 + T2); \
|
||||||
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
|
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
|
||||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||||
+ SPH_C32(0x14292967) + W15); \
|
+ SPH_C32(0x14292967) + W15); \
|
||||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
E = SPH_T32(E + T1); \
|
E = SPH_T32(E + T1); \
|
||||||
A = SPH_T32(T1 + T2); \
|
A = SPH_T32(T1 + T2); \
|
||||||
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
|
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
|
||||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||||
+ SPH_C32(0x27B70A85) + W00); \
|
+ SPH_C32(0x27B70A85) + W00); \
|
||||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
D = SPH_T32(D + T1); \
|
D = SPH_T32(D + T1); \
|
||||||
H = SPH_T32(T1 + T2); \
|
H = SPH_T32(T1 + T2); \
|
||||||
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
|
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
|
||||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||||
+ SPH_C32(0x2E1B2138) + W01); \
|
+ SPH_C32(0x2E1B2138) + W01); \
|
||||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
C = SPH_T32(C + T1); \
|
C = SPH_T32(C + T1); \
|
||||||
G = SPH_T32(T1 + T2); \
|
G = SPH_T32(T1 + T2); \
|
||||||
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
|
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
|
||||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||||
+ SPH_C32(0x4D2C6DFC) + W02); \
|
+ SPH_C32(0x4D2C6DFC) + W02); \
|
||||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
B = SPH_T32(B + T1); \
|
B = SPH_T32(B + T1); \
|
||||||
F = SPH_T32(T1 + T2); \
|
F = SPH_T32(T1 + T2); \
|
||||||
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
|
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
|
||||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||||
+ SPH_C32(0x53380D13) + W03); \
|
+ SPH_C32(0x53380D13) + W03); \
|
||||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
A = SPH_T32(A + T1); \
|
A = SPH_T32(A + T1); \
|
||||||
E = SPH_T32(T1 + T2); \
|
E = SPH_T32(T1 + T2); \
|
||||||
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
|
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
|
||||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||||
+ SPH_C32(0x650A7354) + W04); \
|
+ SPH_C32(0x650A7354) + W04); \
|
||||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
H = SPH_T32(H + T1); \
|
H = SPH_T32(H + T1); \
|
||||||
D = SPH_T32(T1 + T2); \
|
D = SPH_T32(T1 + T2); \
|
||||||
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
|
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
|
||||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||||
+ SPH_C32(0x766A0ABB) + W05); \
|
+ SPH_C32(0x766A0ABB) + W05); \
|
||||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
G = SPH_T32(G + T1); \
|
G = SPH_T32(G + T1); \
|
||||||
C = SPH_T32(T1 + T2); \
|
C = SPH_T32(T1 + T2); \
|
||||||
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
|
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
|
||||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||||
+ SPH_C32(0x81C2C92E) + W06); \
|
+ SPH_C32(0x81C2C92E) + W06); \
|
||||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
F = SPH_T32(F + T1); \
|
F = SPH_T32(F + T1); \
|
||||||
B = SPH_T32(T1 + T2); \
|
B = SPH_T32(T1 + T2); \
|
||||||
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
|
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
|
||||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||||
+ SPH_C32(0x92722C85) + W07); \
|
+ SPH_C32(0x92722C85) + W07); \
|
||||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
E = SPH_T32(E + T1); \
|
E = SPH_T32(E + T1); \
|
||||||
A = SPH_T32(T1 + T2); \
|
A = SPH_T32(T1 + T2); \
|
||||||
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
|
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
|
||||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||||
+ SPH_C32(0xA2BFE8A1) + W08); \
|
+ SPH_C32(0xA2BFE8A1) + W08); \
|
||||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
D = SPH_T32(D + T1); \
|
D = SPH_T32(D + T1); \
|
||||||
H = SPH_T32(T1 + T2); \
|
H = SPH_T32(T1 + T2); \
|
||||||
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
|
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
|
||||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||||
+ SPH_C32(0xA81A664B) + W09); \
|
+ SPH_C32(0xA81A664B) + W09); \
|
||||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
C = SPH_T32(C + T1); \
|
C = SPH_T32(C + T1); \
|
||||||
G = SPH_T32(T1 + T2); \
|
G = SPH_T32(T1 + T2); \
|
||||||
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
|
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
|
||||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||||
+ SPH_C32(0xC24B8B70) + W10); \
|
+ SPH_C32(0xC24B8B70) + W10); \
|
||||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
B = SPH_T32(B + T1); \
|
B = SPH_T32(B + T1); \
|
||||||
F = SPH_T32(T1 + T2); \
|
F = SPH_T32(T1 + T2); \
|
||||||
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
|
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
|
||||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||||
+ SPH_C32(0xC76C51A3) + W11); \
|
+ SPH_C32(0xC76C51A3) + W11); \
|
||||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
A = SPH_T32(A + T1); \
|
A = SPH_T32(A + T1); \
|
||||||
E = SPH_T32(T1 + T2); \
|
E = SPH_T32(T1 + T2); \
|
||||||
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
|
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
|
||||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||||
+ SPH_C32(0xD192E819) + W12); \
|
+ SPH_C32(0xD192E819) + W12); \
|
||||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
H = SPH_T32(H + T1); \
|
H = SPH_T32(H + T1); \
|
||||||
D = SPH_T32(T1 + T2); \
|
D = SPH_T32(T1 + T2); \
|
||||||
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
|
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
|
||||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||||
+ SPH_C32(0xD6990624) + W13); \
|
+ SPH_C32(0xD6990624) + W13); \
|
||||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
G = SPH_T32(G + T1); \
|
G = SPH_T32(G + T1); \
|
||||||
C = SPH_T32(T1 + T2); \
|
C = SPH_T32(T1 + T2); \
|
||||||
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
|
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
|
||||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||||
+ SPH_C32(0xF40E3585) + W14); \
|
+ SPH_C32(0xF40E3585) + W14); \
|
||||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
F = SPH_T32(F + T1); \
|
F = SPH_T32(F + T1); \
|
||||||
B = SPH_T32(T1 + T2); \
|
B = SPH_T32(T1 + T2); \
|
||||||
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
|
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
|
||||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||||
+ SPH_C32(0x106AA070) + W15); \
|
+ SPH_C32(0x106AA070) + W15); \
|
||||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
E = SPH_T32(E + T1); \
|
E = SPH_T32(E + T1); \
|
||||||
A = SPH_T32(T1 + T2); \
|
A = SPH_T32(T1 + T2); \
|
||||||
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
|
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
|
||||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||||
+ SPH_C32(0x19A4C116) + W00); \
|
+ SPH_C32(0x19A4C116) + W00); \
|
||||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
D = SPH_T32(D + T1); \
|
D = SPH_T32(D + T1); \
|
||||||
H = SPH_T32(T1 + T2); \
|
H = SPH_T32(T1 + T2); \
|
||||||
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
|
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
|
||||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||||
+ SPH_C32(0x1E376C08) + W01); \
|
+ SPH_C32(0x1E376C08) + W01); \
|
||||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
C = SPH_T32(C + T1); \
|
C = SPH_T32(C + T1); \
|
||||||
G = SPH_T32(T1 + T2); \
|
G = SPH_T32(T1 + T2); \
|
||||||
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
|
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
|
||||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||||
+ SPH_C32(0x2748774C) + W02); \
|
+ SPH_C32(0x2748774C) + W02); \
|
||||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
B = SPH_T32(B + T1); \
|
B = SPH_T32(B + T1); \
|
||||||
F = SPH_T32(T1 + T2); \
|
F = SPH_T32(T1 + T2); \
|
||||||
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
|
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
|
||||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||||
+ SPH_C32(0x34B0BCB5) + W03); \
|
+ SPH_C32(0x34B0BCB5) + W03); \
|
||||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
A = SPH_T32(A + T1); \
|
A = SPH_T32(A + T1); \
|
||||||
E = SPH_T32(T1 + T2); \
|
E = SPH_T32(T1 + T2); \
|
||||||
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
|
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
|
||||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||||
+ SPH_C32(0x391C0CB3) + W04); \
|
+ SPH_C32(0x391C0CB3) + W04); \
|
||||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
H = SPH_T32(H + T1); \
|
H = SPH_T32(H + T1); \
|
||||||
D = SPH_T32(T1 + T2); \
|
D = SPH_T32(T1 + T2); \
|
||||||
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
|
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
|
||||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||||
+ SPH_C32(0x4ED8AA4A) + W05); \
|
+ SPH_C32(0x4ED8AA4A) + W05); \
|
||||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
G = SPH_T32(G + T1); \
|
G = SPH_T32(G + T1); \
|
||||||
C = SPH_T32(T1 + T2); \
|
C = SPH_T32(T1 + T2); \
|
||||||
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
|
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
|
||||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||||
+ SPH_C32(0x5B9CCA4F) + W06); \
|
+ SPH_C32(0x5B9CCA4F) + W06); \
|
||||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
F = SPH_T32(F + T1); \
|
F = SPH_T32(F + T1); \
|
||||||
B = SPH_T32(T1 + T2); \
|
B = SPH_T32(T1 + T2); \
|
||||||
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
|
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
|
||||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||||
+ SPH_C32(0x682E6FF3) + W07); \
|
+ SPH_C32(0x682E6FF3) + W07); \
|
||||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
E = SPH_T32(E + T1); \
|
E = SPH_T32(E + T1); \
|
||||||
A = SPH_T32(T1 + T2); \
|
A = SPH_T32(T1 + T2); \
|
||||||
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
|
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
|
||||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||||
+ SPH_C32(0x748F82EE) + W08); \
|
+ SPH_C32(0x748F82EE) + W08); \
|
||||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
D = SPH_T32(D + T1); \
|
D = SPH_T32(D + T1); \
|
||||||
H = SPH_T32(T1 + T2); \
|
H = SPH_T32(T1 + T2); \
|
||||||
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
|
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
|
||||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||||
+ SPH_C32(0x78A5636F) + W09); \
|
+ SPH_C32(0x78A5636F) + W09); \
|
||||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
C = SPH_T32(C + T1); \
|
C = SPH_T32(C + T1); \
|
||||||
G = SPH_T32(T1 + T2); \
|
G = SPH_T32(T1 + T2); \
|
||||||
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
|
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
|
||||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||||
+ SPH_C32(0x84C87814) + W10); \
|
+ SPH_C32(0x84C87814) + W10); \
|
||||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
B = SPH_T32(B + T1); \
|
B = SPH_T32(B + T1); \
|
||||||
F = SPH_T32(T1 + T2); \
|
F = SPH_T32(T1 + T2); \
|
||||||
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
|
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
|
||||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||||
+ SPH_C32(0x8CC70208) + W11); \
|
+ SPH_C32(0x8CC70208) + W11); \
|
||||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
A = SPH_T32(A + T1); \
|
A = SPH_T32(A + T1); \
|
||||||
E = SPH_T32(T1 + T2); \
|
E = SPH_T32(T1 + T2); \
|
||||||
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
|
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
|
||||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||||
+ SPH_C32(0x90BEFFFA) + W12); \
|
+ SPH_C32(0x90BEFFFA) + W12); \
|
||||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
H = SPH_T32(H + T1); \
|
H = SPH_T32(H + T1); \
|
||||||
D = SPH_T32(T1 + T2); \
|
D = SPH_T32(T1 + T2); \
|
||||||
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
|
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
|
||||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||||
+ SPH_C32(0xA4506CEB) + W13); \
|
+ SPH_C32(0xA4506CEB) + W13); \
|
||||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
G = SPH_T32(G + T1); \
|
G = SPH_T32(G + T1); \
|
||||||
C = SPH_T32(T1 + T2); \
|
C = SPH_T32(T1 + T2); \
|
||||||
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
|
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
|
||||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||||
+ SPH_C32(0xBEF9A3F7) + W14); \
|
+ SPH_C32(0xBEF9A3F7) + W14); \
|
||||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
F = SPH_T32(F + T1); \
|
F = SPH_T32(F + T1); \
|
||||||
B = SPH_T32(T1 + T2); \
|
B = SPH_T32(T1 + T2); \
|
||||||
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
|
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
|
||||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||||
+ SPH_C32(0xC67178F2) + W15); \
|
+ SPH_C32(0xC67178F2) + W15); \
|
||||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||||
|
Y_xor_Z = X_xor_Y; \
|
||||||
E = SPH_T32(E + T1); \
|
E = SPH_T32(E + T1); \
|
||||||
A = SPH_T32(T1 + T2); \
|
A = SPH_T32(T1 + T2); \
|
||||||
(r)[0] = SPH_T32((r)[0] + A); \
|
(r)[0] = SPH_T32((r)[0] + A); \
|
||||||
@@ -619,8 +683,54 @@ sha2_round(const unsigned char *data, sph_u32 r[8])
|
|||||||
#undef SHA2_IN
|
#undef SHA2_IN
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // SHA else
|
void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
|
||||||
|
const uint32_t *state_in )
|
||||||
|
{
|
||||||
|
memcpy( state_out, state_in, 32 );
|
||||||
|
#define SHA2_IN(x) (data[x])
|
||||||
|
SHA2_ROUND_BODY( SHA2_IN, state_out );
|
||||||
|
#undef SHA2_IN
|
||||||
|
}
|
||||||
|
|
||||||
|
void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
|
||||||
|
const uint32_t *state_in )
|
||||||
|
{
|
||||||
|
memcpy( state_out, state_in, 32 );
|
||||||
|
#define SHA2_IN(x) sph_dec32be_aligned( data+(x) )
|
||||||
|
SHA2_ROUND_BODY( SHA2_IN, state_out );
|
||||||
|
#undef SHA2_IN
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
|
||||||
|
const uint32_t *state_in )
|
||||||
|
{
|
||||||
|
uint32_t t1, t2, X_xor_Y, Y_xor_Z = state_in[1] ^ state_in[2];
|
||||||
|
memcpy( state_out, state_in, 32 );
|
||||||
|
|
||||||
|
t1 = state_out[7] + BSG2_1( state_out[4] )
|
||||||
|
+ CH( state_out[4], state_out[5], state_out[6] ) + 0x428A2F98 + data[0];
|
||||||
|
t2 = BSG2_0( state_out[0] )
|
||||||
|
+ MAJ( state_out[0], state_out[1], state_out[2] );
|
||||||
|
Y_xor_Z = X_xor_Y;
|
||||||
|
state_out[3] += t1;
|
||||||
|
state_out[7] = t1 + t2;
|
||||||
|
|
||||||
|
t1 = state_out[6] + BSG2_1( state_out[3] )
|
||||||
|
+ CH( state_out[3], state_out[4], state_out[5] ) + 0x71374491 + data[1];
|
||||||
|
t2 = BSG2_0( state_out[7] )
|
||||||
|
+ MAJ( state_out[7], state_out[0], state_out[1] );
|
||||||
|
Y_xor_Z = X_xor_Y;
|
||||||
|
state_out[2] += t1;
|
||||||
|
state_out[6] = t1 + t2;
|
||||||
|
|
||||||
|
t1 = state_out[5] + BSG2_1( state_out[2] )
|
||||||
|
+ CH( state_out[2], state_out[3], state_out[4] ) + 0xB5C0FBCF + data[2];
|
||||||
|
t2 = BSG2_0( state_out[6] )
|
||||||
|
+ MAJ( state_out[6], state_out[7], state_out[0] );
|
||||||
|
state_out[1] += t1;
|
||||||
|
state_out[5] = t1 + t2;
|
||||||
|
}
|
||||||
|
|
||||||
/* see sph_sha2.h */
|
/* see sph_sha2.h */
|
||||||
void
|
void
|
||||||
@@ -689,6 +799,14 @@ sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|||||||
// sph_sha256_init(cc);
|
// sph_sha256_init(cc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void sph_sha256_full( void *dst, const void *data, size_t len )
|
||||||
|
{
|
||||||
|
sph_sha256_context cc;
|
||||||
|
sph_sha256_init( &cc );
|
||||||
|
sph_sha256( &cc, data, len );
|
||||||
|
sph_sha256_close( &cc, dst );
|
||||||
|
}
|
||||||
|
|
||||||
/* see sph_sha2.h */
|
/* see sph_sha2.h */
|
||||||
//void
|
//void
|
||||||
//sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8])
|
//sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8])
|
||||||
|
|||||||
@@ -205,6 +205,20 @@ void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
|
|||||||
#define sph_sha256_comp sph_sha224_comp
|
#define sph_sha256_comp sph_sha224_comp
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
void sph_sha256_full( void *dst, const void *data, size_t len );
|
||||||
|
|
||||||
|
// These shouldn't be called directly, use sha256-hash.h generic functions
|
||||||
|
// sha256_transform_le & sha256_transform_be instead.
|
||||||
|
void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
|
||||||
|
const uint32_t *state_in );
|
||||||
|
|
||||||
|
void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
|
||||||
|
const uint32_t *state_in );
|
||||||
|
|
||||||
|
void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
|
||||||
|
const uint32_t *state_in );
|
||||||
|
|
||||||
|
|
||||||
#if SPH_64
|
#if SPH_64
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -38,7 +38,8 @@
|
|||||||
#if SPH_64
|
#if SPH_64
|
||||||
|
|
||||||
#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
|
#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
|
||||||
#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z)))
|
//#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z)))
|
||||||
|
#define MAJ( X, Y, Z ) ( Y ^ ( ( X ^ Y ) & ( Y ^ Z ) ) )
|
||||||
|
|
||||||
#define ROTR64 SPH_ROTR64
|
#define ROTR64 SPH_ROTR64
|
||||||
|
|
||||||
|
|||||||
@@ -70,6 +70,8 @@ extern "C"{
|
|||||||
C8, C9, CA, CB, CC, CD, CE, CF; \
|
C8, C9, CA, CB, CC, CD, CE, CF; \
|
||||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7, \
|
__m256i M0, M1, M2, M3, M4, M5, M6, M7, \
|
||||||
M8, M9, MA, MB, MC, MD, ME, MF; \
|
M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||||
|
const __m256i FIVE = _mm256_set1_epi32( 5 ); \
|
||||||
|
const __m256i THREE = _mm256_set1_epi32( 3 ); \
|
||||||
sph_u32 Wlow, Whigh;
|
sph_u32 Wlow, Whigh;
|
||||||
|
|
||||||
#define READ_STATE8(state) do \
|
#define READ_STATE8(state) do \
|
||||||
@@ -310,12 +312,12 @@ do { \
|
|||||||
|
|
||||||
#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
|
#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
|
||||||
do { \
|
do { \
|
||||||
xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256( \
|
xa0 = mm256_xor3( xm, xb1, _mm256_xor_si256( \
|
||||||
_mm256_andnot_si256( xb3, xb2 ), \
|
_mm256_andnot_si256( xb3, xb2 ), \
|
||||||
_mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \
|
_mm256_mullo_epi32( mm256_xor3( xa0, xc, \
|
||||||
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \
|
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
|
||||||
) ), _mm256_set1_epi32(3UL) ) ) ) ); \
|
FIVE ) ), THREE ) ) ); \
|
||||||
xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \
|
xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define PERM_STEP_0_8 do { \
|
#define PERM_STEP_0_8 do { \
|
||||||
@@ -666,6 +668,8 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|||||||
C8, C9, CA, CB, CC, CD, CE, CF; \
|
C8, C9, CA, CB, CC, CD, CE, CF; \
|
||||||
__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
|
__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
|
||||||
M8, M9, MA, MB, MC, MD, ME, MF; \
|
M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||||
|
const __m128i FIVE = _mm_set1_epi32( 5 ); \
|
||||||
|
const __m128i THREE = _mm_set1_epi32( 3 ); \
|
||||||
sph_u32 Wlow, Whigh;
|
sph_u32 Wlow, Whigh;
|
||||||
|
|
||||||
#define READ_STATE(state) do \
|
#define READ_STATE(state) do \
|
||||||
@@ -930,8 +934,8 @@ do { \
|
|||||||
xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \
|
xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \
|
||||||
_mm_andnot_si128( xb3, xb2 ), \
|
_mm_andnot_si128( xb3, xb2 ), \
|
||||||
_mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
|
_mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
|
||||||
_mm_mullo_epi32( mm128_rol_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
|
_mm_mullo_epi32( mm128_rol_32( xa1, 15 ), FIVE ) \
|
||||||
) ), _mm_set1_epi32(3UL) ) ) ) ); \
|
) ), THREE ) ) ) ); \
|
||||||
xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
|
xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|||||||
@@ -20,17 +20,26 @@ static const uint32_t IV512[] =
|
|||||||
|
|
||||||
|
|
||||||
#define mm256_ror2x256hi_1x32( a, b ) \
|
#define mm256_ror2x256hi_1x32( a, b ) \
|
||||||
_mm256_blend_epi32( mm256_ror128_32( a ), \
|
_mm256_blend_epi32( mm256_shuflr128_32( a ), \
|
||||||
mm256_ror128_32( b ), 0x88 )
|
mm256_shuflr128_32( b ), 0x88 )
|
||||||
|
|
||||||
|
#if defined(__VAES__)
|
||||||
|
|
||||||
|
#define mm256_aesenc_2x128( x, k ) \
|
||||||
|
_mm256_aesenc_epi128( x, _mm256_castsi128_si256( k ) )
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define mm256_aesenc_2x128( x, k ) \
|
||||||
|
mm256_concat_128( _mm_aesenc_si128( mm128_extr_hi128_256( x ), k ), \
|
||||||
|
_mm_aesenc_si128( mm128_extr_lo128_256( x ), k ) )
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
static void
|
static void
|
||||||
c512_2way( shavite512_2way_context *ctx, const void *msg )
|
c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||||
{
|
{
|
||||||
#if defined(__VAES__)
|
|
||||||
const __m256i zero = _mm256_setzero_si256();
|
|
||||||
#else
|
|
||||||
const __m128i zero = _mm_setzero_si128();
|
const __m128i zero = _mm_setzero_si128();
|
||||||
#endif
|
|
||||||
__m256i p0, p1, p2, p3, x;
|
__m256i p0, p1, p2, p3, x;
|
||||||
__m256i k00, k01, k02, k03, k10, k11, k12, k13;
|
__m256i k00, k01, k02, k03, k10, k11, k12, k13;
|
||||||
__m256i *m = (__m256i*)msg;
|
__m256i *m = (__m256i*)msg;
|
||||||
@@ -69,7 +78,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
|||||||
{
|
{
|
||||||
// round 1, 5, 9
|
// round 1, 5, 9
|
||||||
|
|
||||||
k00 = _mm256_xor_si256( k13, mm256_ror128_32(
|
k00 = _mm256_xor_si256( k13, mm256_shuflr128_32(
|
||||||
mm256_aesenc_2x128( k00, zero ) ) );
|
mm256_aesenc_2x128( k00, zero ) ) );
|
||||||
|
|
||||||
if ( r == 0 )
|
if ( r == 0 )
|
||||||
@@ -79,7 +88,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
|||||||
|
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
|
||||||
k01 = _mm256_xor_si256( k00,
|
k01 = _mm256_xor_si256( k00,
|
||||||
mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) );
|
mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ) );
|
||||||
|
|
||||||
if ( r == 1 )
|
if ( r == 1 )
|
||||||
k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
|
k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
|
||||||
@@ -88,25 +97,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
|||||||
|
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||||
k02 = _mm256_xor_si256( k01,
|
k02 = _mm256_xor_si256( k01,
|
||||||
mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) );
|
mm256_shuflr128_32( mm256_aesenc_2x128( k02, zero ) ) );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||||
k03 = _mm256_xor_si256( k02,
|
k03 = _mm256_xor_si256( k02,
|
||||||
mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) );
|
mm256_shuflr128_32( mm256_aesenc_2x128( k03, zero ) ) );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||||
|
|
||||||
p3 = _mm256_xor_si256( p3, x );
|
p3 = _mm256_xor_si256( p3, x );
|
||||||
|
|
||||||
k10 = _mm256_xor_si256( k03,
|
k10 = _mm256_xor_si256( k03,
|
||||||
mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) );
|
mm256_shuflr128_32( mm256_aesenc_2x128( k10, zero ) ) );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
|
||||||
k11 = _mm256_xor_si256( k10,
|
k11 = _mm256_xor_si256( k10,
|
||||||
mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) );
|
mm256_shuflr128_32( mm256_aesenc_2x128( k11, zero ) ) );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||||
k12 = _mm256_xor_si256( k11,
|
k12 = _mm256_xor_si256( k11,
|
||||||
mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) );
|
mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) ) );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||||
k13 = _mm256_xor_si256( k12,
|
k13 = _mm256_xor_si256( k12,
|
||||||
mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) );
|
mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ) );
|
||||||
|
|
||||||
if ( r == 2 )
|
if ( r == 2 )
|
||||||
k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
|
k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
|
||||||
@@ -142,31 +151,31 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
|||||||
|
|
||||||
// round 3, 7, 11
|
// round 3, 7, 11
|
||||||
|
|
||||||
k00 = _mm256_xor_si256( mm256_ror128_32(
|
k00 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||||
mm256_aesenc_2x128( k00, zero ) ), k13 );
|
mm256_aesenc_2x128( k00, zero ) ), k13 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
|
||||||
k01 = _mm256_xor_si256( mm256_ror128_32(
|
k01 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||||
mm256_aesenc_2x128( k01, zero ) ), k00 );
|
mm256_aesenc_2x128( k01, zero ) ), k00 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||||
k02 = _mm256_xor_si256( mm256_ror128_32(
|
k02 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||||
mm256_aesenc_2x128( k02, zero ) ), k01 );
|
mm256_aesenc_2x128( k02, zero ) ), k01 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||||
k03 = _mm256_xor_si256( mm256_ror128_32(
|
k03 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||||
mm256_aesenc_2x128( k03, zero ) ), k02 );
|
mm256_aesenc_2x128( k03, zero ) ), k02 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||||
|
|
||||||
p1 = _mm256_xor_si256( p1, x );
|
p1 = _mm256_xor_si256( p1, x );
|
||||||
|
|
||||||
k10 = _mm256_xor_si256( mm256_ror128_32(
|
k10 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||||
mm256_aesenc_2x128( k10, zero ) ), k03 );
|
mm256_aesenc_2x128( k10, zero ) ), k03 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
|
||||||
k11 = _mm256_xor_si256( mm256_ror128_32(
|
k11 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||||
mm256_aesenc_2x128( k11, zero ) ), k10 );
|
mm256_aesenc_2x128( k11, zero ) ), k10 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||||
k12 = _mm256_xor_si256( mm256_ror128_32(
|
k12 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||||
mm256_aesenc_2x128( k12, zero ) ), k11 );
|
mm256_aesenc_2x128( k12, zero ) ), k11 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||||
k13 = _mm256_xor_si256( mm256_ror128_32(
|
k13 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||||
mm256_aesenc_2x128( k13, zero ) ), k12 );
|
mm256_aesenc_2x128( k13, zero ) ), k12 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
||||||
|
|
||||||
@@ -200,35 +209,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
|||||||
|
|
||||||
// round 13
|
// round 13
|
||||||
|
|
||||||
k00 = _mm256_xor_si256( mm256_ror128_32(
|
k00 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||||
mm256_aesenc_2x128( k00, zero ) ), k13 );
|
mm256_aesenc_2x128( k00, zero ) ), k13 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
|
||||||
k01 = _mm256_xor_si256( mm256_ror128_32(
|
k01 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||||
mm256_aesenc_2x128( k01, zero ) ), k00 );
|
mm256_aesenc_2x128( k01, zero ) ), k00 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||||
k02 = _mm256_xor_si256( mm256_ror128_32(
|
k02 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||||
mm256_aesenc_2x128( k02, zero ) ), k01 );
|
mm256_aesenc_2x128( k02, zero ) ), k01 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||||
k03 = _mm256_xor_si256( mm256_ror128_32(
|
k03 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||||
mm256_aesenc_2x128( k03, zero ) ), k02 );
|
mm256_aesenc_2x128( k03, zero ) ), k02 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||||
|
|
||||||
p3 = _mm256_xor_si256( p3, x );
|
p3 = _mm256_xor_si256( p3, x );
|
||||||
|
|
||||||
k10 = _mm256_xor_si256( mm256_ror128_32(
|
k10 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||||
mm256_aesenc_2x128( k10, zero ) ), k03 );
|
mm256_aesenc_2x128( k10, zero ) ), k03 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
|
||||||
k11 = _mm256_xor_si256( mm256_ror128_32(
|
k11 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||||
mm256_aesenc_2x128( k11, zero ) ), k10 );
|
mm256_aesenc_2x128( k11, zero ) ), k10 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||||
|
|
||||||
k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) );
|
k12 = mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) );
|
||||||
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
|
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
|
||||||
~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
|
~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
|
||||||
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
|
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
|
||||||
|
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||||
k13 = _mm256_xor_si256( mm256_ror128_32(
|
k13 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||||
mm256_aesenc_2x128( k13, zero ) ), k12 );
|
mm256_aesenc_2x128( k13, zero ) ), k12 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
||||||
|
|
||||||
@@ -308,7 +317,7 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
|
|||||||
uint32_t vp = ctx->ptr>>5;
|
uint32_t vp = ctx->ptr>>5;
|
||||||
|
|
||||||
// Terminating byte then zero pad
|
// Terminating byte then zero pad
|
||||||
casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 );
|
casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 );
|
||||||
|
|
||||||
// Zero pad full vectors up to count
|
// Zero pad full vectors up to count
|
||||||
for ( ; vp < 6; vp++ )
|
for ( ; vp < 6; vp++ )
|
||||||
@@ -388,13 +397,13 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
|
|||||||
|
|
||||||
if ( vp == 0 ) // empty buf, xevan.
|
if ( vp == 0 ) // empty buf, xevan.
|
||||||
{
|
{
|
||||||
casti_m256i( buf, 0 ) = m256_const2_64( 0, 0x0000000000000080 );
|
casti_m256i( buf, 0 ) = m256_const1_i128( 0x0000000000000080 );
|
||||||
memset_zero_256( (__m256i*)buf + 1, 5 );
|
memset_zero_256( (__m256i*)buf + 1, 5 );
|
||||||
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
||||||
}
|
}
|
||||||
else // half full buf, everyone else.
|
else // half full buf, everyone else.
|
||||||
{
|
{
|
||||||
casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 );
|
casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 );
|
||||||
memset_zero_256( (__m256i*)buf + vp, 6 - vp );
|
memset_zero_256( (__m256i*)buf + vp, 6 - vp );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -478,13 +487,13 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
|
|||||||
|
|
||||||
if ( vp == 0 ) // empty buf, xevan.
|
if ( vp == 0 ) // empty buf, xevan.
|
||||||
{
|
{
|
||||||
casti_m256i( buf, 0 ) = m256_const2_64( 0, 0x0000000000000080 );
|
casti_m256i( buf, 0 ) = m256_const1_i128( 0x0000000000000080 );
|
||||||
memset_zero_256( (__m256i*)buf + 1, 5 );
|
memset_zero_256( (__m256i*)buf + 1, 5 );
|
||||||
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
||||||
}
|
}
|
||||||
else // half full buf, everyone else.
|
else // half full buf, everyone else.
|
||||||
{
|
{
|
||||||
casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 );
|
casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 );
|
||||||
memset_zero_256( (__m256i*)buf + vp, 6 - vp );
|
memset_zero_256( (__m256i*)buf + vp, 6 - vp );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ static const uint32_t IV512[] =
|
|||||||
};
|
};
|
||||||
|
|
||||||
#define mm512_ror2x512hi_1x32( a, b ) \
|
#define mm512_ror2x512hi_1x32( a, b ) \
|
||||||
_mm512_mask_blend_epi32( 0x8888, mm512_ror128_32( a ), \
|
_mm512_mask_blend_epi32( 0x8888, mm512_shuflr128_32( a ), \
|
||||||
mm512_ror128_32( b ) )
|
mm512_shuflr128_32( b ) )
|
||||||
|
|
||||||
static void
|
static void
|
||||||
c512_4way( shavite512_4way_context *ctx, const void *msg )
|
c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||||
@@ -23,6 +23,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
|||||||
register __m512i K0, K1, K2, K3, K4, K5, K6, K7;
|
register __m512i K0, K1, K2, K3, K4, K5, K6, K7;
|
||||||
__m512i *M = (__m512i*)msg;
|
__m512i *M = (__m512i*)msg;
|
||||||
__m512i *H = (__m512i*)ctx->h;
|
__m512i *H = (__m512i*)ctx->h;
|
||||||
|
const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2,
|
||||||
|
ctx->count1, ctx->count0 );
|
||||||
int r;
|
int r;
|
||||||
|
|
||||||
P0 = H[0];
|
P0 = H[0];
|
||||||
@@ -58,46 +60,46 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
|||||||
{
|
{
|
||||||
// round 1, 5, 9
|
// round 1, 5, 9
|
||||||
|
|
||||||
K0 = _mm512_xor_si512( K7, mm512_ror128_32(
|
K0 = _mm512_xor_si512( K7, mm512_shuflr128_32(
|
||||||
_mm512_aesenc_epi128( K0, m512_zero ) ) );
|
_mm512_aesenc_epi128( K0, m512_zero ) ) );
|
||||||
|
|
||||||
if ( r == 0 )
|
if ( r == 0 )
|
||||||
K0 = _mm512_xor_si512( K0, _mm512_set4_epi32(
|
K0 = _mm512_xor_si512( K0,
|
||||||
~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
|
_mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) );
|
||||||
|
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
|
||||||
K1 = _mm512_xor_si512( K0,
|
K1 = _mm512_xor_si512( K0,
|
||||||
mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
|
mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
|
||||||
|
|
||||||
if ( r == 1 )
|
if ( r == 1 )
|
||||||
K1 = _mm512_xor_si512( K1, _mm512_set4_epi32(
|
K1 = _mm512_xor_si512( K1, mm512_shuflr128_32(
|
||||||
~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) );
|
_mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
|
||||||
|
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||||
K2 = _mm512_xor_si512( K1,
|
K2 = _mm512_xor_si512( K1,
|
||||||
mm512_ror128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
|
mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||||
K3 = _mm512_xor_si512( K2,
|
K3 = _mm512_xor_si512( K2,
|
||||||
mm512_ror128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
|
mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||||
|
|
||||||
P3 = _mm512_xor_si512( P3, X );
|
P3 = _mm512_xor_si512( P3, X );
|
||||||
|
|
||||||
K4 = _mm512_xor_si512( K3,
|
K4 = _mm512_xor_si512( K3,
|
||||||
mm512_ror128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
|
mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
|
||||||
K5 = _mm512_xor_si512( K4,
|
K5 = _mm512_xor_si512( K4,
|
||||||
mm512_ror128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
|
mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||||
K6 = _mm512_xor_si512( K5,
|
K6 = _mm512_xor_si512( K5,
|
||||||
mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
|
mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||||
K7 = _mm512_xor_si512( K6,
|
K7 = _mm512_xor_si512( K6,
|
||||||
mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
|
mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
|
||||||
|
|
||||||
if ( r == 2 )
|
if ( r == 2 )
|
||||||
K7 = _mm512_xor_si512( K7, _mm512_set4_epi32(
|
K7 = _mm512_xor_si512( K7, mm512_swap128_64(
|
||||||
~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
|
_mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) );
|
||||||
|
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||||
P1 = _mm512_xor_si512( P1, X );
|
P1 = _mm512_xor_si512( P1, X );
|
||||||
@@ -128,31 +130,31 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
|||||||
|
|
||||||
// round 3, 7, 11
|
// round 3, 7, 11
|
||||||
|
|
||||||
K0 = _mm512_xor_si512( mm512_ror128_32(
|
K0 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||||
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
|
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
|
||||||
K1 = _mm512_xor_si512( mm512_ror128_32(
|
K1 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||||
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
|
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||||
K2 = _mm512_xor_si512( mm512_ror128_32(
|
K2 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||||
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
|
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||||
K3 = _mm512_xor_si512( mm512_ror128_32(
|
K3 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||||
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
|
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||||
|
|
||||||
P1 = _mm512_xor_si512( P1, X );
|
P1 = _mm512_xor_si512( P1, X );
|
||||||
|
|
||||||
K4 = _mm512_xor_si512( mm512_ror128_32(
|
K4 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||||
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
|
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
|
||||||
K5 = _mm512_xor_si512( mm512_ror128_32(
|
K5 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||||
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
|
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||||
K6 = _mm512_xor_si512( mm512_ror128_32(
|
K6 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||||
_mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
|
_mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||||
K7 = _mm512_xor_si512( mm512_ror128_32(
|
K7 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||||
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
|
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||||
|
|
||||||
@@ -185,34 +187,34 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
|||||||
|
|
||||||
// round 13
|
// round 13
|
||||||
|
|
||||||
K0 = _mm512_xor_si512( mm512_ror128_32(
|
K0 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||||
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
|
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
|
||||||
K1 = _mm512_xor_si512( mm512_ror128_32(
|
K1 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||||
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
|
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||||
K2 = _mm512_xor_si512( mm512_ror128_32(
|
K2 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||||
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
|
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||||
K3 = _mm512_xor_si512( mm512_ror128_32(
|
K3 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||||
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
|
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||||
|
|
||||||
P3 = _mm512_xor_si512( P3, X );
|
P3 = _mm512_xor_si512( P3, X );
|
||||||
|
|
||||||
K4 = _mm512_xor_si512( mm512_ror128_32(
|
K4 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||||
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
|
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
|
||||||
K5 = _mm512_xor_si512( mm512_ror128_32(
|
K5 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||||
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
|
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||||
|
|
||||||
K6 = mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
|
K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
|
||||||
K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32(
|
K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32(
|
||||||
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
|
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
|
||||||
|
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||||
K7= _mm512_xor_si512( mm512_ror128_32(
|
K7= _mm512_xor_si512( mm512_shuflr128_32(
|
||||||
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
|
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
|
||||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||||
|
|
||||||
@@ -292,7 +294,7 @@ void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
|
|||||||
uint32_t vp = ctx->ptr>>6;
|
uint32_t vp = ctx->ptr>>6;
|
||||||
|
|
||||||
// Terminating byte then zero pad
|
// Terminating byte then zero pad
|
||||||
casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 );
|
casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 );
|
||||||
|
|
||||||
// Zero pad full vectors up to count
|
// Zero pad full vectors up to count
|
||||||
for ( ; vp < 6; vp++ )
|
for ( ; vp < 6; vp++ )
|
||||||
@@ -372,13 +374,13 @@ void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
|
|||||||
|
|
||||||
if ( vp == 0 ) // empty buf, xevan.
|
if ( vp == 0 ) // empty buf, xevan.
|
||||||
{
|
{
|
||||||
casti_m512i( buf, 0 ) = m512_const2_64( 0, 0x0000000000000080 );
|
casti_m512i( buf, 0 ) = m512_const1_i128( 0x0000000000000080 );
|
||||||
memset_zero_512( (__m512i*)buf + 1, 5 );
|
memset_zero_512( (__m512i*)buf + 1, 5 );
|
||||||
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
||||||
}
|
}
|
||||||
else // half full buf, everyone else.
|
else // half full buf, everyone else.
|
||||||
{
|
{
|
||||||
casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 );
|
casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 );
|
||||||
memset_zero_512( (__m512i*)buf + vp, 6 - vp );
|
memset_zero_512( (__m512i*)buf + vp, 6 - vp );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -463,13 +465,13 @@ void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
|
|||||||
|
|
||||||
if ( vp == 0 ) // empty buf, xevan.
|
if ( vp == 0 ) // empty buf, xevan.
|
||||||
{
|
{
|
||||||
casti_m512i( buf, 0 ) = m512_const2_64( 0, 0x0000000000000080 );
|
casti_m512i( buf, 0 ) = m512_const1_i128( 0x0000000000000080 );
|
||||||
memset_zero_512( (__m512i*)buf + 1, 5 );
|
memset_zero_512( (__m512i*)buf + 1, 5 );
|
||||||
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
||||||
}
|
}
|
||||||
else // half full buf, everyone else.
|
else // half full buf, everyone else.
|
||||||
{
|
{
|
||||||
casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 );
|
casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 );
|
||||||
memset_zero_512( (__m512i*)buf + vp, 6 - vp );
|
memset_zero_512( (__m512i*)buf + vp, 6 - vp );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -74,15 +74,15 @@ static const sph_u32 IV512[] = {
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
// 2 way version of above
|
// 2 way version of above
|
||||||
// a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] }
|
// a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] }
|
||||||
|
|
||||||
#define mm256_ror2x256hi_1x32( a, b ) \
|
#define mm256_ror2x256hi_1x32( a, b ) \
|
||||||
_mm256_blend_epi32( mm256_ror256_1x32( a ), \
|
_mm256_blend_epi32( mm256_ror256_1x32( a ), \
|
||||||
mm256_rol256_3x32( b ), 0x88 )
|
mm256_rol256_3x32( b ), 0x88 )
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
*/
|
||||||
|
|
||||||
static void
|
static void
|
||||||
c512( sph_shavite_big_context *sc, const void *msg )
|
c512( sph_shavite_big_context *sc, const void *msg )
|
||||||
@@ -101,15 +101,6 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
|||||||
|
|
||||||
// round
|
// round
|
||||||
|
|
||||||
// working proof of concept
|
|
||||||
/*
|
|
||||||
__m512i K = m512_const1_128( m[0] );
|
|
||||||
__m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K );
|
|
||||||
X = _mm512_aesenc_epi128( X, m512_zero );
|
|
||||||
k00 = _mm512_castsi512_si128( K );
|
|
||||||
x = _mm512_castsi512_si128( X );
|
|
||||||
*/
|
|
||||||
|
|
||||||
k00 = m[0];
|
k00 = m[0];
|
||||||
x = _mm_xor_si128( p1, k00 );
|
x = _mm_xor_si128( p1, k00 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
@@ -144,7 +135,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
|||||||
for ( r = 0; r < 3; r ++ )
|
for ( r = 0; r < 3; r ++ )
|
||||||
{
|
{
|
||||||
// round 1, 5, 9
|
// round 1, 5, 9
|
||||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
|
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
|
||||||
k00 = _mm_xor_si128( k00, k13 );
|
k00 = _mm_xor_si128( k00, k13 );
|
||||||
|
|
||||||
if ( r == 0 )
|
if ( r == 0 )
|
||||||
@@ -153,7 +144,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
|||||||
|
|
||||||
x = _mm_xor_si128( p0, k00 );
|
x = _mm_xor_si128( p0, k00 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
|
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
|
||||||
k01 = _mm_xor_si128( k01, k00 );
|
k01 = _mm_xor_si128( k01, k00 );
|
||||||
|
|
||||||
if ( r == 1 )
|
if ( r == 1 )
|
||||||
@@ -162,31 +153,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
|||||||
|
|
||||||
x = _mm_xor_si128( x, k01 );
|
x = _mm_xor_si128( x, k01 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
|
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
|
||||||
k02 = _mm_xor_si128( k02, k01 );
|
k02 = _mm_xor_si128( k02, k01 );
|
||||||
x = _mm_xor_si128( x, k02 );
|
x = _mm_xor_si128( x, k02 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
|
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
|
||||||
k03 = _mm_xor_si128( k03, k02 );
|
k03 = _mm_xor_si128( k03, k02 );
|
||||||
x = _mm_xor_si128( x, k03 );
|
x = _mm_xor_si128( x, k03 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
|
|
||||||
p3 = _mm_xor_si128( p3, x );
|
p3 = _mm_xor_si128( p3, x );
|
||||||
|
|
||||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
|
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
|
||||||
k10 = _mm_xor_si128( k10, k03 );
|
k10 = _mm_xor_si128( k10, k03 );
|
||||||
|
|
||||||
x = _mm_xor_si128( p2, k10 );
|
x = _mm_xor_si128( p2, k10 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
|
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
|
||||||
k11 = _mm_xor_si128( k11, k10 );
|
k11 = _mm_xor_si128( k11, k10 );
|
||||||
x = _mm_xor_si128( x, k11 );
|
x = _mm_xor_si128( x, k11 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
|
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
|
||||||
k12 = _mm_xor_si128( k12, k11 );
|
k12 = _mm_xor_si128( k12, k11 );
|
||||||
x = _mm_xor_si128( x, k12 );
|
x = _mm_xor_si128( x, k12 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
|
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
|
||||||
k13 = _mm_xor_si128( k13, k12 );
|
k13 = _mm_xor_si128( k13, k12 );
|
||||||
|
|
||||||
if ( r == 2 )
|
if ( r == 2 )
|
||||||
@@ -231,38 +222,38 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
|||||||
|
|
||||||
// round 3, 7, 11
|
// round 3, 7, 11
|
||||||
|
|
||||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
|
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
|
||||||
k00 = _mm_xor_si128( k00, k13 );
|
k00 = _mm_xor_si128( k00, k13 );
|
||||||
x = _mm_xor_si128( p2, k00 );
|
x = _mm_xor_si128( p2, k00 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
|
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
|
||||||
k01 = _mm_xor_si128( k01, k00 );
|
k01 = _mm_xor_si128( k01, k00 );
|
||||||
x = _mm_xor_si128( x, k01 );
|
x = _mm_xor_si128( x, k01 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
|
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
|
||||||
k02 = _mm_xor_si128( k02, k01 );
|
k02 = _mm_xor_si128( k02, k01 );
|
||||||
x = _mm_xor_si128( x, k02 );
|
x = _mm_xor_si128( x, k02 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
|
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
|
||||||
k03 = _mm_xor_si128( k03, k02 );
|
k03 = _mm_xor_si128( k03, k02 );
|
||||||
x = _mm_xor_si128( x, k03 );
|
x = _mm_xor_si128( x, k03 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
|
|
||||||
p1 = _mm_xor_si128( p1, x );
|
p1 = _mm_xor_si128( p1, x );
|
||||||
|
|
||||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
|
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
|
||||||
k10 = _mm_xor_si128( k10, k03 );
|
k10 = _mm_xor_si128( k10, k03 );
|
||||||
x = _mm_xor_si128( p0, k10 );
|
x = _mm_xor_si128( p0, k10 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
|
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
|
||||||
k11 = _mm_xor_si128( k11, k10 );
|
k11 = _mm_xor_si128( k11, k10 );
|
||||||
x = _mm_xor_si128( x, k11 );
|
x = _mm_xor_si128( x, k11 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
|
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
|
||||||
k12 = _mm_xor_si128( k12, k11 );
|
k12 = _mm_xor_si128( k12, k11 );
|
||||||
x = _mm_xor_si128( x, k12 );
|
x = _mm_xor_si128( x, k12 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
|
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
|
||||||
k13 = _mm_xor_si128( k13, k12 );
|
k13 = _mm_xor_si128( k13, k12 );
|
||||||
x = _mm_xor_si128( x, k13 );
|
x = _mm_xor_si128( x, k13 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
@@ -304,39 +295,39 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
|||||||
|
|
||||||
// round 13
|
// round 13
|
||||||
|
|
||||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
|
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
|
||||||
k00 = _mm_xor_si128( k00, k13 );
|
k00 = _mm_xor_si128( k00, k13 );
|
||||||
x = _mm_xor_si128( p0, k00 );
|
x = _mm_xor_si128( p0, k00 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
|
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
|
||||||
k01 = _mm_xor_si128( k01, k00 );
|
k01 = _mm_xor_si128( k01, k00 );
|
||||||
x = _mm_xor_si128( x, k01 );
|
x = _mm_xor_si128( x, k01 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
|
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
|
||||||
k02 = _mm_xor_si128( k02, k01 );
|
k02 = _mm_xor_si128( k02, k01 );
|
||||||
x = _mm_xor_si128( x, k02 );
|
x = _mm_xor_si128( x, k02 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
|
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
|
||||||
k03 = _mm_xor_si128( k03, k02 );
|
k03 = _mm_xor_si128( k03, k02 );
|
||||||
x = _mm_xor_si128( x, k03 );
|
x = _mm_xor_si128( x, k03 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
|
|
||||||
p3 = _mm_xor_si128( p3, x );
|
p3 = _mm_xor_si128( p3, x );
|
||||||
|
|
||||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
|
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
|
||||||
k10 = _mm_xor_si128( k10, k03 );
|
k10 = _mm_xor_si128( k10, k03 );
|
||||||
x = _mm_xor_si128( p2, k10 );
|
x = _mm_xor_si128( p2, k10 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
|
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
|
||||||
k11 = _mm_xor_si128( k11, k10 );
|
k11 = _mm_xor_si128( k11, k10 );
|
||||||
x = _mm_xor_si128( x, k11 );
|
x = _mm_xor_si128( x, k11 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
|
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
|
||||||
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
|
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
|
||||||
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
|
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
|
||||||
x = _mm_xor_si128( x, k12 );
|
x = _mm_xor_si128( x, k12 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
|
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
|
||||||
k13 = _mm_xor_si128( k13, k12 );
|
k13 = _mm_xor_si128( k13, k12 );
|
||||||
x = _mm_xor_si128( x, k13 );
|
x = _mm_xor_si128( x, k13 );
|
||||||
x = _mm_aesenc_si128( x, zero );
|
x = _mm_aesenc_si128( x, zero );
|
||||||
|
|||||||
@@ -747,11 +747,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
|
|||||||
|
|
||||||
static const m512_v16 code[] = { c1_16_512(185), c1_16_512(233) };
|
static const m512_v16 code[] = { c1_16_512(185), c1_16_512(233) };
|
||||||
|
|
||||||
|
|
||||||
// static const m512_v16 code[] = { c1_16(185), c1_16(233),
|
|
||||||
// c1_16(185), c1_16(233) };
|
|
||||||
|
|
||||||
|
|
||||||
S0l = _mm512_xor_si512( S[0], M[0] );
|
S0l = _mm512_xor_si512( S[0], M[0] );
|
||||||
S0h = _mm512_xor_si512( S[1], M[1] );
|
S0h = _mm512_xor_si512( S[1], M[1] );
|
||||||
S1l = _mm512_xor_si512( S[2], M[2] );
|
S1l = _mm512_xor_si512( S[2], M[2] );
|
||||||
@@ -764,11 +759,16 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
|
|||||||
// targetted, local macros don't need a unique name
|
// targetted, local macros don't need a unique name
|
||||||
#define S(i) S##i
|
#define S(i) S##i
|
||||||
|
|
||||||
|
#define F_0( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xca )
|
||||||
|
#define F_1( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xe8 )
|
||||||
|
|
||||||
|
/*
|
||||||
#define F_0(B, C, D) \
|
#define F_0(B, C, D) \
|
||||||
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( C,D ), B ), D )
|
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( C,D ), B ), D )
|
||||||
#define F_1(B, C, D) \
|
#define F_1(B, C, D) \
|
||||||
_mm512_or_si512( _mm512_and_si512( D, C ),\
|
_mm512_or_si512( _mm512_and_si512( D, C ),\
|
||||||
_mm512_and_si512( _mm512_or_si512( D,C ), B ) )
|
_mm512_and_si512( _mm512_or_si512( D,C ), B ) )
|
||||||
|
*/
|
||||||
|
|
||||||
#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
|
#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
|
||||||
#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
|
#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
|
||||||
|
|||||||
@@ -6,10 +6,6 @@
|
|||||||
|
|
||||||
#define PRINT_SOME 0
|
#define PRINT_SOME 0
|
||||||
|
|
||||||
/* JDD all ocurrances of macro X in this file renamed to XX
|
|
||||||
* due to name conflict
|
|
||||||
*/
|
|
||||||
|
|
||||||
int SupportedLength(int hashbitlen) {
|
int SupportedLength(int hashbitlen) {
|
||||||
if (hashbitlen <= 0 || hashbitlen > 512)
|
if (hashbitlen <= 0 || hashbitlen > 512)
|
||||||
return 0;
|
return 0;
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "skein-hash-4way.h"
|
#include "skein-hash-4way.h"
|
||||||
#include "algo/sha/sha-hash-4way.h"
|
#include "algo/sha/sha-hash-4way.h"
|
||||||
#include "algo/sha/sph_sha2.h"
|
#include "algo/sha/sha256-hash.h"
|
||||||
|
|
||||||
#if defined (SKEIN_8WAY)
|
#if defined (SKEIN_8WAY)
|
||||||
|
|
||||||
@@ -87,7 +87,6 @@ void skeinhash_4way( void *state, const void *input )
|
|||||||
uint32_t hash1[16] __attribute__ ((aligned (64)));
|
uint32_t hash1[16] __attribute__ ((aligned (64)));
|
||||||
uint32_t hash2[16] __attribute__ ((aligned (64)));
|
uint32_t hash2[16] __attribute__ ((aligned (64)));
|
||||||
uint32_t hash3[16] __attribute__ ((aligned (64)));
|
uint32_t hash3[16] __attribute__ ((aligned (64)));
|
||||||
sph_sha256_context ctx_sha256;
|
|
||||||
#else
|
#else
|
||||||
uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
|
uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
|
||||||
sha256_4way_context ctx_sha256;
|
sha256_4way_context ctx_sha256;
|
||||||
@@ -98,18 +97,12 @@ void skeinhash_4way( void *state, const void *input )
|
|||||||
#if defined(__SHA__)
|
#if defined(__SHA__)
|
||||||
|
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 );
|
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 );
|
||||||
sph_sha256_init( &ctx_sha256 );
|
|
||||||
sph_sha256( &ctx_sha256, hash0, 64 );
|
sha256_full( hash0, hash0, 64 );
|
||||||
sph_sha256_close( &ctx_sha256, hash0 );
|
sha256_full( hash1, hash1, 64 );
|
||||||
sph_sha256_init( &ctx_sha256 );
|
sha256_full( hash2, hash2, 64 );
|
||||||
sph_sha256( &ctx_sha256, hash1, 64 );
|
sha256_full( hash3, hash3, 64 );
|
||||||
sph_sha256_close( &ctx_sha256, hash1 );
|
|
||||||
sph_sha256_init( &ctx_sha256 );
|
|
||||||
sph_sha256( &ctx_sha256, hash2, 64 );
|
|
||||||
sph_sha256_close( &ctx_sha256, hash2 );
|
|
||||||
sph_sha256_init( &ctx_sha256 );
|
|
||||||
sph_sha256( &ctx_sha256, hash3, 64 );
|
|
||||||
sph_sha256_close( &ctx_sha256, hash3 );
|
|
||||||
intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
|
intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|||||||
@@ -309,19 +309,13 @@ static const uint64_t IV512[] = {
|
|||||||
sc->bcount = bcount; \
|
sc->bcount = bcount; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
// AVX2 all scalar vars are now vectors representing 4 nonces in parallel
|
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
#define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
|
#define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
|
||||||
do { \
|
do { \
|
||||||
k8 = _mm512_xor_si512( _mm512_xor_si512( \
|
k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), mm512_xor3( k3, k4, k5 ), \
|
||||||
_mm512_xor_si512( _mm512_xor_si512( k0, k1 ), \
|
mm512_xor3( k6, k7, m512_const1_64( 0x1BD11BDAA9FC1A22) ));\
|
||||||
_mm512_xor_si512( k2, k3 ) ), \
|
|
||||||
_mm512_xor_si512( _mm512_xor_si512( k4, k5 ), \
|
|
||||||
_mm512_xor_si512( k6, k7 ) ) ), \
|
|
||||||
m512_const1_64( 0x1BD11BDAA9FC1A22) ); \
|
|
||||||
t2 = t0 ^ t1; \
|
t2 = t0 ^ t1; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
@@ -340,7 +334,6 @@ do { \
|
|||||||
m512_const1_64( s ) ) ); \
|
m512_const1_64( s ) ) ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
#define TFBIG_MIX_8WAY(x0, x1, rc) \
|
#define TFBIG_MIX_8WAY(x0, x1, rc) \
|
||||||
do { \
|
do { \
|
||||||
x0 = _mm512_add_epi64( x0, x1 ); \
|
x0 = _mm512_add_epi64( x0, x1 ); \
|
||||||
|
|||||||
@@ -5,21 +5,18 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "sph_skein.h"
|
#include "sph_skein.h"
|
||||||
#include "algo/sha/sph_sha2.h"
|
#include "algo/sha/sha256-hash.h"
|
||||||
|
|
||||||
void skeinhash(void *state, const void *input)
|
void skeinhash(void *state, const void *input)
|
||||||
{
|
{
|
||||||
uint32_t hash[16] __attribute__ ((aligned (64)));
|
uint32_t hash[16] __attribute__ ((aligned (64)));
|
||||||
sph_skein512_context ctx_skein;
|
sph_skein512_context ctx_skein;
|
||||||
sph_sha256_context ctx_sha256;
|
|
||||||
|
|
||||||
sph_skein512_init( &ctx_skein );
|
sph_skein512_init( &ctx_skein );
|
||||||
sph_skein512( &ctx_skein, input, 80 );
|
sph_skein512( &ctx_skein, input, 80 );
|
||||||
sph_skein512_close( &ctx_skein, hash );
|
sph_skein512_close( &ctx_skein, hash );
|
||||||
|
|
||||||
sph_sha256_init( &ctx_sha256 );
|
sha256_full( hash, hash, 64 );
|
||||||
sph_sha256( &ctx_sha256, hash, 64 );
|
|
||||||
sph_sha256_close( &ctx_sha256, hash );
|
|
||||||
|
|
||||||
memcpy(state, hash, 32);
|
memcpy(state, hash, 32);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,16 +18,20 @@
|
|||||||
#ifndef __INTTYPES_H_
|
#ifndef __INTTYPES_H_
|
||||||
#define __INTTYPES_H_
|
#define __INTTYPES_H_
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
/* Use [u]intN_t if you need exactly N bits.
|
/* Use [u]intN_t if you need exactly N bits.
|
||||||
XXX - doesn't handle the -mint8 option. */
|
XXX - doesn't handle the -mint8 option. */
|
||||||
|
|
||||||
typedef signed char swift_int8_t;
|
typedef signed char swift_int8_t;
|
||||||
typedef unsigned char swift_uint8_t;
|
typedef unsigned char swift_uint8_t;
|
||||||
|
|
||||||
typedef int swift_int16_t;
|
typedef int32_t swift_int16_t;
|
||||||
|
// typedef int swift_int16_t;
|
||||||
typedef unsigned int swift_uint16_t;
|
typedef unsigned int swift_uint16_t;
|
||||||
|
|
||||||
typedef long swift_int32_t;
|
typedef int32_t swift_int32_t;
|
||||||
|
// typedef long swift_int32_t;
|
||||||
typedef unsigned long swift_uint32_t;
|
typedef unsigned long swift_uint32_t;
|
||||||
|
|
||||||
typedef long long swift_int64_t;
|
typedef long long swift_int64_t;
|
||||||
|
|||||||
@@ -1,47 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2000 Jeroen Ruigrok van der Werven <asmodai@FreeBSD.org>
|
|
||||||
* All rights reserved.
|
|
||||||
*
|
|
||||||
* Redistribution and use in source and binary forms, with or without
|
|
||||||
* modification, are permitted provided that the following conditions
|
|
||||||
* are met:
|
|
||||||
* 1. Redistributions of source code must retain the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer.
|
|
||||||
* 2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer in the
|
|
||||||
* documentation and/or other materials provided with the distribution.
|
|
||||||
*
|
|
||||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
||||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
||||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
||||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
||||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
||||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
||||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
||||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
||||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
||||||
* SUCH DAMAGE.
|
|
||||||
*
|
|
||||||
* $FreeBSD: src/include/stdbool.h,v 1.6 2002/08/16 07:33:14 alfred Exp $
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef _STDBOOL_H_
|
|
||||||
#define _STDBOOL_H_
|
|
||||||
|
|
||||||
#define __bool_true_false_are_defined 1
|
|
||||||
|
|
||||||
#ifndef __cplusplus
|
|
||||||
|
|
||||||
#define false 0
|
|
||||||
#define true 1
|
|
||||||
|
|
||||||
//#define bool _Bool
|
|
||||||
//#if __STDC_VERSION__ < 199901L && __GNUC__ < 3
|
|
||||||
//typedef int _Bool;
|
|
||||||
//#endif
|
|
||||||
typedef int bool;
|
|
||||||
|
|
||||||
#endif /* !__cplusplus */
|
|
||||||
|
|
||||||
#endif /* !_STDBOOL_H_ */
|
|
||||||
@@ -18,6 +18,8 @@
|
|||||||
//#include "stdbool.h"
|
//#include "stdbool.h"
|
||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
|
|
||||||
|
#include "simd-utils.h"
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Constants and static tables portion.
|
// Constants and static tables portion.
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@@ -49,20 +51,20 @@
|
|||||||
// - A: the first operand. After the operation stores the sum of the two operands.
|
// - A: the first operand. After the operation stores the sum of the two operands.
|
||||||
// - B: the second operand. After the operation stores the difference between the first and the
|
// - B: the second operand. After the operation stores the difference between the first and the
|
||||||
// second operands.
|
// second operands.
|
||||||
#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
|
//#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
|
||||||
|
|
||||||
// Quickly reduces an integer modulo 257.
|
// Quickly reduces an integer modulo 257.
|
||||||
//
|
//
|
||||||
// Parameters:
|
// Parameters:
|
||||||
// - A: the input.
|
// - A: the input.
|
||||||
#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
|
//#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
|
||||||
|
|
||||||
// Since we need to do the setup only once, this is the indicator variable:
|
// Since we need to do the setup only once, this is the indicator variable:
|
||||||
static bool wasSetupDone = false;
|
static bool wasSetupDone = false;
|
||||||
|
|
||||||
// This array stores the powers of omegas that correspond to the indices, which are the input
|
// This array stores the powers of omegas that correspond to the indices, which are the input
|
||||||
// values. Known also as the "outer FFT twiddle factors".
|
// values. Known also as the "outer FFT twiddle factors".
|
||||||
swift_int16_t multipliers[N];
|
swift_int16_t multipliers[N] __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
// This array stores the powers of omegas, multiplied by the corresponding values.
|
// This array stores the powers of omegas, multiplied by the corresponding values.
|
||||||
// We store this table to save computation time.
|
// We store this table to save computation time.
|
||||||
@@ -72,14 +74,14 @@ swift_int16_t multipliers[N];
|
|||||||
// compression function, i is between 0 and 31, x_i is a 64-bit value.
|
// compression function, i is between 0 and 31, x_i is a 64-bit value.
|
||||||
// One can see the formula for this (intermediate) stage in the SWIFFT FSE 2008 paper --
|
// One can see the formula for this (intermediate) stage in the SWIFFT FSE 2008 paper --
|
||||||
// formula (2), section 3, page 6.
|
// formula (2), section 3, page 6.
|
||||||
swift_int16_t fftTable[256 * EIGHTH_N];
|
swift_int16_t fftTable[256 * EIGHTH_N] __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
// The A's we use in SWIFFTX shall be random elements of Z_257.
|
// The A's we use in SWIFFTX shall be random elements of Z_257.
|
||||||
// We generated these A's from the decimal expansion of PI as follows: we converted each
|
// We generated these A's from the decimal expansion of PI as follows: we converted each
|
||||||
// triple of digits into a decimal number d. If d < (257 * 3) we used (d % 257) for the next A
|
// triple of digits into a decimal number d. If d < (257 * 3) we used (d % 257) for the next A
|
||||||
// element, otherwise move to the next triple of digits in the expansion. This guarntees that
|
// element, otherwise move to the next triple of digits in the expansion. This guarntees that
|
||||||
// the A's are random, provided that PI digits are.
|
// the A's are random, provided that PI digits are.
|
||||||
const swift_int16_t As[3 * M * N] =
|
const swift_int16_t As[3 * M * N] __attribute__ ((aligned (64))) =
|
||||||
{141, 78, 139, 75, 238, 205, 129, 126, 22, 245, 197, 169, 142, 118, 105, 78,
|
{141, 78, 139, 75, 238, 205, 129, 126, 22, 245, 197, 169, 142, 118, 105, 78,
|
||||||
50, 149, 29, 208, 114, 34, 85, 117, 67, 148, 86, 256, 25, 49, 133, 93,
|
50, 149, 29, 208, 114, 34, 85, 117, 67, 148, 86, 256, 25, 49, 133, 93,
|
||||||
95, 36, 68, 231, 211, 102, 151, 128, 224, 117, 193, 27, 102, 187, 7, 105,
|
95, 36, 68, 231, 211, 102, 151, 128, 224, 117, 193, 27, 102, 187, 7, 105,
|
||||||
@@ -636,10 +638,203 @@ void InitializeSWIFFTX()
|
|||||||
wasSetupDone = true;
|
wasSetupDone = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// In the original code the F matrix is rotated so it was not aranged
|
||||||
|
// the same as all the other data. Rearanging F to match all the other
|
||||||
|
// data made vectorizing possible, the compiler probably could have been
|
||||||
|
// able to auto-vectorize with proper data organisation.
|
||||||
|
// Also in the original code the custom 16 bit data types are all now 32
|
||||||
|
// bit int32_t regardless of the type name.
|
||||||
|
//
|
||||||
void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
||||||
{
|
{
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
|
__m256i F[8] __attribute__ ((aligned (64)));
|
||||||
|
__m256i *mul = (__m256i*)multipliers;
|
||||||
|
__m256i *out = (__m256i*)output;
|
||||||
|
__m256i *tbl = (__m256i*)&( fftTable[ input[0] << 3 ] );
|
||||||
|
|
||||||
|
F[0] = _mm256_mullo_epi32( mul[0], *tbl );
|
||||||
|
tbl = (__m256i*)&( fftTable[ input[1] << 3 ] );
|
||||||
|
F[1] = _mm256_mullo_epi32( mul[1], *tbl );
|
||||||
|
tbl = (__m256i*)&( fftTable[ input[2] << 3 ] );
|
||||||
|
F[2] = _mm256_mullo_epi32( mul[2], *tbl );
|
||||||
|
tbl = (__m256i*)&( fftTable[ input[3] << 3 ] );
|
||||||
|
F[3] = _mm256_mullo_epi32( mul[3], *tbl );
|
||||||
|
tbl = (__m256i*)&( fftTable[ input[4] << 3 ] );
|
||||||
|
F[4] = _mm256_mullo_epi32( mul[4], *tbl );
|
||||||
|
tbl = (__m256i*)&( fftTable[ input[5] << 3 ] );
|
||||||
|
F[5] = _mm256_mullo_epi32( mul[5], *tbl );
|
||||||
|
tbl = (__m256i*)&( fftTable[ input[6] << 3 ] );
|
||||||
|
F[6] = _mm256_mullo_epi32( mul[6], *tbl );
|
||||||
|
tbl = (__m256i*)&( fftTable[ input[7] << 3 ] );
|
||||||
|
F[7] = _mm256_mullo_epi32( mul[7], *tbl );
|
||||||
|
|
||||||
|
#define ADD_SUB( a, b ) \
|
||||||
|
{ \
|
||||||
|
__m256i tmp = b; \
|
||||||
|
b = _mm256_sub_epi32( a, b ); \
|
||||||
|
a = _mm256_add_epi32( a, tmp ); \
|
||||||
|
}
|
||||||
|
|
||||||
|
ADD_SUB( F[0], F[1] );
|
||||||
|
ADD_SUB( F[2], F[3] );
|
||||||
|
ADD_SUB( F[4], F[5] );
|
||||||
|
ADD_SUB( F[6], F[7] );
|
||||||
|
|
||||||
|
F[3] = _mm256_slli_epi32( F[3], 4 );
|
||||||
|
F[7] = _mm256_slli_epi32( F[7], 4 );
|
||||||
|
|
||||||
|
ADD_SUB( F[0], F[2] );
|
||||||
|
ADD_SUB( F[1], F[3] );
|
||||||
|
ADD_SUB( F[4], F[6] );
|
||||||
|
ADD_SUB( F[5], F[7] );
|
||||||
|
|
||||||
|
F[5] = _mm256_slli_epi32( F[5], 2 );
|
||||||
|
F[6] = _mm256_slli_epi32( F[6], 4 );
|
||||||
|
F[7] = _mm256_slli_epi32( F[7], 6 );
|
||||||
|
|
||||||
|
ADD_SUB( F[0], F[4] );
|
||||||
|
ADD_SUB( F[1], F[5] );
|
||||||
|
ADD_SUB( F[2], F[6] );
|
||||||
|
ADD_SUB( F[3], F[7] );
|
||||||
|
|
||||||
|
#undef ADD_SUB
|
||||||
|
|
||||||
|
#if defined (__AVX512VL__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
#define Q_REDUCE( a ) \
|
||||||
|
_mm256_sub_epi32( _mm256_and_si256( a, \
|
||||||
|
_mm256_movm_epi8( 0x11111111 ) ), _mm256_srai_epi32( a, 8 ) )
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define Q_REDUCE( a ) \
|
||||||
|
_mm256_sub_epi32( _mm256_and_si256( a, \
|
||||||
|
m256_const1_32( 0x000000ff ) ), _mm256_srai_epi32( a, 8 ) )
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
out[0] = Q_REDUCE( F[0] );
|
||||||
|
out[1] = Q_REDUCE( F[1] );
|
||||||
|
out[2] = Q_REDUCE( F[2] );
|
||||||
|
out[3] = Q_REDUCE( F[3] );
|
||||||
|
out[4] = Q_REDUCE( F[4] );
|
||||||
|
out[5] = Q_REDUCE( F[5] );
|
||||||
|
out[6] = Q_REDUCE( F[6] );
|
||||||
|
out[7] = Q_REDUCE( F[7] );
|
||||||
|
|
||||||
|
#undef Q_REDUCE
|
||||||
|
|
||||||
|
#elif defined(__SSE4_1__)
|
||||||
|
|
||||||
|
__m128i F[16] __attribute__ ((aligned (64)));
|
||||||
|
__m128i *mul = (__m128i*)multipliers;
|
||||||
|
__m128i *out = (__m128i*)output;
|
||||||
|
__m128i *tbl = (__m128i*)&( fftTable[ input[0] << 3 ] );
|
||||||
|
|
||||||
|
F[ 0] = _mm_mullo_epi32( mul[ 0], tbl[0] );
|
||||||
|
F[ 1] = _mm_mullo_epi32( mul[ 1], tbl[1] );
|
||||||
|
tbl = (__m128i*)&( fftTable[ input[1] << 3 ] );
|
||||||
|
F[ 2] = _mm_mullo_epi32( mul[ 2], tbl[0] );
|
||||||
|
F[ 3] = _mm_mullo_epi32( mul[ 3], tbl[1] );
|
||||||
|
tbl = (__m128i*)&( fftTable[ input[2] << 3 ] );
|
||||||
|
F[ 4] = _mm_mullo_epi32( mul[ 4], tbl[0] );
|
||||||
|
F[ 5] = _mm_mullo_epi32( mul[ 5], tbl[1] );
|
||||||
|
tbl = (__m128i*)&( fftTable[ input[3] << 3 ] );
|
||||||
|
F[ 6] = _mm_mullo_epi32( mul[ 6], tbl[0] );
|
||||||
|
F[ 7] = _mm_mullo_epi32( mul[ 7], tbl[1] );
|
||||||
|
tbl = (__m128i*)&( fftTable[ input[4] << 3 ] );
|
||||||
|
F[ 8] = _mm_mullo_epi32( mul[ 8], tbl[0] );
|
||||||
|
F[ 9] = _mm_mullo_epi32( mul[ 9], tbl[1] );
|
||||||
|
tbl = (__m128i*)&( fftTable[ input[5] << 3 ] );
|
||||||
|
F[10] = _mm_mullo_epi32( mul[10], tbl[0] );
|
||||||
|
F[11] = _mm_mullo_epi32( mul[11], tbl[1] );
|
||||||
|
tbl = (__m128i*)&( fftTable[ input[6] << 3 ] );
|
||||||
|
F[12] = _mm_mullo_epi32( mul[12], tbl[0] );
|
||||||
|
F[13] = _mm_mullo_epi32( mul[13], tbl[1] );
|
||||||
|
tbl = (__m128i*)&( fftTable[ input[7] << 3 ] );
|
||||||
|
F[14] = _mm_mullo_epi32( mul[14], tbl[0] );
|
||||||
|
F[15] = _mm_mullo_epi32( mul[15], tbl[1] );
|
||||||
|
|
||||||
|
#define ADD_SUB( a, b ) \
|
||||||
|
{ \
|
||||||
|
__m128i tmp = b; \
|
||||||
|
b = _mm_sub_epi32( a, b ); \
|
||||||
|
a = _mm_add_epi32( a, tmp ); \
|
||||||
|
}
|
||||||
|
|
||||||
|
ADD_SUB( F[ 0], F[ 2] );
|
||||||
|
ADD_SUB( F[ 1], F[ 3] );
|
||||||
|
ADD_SUB( F[ 4], F[ 6] );
|
||||||
|
ADD_SUB( F[ 5], F[ 7] );
|
||||||
|
ADD_SUB( F[ 8], F[10] );
|
||||||
|
ADD_SUB( F[ 9], F[11] );
|
||||||
|
ADD_SUB( F[12], F[14] );
|
||||||
|
ADD_SUB( F[13], F[15] );
|
||||||
|
|
||||||
|
F[ 6] = _mm_slli_epi32( F[ 6], 4 );
|
||||||
|
F[ 7] = _mm_slli_epi32( F[ 7], 4 );
|
||||||
|
F[14] = _mm_slli_epi32( F[14], 4 );
|
||||||
|
F[15] = _mm_slli_epi32( F[15], 4 );
|
||||||
|
|
||||||
|
ADD_SUB( F[ 0], F[ 4] );
|
||||||
|
ADD_SUB( F[ 1], F[ 5] );
|
||||||
|
ADD_SUB( F[ 2], F[ 6] );
|
||||||
|
ADD_SUB( F[ 3], F[ 7] );
|
||||||
|
ADD_SUB( F[ 8], F[12] );
|
||||||
|
ADD_SUB( F[ 9], F[13] );
|
||||||
|
ADD_SUB( F[10], F[14] );
|
||||||
|
ADD_SUB( F[11], F[15] );
|
||||||
|
|
||||||
|
F[10] = _mm_slli_epi32( F[10], 2 );
|
||||||
|
F[11] = _mm_slli_epi32( F[11], 2 );
|
||||||
|
F[12] = _mm_slli_epi32( F[12], 4 );
|
||||||
|
F[13] = _mm_slli_epi32( F[13], 4 );
|
||||||
|
F[14] = _mm_slli_epi32( F[14], 6 );
|
||||||
|
F[15] = _mm_slli_epi32( F[15], 6 );
|
||||||
|
|
||||||
|
ADD_SUB( F[ 0], F[ 8] );
|
||||||
|
ADD_SUB( F[ 1], F[ 9] );
|
||||||
|
ADD_SUB( F[ 2], F[10] );
|
||||||
|
ADD_SUB( F[ 3], F[11] );
|
||||||
|
ADD_SUB( F[ 4], F[12] );
|
||||||
|
ADD_SUB( F[ 5], F[13] );
|
||||||
|
ADD_SUB( F[ 6], F[14] );
|
||||||
|
ADD_SUB( F[ 7], F[15] );
|
||||||
|
|
||||||
|
#undef ADD_SUB
|
||||||
|
|
||||||
|
#define Q_REDUCE( a ) \
|
||||||
|
_mm_sub_epi32( _mm_and_si128( a, \
|
||||||
|
m128_const1_32( 0x000000ff ) ), _mm_srai_epi32( a, 8 ) )
|
||||||
|
|
||||||
|
out[ 0] = Q_REDUCE( F[ 0] );
|
||||||
|
out[ 1] = Q_REDUCE( F[ 1] );
|
||||||
|
out[ 2] = Q_REDUCE( F[ 2] );
|
||||||
|
out[ 3] = Q_REDUCE( F[ 3] );
|
||||||
|
out[ 4] = Q_REDUCE( F[ 4] );
|
||||||
|
out[ 5] = Q_REDUCE( F[ 5] );
|
||||||
|
out[ 6] = Q_REDUCE( F[ 6] );
|
||||||
|
out[ 7] = Q_REDUCE( F[ 7] );
|
||||||
|
out[ 8] = Q_REDUCE( F[ 8] );
|
||||||
|
out[ 9] = Q_REDUCE( F[ 9] );
|
||||||
|
out[10] = Q_REDUCE( F[10] );
|
||||||
|
out[11] = Q_REDUCE( F[11] );
|
||||||
|
out[12] = Q_REDUCE( F[12] );
|
||||||
|
out[13] = Q_REDUCE( F[13] );
|
||||||
|
out[14] = Q_REDUCE( F[14] );
|
||||||
|
out[15] = Q_REDUCE( F[15] );
|
||||||
|
|
||||||
|
#undef Q_REDUCE
|
||||||
|
|
||||||
|
#else // < SSE4.1
|
||||||
|
|
||||||
swift_int16_t *mult = multipliers;
|
swift_int16_t *mult = multipliers;
|
||||||
|
|
||||||
|
// First loop unrolling:
|
||||||
|
register swift_int16_t *table = &(fftTable[input[0] << 3]);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
swift_int32_t F[64];
|
swift_int32_t F[64];
|
||||||
|
|
||||||
@@ -666,9 +861,6 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
|||||||
F50, F51, F52, F53, F54, F55, F56, F57, F58, F59,
|
F50, F51, F52, F53, F54, F55, F56, F57, F58, F59,
|
||||||
F60, F61, F62, F63;
|
F60, F61, F62, F63;
|
||||||
|
|
||||||
// First loop unrolling:
|
|
||||||
register swift_int16_t *table = &(fftTable[input[0] << 3]);
|
|
||||||
|
|
||||||
F0 = mult[0] * table[0];
|
F0 = mult[0] * table[0];
|
||||||
F8 = mult[1] * table[1];
|
F8 = mult[1] * table[1];
|
||||||
F16 = mult[2] * table[2];
|
F16 = mult[2] * table[2];
|
||||||
@@ -678,89 +870,92 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
|||||||
F48 = mult[6] * table[6];
|
F48 = mult[6] * table[6];
|
||||||
F56 = mult[7] * table[7];
|
F56 = mult[7] * table[7];
|
||||||
|
|
||||||
mult += 8;
|
|
||||||
table = &(fftTable[input[1] << 3]);
|
table = &(fftTable[input[1] << 3]);
|
||||||
|
|
||||||
F1 = mult[0] * table[0];
|
F1 = mult[ 8] * table[0];
|
||||||
F9 = mult[1] * table[1];
|
F9 = mult[ 9] * table[1];
|
||||||
F17 = mult[2] * table[2];
|
F17 = mult[10] * table[2];
|
||||||
F25 = mult[3] * table[3];
|
F25 = mult[11] * table[3];
|
||||||
F33 = mult[4] * table[4];
|
F33 = mult[12] * table[4];
|
||||||
F41 = mult[5] * table[5];
|
F41 = mult[13] * table[5];
|
||||||
F49 = mult[6] * table[6];
|
F49 = mult[14] * table[6];
|
||||||
F57 = mult[7] * table[7];
|
F57 = mult[15] * table[7];
|
||||||
|
|
||||||
mult += 8;
|
|
||||||
table = &(fftTable[input[2] << 3]);
|
table = &(fftTable[input[2] << 3]);
|
||||||
|
|
||||||
F2 = mult[0] * table[0];
|
F2 = mult[16] * table[0];
|
||||||
F10 = mult[1] * table[1];
|
F10 = mult[17] * table[1];
|
||||||
F18 = mult[2] * table[2];
|
F18 = mult[18] * table[2];
|
||||||
F26 = mult[3] * table[3];
|
F26 = mult[19] * table[3];
|
||||||
F34 = mult[4] * table[4];
|
F34 = mult[20] * table[4];
|
||||||
F42 = mult[5] * table[5];
|
F42 = mult[21] * table[5];
|
||||||
F50 = mult[6] * table[6];
|
F50 = mult[22] * table[6];
|
||||||
F58 = mult[7] * table[7];
|
F58 = mult[23] * table[7];
|
||||||
|
|
||||||
mult += 8;
|
|
||||||
table = &(fftTable[input[3] << 3]);
|
table = &(fftTable[input[3] << 3]);
|
||||||
|
|
||||||
F3 = mult[0] * table[0];
|
F3 = mult[24] * table[0];
|
||||||
F11 = mult[1] * table[1];
|
F11 = mult[25] * table[1];
|
||||||
F19 = mult[2] * table[2];
|
F19 = mult[26] * table[2];
|
||||||
F27 = mult[3] * table[3];
|
F27 = mult[27] * table[3];
|
||||||
F35 = mult[4] * table[4];
|
F35 = mult[28] * table[4];
|
||||||
F43 = mult[5] * table[5];
|
F43 = mult[29] * table[5];
|
||||||
F51 = mult[6] * table[6];
|
F51 = mult[30] * table[6];
|
||||||
F59 = mult[7] * table[7];
|
F59 = mult[31] * table[7];
|
||||||
|
|
||||||
mult += 8;
|
|
||||||
table = &(fftTable[input[4] << 3]);
|
table = &(fftTable[input[4] << 3]);
|
||||||
|
|
||||||
F4 = mult[0] * table[0];
|
F4 = mult[32] * table[0];
|
||||||
F12 = mult[1] * table[1];
|
F12 = mult[33] * table[1];
|
||||||
F20 = mult[2] * table[2];
|
F20 = mult[34] * table[2];
|
||||||
F28 = mult[3] * table[3];
|
F28 = mult[35] * table[3];
|
||||||
F36 = mult[4] * table[4];
|
F36 = mult[36] * table[4];
|
||||||
F44 = mult[5] * table[5];
|
F44 = mult[37] * table[5];
|
||||||
F52 = mult[6] * table[6];
|
F52 = mult[38] * table[6];
|
||||||
F60 = mult[7] * table[7];
|
F60 = mult[39] * table[7];
|
||||||
|
|
||||||
mult += 8;
|
|
||||||
table = &(fftTable[input[5] << 3]);
|
table = &(fftTable[input[5] << 3]);
|
||||||
|
|
||||||
F5 = mult[0] * table[0];
|
F5 = mult[40] * table[0];
|
||||||
F13 = mult[1] * table[1];
|
F13 = mult[41] * table[1];
|
||||||
F21 = mult[2] * table[2];
|
F21 = mult[42] * table[2];
|
||||||
F29 = mult[3] * table[3];
|
F29 = mult[43] * table[3];
|
||||||
F37 = mult[4] * table[4];
|
F37 = mult[44] * table[4];
|
||||||
F45 = mult[5] * table[5];
|
F45 = mult[45] * table[5];
|
||||||
F53 = mult[6] * table[6];
|
F53 = mult[46] * table[6];
|
||||||
F61 = mult[7] * table[7];
|
F61 = mult[47] * table[7];
|
||||||
|
|
||||||
mult += 8;
|
|
||||||
table = &(fftTable[input[6] << 3]);
|
table = &(fftTable[input[6] << 3]);
|
||||||
|
|
||||||
F6 = mult[0] * table[0];
|
F6 = mult[48] * table[0];
|
||||||
F14 = mult[1] * table[1];
|
F14 = mult[49] * table[1];
|
||||||
F22 = mult[2] * table[2];
|
F22 = mult[50] * table[2];
|
||||||
F30 = mult[3] * table[3];
|
F30 = mult[51] * table[3];
|
||||||
F38 = mult[4] * table[4];
|
F38 = mult[52] * table[4];
|
||||||
F46 = mult[5] * table[5];
|
F46 = mult[53] * table[5];
|
||||||
F54 = mult[6] * table[6];
|
F54 = mult[54] * table[6];
|
||||||
F62 = mult[7] * table[7];
|
F62 = mult[55] * table[7];
|
||||||
|
|
||||||
mult += 8;
|
|
||||||
table = &(fftTable[input[7] << 3]);
|
table = &(fftTable[input[7] << 3]);
|
||||||
|
|
||||||
F7 = mult[0] * table[0];
|
F7 = mult[56] * table[0];
|
||||||
F15 = mult[1] * table[1];
|
F15 = mult[57] * table[1];
|
||||||
F23 = mult[2] * table[2];
|
F23 = mult[58] * table[2];
|
||||||
F31 = mult[3] * table[3];
|
F31 = mult[59] * table[3];
|
||||||
F39 = mult[4] * table[4];
|
F39 = mult[60] * table[4];
|
||||||
F47 = mult[5] * table[5];
|
F47 = mult[61] * table[5];
|
||||||
F55 = mult[6] * table[6];
|
F55 = mult[62] * table[6];
|
||||||
F63 = mult[7] * table[7];
|
F63 = mult[63] * table[7];
|
||||||
|
|
||||||
|
#define ADD_SUB( a, b ) \
|
||||||
|
{ \
|
||||||
|
int temp = b; \
|
||||||
|
b = a - b; \
|
||||||
|
a = a + temp; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define Q_REDUCE( a ) \
|
||||||
|
( ( (a) & 0xff ) - ( (a) >> 8 ) )
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
||||||
@@ -800,7 +995,6 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
|||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
// Second loop unrolling:
|
// Second loop unrolling:
|
||||||
// Iteration 0:
|
// Iteration 0:
|
||||||
ADD_SUB(F0, F1);
|
ADD_SUB(F0, F1);
|
||||||
@@ -1057,6 +1251,11 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
|||||||
output[47] = Q_REDUCE(F61);
|
output[47] = Q_REDUCE(F61);
|
||||||
output[55] = Q_REDUCE(F62);
|
output[55] = Q_REDUCE(F62);
|
||||||
output[63] = Q_REDUCE(F63);
|
output[63] = Q_REDUCE(F63);
|
||||||
|
|
||||||
|
#undef ADD_SUB
|
||||||
|
#undef Q_REDUCE
|
||||||
|
|
||||||
|
#endif // AVX2 elif SSE4.1 else
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculates the FFT part of SWIFFT.
|
// Calculates the FFT part of SWIFFT.
|
||||||
@@ -1086,24 +1285,66 @@ void SWIFFTFFT(const unsigned char *input, int m, swift_int32_t *output)
|
|||||||
// - m: the input size divided by 64.
|
// - m: the input size divided by 64.
|
||||||
// - output: will store the result.
|
// - output: will store the result.
|
||||||
// - a: the coefficients in the sum. Of size 64 * m.
|
// - a: the coefficients in the sum. Of size 64 * m.
|
||||||
void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const swift_int16_t *a)
|
void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output,
|
||||||
|
const swift_int16_t *a )
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
swift_int32_t result[N];
|
swift_int32_t result[N] __attribute__ ((aligned (64)));
|
||||||
register swift_int16_t carry = 0;
|
register swift_int16_t carry = 0;
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
__m512i *res = (__m512i*)result;
|
||||||
|
for ( j = 0; j < N/16; ++j )
|
||||||
|
{
|
||||||
|
__m512i sum = _mm512_setzero_si512();
|
||||||
|
const __m512i *f = (__m512i*)input + j;
|
||||||
|
const __m512i *k = (__m512i*)a + j;
|
||||||
|
for ( i = 0; i < m; i++, f += N/16, k += N/16 )
|
||||||
|
sum = _mm512_add_epi32( sum, _mm512_mullo_epi32( *f, *k ) );
|
||||||
|
res[j] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
|
||||||
|
__m256i *res = (__m256i*)result;
|
||||||
|
for ( j = 0; j < N/8; ++j )
|
||||||
|
{
|
||||||
|
__m256i sum = _mm256_setzero_si256();
|
||||||
|
const __m256i *f = (__m256i*)input + j;
|
||||||
|
const __m256i *k = (__m256i*)a + j;
|
||||||
|
for ( i = 0; i < m; i++, f += N/8, k += N/8 )
|
||||||
|
sum = _mm256_add_epi32( sum, _mm256_mullo_epi32( *f, *k ) );
|
||||||
|
res[j] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(__SSE4_1__)
|
||||||
|
|
||||||
|
__m128i *res = (__m128i*)result;
|
||||||
|
for ( j = 0; j < N/4; ++j )
|
||||||
|
{
|
||||||
|
__m128i sum = _mm_setzero_si128();
|
||||||
|
const __m128i *f = (__m128i*)input + j;
|
||||||
|
const __m128i *k = (__m128i*)a + j;
|
||||||
|
for ( i = 0; i < m; i++, f += N/4, k += N/4 )
|
||||||
|
sum = _mm_add_epi32( sum, _mm_mullo_epi32( *f, *k ) );
|
||||||
|
res[j] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
for (j = 0; j < N; ++j)
|
for (j = 0; j < N; ++j)
|
||||||
{
|
{
|
||||||
register swift_int32_t sum = 0;
|
register swift_int32_t sum = 0;
|
||||||
const register swift_int32_t *f = input + j;
|
const register swift_int32_t *f = input + j;
|
||||||
const register swift_int16_t *k = a + j;
|
const register swift_int16_t *k = a + j;
|
||||||
|
|
||||||
for (i = 0; i < m; i++, f += N,k += N)
|
for (i = 0; i < m; i++, f += N,k += N)
|
||||||
sum += (*f) * (*k);
|
sum += (*f) * (*k);
|
||||||
|
|
||||||
result[j] = sum;
|
result[j] = sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
for (j = 0; j < N; ++j)
|
for (j = 0; j < N; ++j)
|
||||||
result[j] = ((FIELD_SIZE << 22) + result[j]) % FIELD_SIZE;
|
result[j] = ((FIELD_SIZE << 22) + result[j]) % FIELD_SIZE;
|
||||||
|
|
||||||
@@ -1122,8 +1363,8 @@ void ComputeSingleSWIFFTX_smooth(unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
|
|||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
// Will store the result of the FFT parts:
|
// Will store the result of the FFT parts:
|
||||||
swift_int32_t fftOut[N * M];
|
swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
|
||||||
unsigned char intermediate[N * 3 + 8];
|
unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64)));
|
||||||
unsigned char carry0,carry1,carry2;
|
unsigned char carry0,carry1,carry2;
|
||||||
|
|
||||||
// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
|
// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
|
||||||
@@ -1199,8 +1440,8 @@ void ComputeSingleSWIFFTX( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
|
|||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
// Will store the result of the FFT parts:
|
// Will store the result of the FFT parts:
|
||||||
swift_int32_t fftOut[N * M];
|
swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
|
||||||
unsigned char intermediate[N * 3 + 8];
|
unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64)));
|
||||||
unsigned char carry0,carry1,carry2;
|
unsigned char carry0,carry1,carry2;
|
||||||
|
|
||||||
// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
|
// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
735
algo/verthash/Verthash.c
Normal file
735
algo/verthash/Verthash.c
Normal file
@@ -0,0 +1,735 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2018-2021 CryptoGraphics
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU General Public License as published by the Free
|
||||||
|
* Software Foundation; either version 2 of the License, or (at your option)
|
||||||
|
* any later version. See LICENSE for more details.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "algo-gate-api.h"
|
||||||
|
#include "Verthash.h"
|
||||||
|
#include "mm_malloc.h"
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
// Verthash info management
|
||||||
|
int verthash_info_init(verthash_info_t* info, const char* file_name)
|
||||||
|
{
|
||||||
|
// init fields to 0
|
||||||
|
info->fileName = NULL;
|
||||||
|
info->data = NULL;
|
||||||
|
info->dataSize = 0;
|
||||||
|
info->bitmask = 0;
|
||||||
|
size_t fileNameLen;
|
||||||
|
|
||||||
|
if ( !file_name || !( fileNameLen = strlen( file_name ) ) )
|
||||||
|
{
|
||||||
|
applog( LOG_ERR, "Invalid file specification" );
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
info->fileName = (char*)malloc( fileNameLen + 1 );
|
||||||
|
if ( !info->fileName )
|
||||||
|
{
|
||||||
|
applog( LOG_ERR, "Failed to allocate memory for Verthash data" );
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
memset( info->fileName, 0, fileNameLen + 1 );
|
||||||
|
memcpy( info->fileName, file_name, fileNameLen );
|
||||||
|
|
||||||
|
FILE *fileMiningData = fopen_utf8( info->fileName, "rb" );
|
||||||
|
if ( !fileMiningData )
|
||||||
|
{
|
||||||
|
if ( opt_data_file || !opt_verify )
|
||||||
|
{
|
||||||
|
if ( opt_data_file )
|
||||||
|
applog( LOG_ERR, "Verthash data file not found or invalid: %s",
|
||||||
|
info->fileName );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
applog( LOG_ERR,
|
||||||
|
"No Verthash data file specified and default not found");
|
||||||
|
applog( LOG_NOTICE,
|
||||||
|
"Add '--verify' to create default 'verthash.dat'");
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
applog( LOG_NOTICE, "Creating default 'verthash.dat' in current directory, this will take several minutes");
|
||||||
|
if ( verthash_generate_data_file( info->fileName ) )
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
fileMiningData = fopen_utf8( info->fileName, "rb" );
|
||||||
|
if ( !fileMiningData )
|
||||||
|
{
|
||||||
|
applog( LOG_ERR, "File system error opening %s", info->fileName );
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
applog( LOG_NOTICE, "Verthash data file created successfully" );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get file size
|
||||||
|
fseek(fileMiningData, 0, SEEK_END);
|
||||||
|
int fileSize = ftell(fileMiningData);
|
||||||
|
fseek(fileMiningData, 0, SEEK_SET);
|
||||||
|
|
||||||
|
if ( fileSize < 0 )
|
||||||
|
{
|
||||||
|
fclose(fileMiningData);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocate data
|
||||||
|
info->data = (uint8_t *)_mm_malloc( fileSize, 64 );
|
||||||
|
if (!info->data)
|
||||||
|
{
|
||||||
|
fclose(fileMiningData);
|
||||||
|
// Memory allocation fatal error.
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load data
|
||||||
|
if ( !fread( info->data, fileSize, 1, fileMiningData ) )
|
||||||
|
{
|
||||||
|
applog( LOG_ERR, "File system error reading %s", info->fileName );
|
||||||
|
fclose(fileMiningData);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(fileMiningData);
|
||||||
|
|
||||||
|
// Update fields
|
||||||
|
info->bitmask = ((fileSize - VH_HASH_OUT_SIZE)/VH_BYTE_ALIGNMENT) + 1;
|
||||||
|
info->dataSize = fileSize;
|
||||||
|
|
||||||
|
applog( LOG_NOTICE, "Using Verthash data file '%s'", info->fileName );
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
void verthash_info_free(verthash_info_t* info)
|
||||||
|
{
|
||||||
|
free(info->fileName);
|
||||||
|
free(info->data);
|
||||||
|
info->dataSize = 0;
|
||||||
|
info->bitmask = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
// Verthash hash
|
||||||
|
#define VH_P0_SIZE 64
|
||||||
|
#define VH_N_ITER 8
|
||||||
|
#define VH_N_SUBSET VH_P0_SIZE*VH_N_ITER
|
||||||
|
#define VH_N_ROT 32
|
||||||
|
#define VH_N_INDEXES 4096
|
||||||
|
#define VH_BYTE_ALIGNMENT 16
|
||||||
|
|
||||||
|
static inline uint32_t fnv1a(const uint32_t a, const uint32_t b)
|
||||||
|
{
|
||||||
|
return (a ^ b) * 0x1000193;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
static void rotate_indexes( uint32_t *p )
|
||||||
|
{
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
|
for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m256i); x += 8 )
|
||||||
|
{
|
||||||
|
__m256i *px = (__m256i*)p + x;
|
||||||
|
|
||||||
|
px[0] = mm256_rol_32( px[0], 1 );
|
||||||
|
px[1] = mm256_rol_32( px[1], 1 );
|
||||||
|
px[2] = mm256_rol_32( px[2], 1 );
|
||||||
|
px[3] = mm256_rol_32( px[3], 1 );
|
||||||
|
px[4] = mm256_rol_32( px[4], 1 );
|
||||||
|
px[5] = mm256_rol_32( px[5], 1 );
|
||||||
|
px[6] = mm256_rol_32( px[6], 1 );
|
||||||
|
px[7] = mm256_rol_32( px[7], 1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m128i); x += 8 )
|
||||||
|
{
|
||||||
|
__m128i *px = (__m128i*)p0_index + x;
|
||||||
|
|
||||||
|
px[0] = mm128_rol_32( px[0], 1 );
|
||||||
|
px[1] = mm128_rol_32( px[1], 1 );
|
||||||
|
px[2] = mm128_rol_32( px[2], 1 );
|
||||||
|
px[3] = mm128_rol_32( px[3], 1 );
|
||||||
|
px[4] = mm128_rol_32( px[4], 1 );
|
||||||
|
px[5] = mm128_rol_32( px[5], 1 );
|
||||||
|
px[6] = mm128_rol_32( px[6], 1 );
|
||||||
|
px[7] = mm128_rol_32( px[7], 1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
/*
|
||||||
|
for ( size_t x = 0; x < VH_N_SUBSET / sizeof(uint32_t); ++x )
|
||||||
|
p[x] = ( p[x] << 1 ) | ( p[x] >> 31 );
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// Vectorized and targetted version of fnv1a
|
||||||
|
#if defined (__AVX2__)
|
||||||
|
|
||||||
|
#define MULXOR \
|
||||||
|
*(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
|
||||||
|
*(__m256i*)hash, *(__m256i*)blob_off ), k );
|
||||||
|
|
||||||
|
#elif defined(__SSE41__)
|
||||||
|
|
||||||
|
#define MULXOR \
|
||||||
|
casti_m128i( hash, 0 ) = _mm_mullo_epi32( _mm_xor_si128( \
|
||||||
|
casti_m128i( hash, 0 ), casti_m128i( blob_off, 0 ) ), k ); \
|
||||||
|
casti_m128i( hash, 1 ) = _mm_mullo_epi32( _mm_xor_si128( \
|
||||||
|
casti_m128i( hash, 1 ), casti_m128i( blob_off, 1 ) ), k );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define MULXOR \
|
||||||
|
for ( size_t j = 0; j < VH_HASH_OUT_SIZE / sizeof(uint32_t); j++ ) \
|
||||||
|
hash[j] = fnv1a( hash[j], blob_off[j] ); \
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define UPDATE_ACCUMULATOR \
|
||||||
|
accumulator = fnv1a( accumulator, blob_off[0] ); \
|
||||||
|
accumulator = fnv1a( accumulator, blob_off[1] ); \
|
||||||
|
accumulator = fnv1a( accumulator, blob_off[2] ); \
|
||||||
|
accumulator = fnv1a( accumulator, blob_off[3] ); \
|
||||||
|
accumulator = fnv1a( accumulator, blob_off[4] ); \
|
||||||
|
accumulator = fnv1a( accumulator, blob_off[5] ); \
|
||||||
|
accumulator = fnv1a( accumulator, blob_off[6] ); \
|
||||||
|
accumulator = fnv1a( accumulator, blob_off[7] )
|
||||||
|
|
||||||
|
|
||||||
|
// first pass no rotate
|
||||||
|
#define ROUND_0 \
|
||||||
|
for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
|
||||||
|
{ \
|
||||||
|
const uint32_t *blob_off = blob + \
|
||||||
|
( ( fnv1a( subset[i], accumulator ) % mdiv ) \
|
||||||
|
* ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
|
||||||
|
UPDATE_ACCUMULATOR; \
|
||||||
|
MULXOR; \
|
||||||
|
}
|
||||||
|
|
||||||
|
// subsequent passes rotate by r on demand, no need for mass rotate
|
||||||
|
#define ROUND_r( r ) \
|
||||||
|
for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
|
||||||
|
{ \
|
||||||
|
const uint32_t *blob_off = blob + \
|
||||||
|
( ( fnv1a( rol32( subset[i], r ), accumulator ) % mdiv ) \
|
||||||
|
* ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
|
||||||
|
UPDATE_ACCUMULATOR; \
|
||||||
|
MULXOR; \
|
||||||
|
}
|
||||||
|
|
||||||
|
void verthash_hash( const void *blob_bytes, const size_t blob_size,
|
||||||
|
const void *input, void *output )
|
||||||
|
{
|
||||||
|
uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t subset[ VH_N_SUBSET / 4 ] __attribute__ ((aligned (64)));
|
||||||
|
const uint32_t *blob = (const uint32_t*)blob_bytes;
|
||||||
|
uint32_t accumulator = 0x811c9dc5;
|
||||||
|
const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE )
|
||||||
|
/ VH_BYTE_ALIGNMENT ) + 1;
|
||||||
|
#if defined (__AVX2__)
|
||||||
|
const __m256i k = _mm256_set1_epi32( 0x1000193 );
|
||||||
|
#elif defined(__SSE41__)
|
||||||
|
const __m128i k = _mm_set1_epi32( 0x1000193 );
|
||||||
|
#endif
|
||||||
|
|
||||||
|
sha3( input, VH_HEADER_SIZE, hash, VH_HASH_OUT_SIZE );
|
||||||
|
verthash_sha3_512_final_8( subset, ( (uint64_t*)input )[ 9 ] );
|
||||||
|
|
||||||
|
ROUND_0;
|
||||||
|
for ( size_t r = 1; r < VH_N_ROT; ++r )
|
||||||
|
ROUND_r( r );
|
||||||
|
|
||||||
|
memcpy( output, hash, VH_HASH_OUT_SIZE );
|
||||||
|
}
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
// Verthash data file generator
|
||||||
|
|
||||||
|
#define NODE_SIZE 32
|
||||||
|
|
||||||
|
struct Graph
|
||||||
|
{
|
||||||
|
FILE *db;
|
||||||
|
int64_t log2;
|
||||||
|
int64_t pow2;
|
||||||
|
uint8_t *pk;
|
||||||
|
int64_t index;
|
||||||
|
};
|
||||||
|
|
||||||
|
int64_t Log2(int64_t x)
|
||||||
|
{
|
||||||
|
int64_t r = 0;
|
||||||
|
for (; x > 1; x >>= 1)
|
||||||
|
{
|
||||||
|
r++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t bfsToPost(struct Graph *g, const int64_t node)
|
||||||
|
{
|
||||||
|
return node & ~g->pow2;
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t numXi(int64_t index)
|
||||||
|
{
|
||||||
|
return (1 << ((uint64_t)index)) * (index + 1) * index;
|
||||||
|
}
|
||||||
|
|
||||||
|
void WriteId(struct Graph *g, uint8_t *Node, const int64_t id)
|
||||||
|
{
|
||||||
|
fseek(g->db, id * NODE_SIZE, SEEK_SET);
|
||||||
|
fwrite(Node, 1, NODE_SIZE, g->db);
|
||||||
|
}
|
||||||
|
|
||||||
|
void WriteNode(struct Graph *g, uint8_t *Node, const int64_t id)
|
||||||
|
{
|
||||||
|
const int64_t idx = bfsToPost(g, id);
|
||||||
|
WriteId(g, Node, idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
void NewNode(struct Graph *g, const int64_t id, uint8_t *hash)
|
||||||
|
{
|
||||||
|
WriteNode(g, hash, id);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t *GetId(struct Graph *g, const int64_t id)
|
||||||
|
{
|
||||||
|
fseek(g->db, id * NODE_SIZE, SEEK_SET);
|
||||||
|
uint8_t *node = (uint8_t *)malloc(NODE_SIZE);
|
||||||
|
const size_t bytes_read = fread(node, 1, NODE_SIZE, g->db);
|
||||||
|
if(bytes_read != NODE_SIZE) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return node;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t *GetNode(struct Graph *g, const int64_t id)
|
||||||
|
{
|
||||||
|
const int64_t idx = bfsToPost(g, id);
|
||||||
|
return GetId(g, idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t WriteVarInt(uint8_t *buffer, int64_t val)
|
||||||
|
{
|
||||||
|
memset(buffer, 0, NODE_SIZE);
|
||||||
|
uint64_t uval = ((uint64_t)(val)) << 1;
|
||||||
|
if (val < 0)
|
||||||
|
{
|
||||||
|
uval = ~uval;
|
||||||
|
}
|
||||||
|
uint32_t i = 0;
|
||||||
|
while (uval >= 0x80)
|
||||||
|
{
|
||||||
|
buffer[i] = (uint8_t)uval | 0x80;
|
||||||
|
uval >>= 7;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
buffer[i] = (uint8_t)uval;
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ButterflyGraph(struct Graph *g, int64_t index, int64_t *count)
|
||||||
|
{
|
||||||
|
if (index == 0)
|
||||||
|
{
|
||||||
|
index = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t numLevel = 2 * index;
|
||||||
|
int64_t perLevel = (int64_t)(1 << (uint64_t)index);
|
||||||
|
int64_t begin = *count - perLevel;
|
||||||
|
int64_t level, i;
|
||||||
|
|
||||||
|
for (level = 1; level < numLevel; level++)
|
||||||
|
{
|
||||||
|
for (i = 0; i < perLevel; i++)
|
||||||
|
{
|
||||||
|
int64_t prev;
|
||||||
|
int64_t shift = index - level;
|
||||||
|
if (level > numLevel / 2)
|
||||||
|
{
|
||||||
|
shift = level - numLevel / 2;
|
||||||
|
}
|
||||||
|
if (((i >> (uint64_t)shift) & 1) == 0)
|
||||||
|
{
|
||||||
|
prev = i + (1 << (uint64_t)shift);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
prev = i - (1 << (uint64_t)shift);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t *parent0 = GetNode(g, begin + (level - 1) * perLevel + prev);
|
||||||
|
uint8_t *parent1 = GetNode(g, *count - perLevel);
|
||||||
|
uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
|
||||||
|
WriteVarInt(buf, *count);
|
||||||
|
uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 4);
|
||||||
|
memcpy(hashInput, g->pk, NODE_SIZE);
|
||||||
|
memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
|
||||||
|
memcpy(hashInput + (NODE_SIZE * 2), parent0, NODE_SIZE);
|
||||||
|
memcpy(hashInput + (NODE_SIZE * 3), parent1, NODE_SIZE);
|
||||||
|
|
||||||
|
uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
|
||||||
|
sha3(hashInput, NODE_SIZE * 4, hashOutput, NODE_SIZE);
|
||||||
|
|
||||||
|
NewNode(g, *count, hashOutput);
|
||||||
|
(*count)++;
|
||||||
|
|
||||||
|
free(hashOutput);
|
||||||
|
free(hashInput);
|
||||||
|
free(parent0);
|
||||||
|
free(parent1);
|
||||||
|
free(buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void XiGraphIter(struct Graph *g, int64_t index)
|
||||||
|
{
|
||||||
|
int64_t count = g->pow2;
|
||||||
|
|
||||||
|
int8_t stackSize = 5;
|
||||||
|
int64_t *stack = (int64_t *)malloc(sizeof(int64_t) * stackSize);
|
||||||
|
for (int i = 0; i < 5; i++)
|
||||||
|
stack[i] = index;
|
||||||
|
|
||||||
|
int8_t graphStackSize = 5;
|
||||||
|
int32_t *graphStack = (int32_t *)malloc(sizeof(int32_t) * graphStackSize);
|
||||||
|
for (int i = 0; i < 5; i++)
|
||||||
|
graphStack[i] = graphStackSize - i - 1;
|
||||||
|
|
||||||
|
int64_t i = 0;
|
||||||
|
int64_t graph = 0;
|
||||||
|
int64_t pow2index = 1 << ((uint64_t)index);
|
||||||
|
|
||||||
|
for (i = 0; i < pow2index; i++)
|
||||||
|
{
|
||||||
|
uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
|
||||||
|
WriteVarInt(buf, count);
|
||||||
|
uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 2);
|
||||||
|
memcpy(hashInput, g->pk, NODE_SIZE);
|
||||||
|
memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
|
||||||
|
uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
|
||||||
|
|
||||||
|
sha3(hashInput, NODE_SIZE * 2, hashOutput, NODE_SIZE);
|
||||||
|
NewNode(g, count, hashOutput);
|
||||||
|
count++;
|
||||||
|
|
||||||
|
free(hashOutput);
|
||||||
|
free(hashInput);
|
||||||
|
free(buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (index == 1)
|
||||||
|
{
|
||||||
|
ButterflyGraph(g, index, &count);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (stackSize != 0 && graphStackSize != 0)
|
||||||
|
{
|
||||||
|
|
||||||
|
index = stack[stackSize - 1];
|
||||||
|
graph = graphStack[graphStackSize - 1];
|
||||||
|
|
||||||
|
stackSize--;
|
||||||
|
if (stackSize > 0)
|
||||||
|
{
|
||||||
|
int64_t *tempStack = (int64_t *)malloc(sizeof(int64_t) * (stackSize));
|
||||||
|
memcpy(tempStack, stack, sizeof(int64_t) * (stackSize));
|
||||||
|
free(stack);
|
||||||
|
stack = tempStack;
|
||||||
|
}
|
||||||
|
|
||||||
|
graphStackSize--;
|
||||||
|
if (graphStackSize > 0)
|
||||||
|
{
|
||||||
|
int32_t *tempGraphStack = (int32_t *)malloc(sizeof(int32_t) * (graphStackSize));
|
||||||
|
memcpy(tempGraphStack, graphStack, sizeof(int32_t) * (graphStackSize));
|
||||||
|
free(graphStack);
|
||||||
|
graphStack = tempGraphStack;
|
||||||
|
}
|
||||||
|
|
||||||
|
int8_t indicesSize = 5;
|
||||||
|
int64_t *indices = (int64_t *)malloc(sizeof(int64_t) * indicesSize);
|
||||||
|
for (int i = 0; i < indicesSize; i++)
|
||||||
|
indices[i] = index - 1;
|
||||||
|
|
||||||
|
int8_t graphsSize = 5;
|
||||||
|
int32_t *graphs = (int32_t *)malloc(sizeof(int32_t) * graphsSize);
|
||||||
|
for (int i = 0; i < graphsSize; i++)
|
||||||
|
graphs[i] = graphsSize - i - 1;
|
||||||
|
|
||||||
|
int64_t pow2indexInner = 1 << ((uint64_t)index);
|
||||||
|
int64_t pow2indexInner_1 = 1 << ((uint64_t)index - 1);
|
||||||
|
|
||||||
|
if (graph == 0)
|
||||||
|
{
|
||||||
|
uint64_t sources = count - pow2indexInner;
|
||||||
|
for (i = 0; i < pow2indexInner_1; i++)
|
||||||
|
{
|
||||||
|
uint8_t *parent0 = GetNode(g, sources + i);
|
||||||
|
uint8_t *parent1 = GetNode(g, sources + i + pow2indexInner_1);
|
||||||
|
|
||||||
|
uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
|
||||||
|
WriteVarInt(buf, count);
|
||||||
|
|
||||||
|
uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 4);
|
||||||
|
memcpy(hashInput, g->pk, NODE_SIZE);
|
||||||
|
memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
|
||||||
|
memcpy(hashInput + (NODE_SIZE * 2), parent0, NODE_SIZE);
|
||||||
|
memcpy(hashInput + (NODE_SIZE * 3), parent1, NODE_SIZE);
|
||||||
|
|
||||||
|
uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
|
||||||
|
sha3(hashInput, NODE_SIZE * 4, hashOutput, NODE_SIZE);
|
||||||
|
|
||||||
|
NewNode(g, count, hashOutput);
|
||||||
|
count++;
|
||||||
|
|
||||||
|
free(hashOutput);
|
||||||
|
free(hashInput);
|
||||||
|
free(parent0);
|
||||||
|
free(parent1);
|
||||||
|
free(buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (graph == 1)
|
||||||
|
{
|
||||||
|
uint64_t firstXi = count;
|
||||||
|
for (i = 0; i < pow2indexInner_1; i++)
|
||||||
|
{
|
||||||
|
uint64_t nodeId = firstXi + i;
|
||||||
|
uint8_t *parent = GetNode(g, firstXi - pow2indexInner_1 + i);
|
||||||
|
|
||||||
|
uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
|
||||||
|
WriteVarInt(buf, nodeId);
|
||||||
|
|
||||||
|
uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 3);
|
||||||
|
memcpy(hashInput, g->pk, NODE_SIZE);
|
||||||
|
memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
|
||||||
|
memcpy(hashInput + (NODE_SIZE * 2), parent, NODE_SIZE);
|
||||||
|
|
||||||
|
uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
|
||||||
|
sha3(hashInput, NODE_SIZE * 3, hashOutput, NODE_SIZE);
|
||||||
|
|
||||||
|
NewNode(g, count, hashOutput);
|
||||||
|
count++;
|
||||||
|
|
||||||
|
free(hashOutput);
|
||||||
|
free(hashInput);
|
||||||
|
free(parent);
|
||||||
|
free(buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (graph == 2)
|
||||||
|
{
|
||||||
|
uint64_t secondXi = count;
|
||||||
|
for (i = 0; i < pow2indexInner_1; i++)
|
||||||
|
{
|
||||||
|
uint64_t nodeId = secondXi + i;
|
||||||
|
uint8_t *parent = GetNode(g, secondXi - pow2indexInner_1 + i);
|
||||||
|
|
||||||
|
uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
|
||||||
|
WriteVarInt(buf, nodeId);
|
||||||
|
|
||||||
|
uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 3);
|
||||||
|
memcpy(hashInput, g->pk, NODE_SIZE);
|
||||||
|
memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
|
||||||
|
memcpy(hashInput + (NODE_SIZE * 2), parent, NODE_SIZE);
|
||||||
|
|
||||||
|
uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
|
||||||
|
sha3(hashInput, NODE_SIZE * 3, hashOutput, NODE_SIZE);
|
||||||
|
|
||||||
|
NewNode(g, count, hashOutput);
|
||||||
|
count++;
|
||||||
|
|
||||||
|
free(hashOutput);
|
||||||
|
free(hashInput);
|
||||||
|
free(parent);
|
||||||
|
free(buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (graph == 3)
|
||||||
|
{
|
||||||
|
uint64_t secondButter = count;
|
||||||
|
for (i = 0; i < pow2indexInner_1; i++)
|
||||||
|
{
|
||||||
|
uint64_t nodeId = secondButter + i;
|
||||||
|
uint8_t *parent = GetNode(g, secondButter - pow2indexInner_1 + i);
|
||||||
|
|
||||||
|
uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
|
||||||
|
WriteVarInt(buf, nodeId);
|
||||||
|
|
||||||
|
uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 3);
|
||||||
|
memcpy(hashInput, g->pk, NODE_SIZE);
|
||||||
|
memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
|
||||||
|
memcpy(hashInput + (NODE_SIZE * 2), parent, NODE_SIZE);
|
||||||
|
|
||||||
|
uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
|
||||||
|
sha3(hashInput, NODE_SIZE * 3, hashOutput, NODE_SIZE);
|
||||||
|
|
||||||
|
NewNode(g, count, hashOutput);
|
||||||
|
count++;
|
||||||
|
|
||||||
|
free(hashOutput);
|
||||||
|
free(hashInput);
|
||||||
|
free(parent);
|
||||||
|
free(buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
uint64_t sinks = count;
|
||||||
|
uint64_t sources = sinks + pow2indexInner - numXi(index);
|
||||||
|
for (i = 0; i < pow2indexInner_1; i++)
|
||||||
|
{
|
||||||
|
uint64_t nodeId0 = sinks + i;
|
||||||
|
uint64_t nodeId1 = sinks + i + pow2indexInner_1;
|
||||||
|
uint8_t *parent0 = GetNode(g, sinks - pow2indexInner_1 + i);
|
||||||
|
uint8_t *parent1_0 = GetNode(g, sources + i);
|
||||||
|
uint8_t *parent1_1 = GetNode(g, sources + i + pow2indexInner_1);
|
||||||
|
|
||||||
|
uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
|
||||||
|
WriteVarInt(buf, nodeId0);
|
||||||
|
|
||||||
|
uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 4);
|
||||||
|
memcpy(hashInput, g->pk, NODE_SIZE);
|
||||||
|
memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
|
||||||
|
memcpy(hashInput + (NODE_SIZE * 2), parent0, NODE_SIZE);
|
||||||
|
memcpy(hashInput + (NODE_SIZE * 3), parent1_0, NODE_SIZE);
|
||||||
|
|
||||||
|
uint8_t *hashOutput0 = (uint8_t *)malloc(NODE_SIZE);
|
||||||
|
sha3(hashInput, NODE_SIZE * 4, hashOutput0, NODE_SIZE);
|
||||||
|
|
||||||
|
WriteVarInt(buf, nodeId1);
|
||||||
|
|
||||||
|
memcpy(hashInput, g->pk, NODE_SIZE);
|
||||||
|
memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
|
||||||
|
memcpy(hashInput + (NODE_SIZE * 2), parent0, NODE_SIZE);
|
||||||
|
memcpy(hashInput + (NODE_SIZE * 3), parent1_1, NODE_SIZE);
|
||||||
|
|
||||||
|
uint8_t *hashOutput1 = (uint8_t *)malloc(NODE_SIZE);
|
||||||
|
sha3(hashInput, NODE_SIZE * 4, hashOutput1, NODE_SIZE);
|
||||||
|
|
||||||
|
NewNode(g, nodeId0, hashOutput0);
|
||||||
|
NewNode(g, nodeId1, hashOutput1);
|
||||||
|
count += 2;
|
||||||
|
|
||||||
|
free(parent0);
|
||||||
|
free(parent1_0);
|
||||||
|
free(parent1_1);
|
||||||
|
free(buf);
|
||||||
|
free(hashInput);
|
||||||
|
free(hashOutput0);
|
||||||
|
free(hashOutput1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((graph == 0 || graph == 3) ||
|
||||||
|
((graph == 1 || graph == 2) && index == 2))
|
||||||
|
{
|
||||||
|
ButterflyGraph(g, index - 1, &count);
|
||||||
|
}
|
||||||
|
else if (graph == 1 || graph == 2)
|
||||||
|
{
|
||||||
|
|
||||||
|
int64_t *tempStack = (int64_t *)malloc(sizeof(int64_t) * (stackSize + indicesSize));
|
||||||
|
memcpy(tempStack, stack, stackSize * sizeof(int64_t));
|
||||||
|
memcpy(tempStack + stackSize, indices, indicesSize * sizeof(int64_t));
|
||||||
|
stackSize += indicesSize;
|
||||||
|
free(stack);
|
||||||
|
stack = tempStack;
|
||||||
|
|
||||||
|
int32_t *tempGraphStack = (int32_t *)malloc(sizeof(int32_t) * (graphStackSize + graphsSize));
|
||||||
|
memcpy(tempGraphStack, graphStack, graphStackSize * sizeof(int32_t));
|
||||||
|
memcpy(tempGraphStack + graphStackSize, graphs, graphsSize * sizeof(int32_t));
|
||||||
|
graphStackSize += graphsSize;
|
||||||
|
free(graphStack);
|
||||||
|
graphStack = tempGraphStack;
|
||||||
|
}
|
||||||
|
|
||||||
|
free(indices);
|
||||||
|
free(graphs);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(stack);
|
||||||
|
free(graphStack);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Graph *NewGraph(int64_t index, const char* targetFile, uint8_t *pk)
|
||||||
|
{
|
||||||
|
uint8_t exists = 0;
|
||||||
|
FILE *db;
|
||||||
|
if ((db = fopen_utf8(targetFile, "r")) != NULL)
|
||||||
|
{
|
||||||
|
fclose(db);
|
||||||
|
exists = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
db = fopen_utf8(targetFile, "wb+");
|
||||||
|
int64_t size = numXi(index);
|
||||||
|
int64_t log2 = Log2(size) + 1;
|
||||||
|
int64_t pow2 = 1 << ((uint64_t)log2);
|
||||||
|
|
||||||
|
struct Graph *g = (struct Graph *)malloc(sizeof(struct Graph));
|
||||||
|
|
||||||
|
if ( !g ) return NULL;
|
||||||
|
|
||||||
|
g->db = db;
|
||||||
|
g->log2 = log2;
|
||||||
|
g->pow2 = pow2;
|
||||||
|
g->pk = pk;
|
||||||
|
g->index = index;
|
||||||
|
|
||||||
|
if (exists == 0)
|
||||||
|
{
|
||||||
|
XiGraphIter(g, index);
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(db);
|
||||||
|
return g;
|
||||||
|
}
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// use info for _mm_malloc, then verify file
|
||||||
|
int verthash_generate_data_file(const char* output_file_name)
|
||||||
|
{
|
||||||
|
const char *hashInput = "Verthash Proof-of-Space Datafile";
|
||||||
|
uint8_t *pk = (uint8_t*)malloc( NODE_SIZE );
|
||||||
|
|
||||||
|
if ( !pk )
|
||||||
|
{
|
||||||
|
applog( LOG_ERR, "Verthash data memory allocation failed");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
sha3( hashInput, 32, pk, NODE_SIZE );
|
||||||
|
|
||||||
|
int64_t index = 17;
|
||||||
|
if ( !NewGraph( index, output_file_name, pk ) )
|
||||||
|
{
|
||||||
|
applog( LOG_ERR, "Verthash file creation failed");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
57
algo/verthash/Verthash.h
Normal file
57
algo/verthash/Verthash.h
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2018-2021 CryptoGraphics
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU General Public License as published by the Free
|
||||||
|
* Software Foundation; either version 2 of the License, or (at your option)
|
||||||
|
* any later version. See LICENSE for more details.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef Verthash_INCLUDE_ONCE
|
||||||
|
#define Verthash_INCLUDE_ONCE
|
||||||
|
|
||||||
|
#include "tiny_sha3/sha3.h"
|
||||||
|
#include "fopen_utf8.h"
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
// Verthash constants used to compute bitmask, used inside kernel during IO pass
|
||||||
|
#define VH_HASH_OUT_SIZE 32
|
||||||
|
#define VH_BYTE_ALIGNMENT 16
|
||||||
|
#define VH_HEADER_SIZE 80
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
// Verthash data
|
||||||
|
//! Verthash C api for data maniputation.
|
||||||
|
typedef struct VerthashInfo
|
||||||
|
{
|
||||||
|
char* fileName;
|
||||||
|
uint8_t* data;
|
||||||
|
uint64_t dataSize;
|
||||||
|
uint32_t bitmask;
|
||||||
|
} verthash_info_t;
|
||||||
|
|
||||||
|
//! Must be called before usage. Reset all fields and set a mining data file name.
|
||||||
|
//! Error codes
|
||||||
|
//! 0 - Success(No error).
|
||||||
|
//! 1 - File name is invalid.
|
||||||
|
//! 2 - Memory allocation error
|
||||||
|
int verthash_info_init(verthash_info_t* info, const char* file_name);
|
||||||
|
|
||||||
|
//! Reset all fields and free allocated data.
|
||||||
|
void verthash_info_free(verthash_info_t* info);
|
||||||
|
|
||||||
|
//! Generate verthash data file and save it to specified location.
|
||||||
|
int verthash_generate_data_file(const char* output_file_name);
|
||||||
|
|
||||||
|
void verthash_hash( const void *blob_bytes, const size_t blob_size,
|
||||||
|
const void *input, void *output );
|
||||||
|
|
||||||
|
void verthash_sha3_512_prehash_72( const void *input );
|
||||||
|
void verthash_sha3_512_final_8( void *hash, const uint64_t nonce );
|
||||||
|
|
||||||
|
#endif // !Verthash_INCLUDE_ONCE
|
||||||
|
|
||||||
181
algo/verthash/fopen_utf8.c
Normal file
181
algo/verthash/fopen_utf8.c
Normal file
@@ -0,0 +1,181 @@
|
|||||||
|
#ifndef H_FOPEN_UTF8
|
||||||
|
#define H_FOPEN_UTF8
|
||||||
|
|
||||||
|
#include "fopen_utf8.h"
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
int utf8_char_size(const uint8_t *c)
|
||||||
|
{
|
||||||
|
const uint8_t m0x = 0x80, c0x = 0x00,
|
||||||
|
m10x = 0xC0, c10x = 0x80,
|
||||||
|
m110x = 0xE0, c110x = 0xC0,
|
||||||
|
m1110x = 0xF0, c1110x = 0xE0,
|
||||||
|
m11110x = 0xF8, c11110x = 0xF0;
|
||||||
|
|
||||||
|
if ((c[0] & m0x) == c0x)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if ((c[0] & m110x) == c110x)
|
||||||
|
if ((c[1] & m10x) == c10x)
|
||||||
|
return 2;
|
||||||
|
|
||||||
|
if ((c[0] & m1110x) == c1110x)
|
||||||
|
if ((c[1] & m10x) == c10x)
|
||||||
|
if ((c[2] & m10x) == c10x)
|
||||||
|
return 3;
|
||||||
|
|
||||||
|
if ((c[0] & m11110x) == c11110x)
|
||||||
|
if ((c[1] & m10x) == c10x)
|
||||||
|
if ((c[2] & m10x) == c10x)
|
||||||
|
if ((c[3] & m10x) == c10x)
|
||||||
|
return 4;
|
||||||
|
|
||||||
|
if ((c[0] & m10x) == c10x) // not a first UTF-8 byte
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return -1; // if c[0] is a first byte but the other bytes don't match
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t utf8_to_unicode32(const uint8_t *c, size_t *index)
|
||||||
|
{
|
||||||
|
uint32_t v;
|
||||||
|
int size;
|
||||||
|
const uint8_t m6 = 63, m5 = 31, m4 = 15, m3 = 7;
|
||||||
|
|
||||||
|
if (c==NULL)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
size = utf8_char_size(c);
|
||||||
|
|
||||||
|
if (size > 0 && index)
|
||||||
|
*index += size-1;
|
||||||
|
|
||||||
|
switch (size)
|
||||||
|
{
|
||||||
|
case 1:
|
||||||
|
v = c[0];
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
v = c[0] & m5;
|
||||||
|
v = v << 6 | (c[1] & m6);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
v = c[0] & m4;
|
||||||
|
v = v << 6 | (c[1] & m6);
|
||||||
|
v = v << 6 | (c[2] & m6);
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
v = c[0] & m3;
|
||||||
|
v = v << 6 | (c[1] & m6);
|
||||||
|
v = v << 6 | (c[2] & m6);
|
||||||
|
v = v << 6 | (c[3] & m6);
|
||||||
|
break;
|
||||||
|
case 0: // not a first UTF-8 byte
|
||||||
|
case -1: // corrupt UTF-8 letter
|
||||||
|
default:
|
||||||
|
v = -1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
int codepoint_utf16_size(uint32_t c)
|
||||||
|
{
|
||||||
|
if (c < 0x10000) return 1;
|
||||||
|
if (c < 0x110000) return 2;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint16_t *sprint_utf16(uint16_t *str, uint32_t c) // str must be able to hold 1 to 3 entries and will be null-terminated by this function
|
||||||
|
{
|
||||||
|
int c_size;
|
||||||
|
|
||||||
|
if (str==NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
c_size = codepoint_utf16_size(c);
|
||||||
|
|
||||||
|
switch (c_size)
|
||||||
|
{
|
||||||
|
case 1:
|
||||||
|
str[0] = c;
|
||||||
|
if (c > 0)
|
||||||
|
str[1] = '\0';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 2:
|
||||||
|
c -= 0x10000;
|
||||||
|
str[0] = 0xD800 + (c >> 10);
|
||||||
|
str[1] = 0xDC00 + (c & 0x3FF);
|
||||||
|
str[2] = '\0';
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
str[0] = '\0';
|
||||||
|
}
|
||||||
|
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t strlen_utf8_to_utf16(const uint8_t *str)
|
||||||
|
{
|
||||||
|
size_t i, count;
|
||||||
|
uint32_t c;
|
||||||
|
|
||||||
|
for (i=0, count=0; ; i++)
|
||||||
|
{
|
||||||
|
if (str[i]==0)
|
||||||
|
return count;
|
||||||
|
|
||||||
|
c = utf8_to_unicode32(&str[i], &i);
|
||||||
|
count += codepoint_utf16_size(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
uint16_t *utf8_to_utf16(const uint8_t *utf8, uint16_t *utf16)
|
||||||
|
{
|
||||||
|
size_t i, j;
|
||||||
|
uint32_t c;
|
||||||
|
|
||||||
|
if (utf8==NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
if (utf16==NULL)
|
||||||
|
utf16 = (uint16_t *) calloc(strlen_utf8_to_utf16(utf8) + 1, sizeof(uint16_t));
|
||||||
|
|
||||||
|
for (i=0, j=0, c=1; c; i++)
|
||||||
|
{
|
||||||
|
c = utf8_to_unicode32(&utf8[i], &i);
|
||||||
|
sprint_utf16(&utf16[j], c);
|
||||||
|
j += codepoint_utf16_size(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
return utf16;
|
||||||
|
}
|
||||||
|
|
||||||
|
FILE *fopen_utf8(const char *path, const char *mode)
|
||||||
|
{
|
||||||
|
#ifdef _WIN32
|
||||||
|
wchar_t *wpath, wmode[8];
|
||||||
|
FILE *file;
|
||||||
|
|
||||||
|
if (utf8_to_utf16((const uint8_t *) mode, (uint16_t *) wmode)==NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
wpath = (wchar_t *) utf8_to_utf16((const uint8_t *) path, NULL);
|
||||||
|
if (wpath==NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
file = _wfopen(wpath, wmode);
|
||||||
|
free(wpath);
|
||||||
|
return file;
|
||||||
|
#else
|
||||||
|
return fopen(path, mode);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif
|
||||||
25
algo/verthash/fopen_utf8.h
Normal file
25
algo/verthash/fopen_utf8.h
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
#ifndef H_FOPEN_UTF8
|
||||||
|
#define H_FOPEN_UTF8
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
|
int utf8_char_size(const uint8_t *c);
|
||||||
|
uint32_t utf8_to_unicode32(const uint8_t *c, size_t *index);
|
||||||
|
int codepoint_utf16_size(uint32_t c);
|
||||||
|
uint16_t *sprint_utf16(uint16_t *str, uint32_t c);
|
||||||
|
size_t strlen_utf8_to_utf16(const uint8_t *str);
|
||||||
|
uint16_t *utf8_to_utf16(const uint8_t *utf8, uint16_t *utf16);
|
||||||
|
|
||||||
|
FILE *fopen_utf8(const char *path, const char *mode);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
301
algo/verthash/tiny_sha3/sha3-4way.c
Normal file
301
algo/verthash/tiny_sha3/sha3-4way.c
Normal file
@@ -0,0 +1,301 @@
|
|||||||
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
|
// sha3-4way.c
|
||||||
|
// 19-Nov-11 Markku-Juhani O. Saarinen <mjos@iki.fi>
|
||||||
|
// vectorization by JayDDee 2021-03-27
|
||||||
|
//
|
||||||
|
// Revised 07-Aug-15 to match with official release of FIPS PUB 202 "SHA3"
|
||||||
|
// Revised 03-Sep-15 for portability + OpenSSL - style API
|
||||||
|
|
||||||
|
#include "sha3-4way.h"
|
||||||
|
|
||||||
|
// constants
|
||||||
|
static const uint64_t keccakf_rndc[24] = {
|
||||||
|
0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
|
||||||
|
0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
|
||||||
|
0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
|
||||||
|
0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
|
||||||
|
0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
|
||||||
|
0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
|
||||||
|
0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
|
||||||
|
0x8000000000008080, 0x0000000080000001, 0x8000000080008008
|
||||||
|
};
|
||||||
|
|
||||||
|
void sha3_4way_keccakf( __m256i st[25] )
|
||||||
|
{
|
||||||
|
int i, j, r;
|
||||||
|
__m256i t, bc[5];
|
||||||
|
|
||||||
|
for ( r = 0; r < KECCAKF_ROUNDS; r++ )
|
||||||
|
{
|
||||||
|
// Theta
|
||||||
|
bc[0] = _mm256_xor_si256( st[0],
|
||||||
|
mm256_xor4( st[5], st[10], st[15], st[20] ) );
|
||||||
|
bc[1] = _mm256_xor_si256( st[1],
|
||||||
|
mm256_xor4( st[6], st[11], st[16], st[21] ) );
|
||||||
|
bc[2] = _mm256_xor_si256( st[2],
|
||||||
|
mm256_xor4( st[7], st[12], st[17], st[22] ) );
|
||||||
|
bc[3] = _mm256_xor_si256( st[3],
|
||||||
|
mm256_xor4( st[8], st[13], st[18], st[23] ) );
|
||||||
|
bc[4] = _mm256_xor_si256( st[4],
|
||||||
|
mm256_xor4( st[9], st[14], st[19], st[24] ) );
|
||||||
|
|
||||||
|
for ( i = 0; i < 5; i++ )
|
||||||
|
{
|
||||||
|
t = _mm256_xor_si256( bc[ (i+4) % 5 ],
|
||||||
|
mm256_rol_64( bc[ (i+1) % 5 ], 1 ) );
|
||||||
|
st[ i ] = _mm256_xor_si256( st[ i ], t );
|
||||||
|
st[ i+5 ] = _mm256_xor_si256( st[ i+ 5 ], t );
|
||||||
|
st[ i+10 ] = _mm256_xor_si256( st[ i+10 ], t );
|
||||||
|
st[ i+15 ] = _mm256_xor_si256( st[ i+15 ], t );
|
||||||
|
st[ i+20 ] = _mm256_xor_si256( st[ i+20 ], t );
|
||||||
|
}
|
||||||
|
|
||||||
|
// Rho Pi
|
||||||
|
#define RHO_PI( i, c ) \
|
||||||
|
bc[0] = st[ i ]; \
|
||||||
|
st[ i ] = mm256_rol_64( t, c ); \
|
||||||
|
t = bc[0]
|
||||||
|
|
||||||
|
t = st[1];
|
||||||
|
|
||||||
|
RHO_PI( 10, 1 );
|
||||||
|
RHO_PI( 7, 3 );
|
||||||
|
RHO_PI( 11, 6 );
|
||||||
|
RHO_PI( 17, 10 );
|
||||||
|
RHO_PI( 18, 15 );
|
||||||
|
RHO_PI( 3, 21 );
|
||||||
|
RHO_PI( 5, 28 );
|
||||||
|
RHO_PI( 16, 36 );
|
||||||
|
RHO_PI( 8, 45 );
|
||||||
|
RHO_PI( 21, 55 );
|
||||||
|
RHO_PI( 24, 2 );
|
||||||
|
RHO_PI( 4, 14 );
|
||||||
|
RHO_PI( 15, 27 );
|
||||||
|
RHO_PI( 23, 41 );
|
||||||
|
RHO_PI( 19, 56 );
|
||||||
|
RHO_PI( 13, 8 );
|
||||||
|
RHO_PI( 12, 25 );
|
||||||
|
RHO_PI( 2, 43 );
|
||||||
|
RHO_PI( 20, 62 );
|
||||||
|
RHO_PI( 14, 18 );
|
||||||
|
RHO_PI( 22, 39 );
|
||||||
|
RHO_PI( 9, 61 );
|
||||||
|
RHO_PI( 6, 20 );
|
||||||
|
RHO_PI( 1, 44 );
|
||||||
|
|
||||||
|
#undef RHO_PI
|
||||||
|
|
||||||
|
// Chi
|
||||||
|
for ( j = 0; j < 25; j += 5 )
|
||||||
|
{
|
||||||
|
memcpy( bc, &st[ j ], 5*32 );
|
||||||
|
st[ j ] = _mm256_xor_si256( st[ j ],
|
||||||
|
_mm256_andnot_si256( bc[1], bc[2] ) );
|
||||||
|
st[ j+1 ] = _mm256_xor_si256( st[ j+1 ],
|
||||||
|
_mm256_andnot_si256( bc[2], bc[3] ) );
|
||||||
|
st[ j+2 ] = _mm256_xor_si256( st[ j+2 ],
|
||||||
|
_mm256_andnot_si256( bc[3], bc[4] ) );
|
||||||
|
st[ j+3 ] = _mm256_xor_si256( st[ j+3 ],
|
||||||
|
_mm256_andnot_si256( bc[4], bc[0] ) );
|
||||||
|
st[ j+4 ] = _mm256_xor_si256( st[ j+4 ],
|
||||||
|
_mm256_andnot_si256( bc[0], bc[1] ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
// Iota
|
||||||
|
st[0] = _mm256_xor_si256( st[0],
|
||||||
|
_mm256_set1_epi64x( keccakf_rndc[ r ] ) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int sha3_4way_init( sha3_4way_ctx_t *c, int mdlen )
|
||||||
|
{
|
||||||
|
for ( int i = 0; i < 25; i++ ) c->st[ i ] = m256_zero;
|
||||||
|
c->mdlen = mdlen;
|
||||||
|
c->rsiz = 200 - 2 * mdlen;
|
||||||
|
c->pt = 0;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int sha3_4way_update( sha3_4way_ctx_t *c, const void *data, size_t len )
|
||||||
|
{
|
||||||
|
size_t i;
|
||||||
|
int j = c->pt;
|
||||||
|
const int rsiz = c->rsiz / 8;
|
||||||
|
const int l = len / 8;
|
||||||
|
|
||||||
|
for ( i = 0; i < l; i++ )
|
||||||
|
{
|
||||||
|
c->st[ j ] = _mm256_xor_si256( c->st[ j ],
|
||||||
|
( (const __m256i*)data )[i] );
|
||||||
|
j++;
|
||||||
|
if ( j >= rsiz )
|
||||||
|
{
|
||||||
|
sha3_4way_keccakf( c->st );
|
||||||
|
j = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
c->pt = j;
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int sha3_4way_final( void *md, sha3_4way_ctx_t *c )
|
||||||
|
{
|
||||||
|
c->st[ c->pt ] = _mm256_xor_si256( c->st[ c->pt ],
|
||||||
|
m256_const1_64( 6 ) );
|
||||||
|
c->st[ c->rsiz / 8 - 1 ] =
|
||||||
|
_mm256_xor_si256( c->st[ c->rsiz / 8 - 1 ],
|
||||||
|
m256_const1_64( 0x8000000000000000 ) );
|
||||||
|
sha3_4way_keccakf( c->st );
|
||||||
|
memcpy( md, c->st, c->mdlen * 4 );
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen )
|
||||||
|
{
|
||||||
|
sha3_4way_ctx_t ctx;
|
||||||
|
sha3_4way_init( &ctx, mdlen);
|
||||||
|
sha3_4way_update( &ctx, in, inlen );
|
||||||
|
sha3_4way_final( md, &ctx );
|
||||||
|
return md;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
void sha3_8way_keccakf( __m512i st[25] )
|
||||||
|
{
|
||||||
|
int i, j, r;
|
||||||
|
__m512i t, bc[5];
|
||||||
|
|
||||||
|
// actual iteration
|
||||||
|
for ( r = 0; r < KECCAKF_ROUNDS; r++ )
|
||||||
|
{
|
||||||
|
|
||||||
|
// Theta
|
||||||
|
for ( i = 0; i < 5; i++ )
|
||||||
|
bc[i] = _mm512_xor_si512( st[i],
|
||||||
|
mm512_xor4( st[ i+5 ], st[ i+10 ], st[ i+15 ], st[i+20 ] ) );
|
||||||
|
|
||||||
|
for ( i = 0; i < 5; i++ )
|
||||||
|
{
|
||||||
|
t = _mm512_xor_si512( bc[(i + 4) % 5],
|
||||||
|
_mm512_rol_epi64( bc[(i + 1) % 5], 1 ) );
|
||||||
|
for ( j = 0; j < 25; j += 5 )
|
||||||
|
st[j + i] = _mm512_xor_si512( st[j + i], t );
|
||||||
|
}
|
||||||
|
|
||||||
|
// Rho Pi
|
||||||
|
#define RHO_PI( i, c ) \
|
||||||
|
bc[0] = st[ i ]; \
|
||||||
|
st[ i ] = _mm512_rol_epi64( t, c ); \
|
||||||
|
t = bc[0]
|
||||||
|
|
||||||
|
t = st[1];
|
||||||
|
|
||||||
|
RHO_PI( 10, 1 );
|
||||||
|
RHO_PI( 7, 3 );
|
||||||
|
RHO_PI( 11, 6 );
|
||||||
|
RHO_PI( 17, 10 );
|
||||||
|
RHO_PI( 18, 15 );
|
||||||
|
RHO_PI( 3, 21 );
|
||||||
|
RHO_PI( 5, 28 );
|
||||||
|
RHO_PI( 16, 36 );
|
||||||
|
RHO_PI( 8, 45 );
|
||||||
|
RHO_PI( 21, 55 );
|
||||||
|
RHO_PI( 24, 2 );
|
||||||
|
RHO_PI( 4, 14 );
|
||||||
|
RHO_PI( 15, 27 );
|
||||||
|
RHO_PI( 23, 41 );
|
||||||
|
RHO_PI( 19, 56 );
|
||||||
|
RHO_PI( 13, 8 );
|
||||||
|
RHO_PI( 12, 25 );
|
||||||
|
RHO_PI( 2, 43 );
|
||||||
|
RHO_PI( 20, 62 );
|
||||||
|
RHO_PI( 14, 18 );
|
||||||
|
RHO_PI( 22, 39 );
|
||||||
|
RHO_PI( 9, 61 );
|
||||||
|
RHO_PI( 6, 20 );
|
||||||
|
RHO_PI( 1, 44 );
|
||||||
|
|
||||||
|
#undef RHO_PI
|
||||||
|
|
||||||
|
// Chi
|
||||||
|
for ( j = 0; j < 25; j += 5 )
|
||||||
|
{
|
||||||
|
for ( i = 0; i < 5; i++ )
|
||||||
|
bc[i] = st[j + i];
|
||||||
|
for ( i = 0; i < 5; i++ )
|
||||||
|
st[ j+i ] = _mm512_xor_si512( st[ j+i ], _mm512_andnot_si512(
|
||||||
|
bc[ (i+1) % 5 ], bc[ (i+2) % 5 ] ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
// Iota
|
||||||
|
st[0] = _mm512_xor_si512( st[0], _mm512_set1_epi64( keccakf_rndc[r] ) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize the context for SHA3
|
||||||
|
|
||||||
|
int sha3_8way_init( sha3_8way_ctx_t *c, int mdlen )
|
||||||
|
{
|
||||||
|
for ( int i = 0; i < 25; i++ ) c->st[ i ] = m512_zero;
|
||||||
|
c->mdlen = mdlen;
|
||||||
|
c->rsiz = 200 - 2 * mdlen;
|
||||||
|
c->pt = 0;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// update state with more data
|
||||||
|
|
||||||
|
int sha3_8way_update( sha3_8way_ctx_t *c, const void *data, size_t len )
|
||||||
|
{
|
||||||
|
size_t i;
|
||||||
|
int j = c->pt;
|
||||||
|
const int rsiz = c->rsiz / 8;
|
||||||
|
const int l = len / 8;
|
||||||
|
|
||||||
|
for ( i = 0; i < l; i++ )
|
||||||
|
{
|
||||||
|
c->st[ j ] = _mm512_xor_si512( c->st[ j ],
|
||||||
|
( (const __m512i*)data )[i] );
|
||||||
|
j++;
|
||||||
|
if ( j >= rsiz )
|
||||||
|
{
|
||||||
|
sha3_8way_keccakf( c->st );
|
||||||
|
j = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
c->pt = j;
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// finalize and output a hash
|
||||||
|
|
||||||
|
int sha3_8way_final( void *md, sha3_8way_ctx_t *c )
|
||||||
|
{
|
||||||
|
c->st[ c->pt ] =
|
||||||
|
_mm512_xor_si512( c->st[ c->pt ],
|
||||||
|
m512_const1_64( 6 ) );
|
||||||
|
c->st[ c->rsiz / 8 - 1 ] =
|
||||||
|
_mm512_xor_si512( c->st[ c->rsiz / 8 - 1 ],
|
||||||
|
m512_const1_64( 0x8000000000000000 ) );
|
||||||
|
sha3_8way_keccakf( c->st );
|
||||||
|
memcpy( md, c->st, c->mdlen * 8 );
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// compute a SHA-3 hash (md) of given byte length from "in"
|
||||||
|
|
||||||
|
void *sha3_8way( const void *in, size_t inlen, void *md, int mdlen )
|
||||||
|
{
|
||||||
|
sha3_8way_ctx_t sha3;
|
||||||
|
sha3_8way_init( &sha3, mdlen);
|
||||||
|
sha3_8way_update( &sha3, in, inlen );
|
||||||
|
sha3_8way_final( md, &sha3 );
|
||||||
|
return md;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // AVX512
|
||||||
|
#endif // AVX2
|
||||||
67
algo/verthash/tiny_sha3/sha3-4way.h
Normal file
67
algo/verthash/tiny_sha3/sha3-4way.h
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
// sha3.h
|
||||||
|
// 19-Nov-11 Markku-Juhani O. Saarinen <mjos@iki.fi>
|
||||||
|
// 2021-03-27 JayDDee
|
||||||
|
//
|
||||||
|
#ifndef SHA3_4WAY_H
|
||||||
|
#define SHA3_4WAY_H
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include "simd-utils.h"
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef KECCAKF_ROUNDS
|
||||||
|
#define KECCAKF_ROUNDS 24
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
__m256i st[25]; // 64-bit words * 4 lanes
|
||||||
|
int pt, rsiz, mdlen; // these don't overflow
|
||||||
|
} sha3_4way_ctx_t __attribute__ ((aligned (64)));;
|
||||||
|
|
||||||
|
// Compression function.
|
||||||
|
void sha3_4way_keccakf( __m256i st[25] );
|
||||||
|
|
||||||
|
// OpenSSL - like interfece
|
||||||
|
int sha3_4way_init( sha3_4way_ctx_t *c, int mdlen ); // mdlen = hash output in bytes
|
||||||
|
int sha3_4way_update( sha3_4way_ctx_t *c, const void *data, size_t len );
|
||||||
|
int sha3_4way_final( void *md, sha3_4way_ctx_t *c ); // digest goes to md
|
||||||
|
|
||||||
|
// compute a sha3 hash (md) of given byte length from "in"
|
||||||
|
void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen );
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
// state context
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
__m512i st[25]; // 64-bit words * 8 lanes
|
||||||
|
int pt, rsiz, mdlen; // these don't overflow
|
||||||
|
} sha3_8way_ctx_t __attribute__ ((aligned (64)));;
|
||||||
|
|
||||||
|
// Compression function.
|
||||||
|
void sha3_8way_keccakf( __m512i st[25] );
|
||||||
|
|
||||||
|
// OpenSSL - like interfece
|
||||||
|
int sha3_8way_init( sha3_8way_ctx_t *c, int mdlen ); // mdlen = hash output in bytes
|
||||||
|
int sha3_8way_update( sha3_8way_ctx_t *c, const void *data, size_t len );
|
||||||
|
int sha3_8way_final( void *md, sha3_8way_ctx_t *c ); // digest goes to md
|
||||||
|
|
||||||
|
// compute a sha3 hash (md) of given byte length from "in"
|
||||||
|
void *sha3_8way( const void *in, size_t inlen, void *md, int mdlen );
|
||||||
|
|
||||||
|
#endif // AVX512
|
||||||
|
#endif // AVX2
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user