mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
53 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
45ecd0de14 | ||
|
|
4fa8fcea8b | ||
|
|
c85fb3842b | ||
|
|
cdd587537e | ||
|
|
51a1d91abd | ||
|
|
13563e2598 | ||
|
|
9571f85d53 | ||
|
|
0e69756634 | ||
|
|
9653bca1e2 | ||
|
|
1c0719e8a4 | ||
|
|
8b4b4dc613 | ||
|
|
e76feaced8 | ||
|
|
5e088d00d0 | ||
|
|
972d4d70db | ||
|
|
e96a6bd699 | ||
|
|
fb9163185a | ||
|
|
6e8b8ed34f | ||
|
|
c0aadbcc99 | ||
|
|
3da149418a | ||
|
|
720610cce5 | ||
|
|
cedcf4d070 | ||
|
|
81b50c3c71 | ||
|
|
0e1e88f53e | ||
|
|
45c77a5c81 | ||
|
|
dbce7e0721 | ||
|
|
6d66051de6 | ||
|
|
b93be8816a | ||
|
|
19b0ac6d5c | ||
|
|
3da2b958cf | ||
|
|
dc2f8d81d3 | ||
|
|
fc97ef174a | ||
|
|
13523a12f9 | ||
|
|
1b76cee239 | ||
|
|
0681ca996d | ||
|
|
88f81fda0b | ||
|
|
103e6ad36c | ||
|
|
1a7a573675 | ||
|
|
70089d1224 | ||
|
|
3572cb53c4 | ||
|
|
241bc26767 | ||
|
|
c65b0ff7a6 | ||
|
|
a17ff6f189 | ||
|
|
73430b13b1 | ||
|
|
40039386a0 | ||
|
|
91ec6f1771 | ||
|
|
a52c5eccf7 | ||
|
|
86b889e1b0 | ||
|
|
72330eb5a7 | ||
|
|
789c8b70bc | ||
|
|
01550d94a2 | ||
|
|
a042fb7612 | ||
|
|
9d49e0be7a | ||
|
|
a51f59086b |
3
AUTHORS
3
AUTHORS
@@ -33,3 +33,6 @@ Jay D Dee
|
||||
xcouiz@gmail.com
|
||||
|
||||
Cryply
|
||||
|
||||
Colin Percival
|
||||
Alexander Peslyak
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
|
||||
|
||||
Requirements:
|
||||
1. Requirements:
|
||||
---------------
|
||||
|
||||
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
||||
supported.
|
||||
64 bit Linux operating system. Apple is not supported.
|
||||
|
||||
Building on linux prerequisites:
|
||||
2. Building on linux prerequisites:
|
||||
-----------------------------------
|
||||
|
||||
It is assumed users know how to install packages on their system and
|
||||
be able to compile standard source packages. This is basic Linux and
|
||||
@@ -20,49 +22,74 @@ http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu
|
||||
|
||||
Install any additional dependencies needed by cpuminer-opt. The list below
|
||||
are some of the ones that may not be in the default install and need to
|
||||
be installed manually. There may be others, read the error messages they
|
||||
will give a clue as to the missing package.
|
||||
be installed manually. There may be others, read the compiler error messages,
|
||||
they will give a clue as to the missing package.
|
||||
|
||||
The following command should install everything you need on Debian based
|
||||
distributions such as Ubuntu:
|
||||
distributions such as Ubuntu. Fedora and other distributions may have similar
|
||||
but different package names.
|
||||
|
||||
sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake zlib1g-dev
|
||||
|
||||
build-essential (Development Tools package group on Fedora)
|
||||
automake
|
||||
libjansson-dev
|
||||
libgmp-dev
|
||||
libcurl4-openssl-dev
|
||||
libssl-dev
|
||||
lib-thread
|
||||
zlib1g-dev
|
||||
$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
|
||||
|
||||
SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
|
||||
openssl 1.1.0e or higher. Add one of the following, depending on the
|
||||
compiler version, to CFLAGS:
|
||||
"-march=native" or "-march=znver1" or "-msha".
|
||||
openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
|
||||
support depending on your CPU and compiler version:
|
||||
|
||||
"-march=native" is always the best choice
|
||||
|
||||
"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
|
||||
|
||||
"-msha" Add SHA to other tuning options
|
||||
|
||||
Additional instructions for static compilalation can be found here:
|
||||
https://lxadm.com/Static_compilation_of_cpuminer
|
||||
Static builds should only considered in a homogeneous HW and SW environment.
|
||||
Local builds will always have the best performance and compatibility.
|
||||
|
||||
Extract cpuminer source.
|
||||
3. Download cpuminer-opt
|
||||
------------------------
|
||||
|
||||
tar xvzf cpuminer-opt-x.y.z.tar.gz
|
||||
cd cpuminer-opt-x.y.z
|
||||
Download the source code for the latest realease from the official repository.
|
||||
|
||||
Run ./build.sh to build on Linux or execute the following commands.
|
||||
https://github.com/JayDDee/cpuminer-opt/releases
|
||||
|
||||
./autogen.sh
|
||||
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
make
|
||||
Extract the source code.
|
||||
|
||||
Start mining.
|
||||
$ tar xvzf cpuminer-opt-x.y.z.tar.gz
|
||||
|
||||
|
||||
Alternatively it can be cloned from git.
|
||||
|
||||
$ git clone https://github.com/JayDDee/cpuminer-opt.git
|
||||
|
||||
4. Build cpuminer-opt
|
||||
---------------------
|
||||
|
||||
It is recomended to Build with default options, this will usuallly
|
||||
produce the best results.
|
||||
|
||||
$ ./build.sh to build on Linux or execute the following commands.
|
||||
|
||||
or
|
||||
|
||||
$ ./autogen.sh
|
||||
$ CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
$ make -j n
|
||||
|
||||
n is the number of threads.
|
||||
|
||||
5. Start mining.
|
||||
----------------
|
||||
|
||||
$ ./cpuminer -a algo -o url -u username -p password
|
||||
|
||||
./cpuminer -a algo -o url -u username -p password
|
||||
|
||||
Windows
|
||||
-------
|
||||
|
||||
See also INSTAL_WINDOWS
|
||||
|
||||
The following procedure is obsolete and uses an old compiler.
|
||||
|
||||
Precompiled Windows binaries are built on a Linux host using Mingw
|
||||
with a more recent compiler than the following Windows hosted procedure.
|
||||
|
||||
@@ -22,14 +22,13 @@ Step by step...
|
||||
|
||||
Refer to Linux compile instructions and install required packages.
|
||||
|
||||
Additionally, install mingw-64.
|
||||
Additionally, install mingw-w64.
|
||||
|
||||
sudo apt-get install mingw-w64
|
||||
|
||||
|
||||
2. Create a local library directory for packages to be compiled in the next
|
||||
step. Recommended location is $HOME/usr/lib/
|
||||
|
||||
step. Suggested location is $HOME/usr/lib/
|
||||
|
||||
3. Download and build other packages for mingw that don't have a mingw64
|
||||
version available in the repositories.
|
||||
|
||||
75
Makefile.am
75
Makefile.am
@@ -18,19 +18,9 @@ dist_man_MANS = cpuminer.1
|
||||
cpuminer_SOURCES = \
|
||||
cpu-miner.c \
|
||||
util.c \
|
||||
uint256.cpp \
|
||||
api.c \
|
||||
sysinfos.c \
|
||||
algo-gate-api.c\
|
||||
crypto/oaes_lib.c \
|
||||
crypto/c_keccak.c \
|
||||
crypto/c_groestl.c \
|
||||
crypto/c_blake256.c \
|
||||
crypto/c_jh.c \
|
||||
crypto/c_skein.c \
|
||||
crypto/hash.c \
|
||||
crypto/aesb.c \
|
||||
crypto/magimath.cpp \
|
||||
algo/argon2/argon2a/argon2a.c \
|
||||
algo/argon2/argon2a/ar2/argon2.c \
|
||||
algo/argon2/argon2a/ar2/opt.c \
|
||||
@@ -51,12 +41,15 @@ cpuminer_SOURCES = \
|
||||
algo/blake/blake.c \
|
||||
algo/blake/blake-4way.c \
|
||||
algo/blake/sph_blake2b.c \
|
||||
algo/blake/blake2b.c \
|
||||
algo/blake/sph-blake2s.c \
|
||||
algo/blake/blake2s-hash-4way.c \
|
||||
algo/blake/blake2s.c \
|
||||
algo/blake/blake2s-gate.c \
|
||||
algo/blake/blake2s-4way.c \
|
||||
algo/blake/blake2b-hash-4way.c \
|
||||
algo/blake/blake2b.c \
|
||||
algo/blake/blake2b-gate.c \
|
||||
algo/blake/blake2b-4way.c \
|
||||
algo/blake/blakecoin-gate.c \
|
||||
algo/blake/mod_blakecoin.c \
|
||||
algo/blake/blakecoin.c \
|
||||
@@ -74,31 +67,29 @@ cpuminer_SOURCES = \
|
||||
algo/bmw/bmw512-gate.c \
|
||||
algo/bmw/bmw512.c \
|
||||
algo/bmw/bmw512-4way.c \
|
||||
algo/cryptonight/cryptolight.c \
|
||||
algo/cryptonight/cryptonight-common.c\
|
||||
algo/cryptonight/cryptonight-aesni.c\
|
||||
algo/cryptonight/cryptonight.c\
|
||||
algo/cubehash/sph_cubehash.c \
|
||||
algo/cubehash/cubehash_sse2.c\
|
||||
algo/cubehash/cube-hash-2way.c \
|
||||
algo/echo/sph_echo.c \
|
||||
algo/echo/echo-hash-4way.c \
|
||||
algo/echo/aes_ni/hash.c\
|
||||
algo/gost/sph_gost.c \
|
||||
algo/groestl/groestl-gate.c \
|
||||
algo/groestl/groestl512-hash-4way.c \
|
||||
algo/groestl/groestl256-hash-4way.c \
|
||||
algo/groestl/sph_groestl.c \
|
||||
algo/groestl/groestl.c \
|
||||
algo/groestl/groestl-4way.c \
|
||||
algo/groestl/myrgr-gate.c \
|
||||
algo/groestl/myrgr-4way.c \
|
||||
algo/groestl/myr-groestl.c \
|
||||
algo/groestl/aes_ni/hash-groestl.c \
|
||||
algo/groestl/aes_ni/hash-groestl256.c \
|
||||
algo/fugue/sph_fugue.c \
|
||||
algo/fugue/fugue-aesni.c \
|
||||
algo/hamsi/sph_hamsi.c \
|
||||
algo/hamsi/hamsi-hash-4way.c \
|
||||
algo/haval/haval.c \
|
||||
algo/haval/haval-hash-4way.c \
|
||||
algo/heavy/sph_hefty1.c \
|
||||
algo/heavy/heavy.c \
|
||||
algo/heavy/bastion.c \
|
||||
algo/hodl/aes.c \
|
||||
algo/hodl/hodl-gate.c \
|
||||
algo/hodl/hodl-wolf.c \
|
||||
@@ -114,13 +105,15 @@ cpuminer_SOURCES = \
|
||||
algo/keccak/keccak-hash-4way.c \
|
||||
algo/keccak/keccak-4way.c\
|
||||
algo/keccak/keccak-gate.c \
|
||||
algo/keccak/sse2/keccak.c \
|
||||
algo/luffa/sph_luffa.c \
|
||||
algo/luffa/luffa.c \
|
||||
algo/keccak/sha3d-4way.c \
|
||||
algo/keccak/sha3d.c \
|
||||
algo/lanehash/lane.c \
|
||||
algo/luffa/luffa_for_sse2.c \
|
||||
algo/luffa/luffa-hash-2way.c \
|
||||
algo/lyra2/lyra2.c \
|
||||
algo/lyra2/sponge.c \
|
||||
algo/lyra2/sponge-2way.c \
|
||||
algo/lyra2/lyra2-hash-2way.c \
|
||||
algo/lyra2/lyra2-gate.c \
|
||||
algo/lyra2/lyra2rev2.c \
|
||||
algo/lyra2/lyra2rev2-4way.c \
|
||||
@@ -136,13 +129,14 @@ cpuminer_SOURCES = \
|
||||
algo/lyra2/allium.c \
|
||||
algo/lyra2/phi2-4way.c \
|
||||
algo/lyra2/phi2.c \
|
||||
algo/m7m.c \
|
||||
algo//m7m/m7m.c \
|
||||
algo/m7m/magimath.cpp \
|
||||
algo/nist5/nist5-gate.c \
|
||||
algo/nist5/nist5-4way.c \
|
||||
algo/nist5/nist5.c \
|
||||
algo/nist5/zr5.c \
|
||||
algo/panama/panama-hash-4way.c \
|
||||
algo/panama/sph_panama.c \
|
||||
algo/radiogatun/sph_radiogatun.c \
|
||||
algo/quark/quark-gate.c \
|
||||
algo/quark/quark.c \
|
||||
algo/quark/quark-4way.c \
|
||||
@@ -165,12 +159,12 @@ cpuminer_SOURCES = \
|
||||
algo/ripemd/lbry-4way.c \
|
||||
algo/scrypt/scrypt.c \
|
||||
algo/scrypt/neoscrypt.c \
|
||||
algo/scrypt/pluck.c \
|
||||
algo/scryptjane/scrypt-jane.c \
|
||||
algo/sha/sph_sha2.c \
|
||||
algo/sha/sph_sha2big.c \
|
||||
algo/sha/sha2-hash-4way.c \
|
||||
algo/sha/sha256_hash_11way.c \
|
||||
algo/sha/sha256-hash-4way.c \
|
||||
algo/sha/sha512-hash-4way.c \
|
||||
algo/sha/hmac-sha256-hash.c \
|
||||
algo/sha/hmac-sha256-hash-4way.c \
|
||||
algo/sha/sha2.c \
|
||||
algo/sha/sha256t-gate.c \
|
||||
algo/sha/sha256t-4way.c \
|
||||
@@ -182,8 +176,8 @@ cpuminer_SOURCES = \
|
||||
algo/shavite/sph_shavite.c \
|
||||
algo/shavite/sph-shavite-aesni.c \
|
||||
algo/shavite/shavite-hash-2way.c \
|
||||
algo/shavite/shavite-hash-4way.c \
|
||||
algo/shavite/shavite.c \
|
||||
algo/simd/sph_simd.c \
|
||||
algo/simd/nist.c \
|
||||
algo/simd/vector.c \
|
||||
algo/simd/simd-hash-2way.c \
|
||||
@@ -194,9 +188,9 @@ cpuminer_SOURCES = \
|
||||
algo/skein/skein-gate.c \
|
||||
algo/skein/skein2.c \
|
||||
algo/skein/skein2-4way.c \
|
||||
algo/skein/skein2-gate.c \
|
||||
algo/sm3/sm3.c \
|
||||
algo/sm3/sm3-hash-4way.c \
|
||||
algo/swifftx/swifftx.c \
|
||||
algo/tiger/sph_tiger.c \
|
||||
algo/whirlpool/sph_whirlpool.c \
|
||||
algo/whirlpool/whirlpool-hash-4way.c \
|
||||
@@ -221,7 +215,6 @@ cpuminer_SOURCES = \
|
||||
algo/x11/timetravel10-gate.c \
|
||||
algo/x11/timetravel10.c \
|
||||
algo/x11/timetravel10-4way.c \
|
||||
algo/x11/fresh.c \
|
||||
algo/x11/x11evo.c \
|
||||
algo/x11/x11evo-4way.c \
|
||||
algo/x11/x11evo-gate.c \
|
||||
@@ -240,7 +233,6 @@ cpuminer_SOURCES = \
|
||||
algo/x13/skunk-gate.c \
|
||||
algo/x13/skunk-4way.c \
|
||||
algo/x13/skunk.c \
|
||||
algo/x13/drop.c \
|
||||
algo/x13/x13bcd-4way.c \
|
||||
algo/x13/x13bcd.c \
|
||||
algo/x14/x14-gate.c \
|
||||
@@ -259,8 +251,14 @@ cpuminer_SOURCES = \
|
||||
algo/x16/x16r-gate.c \
|
||||
algo/x16/x16r.c \
|
||||
algo/x16/x16r-4way.c \
|
||||
algo/x16/x16rv2.c \
|
||||
algo/x16/x16rv2-4way.c \
|
||||
algo/x16/x16rt.c \
|
||||
algo/x16/x16rt-4way.c \
|
||||
algo/x16/hex.c \
|
||||
algo/x16/x21s-4way.c \
|
||||
algo/x16/x21s.c \
|
||||
algo/x16/minotaur.c \
|
||||
algo/x17/x17-gate.c \
|
||||
algo/x17/x17.c \
|
||||
algo/x17/x17-4way.c \
|
||||
@@ -270,12 +268,17 @@ cpuminer_SOURCES = \
|
||||
algo/x17/sonoa-gate.c \
|
||||
algo/x17/sonoa-4way.c \
|
||||
algo/x17/sonoa.c \
|
||||
algo/x20/x20r.c \
|
||||
algo/x22/x22i-4way.c \
|
||||
algo/x22/x22i.c \
|
||||
algo/x22/x22i-gate.c \
|
||||
algo/x22/x25x.c \
|
||||
algo/x22/x25x-4way.c \
|
||||
algo/yescrypt/yescrypt.c \
|
||||
algo/yescrypt/sha256_Y.c \
|
||||
algo/yescrypt/yescrypt-best.c \
|
||||
algo/yespower/yespower.c \
|
||||
algo/yespower/sha256_p.c \
|
||||
algo/yespower/yespower-gate.c \
|
||||
algo/yespower/yespower-blake2b.c \
|
||||
algo/yespower/crypto/blake2b-yp.c \
|
||||
algo/yespower/yescrypt-r8g.c \
|
||||
algo/yespower/yespower-opt.c
|
||||
|
||||
disable_flags =
|
||||
|
||||
100
README.md
100
README.md
@@ -12,10 +12,24 @@ a false positive, they are flagged simply because they are cryptocurrency
|
||||
miners. The source code is open for anyone to inspect. If you don't trust
|
||||
the software, don't use it.
|
||||
|
||||
|
||||
New thread:
|
||||
|
||||
https://bitcointalk.org/index.php?topic=5226770.msg53865575#msg53865575
|
||||
|
||||
Old thread:
|
||||
|
||||
https://bitcointalk.org/index.php?topic=1326803.0
|
||||
|
||||
mailto://jayddee246@gmail.com
|
||||
|
||||
This note is to confirm that bitcointalk users JayDDee and joblo are the
|
||||
same person.
|
||||
|
||||
I created a new BCT user JayDDee to match my github user id.
|
||||
The old thread has been locked but still contains useful information for
|
||||
reading.
|
||||
|
||||
See file RELEASE_NOTES for change log and INSTALL_LINUX or INSTALL_WINDOWS
|
||||
for compile instructions.
|
||||
|
||||
@@ -23,25 +37,25 @@ Requirements
|
||||
------------
|
||||
|
||||
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
|
||||
Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
|
||||
optimizations a CPU with AES_NI is required. This includes Intel Westbridge
|
||||
and newer and AMD equivalents. Further optimizations are available on some
|
||||
algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
|
||||
Intel Core2 and newer and AMD equivalents. Further optimizations are available
|
||||
on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
|
||||
|
||||
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
|
||||
performance.
|
||||
|
||||
ARM CPUs are not supported.
|
||||
ARM and Aarch64 CPUs are not supported.
|
||||
|
||||
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
|
||||
Centos, are known to work and have all dependencies in their repositories.
|
||||
Others may work but may require more effort. Older versions such as Centos 6
|
||||
don't work due to missing features.
|
||||
2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
|
||||
including Mint and Centos, are known to work and have all dependencies
|
||||
in their repositories. Others may work but may require more effort. Older
|
||||
versions such as Centos 6 don't work due to missing features.
|
||||
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
|
||||
|
||||
MacOS, OSx and Android are not supported.
|
||||
|
||||
3. Stratum pool. Some algos may work wallet mining using getwork or GBT. YMMV.
|
||||
3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
|
||||
RPC getwork using http:// or https://.
|
||||
GBT is YMMV.
|
||||
|
||||
Supported Algorithms
|
||||
--------------------
|
||||
@@ -53,20 +67,18 @@ Supported Algorithms
|
||||
argon2d500 argon2d-dyn, Dynamic (DYN)
|
||||
argon2d4096 argon2d-uis, Unitus, (UIS)
|
||||
axiom Shabal-256 MemoHash
|
||||
bastion
|
||||
blake Blake-256 (SFR)
|
||||
blakecoin blake256r8
|
||||
blake2b Blake2b 256
|
||||
blake2s Blake-2 S
|
||||
blakecoin blake256r8
|
||||
bmw BMW 256
|
||||
bmw512 BMW 512
|
||||
c11 Chaincoin
|
||||
decred
|
||||
deep Deepcoin (DCN)
|
||||
dmd-gr Diamond-Groestl
|
||||
drop Dropcoin
|
||||
fresh Fresh
|
||||
groestl Groestl coin
|
||||
heavy Heavy
|
||||
hex x16r-hex
|
||||
hmq1725 Espers
|
||||
hodl Hodlcoin
|
||||
jha Jackpotcoin
|
||||
@@ -81,22 +93,25 @@ Supported Algorithms
|
||||
lyra2z
|
||||
lyra2z330 Lyra2 330 rows, Zoin (ZOI)
|
||||
m7m Magi (XMG)
|
||||
minotaur Ringcoin (RNG)
|
||||
myr-gr Myriad-Groestl
|
||||
neoscrypt NeoScrypt(128, 2, 1)
|
||||
nist5 Nist5
|
||||
pentablake Pentablake
|
||||
phi1612 phi, LUX coin (original algo)
|
||||
phi2 LUX coin (new algo)
|
||||
phi1612 phi
|
||||
phi2 Luxcoin (LUX)
|
||||
phi2-lux identical to phi2
|
||||
pluck Pluck:128 (Supcoin)
|
||||
polytimos Ninja
|
||||
power2b MicroBitcoin (MBC)
|
||||
quark Quark
|
||||
qubit Qubit
|
||||
scrypt scrypt(1024, 1, 1) (default)
|
||||
scrypt:N scrypt(N, 1, 1)
|
||||
scryptjane:nf
|
||||
sha256d Double SHA-256
|
||||
sha256q Quad SHA-256, Pyrite (PYE)
|
||||
sha256t Triple SHA-256, Onecoin (OC)
|
||||
sha3d Double keccak256 (BSHA3)
|
||||
shavite3 Shavite3
|
||||
skein Skein+Sha (Skeincoin)
|
||||
skein2 Double Skein (Woodcoin)
|
||||
@@ -118,23 +133,53 @@ Supported Algorithms
|
||||
x13sm3 hsr (Hshare)
|
||||
x14 X14
|
||||
x15 X15
|
||||
x16r Ravencoin (RVN)
|
||||
x16r
|
||||
x16rv2 Ravencoin (RVN)
|
||||
x16rt Gincoin (GIN)
|
||||
x16rt_veil Veil (VEIL)
|
||||
x16rt-veil Veil (VEIL)
|
||||
x16s Pigeoncoin (PGN)
|
||||
x17
|
||||
x21s
|
||||
x22i
|
||||
x25x
|
||||
xevan Bitsend (BSD)
|
||||
yescrypt Globalboost-Y (BSTY)
|
||||
yescryptr8 BitZeny (ZNY)
|
||||
yescryptr8g Koto (KOTO)
|
||||
yescryptr16 Eli
|
||||
yescryptr32 WAVI
|
||||
yespower Cryply
|
||||
yespowerr16 Yenten (YTN)
|
||||
yespower-b2b generic yespower + blake2b
|
||||
zr5 Ziftr
|
||||
|
||||
Many variations of scrypt based algos can be mine by specifying their
|
||||
parameters:
|
||||
|
||||
scryptn2: --algo scrypt --param-n 1048576
|
||||
|
||||
cpupower: --algo yespower --param-key "CPUpower: The number of CPU working or available for proof-of-work mining"
|
||||
|
||||
power2b: --algo yespower-b2b --param-n 2048 --param-r 32 --param-key "Now I am become Death, the destroyer of worlds"
|
||||
|
||||
sugarchain: --algo yespower --param-n 2048 -param-r 32 --param-key "Satoshi Nakamoto 31/Oct/2008 Proof-of-work is essentially one-CPU-one-vote"
|
||||
|
||||
yespoweriots: --algo yespower --param-n 2048 --param-key "Iots is committed to the development of IOT"
|
||||
|
||||
yespowerlitb: --algo yespower --param-n 2048 --param-r 32 --param-key "LITBpower: The number of LITB working or available for proof-of-work mini"
|
||||
|
||||
yespoweric: --algo yespower --param-n 2048 --param-r 32 --param-key "IsotopeC"
|
||||
|
||||
yespowerurx: --algo yespower --param-n 2048 --param-r 32 --param-key "UraniumX"
|
||||
|
||||
yespowerltncg: --algo yespower --param-n 2048 --param-r 32 --param-key "LTNCGYES"
|
||||
|
||||
Errata
|
||||
------
|
||||
|
||||
Old algorithms that are no longer used frequently will not have the latest
|
||||
optimizations.
|
||||
|
||||
Cryptonight and variants are no longer supported, use another miner.
|
||||
|
||||
Neoscrypt crashes on Windows, use legacy version.
|
||||
@@ -152,14 +197,17 @@ Benchmark testing does not work for x11evo.
|
||||
Bugs
|
||||
----
|
||||
|
||||
Users are encouraged to post their bug reports on the Bitcoin Talk
|
||||
forum at:
|
||||
Users are encouraged to post their bug reports using git issues or on the
|
||||
Bitcoin Talk forum or opening an issue in git:
|
||||
|
||||
https://bitcointalk.org/index.php?topic=1326803.0
|
||||
|
||||
All problem reports must be accompanied by a proper definition.
|
||||
https://github.com/JayDDee/cpuminer-opt/issues
|
||||
|
||||
All problem reports must be accompanied by a proper problem definition.
|
||||
This should include how the problem occurred, the command line and
|
||||
output from the miner showing the startup and any errors.
|
||||
output from the miner showing the startup messages and any errors.
|
||||
A history is also useful, ie did it work before.
|
||||
|
||||
Donations
|
||||
---------
|
||||
@@ -167,10 +215,6 @@ Donations
|
||||
cpuminer-opt has no fees of any kind but donations are accepted.
|
||||
|
||||
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
|
||||
ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
|
||||
LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
|
||||
BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
|
||||
BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
|
||||
|
||||
Happy mining!
|
||||
|
||||
|
||||
48
README.txt
48
README.txt
@@ -1,8 +1,12 @@
|
||||
This file is included in the Windows binary package. Compile instructions
|
||||
for Linux and Windows can be found in RELEASE_NOTES.
|
||||
|
||||
cpuminer is a console program that is executed from a DOS command prompt.
|
||||
There is no GUI and no mouse support.
|
||||
This package is officially avalable only from:
|
||||
https://github.com/JayDDee/cpuminer-opt
|
||||
No other sources should be trusted.
|
||||
|
||||
cpuminer is a console program that is executed from a DOS or Powershell
|
||||
prompt. There is no GUI and no mouse support.
|
||||
|
||||
Miner programs are often flagged as malware by antivirus programs. This is
|
||||
a false positive, they are flagged simply because they are cryptocurrency
|
||||
@@ -15,7 +19,7 @@ the features listed at cpuminer startup to ensure you are mining at
|
||||
optimum speed using the best available features.
|
||||
|
||||
Architecture names and compile options used are only provided for Intel
|
||||
Core series. Even the newest Pentium and Celeron CPUs are often missing
|
||||
Core series. Budget CPUs like Pentium and Celeron are often missing some
|
||||
features.
|
||||
|
||||
AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
|
||||
@@ -23,13 +27,39 @@ supported by cpuminer-opt due to an incompatible implementation of SSE2 on
|
||||
these CPUs. Some algos may crash the miner with an invalid instruction.
|
||||
Users are recommended to use an unoptimized miner such as cpuminer-multi.
|
||||
|
||||
Exe name Compile flags Arch name
|
||||
More information for Intel and AMD CPU architectures and their features
|
||||
can be found on Wikipedia.
|
||||
|
||||
cpuminer-sse2.exe "-msse2" Core2, Nehalem
|
||||
cpuminer-aes-sse42.exe "-march=westmere" Westmere
|
||||
cpuminer-avx.exe "-march=corei7-avx" Sandy-Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2" Haswell, Sky-Kaby-Coffeelake
|
||||
cpuminer-zen "-march=znver1" AMD Ryzen, Threadripper
|
||||
https://en.wikipedia.org/wiki/List_of_Intel_CPU_microarchitectures
|
||||
|
||||
https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures
|
||||
|
||||
|
||||
Exe file name Compile flags Arch name
|
||||
|
||||
cpuminer-sse2.exe "-msse2" Core2, Nehalem
|
||||
cpuminer-aes-sse42.exe "-marxh=westmere" Westmere
|
||||
cpuminer-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell(1)
|
||||
cpuminer-avx512.exe "-march=skylake-avx512" Skylake-X, Cascadelake-X
|
||||
cpuminer-zen.exe "-march=znver1" Zen1, Zen2
|
||||
cpuminer-zen3.exe "-march=znver2 -mvaes" Zen3(2)
|
||||
cpuminer-avx512-sha-vaes.exe "-march=icelake-client" Icelake(3)
|
||||
|
||||
(1) Haswell includes Broadwell, Skylake, Kabylake, Coffeelake & Cometlake.
|
||||
(2) Zen3 build uses Zen2+VAES as workaround until Zen3 compiler support is
|
||||
available. Zen2 CPUs should use Zen build.
|
||||
(3) Icelake is only available on some laptops. Mining with a laptop is not
|
||||
recommended.
|
||||
|
||||
Notes about included DLL files:
|
||||
|
||||
Downloading DLL files from alternative sources presents an inherent
|
||||
security risk if their source is unknown. All DLL files included have
|
||||
been copied from the Ubuntu-20.04 instalation or compiled by me from
|
||||
source code obtained from the author's official repository. The exact
|
||||
procedure is documented in the build instructions for Windows:
|
||||
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
|
||||
|
||||
If you like this software feel free to donate:
|
||||
|
||||
|
||||
575
RELEASE_NOTES
575
RELEASE_NOTES
@@ -1,21 +1,18 @@
|
||||
cpuminer-opt is a console program run from the command line using the
|
||||
keyboard, not the mouse.
|
||||
|
||||
cpuminer-opt now supports HW SHA acceleration available on AMD Ryzen CPUs.
|
||||
This feature requires recent SW including GCC version 5 or higher and
|
||||
openssl version 1.1 or higher. It may also require using "-march=znver1"
|
||||
compile flag.
|
||||
|
||||
cpuminer-opt is a console program, if you're using a mouse you're doing it
|
||||
wrong.
|
||||
See also README.md for list of supported algorithms,
|
||||
|
||||
Security warning
|
||||
----------------
|
||||
|
||||
Miner programs are often flagged as malware by antivirus programs. This is
|
||||
a false positive, they are flagged simply because they are cryptocurrency
|
||||
miners. The source code is open for anyone to inspect. If you don't trust
|
||||
the software, don't use it.
|
||||
usually a false positive, they are flagged simply because they are
|
||||
cryptocurrency miners. However, some malware masquerading as a miner has
|
||||
been spread using the cover that miners are known to be subject to false
|
||||
positives ans users will dismiss the AV alert. Always be on alert.
|
||||
The source code of cpuminer-opt is open for anyone to inspect.
|
||||
If you don't trust the software don't download it.
|
||||
|
||||
The cryptographic hashing code has been taken from trusted sources but has been
|
||||
modified for speed at the expense of accepted security practices. This
|
||||
@@ -25,7 +22,7 @@ required.
|
||||
Compile Instructions
|
||||
--------------------
|
||||
|
||||
See INSTALL_LINUX or INSTALL_WINDOWS fror compile instruuctions
|
||||
See INSTALL_LINUX or INSTALL_WINDOWS for compile instruuctions
|
||||
|
||||
Requirements
|
||||
------------
|
||||
@@ -33,11 +30,565 @@ Requirements
|
||||
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
||||
supported.
|
||||
|
||||
64 bit Linux or Windows operating system. Apple and Android are not supported.
|
||||
64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
|
||||
are not supported. FreeBSD YMMV.
|
||||
|
||||
Reporting bugs
|
||||
--------------
|
||||
|
||||
Bugs can be reported by sending am email to JayDDee246@gmail.com or opening
|
||||
an issue in git: https://github.com/JayDDee/cpuminer-opt/issues
|
||||
|
||||
Please include the following information:
|
||||
|
||||
1. CPU model, operating system, cpuminer-opt version (must be latest),
|
||||
binary file for Windows, changes to default build procedure for Linux.
|
||||
|
||||
2. Exact command line (except user and pw) and intial output showing
|
||||
the above requested info.
|
||||
|
||||
3. Additional program output showing any error messages or other
|
||||
pertinent data.
|
||||
|
||||
4. A clear description of the problem including history, scope,
|
||||
persistence or intermittance, and reproduceability.
|
||||
|
||||
In simpler terms:
|
||||
|
||||
What is it doing?
|
||||
What should it be doing instead?
|
||||
Did it work in a previous release?
|
||||
Does it happen for all algos? All pools? All options? Solo?
|
||||
Does it happen all the time?
|
||||
If not what makes it happen or not happen?
|
||||
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.15.2
|
||||
|
||||
Zen3 AVX2+VAES optimization for x16*, x17, sonoa, xevan, x21s, x22i, x25x,
|
||||
allium.
|
||||
Zen3 build added to Windows binary package.
|
||||
|
||||
v3.15.1
|
||||
|
||||
Fix compile on AMD Zen3 CPUs with VAES.
|
||||
Force new work immediately after solving a block solo.
|
||||
|
||||
|
||||
v3.15.0
|
||||
|
||||
Fugue optimized with AES, improves many sha3 algos.
|
||||
Minotaur algo optimized for all architectures.
|
||||
Fixed neoscrypt BUG log.
|
||||
|
||||
v3.14.3
|
||||
|
||||
#265: more mutex changes to reduce blocking with high thread count.
|
||||
|
||||
#267: fixed hodl algo potential memory alignment issue,
|
||||
add warning when thread count is not valid for mining hodl algo.
|
||||
|
||||
v3.14.2
|
||||
|
||||
The second line of the Share Accepted log is no longer displayed,
|
||||
new Xnonce log is added and other small log tweaks.
|
||||
|
||||
#265: Cleanup use of mutex.
|
||||
|
||||
v3.14.1
|
||||
|
||||
GBT and getwork log changes:
|
||||
fixed missing TTF in New Block log,
|
||||
ntime no longer byte-swapped for display in New Work log,
|
||||
fixed zero effective hash rate in Periodic Report log,
|
||||
deleted "Current block is..." log.
|
||||
|
||||
Renamed stratum "New Job" log to "New Work" to be consistent with the solo
|
||||
version of the log. Added more data to both versions.
|
||||
|
||||
v3.14.0
|
||||
|
||||
Changes to solo mining:
|
||||
- segwit is supported by getblocktemplate,
|
||||
- longpolling is not working and is disabled,
|
||||
- Periodic Report log is output,
|
||||
- New Block log includes TTF estimates,
|
||||
- Stratum thread no longer created when using getwork or GBT.
|
||||
|
||||
Fixed BUG log mining sha256d.
|
||||
|
||||
v3.13.1.1
|
||||
|
||||
Fixed Windows crash mining minotaur algo.
|
||||
|
||||
Fixed GCC 10 compile again.
|
||||
Added -fno-common to testing to be consistent with GCC 10 default.
|
||||
|
||||
v3.13.1
|
||||
|
||||
Added minotaur algo for Ringcoin.
|
||||
|
||||
v3.13.0.1
|
||||
|
||||
Issue #262: Fixed xevan AVX2 invalid shares.
|
||||
|
||||
v3.13.0
|
||||
|
||||
Updated Windows binaries compiled with GCC 9. Included DLLs also updated.
|
||||
Icelake build (cpuminer-avx512-sha-vaes.exe) now included in Windows
|
||||
binaries package.
|
||||
|
||||
No source code changes.
|
||||
|
||||
v3.12.8.2
|
||||
|
||||
Fixed x12 AVX2 rejects.
|
||||
Fixed phi AVX2 crash.
|
||||
|
||||
v3.12.8.1
|
||||
|
||||
Issue #261: Fixed yescryptr8g invalid shares.
|
||||
|
||||
v3.12.8
|
||||
|
||||
Yespower sha256 prehash made thread safe.
|
||||
|
||||
Rewrote diff conversion functions from scratch to be simpler and use
|
||||
long double (float80) and int128 arithmetic for improved accuracy and
|
||||
precision.
|
||||
|
||||
Some code cleanup and assorted small changes.
|
||||
|
||||
v3.12.7
|
||||
|
||||
Issue #257: fixed a file descriptor leak which caused the CPU temperature
|
||||
and frequency query to report zeros after mining for a couple of hours.
|
||||
|
||||
Issue #253: stale share reduction for yescrypt, sonoa.
|
||||
|
||||
v3.12.6.1
|
||||
|
||||
Issue #252: Fixed SSL mining (stratum+tcps://)
|
||||
|
||||
Issue #254 Fixed benchmark.
|
||||
|
||||
Issue #253: Implemented stale share reduction for yespower, x25x, x22i, x21s,
|
||||
x16*, scryptn2, more to come.
|
||||
|
||||
v3.12.6
|
||||
|
||||
Issue #246: improved stale share detection for getwork.
|
||||
|
||||
Improved precision of target_to_diff conversion from 4 digits to 20+.
|
||||
|
||||
Display hash and target debug data for all rejected shares.
|
||||
|
||||
A graphical representation of CPU affinity is displayed when using --threads.
|
||||
|
||||
Added highest and lowest accepted share to summary log.
|
||||
|
||||
Other small changes to logs to improve consistency and clarity.
|
||||
|
||||
v3.12.5
|
||||
|
||||
Issues #246 & #251: fixed incorrect share diff for stratum and getwork,
|
||||
fixed incorrect target diff for getwork. Stats should now be correct for
|
||||
getwork as well as stratum.
|
||||
|
||||
Issue #252: Fixed stratum+tcps not using curl ssl.
|
||||
|
||||
Getwork: reduce stale blocks, faster response to new work.
|
||||
|
||||
Added ntime to new job/work logs.
|
||||
|
||||
README.md now lists the parameters for yespower variations that don't have
|
||||
a specific algo name.
|
||||
|
||||
v3.12.4.6
|
||||
|
||||
Issue #246: fixed getwork repeated new block logs with same height. New work
|
||||
for the same block is now reported as "New work" instead of "New block".
|
||||
Also added a check that work is new before generating "New work" log.
|
||||
|
||||
Added target diff to getwork new block log.
|
||||
|
||||
Changed share ratio in share result log to simple fraction, no longer %.
|
||||
|
||||
Added debug log to display mininginfo, use -D.
|
||||
|
||||
v3.12.4.5
|
||||
|
||||
Issue #246: better stale share detection for getwork, and enhanced logging
|
||||
of stale shares for stratum & getwork.
|
||||
|
||||
Issue #251: fixed incorrect share difficulty and share ratio in share
|
||||
result log.
|
||||
|
||||
Changed submit log to include share diff and block height.
|
||||
|
||||
Small cosmetic changes to logs.
|
||||
|
||||
v3.12.4.4
|
||||
|
||||
Issue #246: Fixed net hashrate in getwork block log,
|
||||
removed duplicate getwork block log,
|
||||
other small tweaks to stats logs for getwork.
|
||||
|
||||
Issue #248: Fixed chronic stale shares with scrypt:1048576 (scryptn2).
|
||||
|
||||
v3.12.4.3
|
||||
|
||||
Fixed segfault in new block log for getwork.
|
||||
|
||||
Disabled silent discarding of stale work after the submit is logged.
|
||||
|
||||
v3.12.4.2
|
||||
|
||||
Issue #245: fixed getwork stale shares, solo mining with getwork now works.
|
||||
|
||||
Issue #246: implemented block and summary logs for getwork.
|
||||
|
||||
v3.12.4.1
|
||||
|
||||
Issue #245: fix scantime when mining solo with getwork.
|
||||
|
||||
Added debug logs for creation of stratum and longpoll threads, use -D to
|
||||
enable.
|
||||
|
||||
v3.12.4
|
||||
|
||||
Issue #244: Change longpoll to ignore job id.
|
||||
|
||||
Lyra2rev2 AVX2 +3%, AVX512 +6%.
|
||||
|
||||
v3.12.3.1
|
||||
|
||||
Issue #241: Fixed regression that broke coinbase address in v3.11.7.
|
||||
|
||||
v3.12.3
|
||||
|
||||
Issue #238: Fixed skunk AVX2.
|
||||
|
||||
Issue #239: Faster AVX2 & AVX512 for skein +44%, skein2 +30%, plus marginal
|
||||
increases for skunk, x16r, x16rv2, x16rt, x16rt-veil, x16s, x21s.
|
||||
|
||||
Faster anime VAES +57%, AVX512 +21%, AVX2 +3%.
|
||||
|
||||
Redesigned code reponsible for #236.
|
||||
|
||||
v3.12.2
|
||||
|
||||
Fixed xevan, skein, skein2 AVX2, #238.
|
||||
|
||||
Reversed polarity of AVX2 vector bit test utilities, and all users, to be
|
||||
logically and semantically correct. Follow up to issue #236.
|
||||
|
||||
v3.12.1
|
||||
|
||||
Fixed anime AVX2 low difficulty shares, git issue #236.
|
||||
|
||||
Periodic summary now reports lost hash rate due to rejected and stale shares,
|
||||
displayed only when non-zero.
|
||||
|
||||
v3.12.0.1
|
||||
|
||||
Fixed hodl rejects, git issue #237.
|
||||
|
||||
Fixed debug code added in v3.12.0 to work with AVX2 to be enabled only
|
||||
after low difficulty share have been seen to avoid unnecessarily excessive
|
||||
log outout.
|
||||
|
||||
Added more digits of precision to diff in log output to help diagnose
|
||||
low difficulty shares.
|
||||
|
||||
v3.12.0
|
||||
|
||||
Faster phi2 AVX2 +62%, AVX512 +150% on Intel CPUs. AMD Ryzen AVX2 is
|
||||
YMMV due to its inferiour AVX2 implementation.
|
||||
|
||||
Fixed Hodl stats, rejects are still an issue since v3.9.5, git issue #237.
|
||||
|
||||
API can now be enabled with "-b port" or "--api-bind port".
|
||||
It will use the default address 127.0.0.1.
|
||||
|
||||
Editorial: Short form options should only be used on the command line to save
|
||||
typing. Configuration files and scripts should always use the long form
|
||||
"--api-bind addr:port" without relying on any defaults. This is a general
|
||||
recommendation that applies to all options for any application.
|
||||
|
||||
Removed obsolete cryptonight, all variants, and supporting code for more
|
||||
size reduction and faster compiling.
|
||||
|
||||
Tweaked the timing of the CPU temperature and frequency log (Linux only).
|
||||
|
||||
Added some debug code to collect more info aboout low difficulty rejects,
|
||||
git issue #236.
|
||||
|
||||
v3.11.9
|
||||
|
||||
Fixed x16r invalid shares when Luffa was first in hash order.
|
||||
|
||||
API is disabled by default.
|
||||
|
||||
New startup message for status of stratum connection, API & extranonce.
|
||||
|
||||
New log report for CPU temperature, frequency of fastest and slowest cores.
|
||||
|
||||
Compile time is a little shorter and binary file size a little smaller
|
||||
using conditional compilation..
|
||||
|
||||
Removed code for Bastion, Drop, Heavy, Luffa an Pluck algos and other unused
|
||||
code.
|
||||
|
||||
v3.11.8
|
||||
|
||||
Fixed network hashrate showing incorrect data, should be close now.
|
||||
|
||||
Fixed compile errors when using GCC 10 with default flag -fno-common.
|
||||
|
||||
Faster x16r, x16rv2, x16rt, x16s, x21s, veil, hex with midstate prehash.
|
||||
|
||||
Decoupled sapling usage from block version 5 in yescryptr8g.
|
||||
|
||||
More detailed data reporting for low difficulty rejected shares.
|
||||
|
||||
v3.11.7
|
||||
|
||||
Added yescryptr8g algo for KOTO, including support for block version 5.
|
||||
|
||||
Added sha3d algo for BSHA3.
|
||||
|
||||
Removed memcmp and clean_job checks from get_new_work, now only check job_id.
|
||||
|
||||
Small improvement to sha512 and sha256 parallel implementations that don't
|
||||
use SHA.
|
||||
|
||||
v3.11.6
|
||||
|
||||
Fixed CPU temperature regression from v3.11.5.
|
||||
|
||||
More improvements to share log. More compact, highlight incremented counter,
|
||||
block height when solved, job id when stale.
|
||||
|
||||
v3.11.5
|
||||
|
||||
Fixed AVX512 detection that could cause compilation errors on CPUs
|
||||
without AVX512.
|
||||
|
||||
Fixed "BLOCK SOLVED" log incorrectly displaying "Accepted" when a block
|
||||
is solved.
|
||||
Added share counter to share submitited & accepted logs
|
||||
Added job id to share submitted log.
|
||||
Share submitted log is no longer highlighted blue, there was too much blue.
|
||||
|
||||
Another CPU temperature fix for Linux.
|
||||
|
||||
Added bug reporting tips to RELEASE NOTES.
|
||||
|
||||
v3.11.4
|
||||
|
||||
Fixed scrypt segfault since v3.9.9.1.
|
||||
|
||||
Stale shares counted and reported seperately from other rejected shares.
|
||||
|
||||
Display of counters for solved blocks, rejects, stale shares suppressed in
|
||||
periodic summary when zero.
|
||||
|
||||
v3.11.3
|
||||
|
||||
Fixed x12 AVX2 again.
|
||||
|
||||
More speed for allium: AVX2 +4%, AVX512 +6%, VAES +14%.
|
||||
|
||||
Restored lost speed for x22i & x25x.
|
||||
|
||||
v3.11.2
|
||||
|
||||
Fixed x11gost (sib) AVX2 invalid shares.
|
||||
|
||||
Fixed x16r, x16rv2, x16s, x16rt, x16rt-veil (veil), x21s.
|
||||
No shares were submitted when cube, shavite or echo were the first function
|
||||
in the hash order.
|
||||
|
||||
Fixed all algos reporting stats problems when mining with SSE2.
|
||||
|
||||
Faster Lyra2 AVX512: lyra2z +47%, lyra2rev3 +11%, allium +13%, x21s +6%
|
||||
|
||||
Other minor performance improvements.
|
||||
|
||||
Known issue:
|
||||
|
||||
Lyra2 AVX512 improvements paradoxically reduced performance on x22i and x25x.
|
||||
https://github.com/JayDDee/cpuminer-opt/issues/225
|
||||
|
||||
v3.11.1
|
||||
|
||||
Faster panama for x25x AVX2 & AVX512.
|
||||
|
||||
Fixed echo VAES for Xevan.
|
||||
|
||||
Removed support for scryptjane algo.
|
||||
|
||||
Reverted macro implemtations of hash functions to SPH reference code
|
||||
for SSE2 versions of algos.
|
||||
|
||||
v3.11.0
|
||||
|
||||
Fixed x25x AVX512 lane 4 invalid shares.
|
||||
|
||||
AVX512 for hex, phi2.
|
||||
|
||||
VAES optimzation for Intel Icelake CPUs for most algos recently optimized
|
||||
with AVX512, source code only.
|
||||
|
||||
v3.10.7
|
||||
|
||||
AVX512 for x25x, lbry, x13bcd (bcd).
|
||||
|
||||
v3.10.6
|
||||
|
||||
Added support for SSL stratum: stratum+tcps://
|
||||
|
||||
Added job id reporting again, but leaner, suppressed with --quiet.
|
||||
|
||||
AVX512 for x21s, x22i, lyra2z, allium.
|
||||
|
||||
Fixed share overflow warnings mining lbry with Ryzen (SHA).
|
||||
|
||||
v3.10.5
|
||||
|
||||
AVX512 for x17, sonoa, xevan, hmq1725, lyra2rev3, lyra2rev2.
|
||||
Faster hmq1725 AVX2.
|
||||
|
||||
v3.10.4
|
||||
|
||||
AVX512 for x16r, x16rv2, x16rt, x16s, x16rt-veil (veil).
|
||||
|
||||
v3.10.3
|
||||
|
||||
AVX512 for x12, x13, x14, x15.
|
||||
Fixed x12 AVX2 invalid shares.
|
||||
|
||||
v.10.2
|
||||
|
||||
AVX512 added for bmw512, c11, phi1612 (phi), qubit, skunk, x11, x11gost (sib).
|
||||
Fixed c11 AVX2 invalid shares.
|
||||
|
||||
v3.10.1
|
||||
|
||||
AVX512 for blake2b, nist5, quark, tribus.
|
||||
|
||||
More broken lane fixes, fixed buffer overflow in skein AVX512, fixed
|
||||
quark invalid shares AVX2.
|
||||
|
||||
Only the highest ranking feature in a class is listed at startup, lower ranking
|
||||
features are available but no longer listed.
|
||||
|
||||
v3.10.0
|
||||
|
||||
AVX512 is now supported on selected algos, Windows binary is now available.
|
||||
AVX512 optimizations are available for argon2d, blake2s, keccak, keccakc,
|
||||
skein & skein2.
|
||||
|
||||
Fixed CPU temperature for some CPU models (Linux only).
|
||||
|
||||
Fixed a bug that caused some lanes not to submit shares.
|
||||
|
||||
Fixed some previously undetected buffer overflows.
|
||||
|
||||
Lyra2rev2 3% faster SSE2 and AVX2.
|
||||
|
||||
Added "-fno-asynchronous-unwind-tables" to AVX512 build script for Windows
|
||||
to fix known mingw issue.
|
||||
|
||||
Changed AVX2 build script to explicitly add AES to address change in
|
||||
behaviour in GCC 9.
|
||||
|
||||
v3.9.11
|
||||
|
||||
Added x22i & x25x algos.
|
||||
Blake2s 2% faster AVX2 with Intel CPU, slower with Ryzen v1, v2 ?
|
||||
|
||||
v3.9.10
|
||||
|
||||
Faster X* algos with AVX2.
|
||||
Small improvements to summary stats report.
|
||||
|
||||
v3.9.9.1
|
||||
|
||||
Fixed a day1 bug that could cause the miner to idle for up to 2 minutes
|
||||
under certain circumstances.
|
||||
|
||||
Redesigned summary stats report now includes session statistics.
|
||||
|
||||
More robust handling of statistics to reduce corruption.
|
||||
|
||||
Removed --hide-diff option.
|
||||
|
||||
Better handling of cpu-affinity with more than 64 CPUs.
|
||||
|
||||
v3.9.9
|
||||
|
||||
Added power2b algo for MicroBitcoin.
|
||||
Added generic yespower-b2b (yespower + blake2b) algo to be used with
|
||||
the parameters introduced in v3.9.7 for yespower & yescrypt.
|
||||
Display additional info when a share is rejected.
|
||||
Some low level enhancements and minor tweaking of log output.
|
||||
RELEASE_NOTES (this file) and README.md added to Windows release package.
|
||||
|
||||
v3.9.8.1
|
||||
|
||||
Summary log report will be generated on stratum diff change or after 5 minutes,
|
||||
whichever comes first, to prevent incorrect data in the report.
|
||||
|
||||
Removed phi2-lux alias (introduced in v3.9.8) due to Luxcoin's planned fork
|
||||
to a new algo. The new Luxcoin algo is not supported by cpuminer-opt.
|
||||
Until the fork Luxcoin can be mined using phi2 algo.
|
||||
|
||||
--hide-diff option is deprecated and has no effect. It will be removed in a
|
||||
future release.
|
||||
|
||||
v3.9.8
|
||||
|
||||
Changes to log output to provide data more relevant to actual mining
|
||||
performance.
|
||||
phi2 can now handle pools with a mix of coins that use and don't use roots.
|
||||
phi2-lux added as an alias for phi2 as they are identical except for roots.
|
||||
Add x16rv2 algo for Ravencoin fork.
|
||||
|
||||
v3.9.7
|
||||
|
||||
Command line option changes:
|
||||
|
||||
"-R" is no longer used as a shortcut for "--retry-pause", users must
|
||||
use the long option.
|
||||
|
||||
New options:
|
||||
|
||||
-N, --param-n: set the N parameter for yescrypt, yespower or scrypt algos
|
||||
-R, --param-r: set the R parameter for yescrypt or yespower algos, scrypt is
|
||||
hardcoded with R=1
|
||||
-K, --param-key: set the client key/pers parameter for yescrypt/yespower algos.
|
||||
|
||||
These options can be used to mine yescrypt or yespower variations using
|
||||
the generic yescrypt or yespower algo name and specifying the parameters
|
||||
manually. They can even be used to mine variations that aren't formally
|
||||
supported by a unique algo name. Existing algos can continue to to be mined
|
||||
using their original name without parameters.
|
||||
|
||||
v3.9.6.2
|
||||
|
||||
New algo blake2b.
|
||||
Faster myr-gr on Ryzen using SHA.
|
||||
Faster blake2s SSE2.
|
||||
Small speedup of around 1% for several other algos.
|
||||
|
||||
v3.9.6.1
|
||||
|
||||
New algos: x21s, hex (alias x16r-hex).
|
||||
|
||||
v3.9.6
|
||||
|
||||
New algos: bmw512, x16rt, x16rt-veil (alias veil), x13bcd (alias bcd).
|
||||
|
||||
193
aclocal.m4
vendored
193
aclocal.m4
vendored
@@ -1,6 +1,6 @@
|
||||
# generated automatically by aclocal 1.15.1 -*- Autoconf -*-
|
||||
# generated automatically by aclocal 1.16.1 -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1996-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -20,7 +20,7 @@ You have another version of autoconf. It may work, but is not guaranteed to.
|
||||
If you have problems, you may need to regenerate the build system entirely.
|
||||
To do so, use the procedure documented by the package, typically 'autoreconf'.])])
|
||||
|
||||
# Copyright (C) 2002-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2002-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -32,10 +32,10 @@ To do so, use the procedure documented by the package, typically 'autoreconf'.])
|
||||
# generated from the m4 files accompanying Automake X.Y.
|
||||
# (This private macro should not be called outside this file.)
|
||||
AC_DEFUN([AM_AUTOMAKE_VERSION],
|
||||
[am__api_version='1.15'
|
||||
[am__api_version='1.16'
|
||||
dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
|
||||
dnl require some minimum version. Point them to the right macro.
|
||||
m4_if([$1], [1.15.1], [],
|
||||
m4_if([$1], [1.16.1], [],
|
||||
[AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
|
||||
])
|
||||
|
||||
@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
|
||||
# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
|
||||
# This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
|
||||
AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
|
||||
[AM_AUTOMAKE_VERSION([1.15.1])dnl
|
||||
[AM_AUTOMAKE_VERSION([1.16.1])dnl
|
||||
m4_ifndef([AC_AUTOCONF_VERSION],
|
||||
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
|
||||
_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
|
||||
|
||||
# Figure out how to run the assembler. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -78,7 +78,7 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl
|
||||
|
||||
# AM_AUX_DIR_EXPAND -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -130,7 +130,7 @@ am_aux_dir=`cd "$ac_aux_dir" && pwd`
|
||||
|
||||
# AM_CONDITIONAL -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1997-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1997-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -161,7 +161,7 @@ AC_CONFIG_COMMANDS_PRE(
|
||||
Usually this means the macro was only invoked conditionally.]])
|
||||
fi])])
|
||||
|
||||
# Copyright (C) 1999-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -352,13 +352,12 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
|
||||
|
||||
# Generate code to set up dependency tracking. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1999-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
# with or without modifications, as long as this notice is preserved.
|
||||
|
||||
|
||||
# _AM_OUTPUT_DEPENDENCY_COMMANDS
|
||||
# ------------------------------
|
||||
AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
|
||||
@@ -366,49 +365,41 @@ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
|
||||
# Older Autoconf quotes --file arguments for eval, but not when files
|
||||
# are listed without --file. Let's play safe and only enable the eval
|
||||
# if we detect the quoting.
|
||||
case $CONFIG_FILES in
|
||||
*\'*) eval set x "$CONFIG_FILES" ;;
|
||||
*) set x $CONFIG_FILES ;;
|
||||
esac
|
||||
# TODO: see whether this extra hack can be removed once we start
|
||||
# requiring Autoconf 2.70 or later.
|
||||
AS_CASE([$CONFIG_FILES],
|
||||
[*\'*], [eval set x "$CONFIG_FILES"],
|
||||
[*], [set x $CONFIG_FILES])
|
||||
shift
|
||||
for mf
|
||||
# Used to flag and report bootstrapping failures.
|
||||
am_rc=0
|
||||
for am_mf
|
||||
do
|
||||
# Strip MF so we end up with the name of the file.
|
||||
mf=`echo "$mf" | sed -e 's/:.*$//'`
|
||||
# Check whether this is an Automake generated Makefile or not.
|
||||
# We used to match only the files named 'Makefile.in', but
|
||||
# some people rename them; so instead we look at the file content.
|
||||
# Grep'ing the first line is not enough: some people post-process
|
||||
# each Makefile.in and add a new line on top of each file to say so.
|
||||
# Grep'ing the whole file is not good either: AIX grep has a line
|
||||
am_mf=`AS_ECHO(["$am_mf"]) | sed -e 's/:.*$//'`
|
||||
# Check whether this is an Automake generated Makefile which includes
|
||||
# dependency-tracking related rules and includes.
|
||||
# Grep'ing the whole file directly is not great: AIX grep has a line
|
||||
# limit of 2048, but all sed's we know have understand at least 4000.
|
||||
if sed -n 's,^#.*generated by automake.*,X,p' "$mf" | grep X >/dev/null 2>&1; then
|
||||
dirpart=`AS_DIRNAME("$mf")`
|
||||
else
|
||||
continue
|
||||
fi
|
||||
# Extract the definition of DEPDIR, am__include, and am__quote
|
||||
# from the Makefile without running 'make'.
|
||||
DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
|
||||
test -z "$DEPDIR" && continue
|
||||
am__include=`sed -n 's/^am__include = //p' < "$mf"`
|
||||
test -z "$am__include" && continue
|
||||
am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
|
||||
# Find all dependency output files, they are included files with
|
||||
# $(DEPDIR) in their names. We invoke sed twice because it is the
|
||||
# simplest approach to changing $(DEPDIR) to its actual value in the
|
||||
# expansion.
|
||||
for file in `sed -n "
|
||||
s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
|
||||
sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g'`; do
|
||||
# Make sure the directory exists.
|
||||
test -f "$dirpart/$file" && continue
|
||||
fdir=`AS_DIRNAME(["$file"])`
|
||||
AS_MKDIR_P([$dirpart/$fdir])
|
||||
# echo "creating $dirpart/$file"
|
||||
echo '# dummy' > "$dirpart/$file"
|
||||
done
|
||||
sed -n 's,^am--depfiles:.*,X,p' "$am_mf" | grep X >/dev/null 2>&1 \
|
||||
|| continue
|
||||
am_dirpart=`AS_DIRNAME(["$am_mf"])`
|
||||
am_filepart=`AS_BASENAME(["$am_mf"])`
|
||||
AM_RUN_LOG([cd "$am_dirpart" \
|
||||
&& sed -e '/# am--include-marker/d' "$am_filepart" \
|
||||
| $MAKE -f - am--depfiles]) || am_rc=$?
|
||||
done
|
||||
if test $am_rc -ne 0; then
|
||||
AC_MSG_FAILURE([Something went wrong bootstrapping makefile fragments
|
||||
for automatic dependency tracking. Try re-running configure with the
|
||||
'--disable-dependency-tracking' option to at least be able to build
|
||||
the package (albeit without support for automatic dependency tracking).])
|
||||
fi
|
||||
AS_UNSET([am_dirpart])
|
||||
AS_UNSET([am_filepart])
|
||||
AS_UNSET([am_mf])
|
||||
AS_UNSET([am_rc])
|
||||
rm -f conftest-deps.mk
|
||||
}
|
||||
])# _AM_OUTPUT_DEPENDENCY_COMMANDS
|
||||
|
||||
@@ -417,18 +408,17 @@ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
|
||||
# -----------------------------
|
||||
# This macro should only be invoked once -- use via AC_REQUIRE.
|
||||
#
|
||||
# This code is only required when automatic dependency tracking
|
||||
# is enabled. FIXME. This creates each '.P' file that we will
|
||||
# need in order to bootstrap the dependency handling code.
|
||||
# This code is only required when automatic dependency tracking is enabled.
|
||||
# This creates each '.Po' and '.Plo' makefile fragment that we'll need in
|
||||
# order to bootstrap the dependency handling code.
|
||||
AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
|
||||
[AC_CONFIG_COMMANDS([depfiles],
|
||||
[test x"$AMDEP_TRUE" != x"" || _AM_OUTPUT_DEPENDENCY_COMMANDS],
|
||||
[AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"])
|
||||
])
|
||||
[AMDEP_TRUE="$AMDEP_TRUE" MAKE="${MAKE-make}"])])
|
||||
|
||||
# Do all the work for Automake. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1996-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -515,8 +505,8 @@ AC_REQUIRE([AM_PROG_INSTALL_STRIP])dnl
|
||||
AC_REQUIRE([AC_PROG_MKDIR_P])dnl
|
||||
# For better backward compatibility. To be removed once Automake 1.9.x
|
||||
# dies out for good. For more background, see:
|
||||
# <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
|
||||
# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
|
||||
# <https://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
|
||||
# <https://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
|
||||
AC_SUBST([mkdir_p], ['$(MKDIR_P)'])
|
||||
# We need awk for the "check" target (and possibly the TAP driver). The
|
||||
# system "awk" is bad on some platforms.
|
||||
@@ -583,7 +573,7 @@ END
|
||||
Aborting the configuration process, to ensure you take notice of the issue.
|
||||
|
||||
You can download and install GNU coreutils to get an 'rm' implementation
|
||||
that behaves properly: <http://www.gnu.org/software/coreutils/>.
|
||||
that behaves properly: <https://www.gnu.org/software/coreutils/>.
|
||||
|
||||
If you want to complete the configuration process using your problematic
|
||||
'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM
|
||||
@@ -625,7 +615,7 @@ for _am_header in $config_headers :; do
|
||||
done
|
||||
echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
|
||||
|
||||
# Copyright (C) 2001-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -646,7 +636,7 @@ if test x"${install_sh+set}" != xset; then
|
||||
fi
|
||||
AC_SUBST([install_sh])])
|
||||
|
||||
# Copyright (C) 2003-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2003-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -668,7 +658,7 @@ AC_SUBST([am__leading_dot])])
|
||||
# Add --enable-maintainer-mode option to configure. -*- Autoconf -*-
|
||||
# From Jim Meyering
|
||||
|
||||
# Copyright (C) 1996-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -703,7 +693,7 @@ AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
|
||||
|
||||
# Check to see how 'make' treats includes. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -711,49 +701,42 @@ AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
|
||||
|
||||
# AM_MAKE_INCLUDE()
|
||||
# -----------------
|
||||
# Check to see how make treats includes.
|
||||
# Check whether make has an 'include' directive that can support all
|
||||
# the idioms we need for our automatic dependency tracking code.
|
||||
AC_DEFUN([AM_MAKE_INCLUDE],
|
||||
[am_make=${MAKE-make}
|
||||
cat > confinc << 'END'
|
||||
[AC_MSG_CHECKING([whether ${MAKE-make} supports the include directive])
|
||||
cat > confinc.mk << 'END'
|
||||
am__doit:
|
||||
@echo this is the am__doit target
|
||||
@echo this is the am__doit target >confinc.out
|
||||
.PHONY: am__doit
|
||||
END
|
||||
# If we don't find an include directive, just comment out the code.
|
||||
AC_MSG_CHECKING([for style of include used by $am_make])
|
||||
am__include="#"
|
||||
am__quote=
|
||||
_am_result=none
|
||||
# First try GNU make style include.
|
||||
echo "include confinc" > confmf
|
||||
# Ignore all kinds of additional output from 'make'.
|
||||
case `$am_make -s -f confmf 2> /dev/null` in #(
|
||||
*the\ am__doit\ target*)
|
||||
am__include=include
|
||||
am__quote=
|
||||
_am_result=GNU
|
||||
;;
|
||||
esac
|
||||
# Now try BSD make style include.
|
||||
if test "$am__include" = "#"; then
|
||||
echo '.include "confinc"' > confmf
|
||||
case `$am_make -s -f confmf 2> /dev/null` in #(
|
||||
*the\ am__doit\ target*)
|
||||
am__include=.include
|
||||
am__quote="\""
|
||||
_am_result=BSD
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
AC_SUBST([am__include])
|
||||
AC_SUBST([am__quote])
|
||||
AC_MSG_RESULT([$_am_result])
|
||||
rm -f confinc confmf
|
||||
])
|
||||
# BSD make does it like this.
|
||||
echo '.include "confinc.mk" # ignored' > confmf.BSD
|
||||
# Other make implementations (GNU, Solaris 10, AIX) do it like this.
|
||||
echo 'include confinc.mk # ignored' > confmf.GNU
|
||||
_am_result=no
|
||||
for s in GNU BSD; do
|
||||
AM_RUN_LOG([${MAKE-make} -f confmf.$s && cat confinc.out])
|
||||
AS_CASE([$?:`cat confinc.out 2>/dev/null`],
|
||||
['0:this is the am__doit target'],
|
||||
[AS_CASE([$s],
|
||||
[BSD], [am__include='.include' am__quote='"'],
|
||||
[am__include='include' am__quote=''])])
|
||||
if test "$am__include" != "#"; then
|
||||
_am_result="yes ($s style)"
|
||||
break
|
||||
fi
|
||||
done
|
||||
rm -f confinc.* confmf.*
|
||||
AC_MSG_RESULT([${_am_result}])
|
||||
AC_SUBST([am__include])])
|
||||
AC_SUBST([am__quote])])
|
||||
|
||||
# Fake the existence of programs that GNU maintainers use. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1997-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1997-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -792,7 +775,7 @@ fi
|
||||
|
||||
# Helper functions for option handling. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -821,7 +804,7 @@ AC_DEFUN([_AM_SET_OPTIONS],
|
||||
AC_DEFUN([_AM_IF_OPTION],
|
||||
[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
|
||||
|
||||
# Copyright (C) 1999-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -868,7 +851,7 @@ AC_LANG_POP([C])])
|
||||
# For backward compatibility.
|
||||
AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
|
||||
|
||||
# Copyright (C) 2001-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -887,7 +870,7 @@ AC_DEFUN([AM_RUN_LOG],
|
||||
|
||||
# Check to make sure that the build environment is sane. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1996-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -968,7 +951,7 @@ AC_CONFIG_COMMANDS_PRE(
|
||||
rm -f conftest.file
|
||||
])
|
||||
|
||||
# Copyright (C) 2009-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2009-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -1028,7 +1011,7 @@ AC_SUBST([AM_BACKSLASH])dnl
|
||||
_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
|
||||
])
|
||||
|
||||
# Copyright (C) 2001-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -1056,7 +1039,7 @@ fi
|
||||
INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
|
||||
AC_SUBST([INSTALL_STRIP_PROGRAM])])
|
||||
|
||||
# Copyright (C) 2006-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2006-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -1075,7 +1058,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
|
||||
|
||||
# Check how to create a tarball. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2004-2017 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2004-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
|
||||
317
algo-gate-api.c
317
algo-gate-api.c
@@ -90,39 +90,174 @@ void algo_not_implemented()
|
||||
}
|
||||
|
||||
// default null functions
|
||||
|
||||
// deprecated, use generic as default
|
||||
int null_scanhash()
|
||||
{
|
||||
applog(LOG_WARNING,"SWERR: undefined scanhash function in algo_gate");
|
||||
return 0;
|
||||
}
|
||||
|
||||
void null_hash()
|
||||
// Default generic scanhash can be used in many cases.
|
||||
int scanhash_generic( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t edata[20] __attribute__((aligned(64)));
|
||||
uint32_t hash[8] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 1;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
do
|
||||
{
|
||||
edata[19] = n;
|
||||
if ( likely( algo_gate.hash( hash, edata, thr_id ) ) )
|
||||
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n );
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
n++;
|
||||
} while ( n < last_nonce && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
//int scanhash_4way_64_64( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr )
|
||||
|
||||
//int scanhash_4way_64_640( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr )
|
||||
|
||||
int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash32[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash32_d7 = &(hash32[ 7*4 ]);
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
__m256i *noncev = (__m256i*)vdata + 9;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if ( likely( algo_gate.hash( hash32, vdata, thr_id ) ) )
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 && !bench ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( valid_hash( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev,
|
||||
m256_const1_64( 0x0000000400000000 ) );
|
||||
n += 4;
|
||||
} while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//int scanhash_8way_32_32( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr )
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
//int scanhash_8way_64_64( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr )
|
||||
|
||||
//int scanhash_8way_64_640( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr )
|
||||
|
||||
int scanhash_8way_64in_32out( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash32[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash32_d7 = &(hash32[7*8]);
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
__m512i *noncev = (__m512i*)vdata + 9;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
*noncev = mm512_intrlv_blend_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if ( likely( algo_gate.hash( hash32, vdata, thr_id ) ) )
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( ( hash32_d7[ lane ] <= targ32_d7 ) && !bench ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev,
|
||||
m512_const1_64( 0x0000000800000000 ) );
|
||||
n += 8;
|
||||
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//int scanhash_16way_32_32( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr )
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
int null_hash()
|
||||
{
|
||||
applog(LOG_WARNING,"SWERR: null_hash unsafe null function");
|
||||
};
|
||||
void null_hash_suw()
|
||||
{
|
||||
applog(LOG_WARNING,"SWERR: null_hash_suw unsafe null function");
|
||||
return 0;
|
||||
};
|
||||
|
||||
void init_algo_gate( algo_gate_t* gate )
|
||||
{
|
||||
gate->miner_thread_init = (void*)&return_true;
|
||||
gate->scanhash = (void*)&null_scanhash;
|
||||
gate->scanhash = (void*)&scanhash_generic;
|
||||
gate->hash = (void*)&null_hash;
|
||||
gate->hash_suw = (void*)&null_hash_suw;
|
||||
gate->get_new_work = (void*)&std_get_new_work;
|
||||
gate->get_nonceptr = (void*)&std_get_nonceptr;
|
||||
gate->work_decode = (void*)&std_le_work_decode;
|
||||
gate->decode_extra_data = (void*)&do_nothing;
|
||||
gate->wait_for_diff = (void*)&std_wait_for_diff;
|
||||
gate->get_max64 = (void*)&get_max64_0x1fffffLL;
|
||||
gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
|
||||
gate->stratum_gen_work = (void*)&std_stratum_gen_work;
|
||||
gate->build_stratum_request = (void*)&std_le_build_stratum_request;
|
||||
gate->malloc_txs_request = (void*)&std_malloc_txs_request;
|
||||
gate->set_target = (void*)&std_set_target;
|
||||
gate->submit_getwork_result = (void*)&std_le_submit_getwork_result;
|
||||
gate->build_block_header = (void*)&std_build_block_header;
|
||||
gate->build_extraheader = (void*)&std_build_extraheader;
|
||||
@@ -132,7 +267,6 @@ void init_algo_gate( algo_gate_t* gate )
|
||||
gate->resync_threads = (void*)&do_nothing;
|
||||
gate->do_this_thread = (void*)&return_true;
|
||||
gate->longpoll_rpc_call = (void*)&std_longpoll_rpc_call;
|
||||
gate->stratum_handle_response = (void*)&std_stratum_handle_response;
|
||||
gate->get_work_data_size = (void*)&std_get_work_data_size;
|
||||
gate->optimizations = EMPTY_SET;
|
||||
gate->ntime_index = STD_NTIME_INDEX;
|
||||
@@ -165,30 +299,23 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_ARGON2D500: register_argon2d_dyn_algo ( gate ); break;
|
||||
case ALGO_ARGON2D4096: register_argon2d4096_algo ( gate ); break;
|
||||
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
|
||||
case ALGO_BASTION: register_bastion_algo ( gate ); break;
|
||||
case ALGO_BLAKE: register_blake_algo ( gate ); break;
|
||||
case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break;
|
||||
// case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break;
|
||||
case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break;
|
||||
case ALGO_BLAKE2S: register_blake2s_algo ( gate ); break;
|
||||
case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break;
|
||||
case ALGO_BMW512: register_bmw512_algo ( gate ); break;
|
||||
case ALGO_C11: register_c11_algo ( gate ); break;
|
||||
case ALGO_CRYPTOLIGHT: register_cryptolight_algo ( gate ); break;
|
||||
case ALGO_CRYPTONIGHT: register_cryptonight_algo ( gate ); break;
|
||||
case ALGO_CRYPTONIGHTV7: register_cryptonightv7_algo ( gate ); break;
|
||||
case ALGO_DECRED: register_decred_algo ( gate ); break;
|
||||
case ALGO_DEEP: register_deep_algo ( gate ); break;
|
||||
case ALGO_DMD_GR: register_dmd_gr_algo ( gate ); break;
|
||||
case ALGO_DROP: register_drop_algo ( gate ); break;
|
||||
case ALGO_FRESH: register_fresh_algo ( gate ); break;
|
||||
case ALGO_GROESTL: register_groestl_algo ( gate ); break;
|
||||
case ALGO_HEAVY: register_heavy_algo ( gate ); break;
|
||||
case ALGO_HEX: register_hex_algo ( gate ); break;
|
||||
case ALGO_HMQ1725: register_hmq1725_algo ( gate ); break;
|
||||
case ALGO_HODL: register_hodl_algo ( gate ); break;
|
||||
case ALGO_JHA: register_jha_algo ( gate ); break;
|
||||
case ALGO_KECCAK: register_keccak_algo ( gate ); break;
|
||||
case ALGO_KECCAKC: register_keccakc_algo ( gate ); break;
|
||||
case ALGO_LBRY: register_lbry_algo ( gate ); break;
|
||||
case ALGO_LUFFA: register_luffa_algo ( gate ); break;
|
||||
case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break;
|
||||
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
|
||||
@@ -196,21 +323,22 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z330: register_lyra2z330_algo ( gate ); break;
|
||||
case ALGO_M7M: register_m7m_algo ( gate ); break;
|
||||
case ALGO_MINOTAUR: register_minotaur_algo ( gate ); break;
|
||||
case ALGO_MYR_GR: register_myriad_algo ( gate ); break;
|
||||
case ALGO_NEOSCRYPT: register_neoscrypt_algo ( gate ); break;
|
||||
case ALGO_NIST5: register_nist5_algo ( gate ); break;
|
||||
case ALGO_PENTABLAKE: register_pentablake_algo ( gate ); break;
|
||||
case ALGO_PHI1612: register_phi1612_algo ( gate ); break;
|
||||
case ALGO_PHI2: register_phi2_algo ( gate ); break;
|
||||
case ALGO_PLUCK: register_pluck_algo ( gate ); break;
|
||||
case ALGO_POLYTIMOS: register_polytimos_algo ( gate ); break;
|
||||
case ALGO_POWER2B: register_power2b_algo ( gate ); break;
|
||||
case ALGO_QUARK: register_quark_algo ( gate ); break;
|
||||
case ALGO_QUBIT: register_qubit_algo ( gate ); break;
|
||||
case ALGO_SCRYPT: register_scrypt_algo ( gate ); break;
|
||||
case ALGO_SCRYPTJANE: register_scryptjane_algo ( gate ); break;
|
||||
case ALGO_SHA256D: register_sha256d_algo ( gate ); break;
|
||||
case ALGO_SHA256Q: register_sha256q_algo ( gate ); break;
|
||||
case ALGO_SHA256T: register_sha256t_algo ( gate ); break;
|
||||
case ALGO_SHA3D: register_sha3d_algo ( gate ); break;
|
||||
case ALGO_SHAVITE3: register_shavite_algo ( gate ); break;
|
||||
case ALGO_SKEIN: register_skein_algo ( gate ); break;
|
||||
case ALGO_SKEIN2: register_skein2_algo ( gate ); break;
|
||||
@@ -233,22 +361,23 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_X14: register_x14_algo ( gate ); break;
|
||||
case ALGO_X15: register_x15_algo ( gate ); break;
|
||||
case ALGO_X16R: register_x16r_algo ( gate ); break;
|
||||
case ALGO_X16RV2: register_x16rv2_algo ( gate ); break;
|
||||
case ALGO_X16RT: register_x16rt_algo ( gate ); break;
|
||||
case ALGO_X16RT_VEIL: register_x16rt_veil_algo ( gate ); break;
|
||||
case ALGO_X16S: register_x16s_algo ( gate ); break;
|
||||
case ALGO_X17: register_x17_algo ( gate ); break;
|
||||
case ALGO_X21S: register_x21s_algo ( gate ); break;
|
||||
case ALGO_X22I: register_x22i_algo ( gate ); break;
|
||||
case ALGO_X25X: register_x25x_algo ( gate ); break;
|
||||
case ALGO_XEVAN: register_xevan_algo ( gate ); break;
|
||||
/* case ALGO_YESCRYPT: register_yescrypt_05_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: register_yescryptr8_05_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: register_yescryptr16_05_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR32: register_yescryptr32_05_algo ( gate ); break;
|
||||
*/
|
||||
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8G: register_yescryptr8g_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
|
||||
case ALGO_YESPOWER: register_yespower_algo ( gate ); break;
|
||||
case ALGO_YESPOWERR16: register_yespowerr16_algo ( gate ); break;
|
||||
case ALGO_YESPOWER_B2B: register_yespower_b2b_algo ( gate ); break;
|
||||
case ALGO_ZR5: register_zr5_algo ( gate ); break;
|
||||
default:
|
||||
applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
|
||||
@@ -267,30 +396,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
// restore warnings
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
// override std defaults with jr2 defaults
|
||||
bool register_json_rpc2( algo_gate_t *gate )
|
||||
{
|
||||
applog(LOG_WARNING,"\nCryptonight algorithm and variants are no longer");
|
||||
applog(LOG_WARNING,"supported by cpuminer-opt. Shares submitted will");
|
||||
applog(LOG_WARNING,"likely be rejected. Proceed at your own risk.\n");
|
||||
|
||||
gate->wait_for_diff = (void*)&do_nothing;
|
||||
gate->get_new_work = (void*)&jr2_get_new_work;
|
||||
gate->get_nonceptr = (void*)&jr2_get_nonceptr;
|
||||
gate->stratum_gen_work = (void*)&jr2_stratum_gen_work;
|
||||
gate->build_stratum_request = (void*)&jr2_build_stratum_request;
|
||||
gate->submit_getwork_result = (void*)&jr2_submit_getwork_result;
|
||||
gate->longpoll_rpc_call = (void*)&jr2_longpoll_rpc_call;
|
||||
gate->work_decode = (void*)&jr2_work_decode;
|
||||
gate->stratum_handle_response = (void*)&jr2_stratum_handle_response;
|
||||
gate->nonce_index = JR2_NONCE_INDEX;
|
||||
jsonrpc_2 = true; // still needed
|
||||
opt_extranonce = false;
|
||||
// have_gbt = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// run the alternate hash function for a specific algo
|
||||
void exec_hash_function( int algo, void *output, const void *pdata )
|
||||
{
|
||||
algo_gate_t gate;
|
||||
@@ -310,38 +415,37 @@ void exec_hash_function( int algo, void *output, const void *pdata )
|
||||
const char* const algo_alias_map[][2] =
|
||||
{
|
||||
// alias proper
|
||||
{ "argon2d-crds", "argon2d250" },
|
||||
{ "argon2d-dyn", "argon2d500" },
|
||||
{ "argon2d-uis", "argon2d4096" },
|
||||
{ "bitcore", "timetravel10" },
|
||||
{ "bitzeny", "yescryptr8" },
|
||||
{ "blake256r8", "blakecoin" },
|
||||
{ "blake256r8vnl", "vanilla" },
|
||||
{ "blake256r14", "blake" },
|
||||
{ "blake256r14dcr", "decred" },
|
||||
{ "cryptonote", "cryptonight" },
|
||||
{ "cryptonight-light", "cryptolight" },
|
||||
{ "diamond", "dmd-gr" },
|
||||
{ "droplp", "drop" },
|
||||
{ "espers", "hmq1725" },
|
||||
{ "flax", "c11" },
|
||||
{ "hsr", "x13sm3" },
|
||||
{ "jackpot", "jha" },
|
||||
{ "jane", "scryptjane" },
|
||||
{ "lyra2", "lyra2re" },
|
||||
{ "lyra2v2", "lyra2rev2" },
|
||||
{ "lyra2v3", "lyra2rev3" },
|
||||
{ "myrgr", "myr-gr" },
|
||||
{ "myriad", "myr-gr" },
|
||||
{ "neo", "neoscrypt" },
|
||||
{ "phi", "phi1612" },
|
||||
// { "sia", "blake2b" },
|
||||
{ "sib", "x11gost" },
|
||||
{ "timetravel8", "timetravel" },
|
||||
{ "veil", "x16rt-veil" },
|
||||
{ "yenten", "yescryptr16" },
|
||||
{ "ziftr", "zr5" },
|
||||
{ NULL, NULL }
|
||||
{ "argon2d-crds", "argon2d250" },
|
||||
{ "argon2d-dyn", "argon2d500" },
|
||||
{ "argon2d-uis", "argon2d4096" },
|
||||
{ "bcd", "x13bcd" },
|
||||
{ "bitcore", "timetravel10" },
|
||||
{ "bitzeny", "yescryptr8" },
|
||||
{ "blake256r8", "blakecoin" },
|
||||
{ "blake256r8vnl", "vanilla" },
|
||||
{ "blake256r14", "blake" },
|
||||
{ "blake256r14dcr", "decred" },
|
||||
{ "diamond", "dmd-gr" },
|
||||
{ "espers", "hmq1725" },
|
||||
{ "flax", "c11" },
|
||||
{ "hsr", "x13sm3" },
|
||||
{ "jackpot", "jha" },
|
||||
{ "jane", "scryptjane" },
|
||||
{ "lyra2", "lyra2re" },
|
||||
{ "lyra2v2", "lyra2rev2" },
|
||||
{ "lyra2v3", "lyra2rev3" },
|
||||
{ "myrgr", "myr-gr" },
|
||||
{ "myriad", "myr-gr" },
|
||||
{ "neo", "neoscrypt" },
|
||||
{ "phi", "phi1612" },
|
||||
{ "scryptn2", "scrypt:1048576" },
|
||||
{ "sib", "x11gost" },
|
||||
{ "timetravel8", "timetravel" },
|
||||
{ "veil", "x16rt-veil" },
|
||||
{ "x16r-hex", "hex" },
|
||||
{ "yenten", "yescryptr16" },
|
||||
{ "ziftr", "zr5" },
|
||||
{ NULL, NULL }
|
||||
};
|
||||
|
||||
// if arg is a valid alias for a known algo it is updated with the proper
|
||||
@@ -354,7 +458,7 @@ void get_algo_alias( char** algo_or_alias )
|
||||
if ( !strcasecmp( *algo_or_alias, algo_alias_map[i][ ALIAS ] ) )
|
||||
{
|
||||
// found valid alias, return proper name
|
||||
*algo_or_alias = (char* const)( algo_alias_map[i][ PROPER ] );
|
||||
*algo_or_alias = (char*)( algo_alias_map[i][ PROPER ] );
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -362,40 +466,3 @@ void get_algo_alias( char** algo_or_alias )
|
||||
#undef ALIAS
|
||||
#undef PROPER
|
||||
|
||||
bool submit_solution( struct work *work, void *hash,
|
||||
struct thr_info *thr )
|
||||
{
|
||||
work_set_target_ratio( work, hash );
|
||||
if ( submit_work( thr, work ) )
|
||||
{
|
||||
if ( !opt_quiet )
|
||||
applog( LOG_BLUE, "Share %d submitted by thread %d, job %s.",
|
||||
accepted_share_count + rejected_share_count + 1,
|
||||
thr->id, work->job_id );
|
||||
return true;
|
||||
}
|
||||
else
|
||||
applog( LOG_WARNING, "Failed to submit share." );
|
||||
return false;
|
||||
}
|
||||
|
||||
bool submit_lane_solution( struct work *work, void *hash,
|
||||
struct thr_info *thr, int lane )
|
||||
{
|
||||
work_set_target_ratio( work, hash );
|
||||
if ( submit_work( thr, work ) )
|
||||
{
|
||||
if ( !opt_quiet )
|
||||
// applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d.",
|
||||
// accepted_share_count + rejected_share_count + 1,
|
||||
// thr->id, lane );
|
||||
applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d, job %s.",
|
||||
accepted_share_count + rejected_share_count + 1, thr->id,
|
||||
lane, work->job_id );
|
||||
return true;
|
||||
}
|
||||
else
|
||||
applog( LOG_WARNING, "Failed to submit share." );
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
227
algo-gate-api.h
227
algo-gate-api.h
@@ -35,7 +35,7 @@
|
||||
// 6. Determine if other non existant functions are required.
|
||||
// That is determined by the need to add code in cpu-miner.c
|
||||
// that applies only to the new algo. That is forbidden. All
|
||||
// algo specific code must be in theh algo's file.
|
||||
// algo specific code must be in the algo's file.
|
||||
//
|
||||
// 7. If new functions need to be added to the gate add the type
|
||||
// to the structure, declare a null instance in this file and define
|
||||
@@ -48,10 +48,10 @@
|
||||
// instances as they are defined by default, or unsafe functions that
|
||||
// are not needed by the algo.
|
||||
//
|
||||
// 9. Add an case entry to the switch/case in function register_gate
|
||||
// 9. Add a case entry to the switch/case in function register_gate
|
||||
// in file algo-gate-api.c for the new algo.
|
||||
//
|
||||
// 10 If a new function type was defined add an entry to ini talgo_gate
|
||||
// 10 If a new function type was defined add an entry to init algo_gate
|
||||
// to initialize the new function to its null instance described in step 7.
|
||||
//
|
||||
// 11. If the new algo has aliases add them to the alias array in
|
||||
@@ -75,7 +75,7 @@
|
||||
|
||||
// my hack at creating a set data type using bit masks. Set inclusion,
|
||||
// exclusion union and intersection operations are provided for convenience. In // some cases it may be desireable to use boolean algebra directly on the
|
||||
// data to perfomr set operations. Sets can be represented as single
|
||||
// data to perform set operations. Sets can be represented as single
|
||||
// elements, a bitwise OR of multiple elements, a bitwise OR of multiple
|
||||
// set variables or constants, or combinations of the above.
|
||||
// Examples:
|
||||
@@ -85,14 +85,17 @@
|
||||
|
||||
typedef uint32_t set_t;
|
||||
|
||||
#define EMPTY_SET 0
|
||||
#define SSE2_OPT 1
|
||||
#define AES_OPT 2
|
||||
#define SSE42_OPT 4
|
||||
#define AVX_OPT 8
|
||||
#define AVX2_OPT 0x10
|
||||
#define SHA_OPT 0x20
|
||||
#define AVX512_OPT 0x40
|
||||
#define EMPTY_SET 0
|
||||
#define SSE2_OPT 1
|
||||
#define AES_OPT 2
|
||||
#define SSE42_OPT 4
|
||||
#define AVX_OPT 8 // Sandybridge
|
||||
#define AVX2_OPT 0x10 // Haswell, Zen1
|
||||
#define SHA_OPT 0x20 // Zen1, Icelake (sha256)
|
||||
#define AVX512_OPT 0x40 // Skylake-X (AVX512[F,VL,DQ,BW])
|
||||
#define VAES_OPT 0x80 // Icelake (VAES & AVX512)
|
||||
#define VAES256_OPT 0x100 // Zen3 (VAES without AVX512)
|
||||
|
||||
|
||||
// return set containing all elements from sets a & b
|
||||
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
|
||||
@@ -108,45 +111,62 @@ inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
// special case, only one target, provides a callback for scanhash to
|
||||
// submit work with less overhead.
|
||||
// bool (*submit_work ) ( struct thr_info*, const struct work* );
|
||||
|
||||
// mandatory functions, must be overwritten
|
||||
// Added a 5th arg for the thread_info structure to replace the int thr id
|
||||
// in the first arg. Both will co-exist during the trasition.
|
||||
//int ( *scanhash ) ( int, struct work*, uint32_t, uint64_t* );
|
||||
// Mandatory functions, one of these is mandatory. If a generic scanhash
|
||||
// is used a custom target hash function must be registered, with a custom
|
||||
// scanhash the target hash function can be called directly and doesn't need
|
||||
// to be registered in the gate.
|
||||
int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );
|
||||
|
||||
// optional unsafe, must be overwritten if algo uses function
|
||||
void ( *hash ) ( void*, const void*, uint32_t ) ;
|
||||
void ( *hash_suw ) ( void*, const void* );
|
||||
int ( *hash ) ( void*, const void*, int );
|
||||
|
||||
//optional, safe to use default in most cases
|
||||
bool ( *miner_thread_init ) ( int );
|
||||
void ( *stratum_gen_work ) ( struct stratum_ctx*, struct work* );
|
||||
void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t*,
|
||||
bool );
|
||||
uint32_t *( *get_nonceptr ) ( uint32_t* );
|
||||
void ( *decode_extra_data ) ( struct work*, uint64_t* );
|
||||
void ( *wait_for_diff ) ( struct stratum_ctx* );
|
||||
int64_t ( *get_max64 ) ();
|
||||
bool ( *work_decode ) ( const json_t*, struct work* );
|
||||
void ( *set_target) ( struct work*, double );
|
||||
bool ( *submit_getwork_result ) ( CURL*, struct work* );
|
||||
void ( *gen_merkle_root ) ( char*, struct stratum_ctx* );
|
||||
void ( *build_extraheader ) ( struct work*, struct stratum_ctx* );
|
||||
void ( *build_block_header ) ( struct work*, uint32_t, uint32_t*,
|
||||
uint32_t*, uint32_t, uint32_t );
|
||||
void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );
|
||||
char* ( *malloc_txs_request ) ( struct work* );
|
||||
void ( *set_work_data_endian ) ( struct work* );
|
||||
double ( *calc_network_diff ) ( struct work* );
|
||||
bool ( *ready_to_mine ) ( struct work*, struct stratum_ctx*, int );
|
||||
void ( *resync_threads ) ( struct work* );
|
||||
bool ( *do_this_thread ) ( int );
|
||||
json_t* (*longpoll_rpc_call) ( CURL*, int*, char* );
|
||||
bool ( *stratum_handle_response )( json_t* );
|
||||
|
||||
// Allocate thread local buffers and other initialization specific to miner
|
||||
// threads.
|
||||
bool ( *miner_thread_init ) ( int );
|
||||
|
||||
// Get thread local copy of blockheader with unique nonce.
|
||||
void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t* );
|
||||
|
||||
// Decode getwork blockheader
|
||||
bool ( *work_decode ) ( struct work* );
|
||||
|
||||
// Extra getwork data
|
||||
void ( *decode_extra_data ) ( struct work*, uint64_t* );
|
||||
|
||||
bool ( *submit_getwork_result ) ( CURL*, struct work* );
|
||||
|
||||
void ( *gen_merkle_root ) ( char*, struct stratum_ctx* );
|
||||
|
||||
// Increment extranonce
|
||||
void ( *build_extraheader ) ( struct work*, struct stratum_ctx* );
|
||||
|
||||
void ( *build_block_header ) ( struct work*, uint32_t, uint32_t*,
|
||||
uint32_t*, uint32_t, uint32_t,
|
||||
unsigned char* );
|
||||
|
||||
// Build mining.submit message
|
||||
void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );
|
||||
|
||||
char* ( *malloc_txs_request ) ( struct work* );
|
||||
|
||||
// Big or little
|
||||
void ( *set_work_data_endian ) ( struct work* );
|
||||
|
||||
double ( *calc_network_diff ) ( struct work* );
|
||||
|
||||
// Wait for first work
|
||||
bool ( *ready_to_mine ) ( struct work*, struct stratum_ctx*, int );
|
||||
|
||||
// Diverge mining threads
|
||||
bool ( *do_this_thread ) ( int );
|
||||
|
||||
// After do_this_thread
|
||||
void ( *resync_threads ) ( struct work* );
|
||||
|
||||
// No longer needed
|
||||
json_t* (*longpoll_rpc_call) ( CURL*, int*, char* );
|
||||
|
||||
set_t optimizations;
|
||||
int ( *get_work_data_size ) ();
|
||||
int ntime_index;
|
||||
@@ -184,69 +204,80 @@ void four_way_not_tested();
|
||||
#define STD_WORK_DATA_SIZE 128
|
||||
#define STD_WORK_CMP_SIZE 76
|
||||
|
||||
#define JR2_NONCE_INDEX 39 // 8 bit offset
|
||||
//#define JR2_NONCE_INDEX 39 // 8 bit offset
|
||||
|
||||
// These indexes are only used with JSON RPC2 and are not gated.
|
||||
#define JR2_WORK_CMP_INDEX_2 43
|
||||
#define JR2_WORK_CMP_SIZE_2 33
|
||||
//#define JR2_WORK_CMP_INDEX_2 43
|
||||
//#define JR2_WORK_CMP_SIZE_2 33
|
||||
|
||||
// allways returns failure
|
||||
// deprecated, use generic instead
|
||||
int null_scanhash();
|
||||
|
||||
// Allow algos to submit from scanhash loop.
|
||||
bool submit_solution( struct work *work, void *hash,
|
||||
struct thr_info *thr );
|
||||
bool submit_lane_solution( struct work *work, void *hash,
|
||||
struct thr_info *thr, int lane );
|
||||
// Default generic, may be used in many cases.
|
||||
// N-way is more complicated, requires many different implementations
|
||||
// depending on architecture, input format, and output format.
|
||||
// Naming convention is scanhash_[N]way_[input format]in_[output format]out
|
||||
// N = number of lanes
|
||||
// input/output format:
|
||||
// 32: 32 bit interleaved parallel lanes
|
||||
// 64: 64 bit interleaved parallel lanes
|
||||
// 640: input only, not interleaved, contiguous serial 640 bit lanes.
|
||||
// 256: output only, not interleaved, contiguous serial 256 bit lanes.
|
||||
|
||||
|
||||
bool submit_work( struct thr_info *thr, const struct work *work_in );
|
||||
int scanhash_generic( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
//int scanhash_4way_64in_64out( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
//int scanhash_4way_64in_256out( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
//int scanhash_8way_32in_32out( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
//int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
//int scanhash_8way_64in_256out( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
int scanhash_8way_64in_32out( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
//int scanhash_16way_32in_32out( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
// displays warning
|
||||
void null_hash ();
|
||||
void null_hash_suw();
|
||||
int null_hash ();
|
||||
|
||||
// optional safe targets, default listed first unless noted.
|
||||
|
||||
void std_wait_for_diff();
|
||||
|
||||
uint32_t *std_get_nonceptr( uint32_t *work_data );
|
||||
uint32_t *jr2_get_nonceptr( uint32_t *work_data );
|
||||
|
||||
void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
|
||||
uint32_t* end_nonce_ptr, bool clean_job );
|
||||
void jr2_get_new_work( struct work *work, struct work *g_work, int thr_id,
|
||||
uint32_t* end_nonce_ptr );
|
||||
|
||||
void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *work );
|
||||
void jr2_stratum_gen_work( struct stratum_ctx *sctx, struct work *work );
|
||||
|
||||
void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
|
||||
void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
|
||||
|
||||
// pick your favorite or define your own
|
||||
int64_t get_max64_0x1fffffLL(); // default
|
||||
int64_t get_max64_0x40LL();
|
||||
int64_t get_max64_0x3ffff();
|
||||
int64_t get_max64_0x3fffffLL();
|
||||
int64_t get_max64_0x1ffff();
|
||||
int64_t get_max64_0xffffLL();
|
||||
|
||||
void std_set_target( struct work *work, double job_diff );
|
||||
void alt_set_target( struct work* work, double job_diff );
|
||||
void scrypt_set_target( struct work *work, double job_diff );
|
||||
|
||||
bool std_le_work_decode( const json_t *val, struct work *work );
|
||||
bool std_be_work_decode( const json_t *val, struct work *work );
|
||||
bool jr2_work_decode( const json_t *val, struct work *work );
|
||||
bool std_le_work_decode( struct work *work );
|
||||
bool std_be_work_decode( struct work *work );
|
||||
|
||||
bool std_le_submit_getwork_result( CURL *curl, struct work *work );
|
||||
bool std_be_submit_getwork_result( CURL *curl, struct work *work );
|
||||
bool jr2_submit_getwork_result( CURL *curl, struct work *work );
|
||||
|
||||
void std_le_build_stratum_request( char *req, struct work *work );
|
||||
void std_be_build_stratum_request( char *req, struct work *work );
|
||||
void jr2_build_stratum_request ( char *req, struct work *work );
|
||||
|
||||
char* std_malloc_txs_request( struct work *work );
|
||||
|
||||
@@ -256,16 +287,13 @@ void set_work_data_big_endian( struct work *work );
|
||||
double std_calc_network_diff( struct work *work );
|
||||
|
||||
void std_build_block_header( struct work* g_work, uint32_t version,
|
||||
uint32_t *prevhash, uint32_t *merkle_root,
|
||||
uint32_t ntime, uint32_t nbits );
|
||||
uint32_t *prevhash, uint32_t *merkle_root,
|
||||
uint32_t ntime, uint32_t nbits,
|
||||
unsigned char *final_sapling_hash );
|
||||
|
||||
void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );
|
||||
|
||||
json_t* std_longpoll_rpc_call( CURL *curl, int *err, char *lp_url );
|
||||
json_t* jr2_longpoll_rpc_call( CURL *curl, int *err );
|
||||
|
||||
bool std_stratum_handle_response( json_t *val );
|
||||
bool jr2_stratum_handle_response( json_t *val );
|
||||
|
||||
bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
|
||||
int thr_id );
|
||||
@@ -278,19 +306,16 @@ int std_get_work_data_size();
|
||||
// by calling the algo's register function.
|
||||
bool register_algo_gate( int algo, algo_gate_t *gate );
|
||||
|
||||
// Override any default gate functions that are applicable and do any other
|
||||
// algo-specific initialization.
|
||||
// Called by algos to verride any default gate functions that are applicable
|
||||
// and do any other algo-specific initialization.
|
||||
// The register functions for all the algos can be declared here to reduce
|
||||
// compiler warnings but that's just more work for devs adding new algos.
|
||||
bool register_algo( algo_gate_t *gate );
|
||||
|
||||
// Overrides a common set of functions used by RPC2 and other RPC2-specific
|
||||
// init. Called by algo's register function before initializing algo-specific
|
||||
// functions and data.
|
||||
bool register_json_rpc2( algo_gate_t *gate );
|
||||
|
||||
// use this to call the hash function of an algo directly, ie util.c test.
|
||||
void exec_hash_function( int algo, void *output, const void *pdata );
|
||||
|
||||
void get_algo_alias( char** algo_or_alias );
|
||||
// Validate a string as a known algo and alias, updates arg to proper
|
||||
// algo name if valid alias, NULL if invalid alias or algo.
|
||||
void get_algo_alias( char **algo_or_alias );
|
||||
|
||||
|
||||
@@ -62,9 +62,7 @@ int scanhash_argon2( struct work* work, uint32_t max_nonce,
|
||||
argon2hash(hash, endiandata);
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
work_set_target_ratio(work, hash);
|
||||
return 1;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
@@ -74,19 +72,14 @@ int scanhash_argon2( struct work* work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t argon2_get_max64 ()
|
||||
{
|
||||
return 0x1ffLL;
|
||||
}
|
||||
|
||||
bool register_argon2_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&scanhash_argon2;
|
||||
gate->hash = (void*)&argon2hash;
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
gate->get_max64 = (void*)&argon2_get_max64;
|
||||
opt_target_factor = 65536.0;
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "argon2d-gate.h"
|
||||
#include "simd-utils.h"
|
||||
#include "argon2d/argon2.h"
|
||||
|
||||
static const size_t INPUT_BYTES = 80; // Lenth of a block header in bytes. Input Length = Salt Length (salt = input)
|
||||
@@ -36,7 +37,7 @@ void argon2d_crds_hash( void *output, const void *input )
|
||||
int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -45,11 +46,11 @@ int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t nonce = first_nonce;
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
swab32_array( edata, pdata, 20 );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
argon2d_crds_hash( hash, endiandata );
|
||||
be32enc(&edata[19], nonce);
|
||||
argon2d_crds_hash( hash, edata );
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
@@ -67,8 +68,8 @@ bool register_argon2d_crds_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_argon2d_crds;
|
||||
gate->hash = (void*)&argon2d_crds_hash;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -103,31 +104,32 @@ void argon2d_dyn_hash( void *output, const void *input )
|
||||
int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const int thr_id = mythr->id;
|
||||
const uint32_t first_nonce = (const uint32_t)pdata[19];
|
||||
const uint32_t last_nonce = (const uint32_t)max_nonce;
|
||||
uint32_t nonce = first_nonce;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
do
|
||||
{
|
||||
be32enc(&endiandata[19], nonce);
|
||||
argon2d_dyn_hash( hash, endiandata );
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
|
||||
edata[19] = nonce;
|
||||
argon2d_dyn_hash( hash, edata );
|
||||
if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
|
||||
&& !bench ) )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
pdata[19] = bswap_32( nonce );;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
} while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -135,8 +137,8 @@ bool register_argon2d_dyn_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_argon2d_dyn;
|
||||
gate->hash = (void*)&argon2d_dyn_hash;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -146,47 +148,42 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) vhash[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = (const uint32_t)max_nonce;
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
const int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
uint32_t t_cost = 1; // 1 iteration
|
||||
uint32_t m_cost = 4096; // use 4MB
|
||||
uint32_t parallelism = 1; // 1 thread, 2 lanes
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
be32enc( &endiandata[i], pdata[i] );
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
|
||||
do {
|
||||
be32enc( &endiandata[19], n );
|
||||
argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80,
|
||||
(char*) endiandata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
|
||||
if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) && !opt_benchmark )
|
||||
edata[19] = n;
|
||||
argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) edata, 80,
|
||||
(char*) edata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
|
||||
if ( unlikely( valid_hash( vhash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
be32enc( &pdata[19], n );
|
||||
submit_solution( work, vhash, mythr );
|
||||
}
|
||||
n++;
|
||||
} while ( likely( n < last_nonce && !work_restart[thr_id].restart ) );
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
*hashes_done = n - first_nonce;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t get_max64_0x1ff() { return 0x1ff; }
|
||||
|
||||
bool register_argon2d4096_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_argon2d4096;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
gate->get_max64 = (void*)&get_max64_0x1ff;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <mm_malloc.h>
|
||||
|
||||
#include "core.h"
|
||||
#include "argon2d_thread.h"
|
||||
@@ -99,7 +100,8 @@ int allocate_memory(const argon2_context *context, uint8_t **memory,
|
||||
if (context->allocate_cbk) {
|
||||
(context->allocate_cbk)(memory, memory_size);
|
||||
} else {
|
||||
*memory = malloc(memory_size);
|
||||
*memory = _mm_malloc( memory_size, 64 );
|
||||
// *memory = malloc(memory_size);
|
||||
}
|
||||
|
||||
if (*memory == NULL) {
|
||||
@@ -116,7 +118,8 @@ void free_memory(const argon2_context *context, uint8_t *memory,
|
||||
if (context->free_cbk) {
|
||||
(context->free_cbk)(memory, memory_size);
|
||||
} else {
|
||||
free(memory);
|
||||
// free(memory);
|
||||
_mm_free( memory );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
|
||||
#include "argon2.h"
|
||||
#include "core.h"
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include "../blake2/blake2.h"
|
||||
#include "../blake2/blamka-round-opt.h"
|
||||
|
||||
@@ -37,24 +37,28 @@
|
||||
|
||||
#if defined(__AVX512F__)
|
||||
|
||||
static void fill_block(__m512i *state, const block *ref_block,
|
||||
block *next_block, int with_xor) {
|
||||
static void fill_block( __m512i *state, const block *ref_block,
|
||||
block *next_block, int with_xor )
|
||||
{
|
||||
__m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK];
|
||||
unsigned int i;
|
||||
|
||||
if (with_xor) {
|
||||
for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm512_xor_si512(
|
||||
state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
|
||||
block_XY[i] = _mm512_xor_si512(
|
||||
state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i));
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
|
||||
block_XY[i] = state[i] = _mm512_xor_si512(
|
||||
state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
|
||||
if ( with_xor )
|
||||
{
|
||||
for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
|
||||
{
|
||||
state[i] = _mm512_xor_si512( state[i],
|
||||
_mm512_load_si512( (const __m512i*)ref_block->v + i ) );
|
||||
block_XY[i] = _mm512_xor_si512( state[i],
|
||||
_mm512_load_si512( (const __m512i*)next_block->v + i ) );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
|
||||
block_XY[i] = state[i] = _mm512_xor_si512( state[i],
|
||||
_mm512_load_si512( (const __m512i*)ref_block->v + i ) );
|
||||
}
|
||||
|
||||
BLAKE2_ROUND_1( state[ 0], state[ 1], state[ 2], state[ 3],
|
||||
state[ 4], state[ 5], state[ 6], state[ 7] );
|
||||
@@ -66,23 +70,10 @@ static void fill_block(__m512i *state, const block *ref_block,
|
||||
BLAKE2_ROUND_2( state[ 1], state[ 3], state[ 5], state[ 7],
|
||||
state[ 9], state[11], state[13], state[15] );
|
||||
|
||||
/*
|
||||
for (i = 0; i < 2; ++i) {
|
||||
BLAKE2_ROUND_1(
|
||||
state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
|
||||
state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; ++i) {
|
||||
BLAKE2_ROUND_2(
|
||||
state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i],
|
||||
state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]);
|
||||
}
|
||||
*/
|
||||
|
||||
for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm512_xor_si512(state[i], block_XY[i]);
|
||||
_mm512_storeu_si512((__m512i *)next_block->v + i, state[i]);
|
||||
for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
|
||||
{
|
||||
state[i] = _mm512_xor_si512( state[i], block_XY[i] );
|
||||
_mm512_store_si512( (__m512i*)next_block->v + i, state[i] );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -96,14 +87,14 @@ static void fill_block(__m256i *state, const block *ref_block,
|
||||
if (with_xor) {
|
||||
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm256_xor_si256(
|
||||
state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
|
||||
state[i], _mm256_load_si256((const __m256i *)ref_block->v + i));
|
||||
block_XY[i] = _mm256_xor_si256(
|
||||
state[i], _mm256_loadu_si256((const __m256i *)next_block->v + i));
|
||||
state[i], _mm256_load_si256((const __m256i *)next_block->v + i));
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
|
||||
block_XY[i] = state[i] = _mm256_xor_si256(
|
||||
state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
|
||||
state[i], _mm256_load_si256((const __m256i *)ref_block->v + i));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,21 +116,9 @@ static void fill_block(__m256i *state, const block *ref_block,
|
||||
BLAKE2_ROUND_2( state[ 3], state[ 7], state[11], state[15],
|
||||
state[19], state[23], state[27], state[31] );
|
||||
|
||||
/*
|
||||
for (i = 0; i < 4; ++i) {
|
||||
BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
|
||||
state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 4; ++i) {
|
||||
BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
|
||||
state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
|
||||
}
|
||||
*/
|
||||
|
||||
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm256_xor_si256(state[i], block_XY[i]);
|
||||
_mm256_storeu_si256((__m256i *)next_block->v + i, state[i]);
|
||||
_mm256_store_si256((__m256i *)next_block->v + i, state[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -153,14 +132,14 @@ static void fill_block(__m128i *state, const block *ref_block,
|
||||
if (with_xor) {
|
||||
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm_xor_si128(
|
||||
state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
|
||||
state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
|
||||
block_XY[i] = _mm_xor_si128(
|
||||
state[i], _mm_loadu_si128((const __m128i *)next_block->v + i));
|
||||
state[i], _mm_load_si128((const __m128i *)next_block->v + i));
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
|
||||
block_XY[i] = state[i] = _mm_xor_si128(
|
||||
state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
|
||||
state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,22 +177,9 @@ static void fill_block(__m128i *state, const block *ref_block,
|
||||
BLAKE2_ROUND( state[ 7], state[15], state[23], state[31],
|
||||
state[39], state[47], state[55], state[63] );
|
||||
|
||||
/*
|
||||
for (i = 0; i < 8; ++i) {
|
||||
BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
|
||||
state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
|
||||
state[8 * i + 6], state[8 * i + 7]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 8; ++i) {
|
||||
BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
|
||||
state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
|
||||
state[8 * 6 + i], state[8 * 7 + i]);
|
||||
}
|
||||
*/
|
||||
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm_xor_si128(state[i], block_XY[i]);
|
||||
_mm_storeu_si128((__m128i *)next_block->v + i, state[i]);
|
||||
_mm_store_si128((__m128i *)next_block->v + i, state[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -29,6 +29,8 @@
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if !defined(__AVX512F__)
|
||||
#if !defined(__AVX2__)
|
||||
#if !defined(__XOP__)
|
||||
@@ -182,64 +184,63 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
|
||||
#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
|
||||
#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
|
||||
#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
|
||||
#define rotr32( x ) mm256_ror_64( x, 32 )
|
||||
#define rotr24( x ) mm256_ror_64( x, 24 )
|
||||
#define rotr16( x ) mm256_ror_64( x, 16 )
|
||||
#define rotr63( x ) mm256_rol_64( x, 1 )
|
||||
|
||||
//#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
|
||||
//#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
|
||||
//#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
|
||||
//#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
|
||||
|
||||
#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||
do { \
|
||||
__m256i ml = _mm256_mul_epu32(A0, B0); \
|
||||
ml = _mm256_add_epi64(ml, ml); \
|
||||
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
|
||||
__m256i ml0, ml1; \
|
||||
ml0 = _mm256_mul_epu32(A0, B0); \
|
||||
ml1 = _mm256_mul_epu32(A1, B1); \
|
||||
ml0 = _mm256_add_epi64(ml0, ml0); \
|
||||
ml1 = _mm256_add_epi64(ml1, ml1); \
|
||||
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml0)); \
|
||||
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml1)); \
|
||||
D0 = _mm256_xor_si256(D0, A0); \
|
||||
D0 = rotr32(D0); \
|
||||
\
|
||||
ml = _mm256_mul_epu32(C0, D0); \
|
||||
ml = _mm256_add_epi64(ml, ml); \
|
||||
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
|
||||
\
|
||||
B0 = _mm256_xor_si256(B0, C0); \
|
||||
B0 = rotr24(B0); \
|
||||
\
|
||||
ml = _mm256_mul_epu32(A1, B1); \
|
||||
ml = _mm256_add_epi64(ml, ml); \
|
||||
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
|
||||
D1 = _mm256_xor_si256(D1, A1); \
|
||||
D0 = rotr32(D0); \
|
||||
D1 = rotr32(D1); \
|
||||
\
|
||||
ml = _mm256_mul_epu32(C1, D1); \
|
||||
ml = _mm256_add_epi64(ml, ml); \
|
||||
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
|
||||
\
|
||||
ml0 = _mm256_mul_epu32(C0, D0); \
|
||||
ml1 = _mm256_mul_epu32(C1, D1); \
|
||||
ml0 = _mm256_add_epi64(ml0, ml0); \
|
||||
ml1 = _mm256_add_epi64(ml1, ml1); \
|
||||
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml0)); \
|
||||
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml1)); \
|
||||
B0 = _mm256_xor_si256(B0, C0); \
|
||||
B1 = _mm256_xor_si256(B1, C1); \
|
||||
B0 = rotr24(B0); \
|
||||
B1 = rotr24(B1); \
|
||||
} while((void)0, 0);
|
||||
|
||||
#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||
do { \
|
||||
__m256i ml = _mm256_mul_epu32(A0, B0); \
|
||||
ml = _mm256_add_epi64(ml, ml); \
|
||||
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
|
||||
__m256i ml0, ml1; \
|
||||
ml0 = _mm256_mul_epu32(A0, B0); \
|
||||
ml1 = _mm256_mul_epu32(A1, B1); \
|
||||
ml0 = _mm256_add_epi64(ml0, ml0); \
|
||||
ml1 = _mm256_add_epi64(ml1, ml1); \
|
||||
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml0)); \
|
||||
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml1)); \
|
||||
D0 = _mm256_xor_si256(D0, A0); \
|
||||
D0 = rotr16(D0); \
|
||||
\
|
||||
ml = _mm256_mul_epu32(C0, D0); \
|
||||
ml = _mm256_add_epi64(ml, ml); \
|
||||
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
|
||||
B0 = _mm256_xor_si256(B0, C0); \
|
||||
B0 = rotr63(B0); \
|
||||
\
|
||||
ml = _mm256_mul_epu32(A1, B1); \
|
||||
ml = _mm256_add_epi64(ml, ml); \
|
||||
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
|
||||
D1 = _mm256_xor_si256(D1, A1); \
|
||||
D0 = rotr16(D0); \
|
||||
D1 = rotr16(D1); \
|
||||
\
|
||||
ml = _mm256_mul_epu32(C1, D1); \
|
||||
ml = _mm256_add_epi64(ml, ml); \
|
||||
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
|
||||
ml0 = _mm256_mul_epu32(C0, D0); \
|
||||
ml1 = _mm256_mul_epu32(C1, D1); \
|
||||
ml0 = _mm256_add_epi64(ml0, ml0); \
|
||||
ml1 = _mm256_add_epi64(ml1, ml1); \
|
||||
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml0)); \
|
||||
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml1)); \
|
||||
B0 = _mm256_xor_si256(B0, C0); \
|
||||
B1 = _mm256_xor_si256(B1, C1); \
|
||||
B0 = rotr63(B0); \
|
||||
B1 = rotr63(B1); \
|
||||
} while((void)0, 0);
|
||||
|
||||
@@ -259,16 +260,14 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
|
||||
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
|
||||
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
|
||||
B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
||||
B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
||||
\
|
||||
tmp1 = C0; \
|
||||
B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
||||
C0 = C1; \
|
||||
C1 = tmp1; \
|
||||
\
|
||||
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
|
||||
tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
|
||||
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
||||
C1 = tmp1; \
|
||||
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
|
||||
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
||||
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
||||
} while(0);
|
||||
|
||||
#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
@@ -287,16 +286,14 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
|
||||
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
|
||||
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
|
||||
B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
||||
B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
||||
\
|
||||
tmp1 = C0; \
|
||||
B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
||||
C0 = C1; \
|
||||
C1 = tmp1; \
|
||||
\
|
||||
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
|
||||
tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
|
||||
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
||||
C1 = tmp1; \
|
||||
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
|
||||
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
||||
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
||||
} while((void)0, 0);
|
||||
|
||||
#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||
@@ -430,14 +427,14 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
#define SWAP_QUARTERS(A0, A1) \
|
||||
do { \
|
||||
SWAP_HALVES(A0, A1); \
|
||||
A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
|
||||
A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
|
||||
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
|
||||
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
|
||||
} while((void)0, 0)
|
||||
|
||||
#define UNSWAP_QUARTERS(A0, A1) \
|
||||
do { \
|
||||
A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
|
||||
A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
|
||||
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
|
||||
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
|
||||
SWAP_HALVES(A0, A1); \
|
||||
} while((void)0, 0)
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ void blakehash_4way(void *state, const void *input)
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake256r14_4way_context ctx;
|
||||
memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
|
||||
blake256r14_4way( &ctx, input + (64<<2), 16 );
|
||||
blake256r14_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake256r14_4way_close( &ctx, vhash );
|
||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
@@ -36,7 +36,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake256r14_4way_init( &blake_4w_ctx );
|
||||
blake256r14_4way( &blake_4w_ctx, vdata, 64 );
|
||||
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
@@ -48,7 +48,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
|
||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
n += 4;
|
||||
|
||||
@@ -107,7 +107,7 @@ int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
|
||||
if ( (hash+i)[7] <= HTarget && fulltest( hash+i, ptarget ) )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
n += 8;
|
||||
|
||||
|
||||
@@ -1,18 +1,8 @@
|
||||
#include "blake-gate.h"
|
||||
|
||||
int64_t blake_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
}
|
||||
|
||||
bool register_blake_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT;
|
||||
gate->get_max64 = (void*)&blake_get_max64;
|
||||
//#if defined (__AVX2__) && defined (FOUR_WAY)
|
||||
// gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
// gate->scanhash = (void*)&scanhash_blake_8way;
|
||||
// gate->hash = (void*)&blakehash_8way;
|
||||
#if defined(BLAKE_4WAY)
|
||||
four_way_not_tested();
|
||||
gate->scanhash = (void*)&scanhash_blake_4way;
|
||||
|
||||
@@ -37,8 +37,6 @@
|
||||
#ifndef __BLAKE_HASH_4WAY__
|
||||
#define __BLAKE_HASH_4WAY__ 1
|
||||
|
||||
//#ifdef __SSE4_2__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
@@ -51,49 +49,45 @@ extern "C"{
|
||||
|
||||
#define SPH_SIZE_blake512 512
|
||||
|
||||
// With SSE4.2 only Blake-256 4 way is available.
|
||||
// With AVX2 Blake-256 8way & Blake-512 4 way are also available.
|
||||
|
||||
// Blake-256 4 way
|
||||
//////////////////////////
|
||||
//
|
||||
// Blake-256 4 way SSE2
|
||||
|
||||
typedef struct {
|
||||
unsigned char buf[64<<2];
|
||||
uint32_t H[8<<2];
|
||||
uint32_t S[4<<2];
|
||||
// __m128i buf[16] __attribute__ ((aligned (64)));
|
||||
// __m128i H[8];
|
||||
// __m128i S[4];
|
||||
size_t ptr;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_4way_small_context __attribute__ ((aligned (64)));
|
||||
|
||||
// Default 14 rounds
|
||||
// Default, 14 rounds, blake, decred
|
||||
typedef blake_4way_small_context blake256_4way_context;
|
||||
void blake256_4way_init(void *ctx);
|
||||
void blake256_4way(void *ctx, const void *data, size_t len);
|
||||
void blake256_4way_update(void *ctx, const void *data, size_t len);
|
||||
void blake256_4way_close(void *ctx, void *dst);
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_4way_small_context blake256r14_4way_context;
|
||||
void blake256r14_4way_init(void *cc);
|
||||
void blake256r14_4way(void *cc, const void *data, size_t len);
|
||||
void blake256r14_4way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_4way_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_4way_small_context blake256r8_4way_context;
|
||||
void blake256r8_4way_init(void *cc);
|
||||
void blake256r8_4way(void *cc, const void *data, size_t len);
|
||||
void blake256r8_4way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_4way_close(void *cc, void *dst);
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
// Blake-256 8 way
|
||||
//////////////////////////
|
||||
//
|
||||
// Blake-256 8 way AVX2
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16] __attribute__ ((aligned (64)));
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
sph_u32 T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
@@ -102,39 +96,92 @@ typedef struct {
|
||||
// Default 14 rounds
|
||||
typedef blake_8way_small_context blake256_8way_context;
|
||||
void blake256_8way_init(void *cc);
|
||||
void blake256_8way(void *cc, const void *data, size_t len);
|
||||
void blake256_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256_8way_close(void *cc, void *dst);
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_8way_small_context blake256r14_8way_context;
|
||||
void blake256r14_8way_init(void *cc);
|
||||
void blake256r14_8way(void *cc, const void *data, size_t len);
|
||||
void blake256r14_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_8way_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_8way_small_context blake256r8_8way_context;
|
||||
void blake256r8_8way_init(void *cc);
|
||||
void blake256r8_8way(void *cc, const void *data, size_t len);
|
||||
void blake256r8_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_8way_close(void *cc, void *dst);
|
||||
|
||||
// Blake-512 4 way
|
||||
// Blake-512 4 way AVX2
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16] __attribute__ ((aligned (64)));
|
||||
__m256i buf[16];
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
sph_u64 T0, T1;
|
||||
} blake_4way_big_context;
|
||||
} blake_4way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_4way_big_context blake512_4way_context;
|
||||
|
||||
void blake512_4way_init(void *cc);
|
||||
void blake512_4way(void *cc, const void *data, size_t len);
|
||||
void blake512_4way_close(void *cc, void *dst);
|
||||
void blake512_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
void blake512_4way_init( blake_4way_big_context *sc );
|
||||
void blake512_4way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_4way_close( void *cc, void *dst );
|
||||
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
////////////////////////////
|
||||
//
|
||||
// Blake-256 16 way AVX512
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
size_t ptr;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_16way_small_context __attribute__ ((aligned (128)));
|
||||
|
||||
// Default 14 rounds
|
||||
typedef blake_16way_small_context blake256_16way_context;
|
||||
void blake256_16way_init(void *cc);
|
||||
void blake256_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256_16way_close(void *cc, void *dst);
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_16way_small_context blake256r14_16way_context;
|
||||
void blake256r14_16way_init(void *cc);
|
||||
void blake256r14_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_16way_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_16way_small_context blake256r8_16way_context;
|
||||
void blake256r8_16way_init(void *cc);
|
||||
void blake256r8_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_16way_close(void *cc, void *dst);
|
||||
|
||||
////////////////////////////
|
||||
//
|
||||
//// Blake-512 8 way AVX512
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
__m512i S[4];
|
||||
size_t ptr;
|
||||
sph_u64 T0, T1;
|
||||
} blake_8way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_8way_big_context blake512_8way_context;
|
||||
|
||||
void blake512_8way_init( blake_8way_big_context *sc );
|
||||
void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8way_close( void *cc, void *dst );
|
||||
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@@ -304,16 +304,17 @@ static const sph_u32 CS[16] = {
|
||||
|
||||
#endif
|
||||
|
||||
// Blake-256 4 way
|
||||
|
||||
#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
do { \
|
||||
a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
|
||||
_mm_set_epi32( c1, c1, c1, c1 ), m0 ), b ), a ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), \
|
||||
_mm_xor_si128( _mm_set1_epi32( c1 ), m0 ) ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
|
||||
_mm_set_epi32( c0, c0, c0, c0 ), m1 ), b ), a ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), \
|
||||
_mm_xor_si128( _mm_set1_epi32( c0 ), m1 ) ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
|
||||
@@ -321,7 +322,8 @@ do { \
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32
|
||||
|
||||
// Blake-256 4 way
|
||||
// Not used
|
||||
#if 0
|
||||
|
||||
#define ROUND_S_4WAY(r) do { \
|
||||
GS_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
|
||||
@@ -342,6 +344,8 @@ do { \
|
||||
CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define ROUND_S_4WAY(r) do { \
|
||||
@@ -359,7 +363,6 @@ do { \
|
||||
|
||||
#define DECL_STATE32_4WAY \
|
||||
__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m128i S0, S1, S2, S3; \
|
||||
uint32_t T0, T1;
|
||||
|
||||
#define READ_STATE32_4WAY(state) do { \
|
||||
@@ -371,10 +374,6 @@ do { \
|
||||
H5 = casti_m128i( state->H, 5 ); \
|
||||
H6 = casti_m128i( state->H, 6 ); \
|
||||
H7 = casti_m128i( state->H, 7 ); \
|
||||
S0 = casti_m128i( state->S, 0 ); \
|
||||
S1 = casti_m128i( state->S, 1 ); \
|
||||
S2 = casti_m128i( state->S, 2 ); \
|
||||
S3 = casti_m128i( state->S, 3 ); \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
@@ -388,17 +387,13 @@ do { \
|
||||
casti_m128i( state->H, 5 ) = H5; \
|
||||
casti_m128i( state->H, 6 ) = H6; \
|
||||
casti_m128i( state->H, 7 ) = H7; \
|
||||
casti_m128i( state->S, 0 ) = S0; \
|
||||
casti_m128i( state->S, 1 ) = S1; \
|
||||
casti_m128i( state->S, 2 ) = S2; \
|
||||
casti_m128i( state->S, 3 ) = S3; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32
|
||||
// not used
|
||||
|
||||
#if 0
|
||||
#define COMPRESS32_4WAY( rounds ) do { \
|
||||
__m128i M[16]; \
|
||||
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
@@ -441,6 +436,7 @@ do { \
|
||||
H7 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S3, V7 ), VF ), H7 ); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@@ -508,14 +504,18 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \
|
||||
V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \
|
||||
VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \
|
||||
VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \
|
||||
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
|
||||
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
|
||||
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
|
||||
V8 = m128_const1_64( 0x243F6A88243F6A88 ); \
|
||||
V9 = m128_const1_64( 0x85A308D385A308D3 ); \
|
||||
VA = m128_const1_64( 0x13198A2E13198A2E ); \
|
||||
VB = m128_const1_64( 0x0370734403707344 ); \
|
||||
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \
|
||||
m128_const1_64( 0xA4093822A4093822 ) ); \
|
||||
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \
|
||||
m128_const1_64( 0x299F31D0299F31D0 ) ); \
|
||||
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), \
|
||||
m128_const1_64( 0x082EFA98082EFA98 ) ); \
|
||||
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), \
|
||||
m128_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
|
||||
BLAKE256_4WAY_BLOCK_BSWAP32; \
|
||||
ROUND_S_4WAY(0); \
|
||||
ROUND_S_4WAY(1); \
|
||||
@@ -534,14 +534,14 @@ do { \
|
||||
ROUND_S_4WAY(2); \
|
||||
ROUND_S_4WAY(3); \
|
||||
} \
|
||||
H0 = mm128_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm128_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm128_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm128_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm128_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm128_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm128_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm128_xor4( VF, V7, S3, H7 ); \
|
||||
H0 = _mm_xor_si128( _mm_xor_si128( V8, V0 ), H0 ); \
|
||||
H1 = _mm_xor_si128( _mm_xor_si128( V9, V1 ), H1 ); \
|
||||
H2 = _mm_xor_si128( _mm_xor_si128( VA, V2 ), H2 ); \
|
||||
H3 = _mm_xor_si128( _mm_xor_si128( VB, V3 ), H3 ); \
|
||||
H4 = _mm_xor_si128( _mm_xor_si128( VC, V4 ), H4 ); \
|
||||
H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \
|
||||
H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \
|
||||
H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
@@ -552,13 +552,13 @@ do { \
|
||||
|
||||
#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
do { \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
|
||||
_mm256_set1_epi32( c1 ), m0 ), b ), a ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
|
||||
_mm256_xor_si256( _mm256_set1_epi32( c1 ), m0 ) ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
|
||||
_mm256_set1_epi32( c0 ), m1 ), b ), a ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
|
||||
_mm256_xor_si256( _mm256_set1_epi32( c0 ), m1 ) ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
@@ -577,7 +577,6 @@ do { \
|
||||
|
||||
#define DECL_STATE32_8WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m256i S0, S1, S2, S3; \
|
||||
sph_u32 T0, T1;
|
||||
|
||||
#define READ_STATE32_8WAY(state) \
|
||||
@@ -590,10 +589,6 @@ do { \
|
||||
H5 = (state)->H[5]; \
|
||||
H6 = (state)->H[6]; \
|
||||
H7 = (state)->H[7]; \
|
||||
S0 = (state)->S[0]; \
|
||||
S1 = (state)->S[1]; \
|
||||
S2 = (state)->S[2]; \
|
||||
S3 = (state)->S[3]; \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
@@ -608,10 +603,6 @@ do { \
|
||||
(state)->H[5] = H5; \
|
||||
(state)->H[6] = H6; \
|
||||
(state)->H[7] = H7; \
|
||||
(state)->S[0] = S0; \
|
||||
(state)->S[1] = S1; \
|
||||
(state)->S[2] = S2; \
|
||||
(state)->S[3] = S3; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
@@ -631,16 +622,20 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set1_epi32( CS0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set1_epi32( CS1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set1_epi32( CS2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set1_epi32( CS3 ) ); \
|
||||
VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
|
||||
shuf_bswap32 = _mm256_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
|
||||
V9 = m256_const1_64( 0x85A308D385A308D3 ); \
|
||||
VA = m256_const1_64( 0x13198A2E13198A2E ); \
|
||||
VB = m256_const1_64( 0x0370734403707344 ); \
|
||||
VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
|
||||
m256_const1_64( 0xA4093822A4093822 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
|
||||
m256_const1_64( 0x299F31D0299F31D0 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
|
||||
m256_const1_64( 0x082EFA98082EFA98 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
|
||||
m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
|
||||
shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
|
||||
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
|
||||
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
|
||||
@@ -674,17 +669,155 @@ do { \
|
||||
ROUND_S_8WAY(2); \
|
||||
ROUND_S_8WAY(3); \
|
||||
} \
|
||||
H0 = mm256_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm256_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm256_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm256_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm256_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm256_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm256_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm256_xor4( VF, V7, S3, H7 ); \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \
|
||||
H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \
|
||||
H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \
|
||||
H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \
|
||||
H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \
|
||||
H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \
|
||||
H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \
|
||||
H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// Blaske-256 16 way AVX512
|
||||
|
||||
#define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
do { \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
|
||||
_mm512_xor_si512( _mm512_set1_epi32( c1 ), m0 ) ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
|
||||
_mm512_xor_si512( _mm512_set1_epi32( c0 ), m1 ) ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
|
||||
} while (0)
|
||||
|
||||
#define ROUND_S_16WAY(r) do { \
|
||||
GS_16WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
|
||||
GS_16WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
|
||||
GS_16WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
|
||||
GS_16WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
|
||||
GS_16WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
|
||||
GS_16WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
|
||||
GS_16WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
|
||||
GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#define DECL_STATE32_16WAY \
|
||||
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
sph_u32 T0, T1;
|
||||
|
||||
#define READ_STATE32_16WAY(state) \
|
||||
do { \
|
||||
H0 = (state)->H[0]; \
|
||||
H1 = (state)->H[1]; \
|
||||
H2 = (state)->H[2]; \
|
||||
H3 = (state)->H[3]; \
|
||||
H4 = (state)->H[4]; \
|
||||
H5 = (state)->H[5]; \
|
||||
H6 = (state)->H[6]; \
|
||||
H7 = (state)->H[7]; \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE32_16WAY(state) \
|
||||
do { \
|
||||
(state)->H[0] = H0; \
|
||||
(state)->H[1] = H1; \
|
||||
(state)->H[2] = H2; \
|
||||
(state)->H[3] = H3; \
|
||||
(state)->H[4] = H4; \
|
||||
(state)->H[5] = H5; \
|
||||
(state)->H[6] = H6; \
|
||||
(state)->H[7] = H7; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
#define COMPRESS32_16WAY( rounds ) \
|
||||
do { \
|
||||
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
__m512i shuf_bswap32; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
|
||||
V9 = m512_const1_64( 0x85A308D385A308D3 ); \
|
||||
VA = m512_const1_64( 0x13198A2E13198A2E ); \
|
||||
VB = m512_const1_64( 0x0370734403707344 ); \
|
||||
VC = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
|
||||
m512_const1_64( 0xA4093822A4093822 ) ); \
|
||||
VD = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
|
||||
m512_const1_64( 0x299F31D0299F31D0 ) ); \
|
||||
VE = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
|
||||
m512_const1_64( 0x082EFA98082EFA98 ) ); \
|
||||
VF = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
|
||||
m512_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
|
||||
shuf_bswap32 = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223, \
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
M0 = _mm512_shuffle_epi8( * buf , shuf_bswap32 ); \
|
||||
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
|
||||
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
|
||||
M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
|
||||
M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
|
||||
M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
|
||||
M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
|
||||
M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
|
||||
M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
|
||||
M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
|
||||
MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
|
||||
MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
|
||||
MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
|
||||
MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
|
||||
ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
|
||||
MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
|
||||
ROUND_S_16WAY(0); \
|
||||
ROUND_S_16WAY(1); \
|
||||
ROUND_S_16WAY(2); \
|
||||
ROUND_S_16WAY(3); \
|
||||
ROUND_S_16WAY(4); \
|
||||
ROUND_S_16WAY(5); \
|
||||
ROUND_S_16WAY(6); \
|
||||
ROUND_S_16WAY(7); \
|
||||
if (rounds == 14) \
|
||||
{ \
|
||||
ROUND_S_16WAY(8); \
|
||||
ROUND_S_16WAY(9); \
|
||||
ROUND_S_16WAY(0); \
|
||||
ROUND_S_16WAY(1); \
|
||||
ROUND_S_16WAY(2); \
|
||||
ROUND_S_16WAY(3); \
|
||||
} \
|
||||
H0 = _mm512_xor_si512( _mm512_xor_si512( V8, V0 ), H0 ); \
|
||||
H1 = _mm512_xor_si512( _mm512_xor_si512( V9, V1 ), H1 ); \
|
||||
H2 = _mm512_xor_si512( _mm512_xor_si512( VA, V2 ), H2 ); \
|
||||
H3 = _mm512_xor_si512( _mm512_xor_si512( VB, V3 ), H3 ); \
|
||||
H4 = _mm512_xor_si512( _mm512_xor_si512( VC, V4 ), H4 ); \
|
||||
H5 = _mm512_xor_si512( _mm512_xor_si512( VD, V5 ), H5 ); \
|
||||
H6 = _mm512_xor_si512( _mm512_xor_si512( VE, V6 ), H6 ); \
|
||||
H7 = _mm512_xor_si512( _mm512_xor_si512( VF, V7 ), H7 ); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
// Blake-256 4 way
|
||||
@@ -695,27 +828,22 @@ static void
|
||||
blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
{
|
||||
__m128i zero = m128_zero;
|
||||
casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
|
||||
casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
|
||||
casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
|
||||
casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
|
||||
casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
|
||||
casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
|
||||
casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
|
||||
casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
|
||||
|
||||
casti_m128i( ctx->S, 0 ) = zero;
|
||||
casti_m128i( ctx->S, 1 ) = zero;
|
||||
casti_m128i( ctx->S, 2 ) = zero;
|
||||
casti_m128i( ctx->S, 3 ) = zero;
|
||||
casti_m128i( ctx->H, 0 ) = m128_const1_64( 0x6A09E6676A09E667 );
|
||||
casti_m128i( ctx->H, 1 ) = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||
casti_m128i( ctx->H, 2 ) = m128_const1_64( 0x3C6EF3723C6EF372 );
|
||||
casti_m128i( ctx->H, 3 ) = m128_const1_64( 0xA54FF53AA54FF53A );
|
||||
casti_m128i( ctx->H, 4 ) = m128_const1_64( 0x510E527F510E527F );
|
||||
casti_m128i( ctx->H, 5 ) = m128_const1_64( 0x9B05688C9B05688C );
|
||||
casti_m128i( ctx->H, 6 ) = m128_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
casti_m128i( ctx->H, 7 ) = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
ctx->T0 = ctx->T1 = 0;
|
||||
ctx->ptr = 0;
|
||||
ctx->rounds = rounds;
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_4way( blake_4way_small_context *ctx, const void *data, size_t len )
|
||||
blake32_4way( blake_4way_small_context *ctx, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m128i *buf = (__m128i*)ctx->buf;
|
||||
size_t bptr = ctx->ptr<<2;
|
||||
@@ -778,12 +906,13 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
|
||||
else
|
||||
ctx->T0 -= 512 - bit_len;
|
||||
|
||||
buf[vptr] = _mm_set1_epi32( 0x80 );
|
||||
buf[vptr] = m128_const1_64( 0x0000008000000080 );
|
||||
|
||||
if ( vptr < 12 )
|
||||
{
|
||||
memset_zero_128( buf + vptr + 1, 13 - vptr );
|
||||
buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) );
|
||||
buf[ 13 ] = _mm_or_si128( buf[ 13 ],
|
||||
m128_const1_64( 0x0100000001000000ULL ) );
|
||||
buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
|
||||
buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
|
||||
blake32_4way( ctx, buf + vptr, 64 - ptr );
|
||||
@@ -795,7 +924,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
|
||||
ctx->T0 = 0xFFFFFE00UL;
|
||||
ctx->T1 = 0xFFFFFFFFUL;
|
||||
memset_zero_128( buf, 56>>2 );
|
||||
buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) );
|
||||
buf[ 13 ] = _mm_or_si128( buf[ 13 ],
|
||||
m128_const1_64( 0x0100000001000000ULL ) );
|
||||
buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
|
||||
buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
|
||||
blake32_4way( ctx, buf, 64 );
|
||||
@@ -814,21 +944,14 @@ static void
|
||||
blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
|
||||
const sph_u32 *salt, int rounds )
|
||||
{
|
||||
__m256i zero = m256_zero;
|
||||
casti_m256i( sc->H, 0 ) = _mm256_set1_epi32( iv[0] );
|
||||
casti_m256i( sc->H, 1 ) = _mm256_set1_epi32( iv[1] );
|
||||
casti_m256i( sc->H, 2 ) = _mm256_set1_epi32( iv[2] );
|
||||
casti_m256i( sc->H, 3 ) = _mm256_set1_epi32( iv[3] );
|
||||
casti_m256i( sc->H, 4 ) = _mm256_set1_epi32( iv[4] );
|
||||
casti_m256i( sc->H, 5 ) = _mm256_set1_epi32( iv[5] );
|
||||
casti_m256i( sc->H, 6 ) = _mm256_set1_epi32( iv[6] );
|
||||
casti_m256i( sc->H, 7 ) = _mm256_set1_epi32( iv[7] );
|
||||
|
||||
casti_m256i( sc->S, 0 ) = zero;
|
||||
casti_m256i( sc->S, 1 ) = zero;
|
||||
casti_m256i( sc->S, 2 ) = zero;
|
||||
casti_m256i( sc->S, 3 ) = zero;
|
||||
|
||||
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E6676A09E667 );
|
||||
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE85BB67AE85 );
|
||||
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF3723C6EF372 );
|
||||
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53AA54FF53A );
|
||||
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527F510E527F );
|
||||
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C9B05688C );
|
||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
sc->rounds = rounds;
|
||||
@@ -887,7 +1010,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr>>2] = _mm256_set1_epi32( 0x80 );
|
||||
buf[ptr>>2] = m256_const1_64( 0x0000008000000080ULL );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
@@ -909,7 +1032,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
|
||||
if ( out_size_w32 == 8 )
|
||||
buf[52>>2] = _mm256_or_si256( buf[52>>2],
|
||||
_mm256_set1_epi32( 0x01000000UL ) );
|
||||
m256_const1_64( 0x0100000001000000ULL ) );
|
||||
*(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
|
||||
*(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
|
||||
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
|
||||
@@ -922,7 +1045,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||
memset_zero_256( buf, 56>>2 );
|
||||
if ( out_size_w32 == 8 )
|
||||
buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
|
||||
buf[52>>2] = m256_const1_64( 0x0100000001000000ULL );
|
||||
*(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
|
||||
*(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
|
||||
blake32_8way( sc, buf, 64 );
|
||||
@@ -932,6 +1055,179 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
//Blake-256 16 way AVX512
|
||||
|
||||
static void
|
||||
blake32_16way_init( blake_16way_small_context *sc, const sph_u32 *iv,
|
||||
const sph_u32 *salt, int rounds )
|
||||
{
|
||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E6676A09E667 );
|
||||
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE85BB67AE85 );
|
||||
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF3723C6EF372 );
|
||||
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53AA54FF53A );
|
||||
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527F510E527F );
|
||||
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C9B05688C );
|
||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
sc->rounds = rounds;
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
__m512i *buf;
|
||||
size_t ptr;
|
||||
const int buf_size = 64; // number of elements, sizeof/4
|
||||
DECL_STATE32_16WAY
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
if ( len < buf_size - ptr )
|
||||
{
|
||||
memcpy_512( buf + (ptr>>2), vdata, len>>2 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
}
|
||||
READ_STATE32_16WAY(sc);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = buf_size - ptr;
|
||||
if (clen > len)
|
||||
clen = len;
|
||||
memcpy_512( buf + (ptr>>2), vdata, clen>>2 );
|
||||
ptr += clen;
|
||||
vdata += (clen>>2);
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
if ( ( T0 = T0 + 512 ) < 512 )
|
||||
T1 = T1 + 1;
|
||||
COMPRESS32_16WAY( sc->rounds );
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE32_16WAY(sc);
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w32 )
|
||||
{
|
||||
__m512i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
sph_u32 th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr>>2] = m512_const1_64( 0x0000008000000080ULL );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
if ( ptr == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFE00UL;
|
||||
sc->T1 = 0xFFFFFFFFUL;
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFE00UL + bit_len;
|
||||
sc->T1 = sc->T1 - 1;
|
||||
}
|
||||
else
|
||||
sc->T0 -= 512 - bit_len;
|
||||
|
||||
if ( ptr <= 52 )
|
||||
{
|
||||
memset_zero_512( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
|
||||
if ( out_size_w32 == 8 )
|
||||
buf[52>>2] = _mm512_or_si512( buf[52>>2],
|
||||
m512_const1_64( 0x0100000001000000ULL ) );
|
||||
buf[+56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
|
||||
buf[+60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
|
||||
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
||||
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
|
||||
sc->T0 = 0xFFFFFE00UL;
|
||||
sc->T1 = 0xFFFFFFFFUL;
|
||||
memset_zero_512( buf, 56>>2 );
|
||||
if ( out_size_w32 == 8 )
|
||||
buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
|
||||
buf[56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
|
||||
buf[60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
|
||||
blake32_16way( sc, buf, 64 );
|
||||
}
|
||||
mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
|
||||
}
|
||||
|
||||
void
|
||||
blake256_16way_init(void *cc)
|
||||
{
|
||||
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256_16way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_16way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256_16way_close(void *cc, void *dst)
|
||||
{
|
||||
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
void blake256r14_16way_init(void *cc)
|
||||
{
|
||||
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256r14_16way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_16way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256r14_16way_close(void *cc, void *dst)
|
||||
{
|
||||
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
void blake256r8_16way_init(void *cc)
|
||||
{
|
||||
blake32_16way_init( cc, IV256, salt_zero_8way_small, 8 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256r8_16way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_16way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256r8_16way_close(void *cc, void *dst)
|
||||
{
|
||||
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
|
||||
// Blake-256 4 way
|
||||
|
||||
// default 14 rounds, backward copatibility
|
||||
@@ -942,7 +1238,7 @@ blake256_4way_init(void *ctx)
|
||||
}
|
||||
|
||||
void
|
||||
blake256_4way(void *ctx, const void *data, size_t len)
|
||||
blake256_4way_update(void *ctx, const void *data, size_t len)
|
||||
{
|
||||
blake32_4way(ctx, data, len);
|
||||
}
|
||||
@@ -964,7 +1260,7 @@ blake256_8way_init(void *cc)
|
||||
}
|
||||
|
||||
void
|
||||
blake256_8way(void *cc, const void *data, size_t len)
|
||||
blake256_8way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_8way(cc, data, len);
|
||||
}
|
||||
@@ -984,7 +1280,7 @@ void blake256r14_4way_init(void *cc)
|
||||
}
|
||||
|
||||
void
|
||||
blake256r14_4way(void *cc, const void *data, size_t len)
|
||||
blake256r14_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_4way(cc, data, len);
|
||||
}
|
||||
@@ -1003,7 +1299,7 @@ void blake256r14_8way_init(void *cc)
|
||||
}
|
||||
|
||||
void
|
||||
blake256r14_8way(void *cc, const void *data, size_t len)
|
||||
blake256r14_8way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_8way(cc, data, len);
|
||||
}
|
||||
@@ -1023,7 +1319,7 @@ void blake256r8_4way_init(void *cc)
|
||||
}
|
||||
|
||||
void
|
||||
blake256r8_4way(void *cc, const void *data, size_t len)
|
||||
blake256r8_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_4way(cc, data, len);
|
||||
}
|
||||
@@ -1042,7 +1338,7 @@ void blake256r8_8way_init(void *cc)
|
||||
}
|
||||
|
||||
void
|
||||
blake256r8_8way(void *cc, const void *data, size_t len)
|
||||
blake256r8_8way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_8way(cc, data, len);
|
||||
}
|
||||
|
||||
@@ -1,322 +0,0 @@
|
||||
// convert blake256 32 bit to use 64 bit with serial vectoring
|
||||
//
|
||||
// cut calls to GS in half
|
||||
//
|
||||
// combine V
|
||||
// v0 = {V0,V1}
|
||||
// v1 = {V2,V3}
|
||||
// v2 = {V4,V5}
|
||||
// v3 = {V6,V7}
|
||||
// v4 = {V8,V9}
|
||||
// v5 = {VA,VB}
|
||||
// v6 = {VC,VD}
|
||||
// v7 = {CE,VF}
|
||||
//
|
||||
// v6x = {VD,VC} swap(VC,VD) swap(v6)
|
||||
// v7x = {VF,VE} swap(VE,VF) swap(v7)
|
||||
//
|
||||
// V0 = v1v0
|
||||
// V1 = v3v2
|
||||
// V2 = v5v4
|
||||
// V3 = v7v6
|
||||
// V4 = v9v8
|
||||
// V5 = vbva
|
||||
// V6 = vdvc
|
||||
// V7 = vfve
|
||||
//
|
||||
// The rotate in ROUND is to effect straddle and unstraddle for the third
|
||||
// and 4th iteration of GS.
|
||||
// It concatenates 2 contiguous 256 bit vectors and extracts the middle
|
||||
// 256 bits. After the transform they must be restored with only the
|
||||
// chosen bits modified in the original 2 vectors.
|
||||
// ror1x128 achieves this by putting the chosen bits in arg1, the "low"
|
||||
// 256 bit vector and saves the untouched bits temporailly in arg0, the
|
||||
// "high" 256 bit vector. Simply reverse the process to restore data back
|
||||
// to original positions.
|
||||
|
||||
// Use standard 4way when AVX2 is not available use x2 mode with AVX2.
|
||||
//
|
||||
// Data is organised the same as 32 bit 4 way, in effect serial vectoring
|
||||
// on top of parallel vectoring. Same data in the same place just taking
|
||||
// two chunks at a time.
|
||||
//
|
||||
// Transparent to user, x2 mode used when AVX2 detected.
|
||||
// Use existing 4way context but revert to scalar types.
|
||||
// Same interleave function (128 bit) or x2 with 256 bit?
|
||||
// User trsnaparency would have to apply to interleave as well.
|
||||
//
|
||||
// Use common 4way update and close
|
||||
|
||||
/*
|
||||
typedef struct {
|
||||
unsigned char buf[64<<2];
|
||||
uint32_t H[8<<2];
|
||||
uint32_t S[4<<2];
|
||||
size_t ptr;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blakex2_4way_small_context __attribute__ ((aligned (64)));
|
||||
*/
|
||||
|
||||
static void
|
||||
blake32x2_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
{
|
||||
casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
|
||||
casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
|
||||
casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
|
||||
casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
|
||||
casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
|
||||
casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
|
||||
casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
|
||||
casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
|
||||
|
||||
casti_m128i( ctx->S, 0 ) = m128_zero;
|
||||
casti_m128i( ctx->S, 1 ) = m128_zero;
|
||||
casti_m128i( ctx->S, 2 ) = m128_zero;
|
||||
casti_m128i( ctx->S, 3 ) = m128_zero;
|
||||
/*
|
||||
sc->S[0] = _mm_set1_epi32( salt[0] );
|
||||
sc->S[1] = _mm_set1_epi32( salt[1] );
|
||||
sc->S[2] = _mm_set1_epi32( salt[2] );
|
||||
sc->S[3] = _mm_set1_epi32( salt[3] );
|
||||
*/
|
||||
ctx->T0 = ctx->T1 = 0;
|
||||
ctx->ptr = 0;
|
||||
ctx->rounds = rounds;
|
||||
}
|
||||
|
||||
static void
|
||||
blake32x2( blake_4way_small_context *ctx, const void *data, size_t len )
|
||||
{
|
||||
__m128i *buf = (__m256i*)ctx->buf;
|
||||
size_t bptr = ctx->ptr << 2;
|
||||
size_t vptr = ctx->ptr >> 3;
|
||||
size_t blen = len << 2;
|
||||
// unsigned char *buf = ctx->buf;
|
||||
// size_t ptr = ctx->ptr<<4; // repurposed
|
||||
DECL_STATE32x2
|
||||
|
||||
// buf = sc->buf;
|
||||
// ptr = sc->ptr;
|
||||
|
||||
// adjust len for use with ptr, clen, all absolute bytes.
|
||||
// int blen = len<<2;
|
||||
|
||||
if ( blen < (sizeof ctx->buf) - bptr )
|
||||
{
|
||||
memcpy( buf + vptr, data, blen );
|
||||
ptr += blen;
|
||||
ctx->ptr = bptr >> 2;;
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE32( ctx );
|
||||
while ( blen > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = ( sizeof sc->buf ) - ptr;
|
||||
if ( clen > blen )
|
||||
clen = blen;
|
||||
memcpy( buf + vptr, data, clen );
|
||||
bptr += clen;
|
||||
vptr = bptr >> 5;
|
||||
data = (const unsigned char *)data + clen;
|
||||
blen -= clen;
|
||||
if ( bptr == sizeof ctx->buf )
|
||||
{
|
||||
if ( ( T0 = T0 + 512 ) < 512 ) // not needed, will never rollover
|
||||
T1 += 1;
|
||||
COMPRESS32x2_4WAY( ctx->rounds );
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE32x2( ctx );
|
||||
ctx->ptr = bptr >> 2;
|
||||
}
|
||||
|
||||
static void
|
||||
blake32x2_4way_close( blake_4way_small_context *ctx, void *dst )
|
||||
{
|
||||
__m256i buf[8] __attribute__ ((aligned (64)));
|
||||
size_t ptr = ctx->ptr;
|
||||
size_t vptr = ctx->ptr>>2;
|
||||
unsigned bit_len = ( (unsigned)ptr << 3 ); // one lane
|
||||
uint32_t th = ctx->T1;
|
||||
uint32_t tl = ctx->T0 + bit_len;
|
||||
|
||||
if ( ptr == 0 )
|
||||
{
|
||||
ctx->T0 = 0xFFFFFE00UL;
|
||||
ctx->T1 = 0xFFFFFFFFUL;
|
||||
}
|
||||
else if ( ctx->T0 == 0 )
|
||||
{
|
||||
ctx->T0 = 0xFFFFFE00UL + bit_len;
|
||||
ctx->T1 -= 1;
|
||||
}
|
||||
else
|
||||
ctx->T0 -= 512 - bit_len;
|
||||
|
||||
// memset doesn't do ints
|
||||
buf[ vptr ] = _mm256_set_epi32( 0,0,0,0, 0x80, 0x80, 0x80, 0x80 );
|
||||
|
||||
if ( vptr < 5 )
|
||||
{
|
||||
memset_zero_256( buf + vptr + 1, 6 - vptr );
|
||||
buf[ 6 ] = _mm256_or_si256( vbuf[ 6 ], _mm256_set_epi32(
|
||||
0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL, 0,0,0,0 ) );
|
||||
buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl,tl,tl,tl,
|
||||
th,th,th,th ) );
|
||||
blake32x2_4way( ctx, buf + vptr, 64 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_256( vbuf + vptr + 1, 7 - vptr );
|
||||
blake32x2_4way( ctx, vbuf + ptr, 64 - ptr );
|
||||
ctx->T0 = 0xFFFFFE00UL;
|
||||
ctx->T1 = 0xFFFFFFFFUL;
|
||||
buf[ 6 ] = mm256_zero;
|
||||
buf[ 6 ] = _mm256_set_epi32( 0,0,0,0,
|
||||
0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL );
|
||||
buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl, tl, tl, tl,
|
||||
th, th, th, th );
|
||||
blake32x2_4way( ctx, buf, 64 );
|
||||
}
|
||||
|
||||
casti_m256i( dst, 0 ) = mm256_bswap_32( casti_m256i( ctx->H, 0 ) );
|
||||
casti_m256i( dst, 1 ) = mm256_bswap_32( casti_m256i( ctx->H, 1 ) );
|
||||
casti_m256i( dst, 2 ) = mm256_bswap_32( casti_m256i( ctx->H, 2 ) );
|
||||
casti_m256i( dst, 3 ) = mm256_bswap_32( casti_m256i( ctx->H, 3 ) );
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#define DECL_STATE32x2_4WAY \
|
||||
__m256i H0, H1, H2, H3; \
|
||||
__m256i S0, S1; \
|
||||
uint32_t T0, T1;
|
||||
|
||||
#define READ_STATE32x2_4WAY(state) do \
|
||||
{ \
|
||||
H0 = casti_m256i( state->H, 0 ); \
|
||||
H1 = casti_m256i( state->H, 1 ); \
|
||||
H2 = casti_m256i( state->H, 2 ); \
|
||||
H3 = casti_m256i( state->H, 3 ); \
|
||||
S0 = casti_m256i( state->S, 0 ); \
|
||||
S1 = casti_m256i( state->S, 1 ); \
|
||||
T0 = state->T0; \
|
||||
T1 = state->T1; \
|
||||
|
||||
#define WRITE_STATE32x2_4WAY(state) do { \
|
||||
casti_m256i( state->H, 0 ) = H0; \
|
||||
casti_m256i( state->H, 1 ) = H1; \
|
||||
casti_m256i( state->H, 2 ) = H2; \
|
||||
casti_m256i( state->H, 3 ) = H3; \
|
||||
casti_m256i( state->S, 0 ) = S0; \
|
||||
casti_m256i( state->S, 1 ) = S1; \
|
||||
state->T0 = T0; \
|
||||
state->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define GSx2_4WAY( m0m2, m1m3, c0c2, c1c3, a, b, c, d ) do \
|
||||
{ \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
|
||||
_mm256_set_epi32( c1,c3, c1,c3, c1,c3, c1,c3 ), \
|
||||
_mm256_set_epi32( m0,m2, m0,m2, m0,m2, m0,m2 ) ), b ), a ); \
|
||||
d = mm256_ror_32( _mm_xor_si128( d, a ), 16 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
|
||||
_mm256_set_epi32( c0,c2, c0,c2, c0,c2, c0,c2 ), \
|
||||
_mm256_set_epi32( m1,m3, m1,m3, m1,m3, m1,m3 ) ), b ), a ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
} while (0)
|
||||
|
||||
#define ROUND_Sx2_4WAY(r) do \
|
||||
{ \
|
||||
GS2_4WAY( Mx(r, 0), Mx(r, 1), Mx(r, 2), Mx(r, 3), \
|
||||
CSx(r, 0), CSx(r, 1), CSx(r, 2), CSx(r, 3), V0, V2, V4, V6 ); \
|
||||
GS2_4WAY( Mx(r, 4), Mx(r, 5), Mx(r, 6), Mx(r, 7), \
|
||||
CSx(r, 4), CSx(r, 5), CSx(r, 6), CSx(r, 7), V1, V3, V5, V7 ); \
|
||||
mm256_ror1x128_512( V3, V2 ); \
|
||||
mm256_ror1x128_512( V6, V7 ); \
|
||||
GS2_4WAY( Mx(r, 8), Mx(r, 9), Mx(r, A), Mx(r, B), \
|
||||
CSx(r, 8), CSx(r, 9), CSx(r, A), CSx(r, B), V0, V2, V5, V7 ); \
|
||||
GS2_4WAY( Mx(r, C), Mx(r, D), Mx(r, C), Mx(r, D), \
|
||||
CSx(r, C), CSx(r, D), CSx(r, C), CSx(r, D), V1, V3, V4, V6 ); \
|
||||
mm256_rol1x128_512( V2, V3 ); \
|
||||
mm256_rol1x128_512( V7, V6 );
|
||||
|
||||
#define COMPRESS32x2_4WAY( rounds ) do \
|
||||
{ \
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
unsigned r; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = _mm256_xor_si256( S0, _mm256_set_epi32( CS1, CS1, CS1, CS1, \
|
||||
CS0, CS0, CS0, CS0 ) ); \
|
||||
V5 = _mm256_xor_si256( S1, _mm256_set_epi32( CS3, CS3, CS3, CS3, \
|
||||
CS2, CS2, CS2, CS2 ) ); \
|
||||
V6 = _mm256_xor_si256( _mm256_set1_epi32( T0 ), \
|
||||
_mm256_set_epi32( CS5, CS5, CS5, CS5, \
|
||||
CS4, CS4, CS4, CS4 ) ); \
|
||||
V7 = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
|
||||
_mm256_set_epi32( CS7, CS7, CS7, CS7, \
|
||||
CS6, CS6, CS6, CS6 ) ); \
|
||||
M0 = mm256_bswap_32( buf[ 0] ); \
|
||||
M1 = mm256_bswap_32( buf[ 1] ); \
|
||||
M2 = mm256_bswap_32( buf[ 2] ); \
|
||||
M3 = mm256_bswap_32( buf[ 3] ); \
|
||||
M4 = mm256_bswap_32( buf[ 4] ); \
|
||||
M5 = mm256_bswap_32( buf[ 5] ); \
|
||||
M6 = mm256_bswap_32( buf[ 6] ); \
|
||||
M7 = mm256_bswap_32( buf[ 7] ); \
|
||||
ROUND_Sx2_4WAY(0); \
|
||||
ROUND_Sx2_4WAY(1); \
|
||||
ROUND_Sx2_4WAY(2); \
|
||||
ROUND_Sx2_4WAY(3); \
|
||||
ROUND_Sx2_4WAY(4); \
|
||||
ROUND_Sx2_4WAY(5); \
|
||||
ROUND_Sx2_4WAY(6); \
|
||||
ROUND_Sx2_4WAY(7); \
|
||||
if (rounds == 14) \
|
||||
{ \
|
||||
ROUND_Sx2_4WAY(8); \
|
||||
ROUND_Sx2_4WAY(9); \
|
||||
ROUND_Sx2_4WAY(0); \
|
||||
ROUND_Sx2_4WAY(1); \
|
||||
ROUND_Sx2_4WAY(2); \
|
||||
ROUND_Sx2_4WAY(3); \
|
||||
} \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( V8, V0 ), S0 ), H0 ); \
|
||||
H1 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( V9, V1 ), S1 ), H1 ); \
|
||||
H2 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( VA, V2 ), S2 ), H2 ); \
|
||||
H3 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( VB, V3 ), S3 ), H3 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
113
algo/blake/blake2b-4way.c
Normal file
113
algo/blake/blake2b-4way.c
Normal file
@@ -0,0 +1,113 @@
|
||||
/**
|
||||
* Blake2-B Implementation
|
||||
* tpruvot@github 2015-2016
|
||||
*/
|
||||
|
||||
#include "blake2b-gate.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "blake2b-hash-4way.h"
|
||||
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
blake2b_8way_init( &ctx );
|
||||
blake2b_8way_update( &ctx, vdata, 80 );
|
||||
blake2b_8way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
// Function not used, code inlined.
|
||||
void blake2b_4way_hash(void *output, const void *input)
|
||||
{
|
||||
blake2b_4way_ctx ctx;
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, input, 80 );
|
||||
blake2b_4way_final( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, vdata, 80 );
|
||||
blake2b_4way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
20
algo/blake/blake2b-gate.c
Normal file
20
algo/blake/blake2b-gate.c
Normal file
@@ -0,0 +1,20 @@
|
||||
#include "blake2b-gate.h"
|
||||
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_8way;
|
||||
// gate->hash = (void*)&blake2b_8way_hash;
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_4way;
|
||||
gate->hash = (void*)&blake2b_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2b;
|
||||
gate->hash = (void*)&blake2b_hash;
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
34
algo/blake/blake2b-gate.h
Normal file
34
algo/blake/blake2b-gate.h
Normal file
@@ -0,0 +1,34 @@
|
||||
#ifndef __BLAKE2B_GATE_H__
|
||||
#define __BLAKE2B_GATE_H__ 1
|
||||
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2B_8WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2B_4WAY
|
||||
#endif
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
//void blake2b_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
void blake2b_4way_hash( void *state, const void *input );
|
||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#else
|
||||
|
||||
void blake2b_hash( void *state, const void *input );
|
||||
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
374
algo/blake/blake2b-hash-4way.c
Normal file
374
algo/blake/blake2b-hash-4way.c
Normal file
@@ -0,0 +1,374 @@
|
||||
/*
|
||||
* Copyright 2009 Colin Percival, 2014 savale
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* This file was originally written by Colin Percival as part of the Tarsnap
|
||||
* online backup system.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "blake2b-hash-4way.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
static const uint8_t sigma[12][16] =
|
||||
{
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
|
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
|
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
|
||||
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
|
||||
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
|
||||
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
|
||||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
|
||||
};
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define B2B8W_G(a, b, c, d, x, y) \
|
||||
{ \
|
||||
v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), x ); \
|
||||
v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 32 ); \
|
||||
v[c] = _mm512_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 24 ); \
|
||||
v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), y ); \
|
||||
v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 16 ); \
|
||||
v[c] = _mm512_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
|
||||
}
|
||||
|
||||
static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
{
|
||||
__m512i v[16], m[16];
|
||||
|
||||
v[ 0] = ctx->h[0];
|
||||
v[ 1] = ctx->h[1];
|
||||
v[ 2] = ctx->h[2];
|
||||
v[ 3] = ctx->h[3];
|
||||
v[ 4] = ctx->h[4];
|
||||
v[ 5] = ctx->h[5];
|
||||
v[ 6] = ctx->h[6];
|
||||
v[ 7] = ctx->h[7];
|
||||
v[ 8] = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
v[10] = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
v[11] = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = m512_const1_64( 0x510E527FADE682D1 );
|
||||
v[13] = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) );
|
||||
v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) );
|
||||
|
||||
if ( last )
|
||||
v[14] = mm512_not( v[14] );
|
||||
|
||||
m[ 0] = ctx->b[ 0];
|
||||
m[ 1] = ctx->b[ 1];
|
||||
m[ 2] = ctx->b[ 2];
|
||||
m[ 3] = ctx->b[ 3];
|
||||
m[ 4] = ctx->b[ 4];
|
||||
m[ 5] = ctx->b[ 5];
|
||||
m[ 6] = ctx->b[ 6];
|
||||
m[ 7] = ctx->b[ 7];
|
||||
m[ 8] = ctx->b[ 8];
|
||||
m[ 9] = ctx->b[ 9];
|
||||
m[10] = ctx->b[10];
|
||||
m[11] = ctx->b[11];
|
||||
m[12] = ctx->b[12];
|
||||
m[13] = ctx->b[13];
|
||||
m[14] = ctx->b[14];
|
||||
m[15] = ctx->b[15];
|
||||
|
||||
for ( int i = 0; i < 12; i++ )
|
||||
{
|
||||
B2B8W_G( 0, 4, 8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
|
||||
B2B8W_G( 1, 5, 9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
|
||||
B2B8W_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] );
|
||||
B2B8W_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] );
|
||||
B2B8W_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] );
|
||||
B2B8W_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] );
|
||||
B2B8W_G( 2, 7, 8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] );
|
||||
B2B8W_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
|
||||
}
|
||||
|
||||
ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] );
|
||||
ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] );
|
||||
ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] );
|
||||
ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] );
|
||||
ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] );
|
||||
ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] );
|
||||
ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] );
|
||||
ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] );
|
||||
}
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
ctx->h[0] = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = m512_const1_64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
ctx->h[0] = _mm512_xor_si512( ctx->h[0], m512_const1_64( 0x01010020 ) );
|
||||
|
||||
ctx->t[0] = 0;
|
||||
ctx->t[1] = 0;
|
||||
ctx->c = 0;
|
||||
ctx->outlen = 32;
|
||||
|
||||
for ( i = 0; i < 16; i++ )
|
||||
ctx->b[i] = m512_zero;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
size_t inlen )
|
||||
{
|
||||
__m512i* in =(__m512i*)input;
|
||||
|
||||
size_t i, c;
|
||||
c = ctx->c >> 3;
|
||||
|
||||
for ( i = 0; i < (inlen >> 3); i++ )
|
||||
{
|
||||
if ( ctx->c == 128 )
|
||||
{
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
blake2b_8way_compress( ctx, 0 );
|
||||
ctx->c = 0;
|
||||
}
|
||||
ctx->b[ c++ ] = in[i];
|
||||
ctx->c += 8;
|
||||
}
|
||||
}
|
||||
|
||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
||||
{
|
||||
size_t c;
|
||||
c = ctx->c >> 3;
|
||||
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
|
||||
while ( ctx->c < 128 )
|
||||
{
|
||||
ctx->b[c++] = m512_zero;
|
||||
ctx->c += 8;
|
||||
}
|
||||
|
||||
blake2b_8way_compress( ctx, 1 ); // final block flag = 1
|
||||
|
||||
casti_m512i( out, 0 ) = ctx->h[0];
|
||||
casti_m512i( out, 1 ) = ctx->h[1];
|
||||
casti_m512i( out, 2 ) = ctx->h[2];
|
||||
casti_m512i( out, 3 ) = ctx->h[3];
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
// AVX2
|
||||
|
||||
// G Mixing function.
|
||||
|
||||
#define B2B_G(a, b, c, d, x, y) \
|
||||
{ \
|
||||
v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \
|
||||
v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \
|
||||
v[c] = _mm256_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \
|
||||
v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \
|
||||
v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \
|
||||
v[c] = _mm256_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \
|
||||
}
|
||||
|
||||
// Initialization Vector.
|
||||
/*
|
||||
static const uint64_t blake2b_iv[8] = {
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
|
||||
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
|
||||
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||
};
|
||||
*/
|
||||
|
||||
static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
||||
{
|
||||
__m256i v[16], m[16];
|
||||
|
||||
v[ 0] = ctx->h[0];
|
||||
v[ 1] = ctx->h[1];
|
||||
v[ 2] = ctx->h[2];
|
||||
v[ 3] = ctx->h[3];
|
||||
v[ 4] = ctx->h[4];
|
||||
v[ 5] = ctx->h[5];
|
||||
v[ 6] = ctx->h[6];
|
||||
v[ 7] = ctx->h[7];
|
||||
v[ 8] = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||
v[10] = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||
v[11] = m256_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = m256_const1_64( 0x510E527FADE682D1 );
|
||||
v[13] = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) );
|
||||
v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) );
|
||||
|
||||
if ( last )
|
||||
v[14] = mm256_not( v[14] );
|
||||
|
||||
m[ 0] = ctx->b[ 0];
|
||||
m[ 1] = ctx->b[ 1];
|
||||
m[ 2] = ctx->b[ 2];
|
||||
m[ 3] = ctx->b[ 3];
|
||||
m[ 4] = ctx->b[ 4];
|
||||
m[ 5] = ctx->b[ 5];
|
||||
m[ 6] = ctx->b[ 6];
|
||||
m[ 7] = ctx->b[ 7];
|
||||
m[ 8] = ctx->b[ 8];
|
||||
m[ 9] = ctx->b[ 9];
|
||||
m[10] = ctx->b[10];
|
||||
m[11] = ctx->b[11];
|
||||
m[12] = ctx->b[12];
|
||||
m[13] = ctx->b[13];
|
||||
m[14] = ctx->b[14];
|
||||
m[15] = ctx->b[15];
|
||||
|
||||
for ( int i = 0; i < 12; i++ )
|
||||
{
|
||||
B2B_G( 0, 4, 8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
|
||||
B2B_G( 1, 5, 9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
|
||||
B2B_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] );
|
||||
B2B_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] );
|
||||
B2B_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] );
|
||||
B2B_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] );
|
||||
B2B_G( 2, 7, 8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] );
|
||||
B2B_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
|
||||
}
|
||||
|
||||
ctx->h[0] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[0], v[0] ), v[ 8] );
|
||||
ctx->h[1] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[1], v[1] ), v[ 9] );
|
||||
ctx->h[2] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[2], v[2] ), v[10] );
|
||||
ctx->h[3] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[3], v[3] ), v[11] );
|
||||
ctx->h[4] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[4], v[4] ), v[12] );
|
||||
ctx->h[5] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[5], v[5] ), v[13] );
|
||||
ctx->h[6] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[6], v[6] ), v[14] );
|
||||
ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
|
||||
}
|
||||
|
||||
int blake2b_4way_init( blake2b_4way_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
ctx->h[0] = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = m256_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = m256_const1_64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
ctx->h[0] = _mm256_xor_si256( ctx->h[0], m256_const1_64( 0x01010020 ) );
|
||||
|
||||
ctx->t[0] = 0;
|
||||
ctx->t[1] = 0;
|
||||
ctx->c = 0;
|
||||
ctx->outlen = 32;
|
||||
|
||||
for ( i = 0; i < 16; i++ )
|
||||
ctx->b[i] = m256_zero;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
size_t inlen )
|
||||
{
|
||||
__m256i* in =(__m256i*)input;
|
||||
|
||||
size_t i, c;
|
||||
c = ctx->c >> 3;
|
||||
|
||||
for ( i = 0; i < (inlen >> 3); i++ )
|
||||
{
|
||||
if ( ctx->c == 128 )
|
||||
{
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
blake2b_4way_compress( ctx, 0 );
|
||||
ctx->c = 0;
|
||||
}
|
||||
ctx->b[ c++ ] = in[i];
|
||||
ctx->c += 8;
|
||||
}
|
||||
}
|
||||
|
||||
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
|
||||
{
|
||||
size_t c;
|
||||
c = ctx->c >> 3;
|
||||
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
|
||||
while ( ctx->c < 128 )
|
||||
{
|
||||
ctx->b[c++] = m256_zero;
|
||||
ctx->c += 8;
|
||||
}
|
||||
|
||||
blake2b_4way_compress( ctx, 1 ); // final block flag = 1
|
||||
|
||||
casti_m256i( out, 0 ) = ctx->h[0];
|
||||
casti_m256i( out, 1 ) = ctx->h[1];
|
||||
casti_m256i( out, 2 ) = ctx->h[2];
|
||||
casti_m256i( out, 3 ) = ctx->h[3];
|
||||
}
|
||||
|
||||
#endif // AVX2
|
||||
53
algo/blake/blake2b-hash-4way.h
Normal file
53
algo/blake/blake2b-hash-4way.h
Normal file
@@ -0,0 +1,53 @@
|
||||
#pragma once
|
||||
#ifndef __BLAKE2B_HASH_4WAY_H__
|
||||
#define __BLAKE2B_HASH_4WAY_H__
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#include <inttypes.h>
|
||||
#define inline __inline
|
||||
#define ALIGN(x) __declspec(align(x))
|
||||
#else
|
||||
#define ALIGN(x) __attribute__((aligned(x)))
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
ALIGN(128) typedef struct {
|
||||
__m512i b[16]; // input buffer
|
||||
__m512i h[8]; // chained state
|
||||
uint64_t t[2]; // total number of bytes
|
||||
size_t c; // pointer for b[]
|
||||
size_t outlen; // digest size
|
||||
} blake2b_8way_ctx;
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx );
|
||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
size_t inlen );
|
||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// state context
|
||||
ALIGN(128) typedef struct {
|
||||
__m256i b[16]; // input buffer
|
||||
__m256i h[8]; // chained state
|
||||
uint64_t t[2]; // total number of bytes
|
||||
size_t c; // pointer for b[]
|
||||
size_t outlen; // digest size
|
||||
} blake2b_4way_ctx;
|
||||
|
||||
int blake2b_4way_init( blake2b_4way_ctx *ctx );
|
||||
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
size_t inlen );
|
||||
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -3,13 +3,14 @@
|
||||
* tpruvot@github 2015-2016
|
||||
*/
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include "blake2b-gate.h"
|
||||
|
||||
#if !defined(BLAKE2B_8WAY) && !defined(BLAKE2B_4WAY)
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/sph_blake2b.h"
|
||||
|
||||
//static __thread sph_blake2b_ctx s_midstate;
|
||||
//static __thread sph_blake2b_ctx s_ctx;
|
||||
#define MIDLEN 76
|
||||
#define A 64
|
||||
|
||||
@@ -25,16 +26,6 @@ void blake2b_hash(void *output, const void *input)
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
||||
/*
|
||||
static void blake2b_hash_end(uint32_t *output, const uint32_t *input)
|
||||
{
|
||||
s_ctx.outlen = MIDLEN;
|
||||
memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
|
||||
sph_blake2b_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
|
||||
sph_blake2b_final(&s_ctx, (uint8_t*) output);
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
@@ -45,7 +36,7 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[8];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
@@ -53,179 +44,21 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
}
|
||||
|
||||
// midstate (untested yet)
|
||||
//blake2b_init(&s_midstate, 32, NULL, 0);
|
||||
//blake2b_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
|
||||
//memcpy(&s_ctx, &s_midstate, sizeof(blake2b_ctx));
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[8], n);
|
||||
//blake2b_hash_end(vhashcpu, endiandata);
|
||||
be32enc(&endiandata[19], n);
|
||||
blake2b_hash(vhashcpu, endiandata);
|
||||
|
||||
if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) {
|
||||
work_set_target_ratio(work, vhashcpu);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[8] = n;
|
||||
return 1;
|
||||
}
|
||||
n++;
|
||||
|
||||
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, vhashcpu, mythr );
|
||||
}
|
||||
n++;
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[8] = n;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void swab256(void *dest_p, const void *src_p)
|
||||
{
|
||||
uint32_t *dest = (uint32_t *)dest_p;
|
||||
const uint32_t *src = (uint32_t *)src_p;
|
||||
|
||||
dest[0] = swab32(src[7]);
|
||||
dest[1] = swab32(src[6]);
|
||||
dest[2] = swab32(src[5]);
|
||||
dest[3] = swab32(src[4]);
|
||||
dest[4] = swab32(src[3]);
|
||||
dest[5] = swab32(src[2]);
|
||||
dest[6] = swab32(src[1]);
|
||||
dest[7] = swab32(src[0]);
|
||||
}
|
||||
|
||||
/* compute nbits to get the network diff */
|
||||
void blake2b_calc_network_diff(struct work *work)
|
||||
{
|
||||
// sample for diff 43.281 : 1c05ea29
|
||||
uint32_t nbits = work->data[11]; // unsure if correct
|
||||
uint32_t bits = (nbits & 0xffffff);
|
||||
int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
|
||||
|
||||
double d = (double)0x0000ffff / (double)bits;
|
||||
for (int m=shift; m < 29; m++) d *= 256.0;
|
||||
for (int m=29; m < shift; m++) d /= 256.0;
|
||||
if (opt_debug_diff)
|
||||
applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
|
||||
net_diff = d;
|
||||
}
|
||||
|
||||
void blake2b_be_build_stratum_request( char *req, struct work *work )
|
||||
{
|
||||
unsigned char *xnonce2str;
|
||||
uint32_t ntime, nonce;
|
||||
char ntimestr[9], noncestr[9];
|
||||
be32enc( &ntime, work->data[ algo_gate.ntime_index ] );
|
||||
be32enc( &nonce, work->data[ algo_gate.nonce_index ] );
|
||||
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
|
||||
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
|
||||
uint16_t high_nonce = swab32(work->data[9]) >> 16;
|
||||
xnonce2str = abin2hex((unsigned char*)(&high_nonce), 2);
|
||||
snprintf( req, JSON_BUF_LEN,
|
||||
"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
|
||||
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
|
||||
free( xnonce2str );
|
||||
}
|
||||
|
||||
#define min(a,b) (a>b ? (b) :(a))
|
||||
|
||||
// merkle root handled here, no need for gen_merkle_root gate target
|
||||
void blake2b_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
{
|
||||
uchar merkle_root[64] = { 0 };
|
||||
uint32_t extraheader[32] = { 0 };
|
||||
int headersize = 0;
|
||||
size_t t;
|
||||
int i;
|
||||
|
||||
// merkle root
|
||||
memcpy( merkle_root, sctx->job.coinbase, 32 );
|
||||
headersize = min( (int)sctx->job.coinbase_size - 32, sizeof(extraheader) );
|
||||
memcpy( extraheader, &sctx->job.coinbase[32], headersize );
|
||||
// Increment extranonce2
|
||||
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
|
||||
// Assemble block header
|
||||
memset( g_work->data, 0, sizeof(g_work->data) );
|
||||
// g_work->data[0] = le32dec( sctx->job.version );
|
||||
// for ( i = 0; i < 8; i++ )
|
||||
// g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[i] = ((uint32_t*)sctx->job.prevhash)[7-i];
|
||||
// for ( i = 0; i < 8; i++ )
|
||||
// g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
|
||||
g_work->data[8] = 0; // nonce
|
||||
g_work->data[9] = swab32( extraheader[0] ) | ( rand() & 0xf0 );
|
||||
g_work->data[10] = be32dec( sctx->job.ntime );
|
||||
g_work->data[11] = be32dec( sctx->job.nbits );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[12+i] = ( (uint32_t*)merkle_root )[i];
|
||||
}
|
||||
|
||||
#undef min
|
||||
|
||||
void blake2b_get_new_work( struct work* work, struct work* g_work, int thr_id,
|
||||
uint32_t* end_nonce_ptr, bool clean_job )
|
||||
{
|
||||
const int wkcmp_sz = 32; // bytes
|
||||
const int wkcmp_off = 32 + 16;
|
||||
uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
|
||||
|
||||
if ( memcmp( &work->data[ wkcmp_off ], &g_work->data[ wkcmp_off ], wkcmp_sz )
|
||||
&& ( clean_job || ( *nonceptr >= *end_nonce_ptr )
|
||||
|| strcmp( work->job_id, g_work->job_id ) ) )
|
||||
{
|
||||
work_free( work );
|
||||
work_copy( work, g_work );
|
||||
*nonceptr = ( 0xffffffffU / opt_n_threads ) * thr_id;
|
||||
if ( opt_randomize )
|
||||
*nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
|
||||
*end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20;
|
||||
}
|
||||
else
|
||||
++(*nonceptr);
|
||||
|
||||
// suprnova job_id check without data/target/height change...
|
||||
// we just may have copied new g_wwork to work so why this test here?
|
||||
// if ( have_stratum && strcmp( work->job_id, g_work->job_id ) )
|
||||
// exit thread loop
|
||||
// continue;
|
||||
// else
|
||||
// {
|
||||
// nonceptr[1] += 0x10;
|
||||
// nonceptr[1] |= thr_id;
|
||||
// }
|
||||
}
|
||||
|
||||
bool blake2b_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
|
||||
int thr_id )
|
||||
{
|
||||
if ( have_stratum && strcmp( stratum->job.job_id, work->job_id ) )
|
||||
// need to regen g_work..
|
||||
return false;
|
||||
// extradata: prevent duplicates
|
||||
work->data[ 8 ] += 0x10;
|
||||
work->data[ 8 + 1 ] |= thr_id;
|
||||
return true;
|
||||
}
|
||||
|
||||
double blake2b_get_max64() { return 0x1fffffLL; }
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate )
|
||||
{
|
||||
algo_not_tested();
|
||||
gate->ntime_index = 10;
|
||||
gate->nbits_index = 11;
|
||||
gate->nonce_index = 8;
|
||||
gate->work_cmp_size = 32;
|
||||
gate->scanhash = (void*)&scanhash_blake2b;
|
||||
gate->hash = (void*)&blake2b_hash;
|
||||
gate->calc_network_diff = (void*)&blake2b_calc_network_diff;
|
||||
gate->build_stratum_request = (void*)&blake2b_be_build_stratum_request;
|
||||
gate->work_decode = (void*)&std_be_work_decode;
|
||||
gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
|
||||
gate->build_extraheader = (void*)&blake2b_build_extraheader;
|
||||
gate->get_new_work = (void*)&blake2b_get_new_work;
|
||||
gate->get_max64 = (void*)&blake2b_get_max64;
|
||||
gate->ready_to_mine = (void*)&blake2b_ready_to_mine;
|
||||
have_gbt = false;
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -3,22 +3,72 @@
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(BLAKE2S_8WAY)
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
static __thread blake2s_16way_state blake2s_16w_ctx;
|
||||
|
||||
void blake2s_16way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_16way_state ctx;
|
||||
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
|
||||
blake2s_16way_update( &ctx, input + (64<<4), 16 );
|
||||
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*16] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<4]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm512_bswap_32( _mm512_set_epi32(
|
||||
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_16way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 16;
|
||||
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
|
||||
static __thread blake2s_8way_state blake2s_8w_ctx;
|
||||
|
||||
void blake2s_8way_hash( void *output, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
blake2s_8way_state ctx;
|
||||
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
|
||||
|
||||
blake2s_8way_update( &ctx, input + (64<<3), 16 );
|
||||
blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
|
||||
|
||||
dintrlv_8x32( output, output+ 32, output+ 64, output+ 96,
|
||||
output+128, output+160, output+192, output+224,
|
||||
vhash, 256 );
|
||||
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
@@ -26,13 +76,15 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
@@ -45,16 +97,17 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
blake2s_8way_hash( hash, vdata );
|
||||
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg )
|
||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
@@ -67,15 +120,10 @@ static __thread blake2s_4way_state blake2s_4w_ctx;
|
||||
|
||||
void blake2s_4way_hash( void *output, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake2s_4way_state ctx;
|
||||
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
|
||||
|
||||
blake2s_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
|
||||
|
||||
dintrlv_4x32( output, output+32, output+64, output+96,
|
||||
vhash, 256 );
|
||||
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
@@ -83,13 +131,15 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
|
||||
@@ -101,15 +151,16 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
blake2s_4way_hash( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg )
|
||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
|
||||
@@ -1,15 +1,12 @@
|
||||
#include "blake2s-gate.h"
|
||||
|
||||
|
||||
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
|
||||
int64_t blake2s_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
}
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2S_8WAY)
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_16way;
|
||||
gate->hash = (void*)&blake2s_16way_hash;
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
//#if defined(BLAKE2S_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_8way;
|
||||
gate->hash = (void*)&blake2s_8way_hash;
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
@@ -19,8 +16,7 @@ bool register_blake2s_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_blake2s;
|
||||
gate->hash = (void*)&blake2s_hash;
|
||||
#endif
|
||||
gate->get_max64 = (void*)&blake2s_get_max64;
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
@@ -4,16 +4,30 @@
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
//#if defined(__SSE4_2__)
|
||||
#if defined(__SSE2__)
|
||||
#define BLAKE2S_4WAY
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define BLAKE2S_8WAY
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2S_16WAY
|
||||
#endif
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(BLAKE2S_8WAY)
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
void blake2s_16way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined (BLAKE2S_8WAY)
|
||||
|
||||
//#if defined(BLAKE2S_8WAY)
|
||||
|
||||
void blake2s_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
@@ -17,13 +17,16 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
//#if defined(__SSE4_2__)
|
||||
#if defined(__SSE2__)
|
||||
|
||||
/*
|
||||
static const uint32_t blake2s_IV[8] =
|
||||
{
|
||||
0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
|
||||
0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
|
||||
};
|
||||
*/
|
||||
|
||||
static const uint8_t blake2s_sigma[10][16] =
|
||||
{
|
||||
@@ -39,6 +42,7 @@ static const uint8_t blake2s_sigma[10][16] =
|
||||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
|
||||
};
|
||||
|
||||
|
||||
// define a constant for initial param.
|
||||
|
||||
int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
|
||||
@@ -57,8 +61,18 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
|
||||
memset( P->personal, 0, sizeof( P->personal ) );
|
||||
|
||||
memset( S, 0, sizeof( blake2s_4way_state ) );
|
||||
for( int i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
|
||||
|
||||
S->h[0] = m128_const1_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = m128_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = m128_const1_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = m128_const1_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = m128_const1_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = m128_const1_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
// for( int i = 0; i < 8; ++i )
|
||||
// S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
@@ -76,41 +90,45 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
|
||||
memcpy_128( m, block, 16 );
|
||||
memcpy_128( v, S->h, 8 );
|
||||
|
||||
v[ 8] = _mm_set1_epi32( blake2s_IV[0] );
|
||||
v[ 9] = _mm_set1_epi32( blake2s_IV[1] );
|
||||
v[10] = _mm_set1_epi32( blake2s_IV[2] );
|
||||
v[11] = _mm_set1_epi32( blake2s_IV[3] );
|
||||
v[ 8] = m128_const1_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = m128_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
|
||||
_mm_set1_epi32( blake2s_IV[4] ) );
|
||||
m128_const1_64( 0x510E527F510E527FULL ) );
|
||||
v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
|
||||
_mm_set1_epi32( blake2s_IV[5] ) );
|
||||
m128_const1_64( 0x9B05688C9B05688CULL ) );
|
||||
v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
|
||||
_mm_set1_epi32( blake2s_IV[6] ) );
|
||||
m128_const1_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
|
||||
_mm_set1_epi32( blake2s_IV[7] ) );
|
||||
m128_const1_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
#define G4W(r,i,a,b,c,d) \
|
||||
#define G4W( sigma0, sigma1, a, b, c, d ) \
|
||||
do { \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+0] ] ); \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+1] ] ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define ROUND4W(r) \
|
||||
do { \
|
||||
G4W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G4W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G4W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
|
||||
G4W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
|
||||
G4W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
|
||||
G4W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
|
||||
G4W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G4W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
|
||||
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
|
||||
G4W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G4W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G4W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
|
||||
G4W( sigma[ 6], sigma[ 7], v[ 3], v[ 7], v[11], v[15] ); \
|
||||
G4W( sigma[ 8], sigma[ 9], v[ 0], v[ 5], v[10], v[15] ); \
|
||||
G4W( sigma[10], sigma[11], v[ 1], v[ 6], v[11], v[12] ); \
|
||||
G4W( sigma[12], sigma[13], v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G4W( sigma[14], sigma[15], v[ 3], v[ 4], v[ 9], v[14] ); \
|
||||
} while(0)
|
||||
|
||||
ROUND4W( 0 );
|
||||
@@ -132,26 +150,47 @@ do { \
|
||||
return 0;
|
||||
}
|
||||
|
||||
// There is a problem that can't be resolved internally.
|
||||
// If the last block is a full 64 bytes it should not be compressed in
|
||||
// update but left for final. However, when streaming, it isn't known
|
||||
// which block is last. There may be a subsequent call to update to add
|
||||
// more data.
|
||||
//
|
||||
// The reference code handled this by juggling 2 blocks at a time at
|
||||
// a significant performance penalty.
|
||||
//
|
||||
// Instead a new function is introduced called full_blocks which combines
|
||||
// update and final and is to be used in non-streaming mode where the data
|
||||
// is a multiple of 64 bytes.
|
||||
//
|
||||
// Supported:
|
||||
// 64 + 16 bytes (blake2s with midstate optimization)
|
||||
// 80 bytes (blake2s without midstate optimization)
|
||||
// Any multiple of 64 bytes in one shot (x25x)
|
||||
//
|
||||
// Unsupported:
|
||||
// Stream of full 64 byte blocks one at a time.
|
||||
|
||||
// use only when streaming more data or final block not full.
|
||||
int blake2s_4way_update( blake2s_4way_state *S, const void *in,
|
||||
uint64_t inlen )
|
||||
{
|
||||
__m128i *input = (__m128i*)in;
|
||||
__m128i *buf = (__m128i*)S->buf;
|
||||
const int bsize = BLAKE2S_BLOCKBYTES;
|
||||
__m128i *input = (__m128i*)in;
|
||||
__m128i *buf = (__m128i*)S->buf;
|
||||
|
||||
while( inlen > 0 )
|
||||
{
|
||||
size_t left = S->buflen;
|
||||
if( inlen >= bsize - left )
|
||||
if( inlen >= BLAKE2S_BLOCKBYTES - left )
|
||||
{
|
||||
memcpy_128( buf + (left>>2), input, (bsize - left) >> 2 );
|
||||
S->buflen += bsize - left;
|
||||
memcpy_128( buf + (left>>2), input, (BLAKE2S_BLOCKBYTES - left) >> 2 );
|
||||
S->buflen += BLAKE2S_BLOCKBYTES - left;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
blake2s_4way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
input += ( bsize >> 2 );
|
||||
inlen -= bsize;
|
||||
input += ( BLAKE2S_BLOCKBYTES >> 2 );
|
||||
inlen -= BLAKE2S_BLOCKBYTES;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -183,8 +222,45 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Update and final when inlen is a multiple of 64 bytes
|
||||
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
||||
const void *input, uint64_t inlen )
|
||||
{
|
||||
__m128i *in = (__m128i*)input;
|
||||
__m128i *buf = (__m128i*)S->buf;
|
||||
|
||||
while( inlen > BLAKE2S_BLOCKBYTES )
|
||||
{
|
||||
memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
|
||||
S->buflen = BLAKE2S_BLOCKBYTES;
|
||||
inlen -= BLAKE2S_BLOCKBYTES;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
blake2s_4way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
in += ( BLAKE2S_BLOCKBYTES >> 2 );
|
||||
}
|
||||
|
||||
// last block
|
||||
memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
|
||||
S->buflen = BLAKE2S_BLOCKBYTES;
|
||||
S->t[0] += S->buflen;
|
||||
S->t[1] += ( S->t[0] < S->buflen );
|
||||
if ( S->last_node ) S->f[1] = ~0U;
|
||||
S->f[0] = ~0U;
|
||||
blake2s_4way_compress( S, buf );
|
||||
|
||||
for ( int i = 0; i < 8; ++i )
|
||||
casti_m128i( out, i ) = S->h[ i ];
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// The commented code below is slower on Intel but faster on
|
||||
// Zen1 AVX2. It's also faster than Zen1 AVX.
|
||||
// Ryzen gen2 is unknown at this time.
|
||||
|
||||
int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
|
||||
{
|
||||
__m256i m[16];
|
||||
@@ -193,6 +269,23 @@ int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
|
||||
memcpy_256( m, block, 16 );
|
||||
memcpy_256( v, S->h, 8 );
|
||||
|
||||
v[ 8] = m256_const1_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = m256_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
|
||||
m256_const1_64( 0x510E527F510E527FULL ) );
|
||||
|
||||
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
|
||||
m256_const1_64( 0x9B05688C9B05688CULL ) );
|
||||
|
||||
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
|
||||
m256_const1_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
|
||||
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
|
||||
m256_const1_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
/*
|
||||
v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
|
||||
v[ 9] = _mm256_set1_epi32( blake2s_IV[1] );
|
||||
v[10] = _mm256_set1_epi32( blake2s_IV[2] );
|
||||
@@ -206,6 +299,7 @@ int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
|
||||
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
|
||||
_mm256_set1_epi32( blake2s_IV[7] ) );
|
||||
|
||||
|
||||
#define G8W(r,i,a,b,c,d) \
|
||||
do { \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
|
||||
@@ -219,7 +313,36 @@ do { \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
#define G8W( sigma0, sigma1, a, b, c, d) \
|
||||
do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s1 ] ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
} while(0)
|
||||
|
||||
#define ROUND8W(r) \
|
||||
do { \
|
||||
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
|
||||
G8W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G8W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G8W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
|
||||
G8W( sigma[ 6], sigma[ 7], v[ 3], v[ 7], v[11], v[15] ); \
|
||||
G8W( sigma[ 8], sigma[ 9], v[ 0], v[ 5], v[10], v[15] ); \
|
||||
G8W( sigma[10], sigma[11], v[ 1], v[ 6], v[11], v[12] ); \
|
||||
G8W( sigma[12], sigma[13], v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G8W( sigma[14], sigma[15], v[ 3], v[ 4], v[ 9], v[14] ); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
#define ROUND8W(r) \
|
||||
do { \
|
||||
G8W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
@@ -231,6 +354,7 @@ do { \
|
||||
G8W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G8W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
ROUND8W( 0 );
|
||||
ROUND8W( 1 );
|
||||
@@ -267,8 +391,18 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
|
||||
memset( P->personal, 0, sizeof( P->personal ) );
|
||||
|
||||
memset( S, 0, sizeof( blake2s_8way_state ) );
|
||||
for( int i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
|
||||
S->h[0] = m256_const1_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = m256_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = m256_const1_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = m256_const1_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = m256_const1_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = m256_const1_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
|
||||
// for( int i = 0; i < 8; ++i )
|
||||
// S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
@@ -329,9 +463,203 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Update and final when inlen is a multiple of 64 bytes
|
||||
int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
const void *input, uint64_t inlen )
|
||||
{
|
||||
__m256i *in = (__m256i*)input;
|
||||
__m256i *buf = (__m256i*)S->buf;
|
||||
|
||||
while( inlen > BLAKE2S_BLOCKBYTES )
|
||||
{
|
||||
memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
|
||||
S->buflen = BLAKE2S_BLOCKBYTES;
|
||||
inlen -= BLAKE2S_BLOCKBYTES;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
blake2s_8way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
in += ( BLAKE2S_BLOCKBYTES >> 2 );
|
||||
}
|
||||
|
||||
// last block
|
||||
memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
|
||||
S->buflen = BLAKE2S_BLOCKBYTES;
|
||||
S->t[0] += S->buflen;
|
||||
S->t[1] += ( S->t[0] < S->buflen );
|
||||
if ( S->last_node ) S->f[1] = ~0U;
|
||||
S->f[0] = ~0U;
|
||||
blake2s_8way_compress( S, buf );
|
||||
|
||||
for ( int i = 0; i < 8; ++i )
|
||||
casti_m256i( out, i ) = S->h[ i ];
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// Blake2s-256 16 way
|
||||
|
||||
int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
|
||||
{
|
||||
__m512i m[16];
|
||||
__m512i v[16];
|
||||
|
||||
memcpy_512( m, block, 16 );
|
||||
memcpy_512( v, S->h, 8 );
|
||||
|
||||
v[ 8] = m512_const1_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = m512_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ),
|
||||
m512_const1_64( 0x510E527F510E527FULL ) );
|
||||
|
||||
v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ),
|
||||
m512_const1_64( 0x9B05688C9B05688CULL ) );
|
||||
|
||||
v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ),
|
||||
m512_const1_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
|
||||
v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ),
|
||||
m512_const1_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
|
||||
#define G16W( sigma0, sigma1, a, b, c, d) \
|
||||
do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s1 ] ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
|
||||
} while(0)
|
||||
|
||||
#define ROUND16W(r) \
|
||||
do { \
|
||||
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
|
||||
G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
|
||||
G16W( sigma[ 6], sigma[ 7], v[ 3], v[ 7], v[11], v[15] ); \
|
||||
G16W( sigma[ 8], sigma[ 9], v[ 0], v[ 5], v[10], v[15] ); \
|
||||
G16W( sigma[10], sigma[11], v[ 1], v[ 6], v[11], v[12] ); \
|
||||
G16W( sigma[12], sigma[13], v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G16W( sigma[14], sigma[15], v[ 3], v[ 4], v[ 9], v[14] ); \
|
||||
} while(0)
|
||||
|
||||
ROUND16W( 0 );
|
||||
ROUND16W( 1 );
|
||||
ROUND16W( 2 );
|
||||
ROUND16W( 3 );
|
||||
ROUND16W( 4 );
|
||||
ROUND16W( 5 );
|
||||
ROUND16W( 6 );
|
||||
ROUND16W( 7 );
|
||||
ROUND16W( 8 );
|
||||
ROUND16W( 9 );
|
||||
|
||||
for( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] );
|
||||
|
||||
#undef G16W
|
||||
#undef ROUND16W
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen )
|
||||
{
|
||||
blake2s_nway_param P[1];
|
||||
|
||||
P->digest_length = outlen;
|
||||
P->key_length = 0;
|
||||
P->fanout = 1;
|
||||
P->depth = 1;
|
||||
P->leaf_length = 0;
|
||||
*((uint64_t*)(P->node_offset)) = 0;
|
||||
P->node_depth = 0;
|
||||
P->inner_length = 0;
|
||||
memset( P->salt, 0, sizeof( P->salt ) );
|
||||
memset( P->personal, 0, sizeof( P->personal ) );
|
||||
|
||||
memset( S, 0, sizeof( blake2s_16way_state ) );
|
||||
S->h[0] = m512_const1_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = m512_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = m512_const1_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = m512_const1_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = m512_const1_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = m512_const1_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
/* IV XOR ParamBlock */
|
||||
for ( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm512_xor_si512( S->h[i], _mm512_set1_epi32( p[i] ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blake2s_16way_update( blake2s_16way_state *S, const void *in,
|
||||
uint64_t inlen )
|
||||
{
|
||||
__m512i *input = (__m512i*)in;
|
||||
__m512i *buf = (__m512i*)S->buf;
|
||||
const int bsize = BLAKE2S_BLOCKBYTES;
|
||||
|
||||
while( inlen > 0 )
|
||||
{
|
||||
size_t left = S->buflen;
|
||||
if( inlen >= bsize - left )
|
||||
{
|
||||
memcpy_512( buf + (left>>2), input, (bsize - left) >> 2 );
|
||||
S->buflen += bsize - left;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
blake2s_16way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
input += ( bsize >> 2 );
|
||||
inlen -= bsize;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy_512( buf + ( left>>2 ), input, inlen>>2 );
|
||||
S->buflen += (size_t) inlen;
|
||||
input += ( inlen>>2 );
|
||||
inlen -= inlen;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen )
|
||||
{
|
||||
__m512i *buf = (__m512i*)S->buf;
|
||||
|
||||
S->t[0] += S->buflen;
|
||||
S->t[1] += ( S->t[0] < S->buflen );
|
||||
if ( S->last_node )
|
||||
S->f[1] = ~0U;
|
||||
S->f[0] = ~0U;
|
||||
|
||||
memset_zero_512( buf + ( S->buflen>>2 ),
|
||||
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
|
||||
blake2s_16way_compress( S, buf );
|
||||
|
||||
for ( int i = 0; i < 8; ++i )
|
||||
casti_m512i( out, i ) = S->h[ i ];
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
#if 0
|
||||
int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
|
||||
{
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
#ifndef __BLAKE2S_HASH_4WAY_H__
|
||||
#define __BLAKE2S_HASH_4WAY_H__ 1
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
#if defined(__SSE2__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
@@ -74,6 +74,9 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen );
|
||||
int blake2s_4way_update( blake2s_4way_state *S, const void *in,
|
||||
uint64_t inlen );
|
||||
int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
|
||||
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
||||
const void *input, uint64_t inlen );
|
||||
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
@@ -91,6 +94,27 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
|
||||
int blake2s_8way_update( blake2s_8way_state *S, const void *in,
|
||||
uint64_t inlen );
|
||||
int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
|
||||
int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
const void *input, uint64_t inlen );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
ALIGN( 128 ) typedef struct __blake2s_16way_state
|
||||
{
|
||||
__m512i h[8];
|
||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 16 ];
|
||||
uint32_t t[2];
|
||||
uint32_t f[2];
|
||||
size_t buflen;
|
||||
uint8_t last_node;
|
||||
} blake2s_16way_state ;
|
||||
|
||||
int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen );
|
||||
int blake2s_16way_update( blake2s_16way_state *S, const void *in,
|
||||
uint64_t inlen );
|
||||
int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
|
||||
|
||||
#endif
|
||||
|
||||
@@ -107,6 +131,6 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // __SSE4_2__
|
||||
#endif // __SSE2__
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
#include "blake2s-gate.h"
|
||||
|
||||
#if !defined(BLAKE2S_16WAY) && !defined(BLAKE2S_8WAY) && !defined(BLAKE2S)
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
@@ -56,7 +58,7 @@ int scanhash_blake2s( struct work *work,
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
blake2s_hash( hash64, endiandata );
|
||||
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
|
||||
if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
@@ -70,18 +72,4 @@ int scanhash_blake2s( struct work *work,
|
||||
|
||||
return 0;
|
||||
}
|
||||
/*
|
||||
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
|
||||
int64_t blake2s_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
}
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_blake2s;
|
||||
gate->hash = (void*)&blake2s_hash;
|
||||
gate->get_max64 = (void*)&blake2s_get_max64;
|
||||
return true;
|
||||
};
|
||||
*/
|
||||
#endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -14,7 +14,7 @@ void blakecoin_4way_hash(void *state, const void *input)
|
||||
blake256r8_4way_context ctx;
|
||||
|
||||
memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
|
||||
blake256r8_4way( &ctx, input + (64<<2), 16 );
|
||||
blake256r8_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake256r8_4way_close( &ctx, vhash );
|
||||
|
||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
@@ -37,7 +37,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake256r8_4way_init( &blakecoin_4w_ctx );
|
||||
blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );
|
||||
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
@@ -49,7 +49,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
||||
&& !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
n += 4;
|
||||
|
||||
@@ -71,7 +71,7 @@ void blakecoin_8way_hash( void *state, const void *input )
|
||||
blake256r8_8way_context ctx;
|
||||
|
||||
memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
|
||||
blake256r8_8way( &ctx, input + (64<<3), 16 );
|
||||
blake256r8_8way_update( &ctx, input + (64<<3), 16 );
|
||||
blake256r8_8way_close( &ctx, vhash );
|
||||
|
||||
dintrlv_8x32( state, state+ 32, state+ 64, state+ 96, state+128,
|
||||
@@ -95,7 +95,7 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake256r8_8way_init( &blakecoin_8w_ctx );
|
||||
blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 );
|
||||
blake256r8_8way_update( &blakecoin_8w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
@@ -108,7 +108,7 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
&& !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
@@ -1,13 +1,6 @@
|
||||
#include "blakecoin-gate.h"
|
||||
#include <memory.h>
|
||||
|
||||
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
|
||||
int64_t blakecoin_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
// return 0x3fffffLL;
|
||||
}
|
||||
|
||||
// vanilla uses default gen merkle root, otherwise identical to blakecoin
|
||||
bool register_vanilla_algo( algo_gate_t* gate )
|
||||
{
|
||||
@@ -23,7 +16,6 @@ bool register_vanilla_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&blakecoinhash;
|
||||
#endif
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
gate->get_max64 = (void*)&blakecoin_get_max64;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
#include "blakecoin-gate.h"
|
||||
|
||||
#if !defined(BLAKECOIN_8WAY) && !defined(BLAKECOIN_4WAY)
|
||||
|
||||
#define BLAKE32_ROUNDS 8
|
||||
#include "sph_blake.h"
|
||||
|
||||
@@ -93,33 +96,4 @@ int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
|
||||
{
|
||||
SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
|
||||
}
|
||||
*/
|
||||
/*
|
||||
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
|
||||
int64_t blakecoin_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
}
|
||||
|
||||
// vanilla uses default gen merkle root, otherwise identical to blakecoin
|
||||
bool register_vanilla_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_blakecoin;
|
||||
gate->hash = (void*)&blakecoinhash;
|
||||
gate->get_max64 = (void*)&blakecoin_get_max64;
|
||||
blakecoin_init( &blake_init_ctx );
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_blakecoin_algo( algo_gate_t* gate )
|
||||
{
|
||||
register_vanilla_algo( gate );
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
#endif
|
||||
|
||||
@@ -21,7 +21,7 @@ void decred_hash_4way( void *state, const void *input )
|
||||
blake256_4way_context ctx __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
|
||||
blake256_4way( &ctx, tail, tail_len );
|
||||
blake256_4way_update( &ctx, tail, tail_len );
|
||||
blake256_4way_close( &ctx, vhash );
|
||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
@@ -46,7 +46,7 @@ int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
|
||||
mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 );
|
||||
|
||||
blake256_4way_init( &blake_mid );
|
||||
blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
|
||||
blake256_4way_update( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
|
||||
|
||||
uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
|
||||
do {
|
||||
@@ -62,7 +62,7 @@ int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
|
||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[DECRED_NONCE_INDEX] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
@@ -38,7 +38,7 @@ void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
|
||||
if (!have_longpoll && work->height > *net_blocks + 1)
|
||||
{
|
||||
char netinfo[64] = { 0 };
|
||||
if (opt_showdiff && net_diff > 0.)
|
||||
if ( net_diff > 0. )
|
||||
{
|
||||
if (net_diff != work->targetdiff)
|
||||
sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
|
||||
@@ -78,7 +78,6 @@ void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
uint32_t extraheader[32] = { 0 };
|
||||
int headersize = 0;
|
||||
uint32_t* extradata = (uint32_t*) sctx->xnonce1;
|
||||
size_t t;
|
||||
int i;
|
||||
|
||||
// getwork over stratum, getwork merkle + header passed in coinb1
|
||||
@@ -87,9 +86,6 @@ void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
sizeof(extraheader) );
|
||||
memcpy( extraheader, &sctx->job.coinbase[32], headersize );
|
||||
|
||||
// Increment extranonce2
|
||||
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
|
||||
|
||||
// Assemble block header
|
||||
memset( g_work->data, 0, sizeof(g_work->data) );
|
||||
g_work->data[0] = le32dec( sctx->job.version );
|
||||
@@ -116,7 +112,7 @@ void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
// block header suffix from coinb2 (stake version)
|
||||
memcpy( &g_work->data[44],
|
||||
&sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
|
||||
sctx->bloc_height = g_work->data[32];
|
||||
sctx->block_height = g_work->data[32];
|
||||
//applog_hex(work->data, 180);
|
||||
//applog_hex(&work->data[36], 36);
|
||||
}
|
||||
@@ -153,8 +149,7 @@ bool register_decred_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&decred_hash;
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT;
|
||||
gate->get_nonceptr = (void*)&decred_get_nonceptr;
|
||||
gate->get_max64 = (void*)&get_max64_0x3fffffLL;
|
||||
// gate->get_nonceptr = (void*)&decred_get_nonceptr;
|
||||
gate->decode_extra_data = (void*)&decred_decode_extradata;
|
||||
gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
|
||||
gate->work_decode = (void*)&std_be_work_decode;
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
#include "decred-gate.h"
|
||||
|
||||
#if !defined(DECRED_8WAY) && !defined(DECRED_4WAY)
|
||||
|
||||
#include "sph_blake.h"
|
||||
|
||||
#include <string.h>
|
||||
@@ -77,25 +80,15 @@ int scanhash_decred( struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[k], pdata[k]);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
if (!thr_id) applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]);
|
||||
#endif
|
||||
|
||||
do {
|
||||
//be32enc(&endiandata[DCR_NONCE_OFT32], n);
|
||||
endiandata[DECRED_NONCE_INDEX] = n;
|
||||
decred_hash(hash32, endiandata);
|
||||
|
||||
if (hash32[7] <= HTarget && fulltest(hash32, ptarget)) {
|
||||
work_set_target_ratio(work, hash32);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
#ifdef DEBUG_ALGO
|
||||
applog(LOG_BLUE, "Nonce : %08x %08x", n, swab32(n));
|
||||
applog_hash(ptarget);
|
||||
applog_compare_hash(hash32, ptarget);
|
||||
#endif
|
||||
pdata[DECRED_NONCE_INDEX] = n;
|
||||
return 1;
|
||||
if (hash32[7] <= HTarget && fulltest(hash32, ptarget))
|
||||
{
|
||||
pdata[DECRED_NONCE_INDEX] = n;
|
||||
submit_solution( work, hash32, mythr );
|
||||
}
|
||||
|
||||
n++;
|
||||
@@ -143,7 +136,7 @@ void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
|
||||
if (!have_longpoll && work->height > *net_blocks + 1)
|
||||
{
|
||||
char netinfo[64] = { 0 };
|
||||
if (opt_showdiff && net_diff > 0.)
|
||||
if (net_diff > 0.)
|
||||
{
|
||||
if (net_diff != work->targetdiff)
|
||||
sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
|
||||
@@ -269,7 +262,6 @@ bool register_decred_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_decred;
|
||||
gate->hash = (void*)&decred_hash;
|
||||
gate->get_nonceptr = (void*)&decred_get_nonceptr;
|
||||
gate->get_max64 = (void*)&get_max64_0x3fffffLL;
|
||||
gate->decode_extra_data = (void*)&decred_decode_extradata;
|
||||
gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
|
||||
gate->work_decode = (void*)&std_be_work_decode;
|
||||
@@ -286,3 +278,5 @@ bool register_decred_algo( algo_gate_t* gate )
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
@@ -22,23 +22,23 @@ extern void pentablakehash_4way( void *output, const void *input )
|
||||
|
||||
|
||||
blake512_4way_init( &ctx );
|
||||
blake512_4way( &ctx, input, 80 );
|
||||
blake512_4way_update( &ctx, input, 80 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
|
||||
blake512_4way_init( &ctx );
|
||||
blake512_4way( &ctx, vhash, 64 );
|
||||
blake512_4way_update( &ctx, vhash, 64 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
|
||||
blake512_4way_init( &ctx );
|
||||
blake512_4way( &ctx, vhash, 64 );
|
||||
blake512_4way_update( &ctx, vhash, 64 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
|
||||
blake512_4way_init( &ctx );
|
||||
blake512_4way( &ctx, vhash, 64 );
|
||||
blake512_4way_update( &ctx, vhash, 64 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
|
||||
blake512_4way_init( &ctx );
|
||||
blake512_4way( &ctx, vhash, 64 );
|
||||
blake512_4way_update( &ctx, vhash, 64 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
|
||||
memcpy( output, hash0, 32 );
|
||||
@@ -105,7 +105,7 @@ int scanhash_pentablake_4way( struct work *work,
|
||||
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
n += 4;
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@ bool register_pentablake_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&pentablakehash;
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
#include "pentablake-gate.h"
|
||||
|
||||
#if !defined(PENTABLAKE_8WAY) && !defined(PENTABLAKE_4WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -111,3 +114,4 @@ int scanhash_pentablake( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -103,7 +103,6 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
|
||||
v[13] ^= ctx->t[1]; // high 64 bits
|
||||
if (last) // last block flag set ?
|
||||
v[14] = ~v[14];
|
||||
|
||||
for (i = 0; i < 16; i++) // get little-endian words
|
||||
m[i] = B2B_GET64(&ctx->b[8 * i]);
|
||||
|
||||
@@ -184,7 +183,8 @@ void sph_blake2b_final( sph_blake2b_ctx *ctx, void *out )
|
||||
|
||||
while (ctx->c < 128) // fill up with zeros
|
||||
ctx->b[ctx->c++] = 0;
|
||||
blake2b_compress(ctx, 1); // final block flag = 1
|
||||
|
||||
blake2b_compress(ctx, 1); // final block flag = 1
|
||||
|
||||
// little endian convert and store
|
||||
for (i = 0; i < ctx->outlen; i++) {
|
||||
|
||||
@@ -1,476 +0,0 @@
|
||||
/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
|
||||
/*
|
||||
* BLAKE implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include "../sph_blake.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
static const sph_u64 blkIV512[8] = {
|
||||
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
||||
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
|
||||
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
|
||||
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
|
||||
};
|
||||
|
||||
#define Z00 0
|
||||
#define Z01 1
|
||||
#define Z02 2
|
||||
#define Z03 3
|
||||
#define Z04 4
|
||||
#define Z05 5
|
||||
#define Z06 6
|
||||
#define Z07 7
|
||||
#define Z08 8
|
||||
#define Z09 9
|
||||
#define Z0A A
|
||||
#define Z0B B
|
||||
#define Z0C C
|
||||
#define Z0D D
|
||||
#define Z0E E
|
||||
#define Z0F F
|
||||
|
||||
#define Z10 E
|
||||
#define Z11 A
|
||||
#define Z12 4
|
||||
#define Z13 8
|
||||
#define Z14 9
|
||||
#define Z15 F
|
||||
#define Z16 D
|
||||
#define Z17 6
|
||||
#define Z18 1
|
||||
#define Z19 C
|
||||
#define Z1A 0
|
||||
#define Z1B 2
|
||||
#define Z1C B
|
||||
#define Z1D 7
|
||||
#define Z1E 5
|
||||
#define Z1F 3
|
||||
|
||||
#define Z20 B
|
||||
#define Z21 8
|
||||
#define Z22 C
|
||||
#define Z23 0
|
||||
#define Z24 5
|
||||
#define Z25 2
|
||||
#define Z26 F
|
||||
#define Z27 D
|
||||
#define Z28 A
|
||||
#define Z29 E
|
||||
#define Z2A 3
|
||||
#define Z2B 6
|
||||
#define Z2C 7
|
||||
#define Z2D 1
|
||||
#define Z2E 9
|
||||
#define Z2F 4
|
||||
|
||||
#define Z30 7
|
||||
#define Z31 9
|
||||
#define Z32 3
|
||||
#define Z33 1
|
||||
#define Z34 D
|
||||
#define Z35 C
|
||||
#define Z36 B
|
||||
#define Z37 E
|
||||
#define Z38 2
|
||||
#define Z39 6
|
||||
#define Z3A 5
|
||||
#define Z3B A
|
||||
#define Z3C 4
|
||||
#define Z3D 0
|
||||
#define Z3E F
|
||||
#define Z3F 8
|
||||
|
||||
#define Z40 9
|
||||
#define Z41 0
|
||||
#define Z42 5
|
||||
#define Z43 7
|
||||
#define Z44 2
|
||||
#define Z45 4
|
||||
#define Z46 A
|
||||
#define Z47 F
|
||||
#define Z48 E
|
||||
#define Z49 1
|
||||
#define Z4A B
|
||||
#define Z4B C
|
||||
#define Z4C 6
|
||||
#define Z4D 8
|
||||
#define Z4E 3
|
||||
#define Z4F D
|
||||
|
||||
#define Z50 2
|
||||
#define Z51 C
|
||||
#define Z52 6
|
||||
#define Z53 A
|
||||
#define Z54 0
|
||||
#define Z55 B
|
||||
#define Z56 8
|
||||
#define Z57 3
|
||||
#define Z58 4
|
||||
#define Z59 D
|
||||
#define Z5A 7
|
||||
#define Z5B 5
|
||||
#define Z5C F
|
||||
#define Z5D E
|
||||
#define Z5E 1
|
||||
#define Z5F 9
|
||||
|
||||
#define Z60 C
|
||||
#define Z61 5
|
||||
#define Z62 1
|
||||
#define Z63 F
|
||||
#define Z64 E
|
||||
#define Z65 D
|
||||
#define Z66 4
|
||||
#define Z67 A
|
||||
#define Z68 0
|
||||
#define Z69 7
|
||||
#define Z6A 6
|
||||
#define Z6B 3
|
||||
#define Z6C 9
|
||||
#define Z6D 2
|
||||
#define Z6E 8
|
||||
#define Z6F B
|
||||
|
||||
#define Z70 D
|
||||
#define Z71 B
|
||||
#define Z72 7
|
||||
#define Z73 E
|
||||
#define Z74 C
|
||||
#define Z75 1
|
||||
#define Z76 3
|
||||
#define Z77 9
|
||||
#define Z78 5
|
||||
#define Z79 0
|
||||
#define Z7A F
|
||||
#define Z7B 4
|
||||
#define Z7C 8
|
||||
#define Z7D 6
|
||||
#define Z7E 2
|
||||
#define Z7F A
|
||||
|
||||
#define Z80 6
|
||||
#define Z81 F
|
||||
#define Z82 E
|
||||
#define Z83 9
|
||||
#define Z84 B
|
||||
#define Z85 3
|
||||
#define Z86 0
|
||||
#define Z87 8
|
||||
#define Z88 C
|
||||
#define Z89 2
|
||||
#define Z8A D
|
||||
#define Z8B 7
|
||||
#define Z8C 1
|
||||
#define Z8D 4
|
||||
#define Z8E A
|
||||
#define Z8F 5
|
||||
|
||||
#define Z90 A
|
||||
#define Z91 2
|
||||
#define Z92 8
|
||||
#define Z93 4
|
||||
#define Z94 7
|
||||
#define Z95 6
|
||||
#define Z96 1
|
||||
#define Z97 5
|
||||
#define Z98 F
|
||||
#define Z99 B
|
||||
#define Z9A 9
|
||||
#define Z9B E
|
||||
#define Z9C 3
|
||||
#define Z9D C
|
||||
#define Z9E D
|
||||
#define Z9F 0
|
||||
|
||||
#define Mx(r, i) Mx_(Z ## r ## i)
|
||||
#define Mx_(n) Mx__(n)
|
||||
#define Mx__(n) M ## n
|
||||
|
||||
#define CSx(r, i) CSx_(Z ## r ## i)
|
||||
#define CSx_(n) CSx__(n)
|
||||
#define CSx__(n) CS ## n
|
||||
|
||||
#define CS0 SPH_C32(0x243F6A88)
|
||||
#define CS1 SPH_C32(0x85A308D3)
|
||||
#define CS2 SPH_C32(0x13198A2E)
|
||||
#define CS3 SPH_C32(0x03707344)
|
||||
#define CS4 SPH_C32(0xA4093822)
|
||||
#define CS5 SPH_C32(0x299F31D0)
|
||||
#define CS6 SPH_C32(0x082EFA98)
|
||||
#define CS7 SPH_C32(0xEC4E6C89)
|
||||
#define CS8 SPH_C32(0x452821E6)
|
||||
#define CS9 SPH_C32(0x38D01377)
|
||||
#define CSA SPH_C32(0xBE5466CF)
|
||||
#define CSB SPH_C32(0x34E90C6C)
|
||||
#define CSC SPH_C32(0xC0AC29B7)
|
||||
#define CSD SPH_C32(0xC97C50DD)
|
||||
#define CSE SPH_C32(0x3F84D5B5)
|
||||
#define CSF SPH_C32(0xB5470917)
|
||||
|
||||
|
||||
|
||||
#define CBx(r, i) CBx_(Z ## r ## i)
|
||||
#define CBx_(n) CBx__(n)
|
||||
#define CBx__(n) CB ## n
|
||||
|
||||
#define CB0 SPH_C64(0x243F6A8885A308D3)
|
||||
#define CB1 SPH_C64(0x13198A2E03707344)
|
||||
#define CB2 SPH_C64(0xA4093822299F31D0)
|
||||
#define CB3 SPH_C64(0x082EFA98EC4E6C89)
|
||||
#define CB4 SPH_C64(0x452821E638D01377)
|
||||
#define CB5 SPH_C64(0xBE5466CF34E90C6C)
|
||||
#define CB6 SPH_C64(0xC0AC29B7C97C50DD)
|
||||
#define CB7 SPH_C64(0x3F84D5B5B5470917)
|
||||
#define CB8 SPH_C64(0x9216D5D98979FB1B)
|
||||
#define CB9 SPH_C64(0xD1310BA698DFB5AC)
|
||||
#define CBA SPH_C64(0x2FFD72DBD01ADFB7)
|
||||
#define CBB SPH_C64(0xB8E1AFED6A267E96)
|
||||
#define CBC SPH_C64(0xBA7C9045F12C7F99)
|
||||
#define CBD SPH_C64(0x24A19947B3916CF7)
|
||||
#define CBE SPH_C64(0x0801F2E2858EFC16)
|
||||
#define CBF SPH_C64(0x636920D871574E69)
|
||||
|
||||
|
||||
#define GS(m0, m1, c0, c1, a, b, c, d) do { \
|
||||
a = SPH_T32(a + b + (m0 ^ c1)); \
|
||||
d = SPH_ROTR32(d ^ a, 16); \
|
||||
c = SPH_T32(c + d); \
|
||||
b = SPH_ROTR32(b ^ c, 12); \
|
||||
a = SPH_T32(a + b + (m1 ^ c0)); \
|
||||
d = SPH_ROTR32(d ^ a, 8); \
|
||||
c = SPH_T32(c + d); \
|
||||
b = SPH_ROTR32(b ^ c, 7); \
|
||||
} while (0)
|
||||
|
||||
#define ROUND_S(r) do { \
|
||||
GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
|
||||
GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
|
||||
GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
|
||||
GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
|
||||
GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
|
||||
GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
|
||||
GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
|
||||
GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
|
||||
|
||||
#define GB(m0, m1, c0, c1, a, b, c, d) do { \
|
||||
a = SPH_T64(a + b + (m0 ^ c1)); \
|
||||
d = SPH_ROTR64(d ^ a, 32); \
|
||||
c = SPH_T64(c + d); \
|
||||
b = SPH_ROTR64(b ^ c, 25); \
|
||||
a = SPH_T64(a + b + (m1 ^ c0)); \
|
||||
d = SPH_ROTR64(d ^ a, 16); \
|
||||
c = SPH_T64(c + d); \
|
||||
b = SPH_ROTR64(b ^ c, 11); \
|
||||
} while (0)
|
||||
|
||||
#define ROUND_B(r) do { \
|
||||
GB(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
|
||||
GB(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
|
||||
GB(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
|
||||
GB(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
|
||||
GB(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
|
||||
GB(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
|
||||
GB(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
|
||||
GB(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define COMPRESS64 do { \
|
||||
int b=0; \
|
||||
sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
V0 = blkH0, \
|
||||
V1 = blkH1, \
|
||||
V2 = blkH2, \
|
||||
V3 = blkH3, \
|
||||
V4 = blkH4, \
|
||||
V5 = blkH5, \
|
||||
V6 = blkH6, \
|
||||
V7 = blkH7; \
|
||||
V8 = blkS0 ^ CB0, \
|
||||
V9 = blkS1 ^ CB1, \
|
||||
VA = blkS2 ^ CB2, \
|
||||
VB = blkS3 ^ CB3, \
|
||||
VC = hashctA ^ CB4, \
|
||||
VD = hashctA ^ CB5, \
|
||||
VE = hashctB ^ CB6, \
|
||||
VF = hashctB ^ CB7; \
|
||||
M0 = sph_dec64be_aligned(buf + 0), \
|
||||
M1 = sph_dec64be_aligned(buf + 8), \
|
||||
M2 = sph_dec64be_aligned(buf + 16), \
|
||||
M3 = sph_dec64be_aligned(buf + 24), \
|
||||
M4 = sph_dec64be_aligned(buf + 32), \
|
||||
M5 = sph_dec64be_aligned(buf + 40), \
|
||||
M6 = sph_dec64be_aligned(buf + 48), \
|
||||
M7 = sph_dec64be_aligned(buf + 56), \
|
||||
M8 = sph_dec64be_aligned(buf + 64), \
|
||||
M9 = sph_dec64be_aligned(buf + 72), \
|
||||
MA = sph_dec64be_aligned(buf + 80), \
|
||||
MB = sph_dec64be_aligned(buf + 88), \
|
||||
MC = sph_dec64be_aligned(buf + 96), \
|
||||
MD = sph_dec64be_aligned(buf + 104), \
|
||||
ME = sph_dec64be_aligned(buf + 112), \
|
||||
MF = sph_dec64be_aligned(buf + 120); \
|
||||
/* loop once and a half */ \
|
||||
/* save some space */ \
|
||||
for (;;) { \
|
||||
ROUND_B(0); \
|
||||
ROUND_B(1); \
|
||||
ROUND_B(2); \
|
||||
ROUND_B(3); \
|
||||
ROUND_B(4); \
|
||||
ROUND_B(5); \
|
||||
if (b) break; \
|
||||
b = 1; \
|
||||
ROUND_B(6); \
|
||||
ROUND_B(7); \
|
||||
ROUND_B(8); \
|
||||
ROUND_B(9); \
|
||||
}; \
|
||||
blkH0 ^= blkS0 ^ V0 ^ V8, \
|
||||
blkH1 ^= blkS1 ^ V1 ^ V9, \
|
||||
blkH2 ^= blkS2 ^ V2 ^ VA, \
|
||||
blkH3 ^= blkS3 ^ V3 ^ VB, \
|
||||
blkH4 ^= blkS0 ^ V4 ^ VC, \
|
||||
blkH5 ^= blkS1 ^ V5 ^ VD, \
|
||||
blkH6 ^= blkS2 ^ V6 ^ VE, \
|
||||
blkH7 ^= blkS3 ^ V7 ^ VF; \
|
||||
} while (0)
|
||||
/*
|
||||
*/
|
||||
#define DECL_BLK \
|
||||
sph_u64 blkH0; \
|
||||
sph_u64 blkH1; \
|
||||
sph_u64 blkH2; \
|
||||
sph_u64 blkH3; \
|
||||
sph_u64 blkH4; \
|
||||
sph_u64 blkH5; \
|
||||
sph_u64 blkH6; \
|
||||
sph_u64 blkH7; \
|
||||
sph_u64 blkS0; \
|
||||
sph_u64 blkS1; \
|
||||
sph_u64 blkS2; \
|
||||
sph_u64 blkS3; \
|
||||
|
||||
/* load initial constants */
|
||||
#define BLK_I \
|
||||
do { \
|
||||
blkH0 = SPH_C64(0x6A09E667F3BCC908); \
|
||||
blkH1 = SPH_C64(0xBB67AE8584CAA73B); \
|
||||
blkH2 = SPH_C64(0x3C6EF372FE94F82B); \
|
||||
blkH3 = SPH_C64(0xA54FF53A5F1D36F1); \
|
||||
blkH4 = SPH_C64(0x510E527FADE682D1); \
|
||||
blkH5 = SPH_C64(0x9B05688C2B3E6C1F); \
|
||||
blkH6 = SPH_C64(0x1F83D9ABFB41BD6B); \
|
||||
blkH7 = SPH_C64(0x5BE0CD19137E2179); \
|
||||
blkS0 = 0; \
|
||||
blkS1 = 0; \
|
||||
blkS2 = 0; \
|
||||
blkS3 = 0; \
|
||||
hashctB = SPH_T64(0- 1); \
|
||||
} while (0)
|
||||
|
||||
/* copy in 80 for initial hash */
|
||||
#define BLK_W \
|
||||
do { \
|
||||
memcpy(hashbuf, input, 80); \
|
||||
hashctA = SPH_C64(0xFFFFFFFFFFFFFC00) + 80*8; \
|
||||
hashptr = 80; \
|
||||
} while (0)
|
||||
|
||||
/* copy in 64 for looped hash */
|
||||
#define BLK_U \
|
||||
do { \
|
||||
memcpy(hashbuf, hash , 64); \
|
||||
hashctA = SPH_C64(0xFFFFFFFFFFFFFC00) + 64*8; \
|
||||
hashptr = 64; \
|
||||
} while (0)
|
||||
|
||||
/* blake compress function */
|
||||
/* hash = blake512(loaded) */
|
||||
#define BLK_C \
|
||||
do { \
|
||||
\
|
||||
union { \
|
||||
unsigned char buf[128]; \
|
||||
sph_u64 dummy; \
|
||||
} u; \
|
||||
size_t ptr; \
|
||||
unsigned bit_len; \
|
||||
\
|
||||
ptr = hashptr; \
|
||||
bit_len = ((unsigned)ptr << 3) + 0; \
|
||||
u.buf[ptr] = ((0 & -(0x80)) | (0x80)) & 0xFF; \
|
||||
memset(u.buf + ptr + 1, 0, 111 - ptr); \
|
||||
u.buf[111] |= 1; \
|
||||
sph_enc64be_aligned(u.buf + 112, 0); \
|
||||
sph_enc64be_aligned(u.buf + 120, bit_len); \
|
||||
do { \
|
||||
const void *data = u.buf + ptr; \
|
||||
unsigned char *buf; \
|
||||
buf = hashbuf; \
|
||||
size_t clen; \
|
||||
clen = (sizeof(char)*128) - hashptr; \
|
||||
memcpy(buf + hashptr, data, clen); \
|
||||
hashctA = SPH_T64(hashctA + 1024); \
|
||||
hashctB = SPH_T64(hashctB + 1); \
|
||||
COMPRESS64; \
|
||||
} while (0); \
|
||||
/* end blake64(sc, u.buf + ptr, 128 - ptr); */ \
|
||||
sph_enc64be((unsigned char*)(hash) + (0 << 3), blkH0), \
|
||||
sph_enc64be((unsigned char*)(hash) + (1 << 3), blkH1); \
|
||||
sph_enc64be((unsigned char*)(hash) + (2 << 3), blkH2), \
|
||||
sph_enc64be((unsigned char*)(hash) + (3 << 3), blkH3); \
|
||||
sph_enc64be((unsigned char*)(hash) + (4 << 3), blkH4), \
|
||||
sph_enc64be((unsigned char*)(hash) + (5 << 3), blkH5); \
|
||||
sph_enc64be((unsigned char*)(hash) + (6 << 3), blkH6), \
|
||||
sph_enc64be((unsigned char*)(hash) + (7 << 3), blkH7); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -1,2 +0,0 @@
|
||||
#define CRYPTO_BYTES 64
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
amd64
|
||||
x86
|
||||
@@ -1,8 +0,0 @@
|
||||
#ifndef __BLAKE512_CONFIG_H__
|
||||
#define __BLAKE512_CONFIG_H__
|
||||
|
||||
#define AVOID_BRANCHING 1
|
||||
//#define HAVE_XOP 1
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,287 +0,0 @@
|
||||
|
||||
#include "hash.h"
|
||||
/*
|
||||
#ifndef NOT_SUPERCOP
|
||||
|
||||
#include "crypto_hash.h"
|
||||
#include "crypto_uint64.h"
|
||||
#include "crypto_uint32.h"
|
||||
#include "crypto_uint8.h"
|
||||
|
||||
typedef crypto_uint64 u64;
|
||||
typedef crypto_uint32 u32;
|
||||
typedef crypto_uint8 u8;
|
||||
|
||||
#else
|
||||
|
||||
typedef unsigned long long u64;
|
||||
typedef unsigned int u32;
|
||||
typedef unsigned char u8;
|
||||
|
||||
#endif
|
||||
*/
|
||||
#define U8TO32(p) \
|
||||
(((u32)((p)[0]) << 24) | ((u32)((p)[1]) << 16) | \
|
||||
((u32)((p)[2]) << 8) | ((u32)((p)[3]) ))
|
||||
#define U8TO64(p) \
|
||||
(((u64)U8TO32(p) << 32) | (u64)U8TO32((p) + 4))
|
||||
#define U32TO8(p, v) \
|
||||
(p)[0] = (u8)((v) >> 24); (p)[1] = (u8)((v) >> 16); \
|
||||
(p)[2] = (u8)((v) >> 8); (p)[3] = (u8)((v) );
|
||||
#define U64TO8(p, v) \
|
||||
U32TO8((p), (u32)((v) >> 32)); \
|
||||
U32TO8((p) + 4, (u32)((v) ));
|
||||
/*
|
||||
typedef struct
|
||||
{
|
||||
__m128i h[4];
|
||||
u64 s[4], t[2];
|
||||
u32 buflen, nullt;
|
||||
u8 buf[128];
|
||||
} state __attribute__ ((aligned (64)));
|
||||
*/
|
||||
static const u8 padding[129] =
|
||||
{
|
||||
0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
||||
};
|
||||
|
||||
static inline int blake512_compress( hashState_blake * state, const u8 * datablock )
|
||||
{
|
||||
|
||||
__m128i row1l,row1h;
|
||||
__m128i row2l,row2h;
|
||||
__m128i row3l,row3h;
|
||||
__m128i row4l,row4h;
|
||||
|
||||
const __m128i r16 = _mm_setr_epi8(2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9);
|
||||
const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
|
||||
|
||||
__m128i m0, m1, m2, m3, m4, m5, m6, m7;
|
||||
__m128i t0, t1, t2, t3, t4, t5, t6, t7;
|
||||
__m128i b0, b1, b2, b3;
|
||||
|
||||
m0 = _mm_loadu_si128((__m128i*)(datablock + 0));
|
||||
m1 = _mm_loadu_si128((__m128i*)(datablock + 16));
|
||||
m2 = _mm_loadu_si128((__m128i*)(datablock + 32));
|
||||
m3 = _mm_loadu_si128((__m128i*)(datablock + 48));
|
||||
m4 = _mm_loadu_si128((__m128i*)(datablock + 64));
|
||||
m5 = _mm_loadu_si128((__m128i*)(datablock + 80));
|
||||
m6 = _mm_loadu_si128((__m128i*)(datablock + 96));
|
||||
m7 = _mm_loadu_si128((__m128i*)(datablock + 112));
|
||||
|
||||
m0 = BSWAP64(m0);
|
||||
m1 = BSWAP64(m1);
|
||||
m2 = BSWAP64(m2);
|
||||
m3 = BSWAP64(m3);
|
||||
m4 = BSWAP64(m4);
|
||||
m5 = BSWAP64(m5);
|
||||
m6 = BSWAP64(m6);
|
||||
m7 = BSWAP64(m7);
|
||||
|
||||
row1l = state->h[0];
|
||||
row1h = state->h[1];
|
||||
row2l = state->h[2];
|
||||
row2h = state->h[3];
|
||||
row3l = _mm_set_epi64x(0x13198A2E03707344ULL, 0x243F6A8885A308D3ULL);
|
||||
row3h = _mm_set_epi64x(0x082EFA98EC4E6C89ULL, 0xA4093822299F31D0ULL);
|
||||
|
||||
row4l = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL);
|
||||
row4h = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xC0AC29B7C97C50DDULL);
|
||||
|
||||
#ifdef AVOID_BRANCHING
|
||||
do
|
||||
{
|
||||
const __m128i mask = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_set1_epi32(state->nullt));
|
||||
const __m128i xor1 = _mm_and_si128(_mm_set1_epi64x(state->t[0]), mask);
|
||||
const __m128i xor2 = _mm_and_si128(_mm_set1_epi64x(state->t[1]), mask);
|
||||
row4l = _mm_xor_si128(row4l, xor1);
|
||||
row4h = _mm_xor_si128(row4h, xor2);
|
||||
} while(0);
|
||||
#else
|
||||
if(!state->nullt)
|
||||
{
|
||||
row4l = _mm_xor_si128(row4l, _mm_set1_epi64x(state->t[0]));
|
||||
row4h = _mm_xor_si128(row4h, _mm_set1_epi64x(state->t[1]));
|
||||
}
|
||||
#endif
|
||||
|
||||
ROUND( 0);
|
||||
ROUND( 1);
|
||||
ROUND( 2);
|
||||
ROUND( 3);
|
||||
ROUND( 4);
|
||||
ROUND( 5);
|
||||
ROUND( 6);
|
||||
ROUND( 7);
|
||||
ROUND( 8);
|
||||
ROUND( 9);
|
||||
ROUND(10);
|
||||
ROUND(11);
|
||||
ROUND(12);
|
||||
ROUND(13);
|
||||
ROUND(14);
|
||||
ROUND(15);
|
||||
|
||||
row1l = _mm_xor_si128(row3l,row1l);
|
||||
row1h = _mm_xor_si128(row3h,row1h);
|
||||
|
||||
state->h[0] = _mm_xor_si128(row1l, state->h[0]);
|
||||
state->h[1] = _mm_xor_si128(row1h, state->h[1]);
|
||||
|
||||
row2l = _mm_xor_si128(row4l,row2l);
|
||||
row2h = _mm_xor_si128(row4h,row2h);
|
||||
|
||||
state->h[2] = _mm_xor_si128(row2l, state->h[2]);
|
||||
state->h[3] = _mm_xor_si128(row2h, state->h[3]);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void blake512_init( hashState_blake * S, u64 databitlen )
|
||||
{
|
||||
memset(S, 0, sizeof(hashState_blake));
|
||||
S->h[0] = _mm_set_epi64x(0xBB67AE8584CAA73BULL, 0x6A09E667F3BCC908ULL);
|
||||
S->h[1] = _mm_set_epi64x(0xA54FF53A5F1D36F1ULL, 0x3C6EF372FE94F82BULL);
|
||||
S->h[2] = _mm_set_epi64x(0x9B05688C2B3E6C1FULL, 0x510E527FADE682D1ULL);
|
||||
S->h[3] = _mm_set_epi64x(0x5BE0CD19137E2179ULL, 0x1F83D9ABFB41BD6BULL);
|
||||
S->buflen = databitlen;
|
||||
}
|
||||
|
||||
|
||||
static void blake512_update( hashState_blake * S, const u8 * data, u64 datalen )
|
||||
{
|
||||
|
||||
|
||||
int left = (S->buflen >> 3);
|
||||
int fill = 128 - left;
|
||||
|
||||
if( left && ( ((datalen >> 3) & 0x7F) >= fill ) ) {
|
||||
memcpy( (void *) (S->buf + left), (void *) data, fill );
|
||||
S->t[0] += 1024;
|
||||
blake512_compress( S, S->buf );
|
||||
data += fill;
|
||||
datalen -= (fill << 3);
|
||||
left = 0;
|
||||
}
|
||||
|
||||
while( datalen >= 1024 ) {
|
||||
S->t[0] += 1024;
|
||||
blake512_compress( S, data );
|
||||
data += 128;
|
||||
datalen -= 1024;
|
||||
}
|
||||
|
||||
if( datalen > 0 ) {
|
||||
memcpy( (void *) (S->buf + left), (void *) data, ( datalen>>3 ) & 0x7F );
|
||||
S->buflen = (left<<3) + datalen;
|
||||
}
|
||||
else S->buflen=0;
|
||||
}
|
||||
|
||||
static inline void blake512_final( hashState_blake * S, u8 * digest )
|
||||
{
|
||||
|
||||
u8 msglen[16], zo=0x01,oo=0x81;
|
||||
u64 lo=S->t[0] + S->buflen, hi = S->t[1];
|
||||
if ( lo < S->buflen ) hi++;
|
||||
U64TO8( msglen + 0, hi );
|
||||
U64TO8( msglen + 8, lo );
|
||||
|
||||
if ( S->buflen == 888 ) /* one padding byte */
|
||||
{
|
||||
S->t[0] -= 8;
|
||||
blake512_update( S, &oo, 8 );
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( S->buflen < 888 ) /* enough space to fill the block */
|
||||
{
|
||||
if ( S->buflen == 0 ) S->nullt=1;
|
||||
S->t[0] -= 888 - S->buflen;
|
||||
blake512_update( S, padding, 888 - S->buflen );
|
||||
}
|
||||
else /* NOT enough space, need 2 compressions */
|
||||
{
|
||||
S->t[0] -= 1024 - S->buflen;
|
||||
blake512_update( S, padding, 1024 - S->buflen );
|
||||
S->t[0] -= 888;
|
||||
blake512_update( S, padding+1, 888 );
|
||||
S->nullt = 1;
|
||||
}
|
||||
blake512_update( S, &zo, 8 );
|
||||
S->t[0] -= 8;
|
||||
}
|
||||
S->t[0] -= 128;
|
||||
blake512_update( S, msglen, 128 );
|
||||
|
||||
do
|
||||
{
|
||||
const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
|
||||
_mm_storeu_si128((__m128i*)(digest + 0), BSWAP64(S->h[0]));
|
||||
_mm_storeu_si128((__m128i*)(digest + 16), BSWAP64(S->h[1]));
|
||||
_mm_storeu_si128((__m128i*)(digest + 32), BSWAP64(S->h[2]));
|
||||
_mm_storeu_si128((__m128i*)(digest + 48), BSWAP64(S->h[3]));
|
||||
} while(0);
|
||||
}
|
||||
|
||||
/*
|
||||
int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen )
|
||||
{
|
||||
|
||||
hashState_blake S;
|
||||
blake512_init( &S );
|
||||
blake512_update( &S, in, inlen*8 );
|
||||
blake512_final( &S, out );
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
/*
|
||||
#ifdef NOT_SUPERCOP
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
int i, v;
|
||||
u8 data[144], digest[64];
|
||||
u8 test1[]= {0x97, 0x96, 0x15, 0x87, 0xF6, 0xD9, 0x70, 0xFA, 0xBA, 0x6D, 0x24, 0x78, 0x04, 0x5D, 0xE6, 0xD1,
|
||||
0xFA, 0xBD, 0x09, 0xB6, 0x1A, 0xE5, 0x09, 0x32, 0x05, 0x4D, 0x52, 0xBC, 0x29, 0xD3, 0x1B, 0xE4,
|
||||
0xFF, 0x91, 0x02, 0xB9, 0xF6, 0x9E, 0x2B, 0xBD, 0xB8, 0x3B, 0xE1, 0x3D, 0x4B, 0x9C, 0x06, 0x09,
|
||||
0x1E, 0x5F, 0xA0, 0xB4, 0x8B, 0xD0, 0x81, 0xB6, 0x34, 0x05, 0x8B, 0xE0, 0xEC, 0x49, 0xBE, 0xB3};
|
||||
u8 test2[]= {0x31, 0x37, 0x17, 0xD6, 0x08, 0xE9, 0xCF, 0x75, 0x8D, 0xCB, 0x1E, 0xB0, 0xF0, 0xC3, 0xCF, 0x9F,
|
||||
0xC1, 0x50, 0xB2, 0xD5, 0x00, 0xFB, 0x33, 0xF5, 0x1C, 0x52, 0xAF, 0xC9, 0x9D, 0x35, 0x8A, 0x2F,
|
||||
0x13, 0x74, 0xB8, 0xA3, 0x8B, 0xBA, 0x79, 0x74, 0xE7, 0xF6, 0xEF, 0x79, 0xCA, 0xB1, 0x6F, 0x22,
|
||||
0xCE, 0x1E, 0x64, 0x9D, 0x6E, 0x01, 0xAD, 0x95, 0x89, 0xC2, 0x13, 0x04, 0x5D, 0x54, 0x5D, 0xDE};
|
||||
|
||||
for(i=0; i<144; ++i) data[i]=0;
|
||||
|
||||
crypto_hash( digest, data, 1 );
|
||||
v=0;
|
||||
for(i=0; i<64; ++i) {
|
||||
printf("%02X", digest[i]);
|
||||
if ( digest[i] != test1[i]) v=1;
|
||||
}
|
||||
if (v) printf("\nerror\n");
|
||||
else printf("\nok\n");
|
||||
|
||||
for(i=0; i<144; ++i) data[i]=0;
|
||||
|
||||
crypto_hash( digest, data, 144 );
|
||||
v=0;
|
||||
for(i=0; i<64; ++i) {
|
||||
printf("%02X", digest[i]);
|
||||
if ( digest[i] != test2[i]) v=1;
|
||||
}
|
||||
if (v) printf("\nerror\n");
|
||||
else printf("\nok\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
*/
|
||||
|
||||
|
||||
@@ -1,74 +0,0 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <x86intrin.h>
|
||||
|
||||
#include "config.h"
|
||||
#include "rounds.h"
|
||||
/*
|
||||
#ifndef NOT_SUPERCOP
|
||||
|
||||
#include "crypto_hash.h"
|
||||
#include "crypto_uint64.h"
|
||||
#include "crypto_uint32.h"
|
||||
#include "crypto_uint8.h"
|
||||
|
||||
typedef crypto_uint64 u64;
|
||||
typedef crypto_uint32 u32;
|
||||
typedef crypto_uint8 u8;
|
||||
|
||||
#else
|
||||
*/
|
||||
typedef unsigned long long u64;
|
||||
typedef unsigned int u32;
|
||||
typedef unsigned char u8;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m128i h[4];
|
||||
u64 s[4], t[2];
|
||||
u32 buflen, nullt;
|
||||
u8 buf[128];
|
||||
} hashState_blake __attribute__ ((aligned (64)));
|
||||
/*
|
||||
#endif
|
||||
|
||||
#define U8TO32(p) \
|
||||
(((u32)((p)[0]) << 24) | ((u32)((p)[1]) << 16) | \
|
||||
((u32)((p)[2]) << 8) | ((u32)((p)[3]) ))
|
||||
#define U8TO64(p) \
|
||||
(((u64)U8TO32(p) << 32) | (u64)U8TO32((p) + 4))
|
||||
#define U32TO8(p, v) \
|
||||
(p)[0] = (u8)((v) >> 24); (p)[1] = (u8)((v) >> 16); \
|
||||
(p)[2] = (u8)((v) >> 8); (p)[3] = (u8)((v) );
|
||||
#define U64TO8(p, v) \
|
||||
U32TO8((p), (u32)((v) >> 32)); \
|
||||
U32TO8((p) + 4, (u32)((v) ));
|
||||
*/
|
||||
|
||||
/*
|
||||
static const u8 padding[129] =
|
||||
{
|
||||
0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
||||
};
|
||||
|
||||
*/
|
||||
static inline void blake512_init( hashState_blake * S, u64 datalen );
|
||||
|
||||
|
||||
static void blake512_update( hashState_blake * S, const u8 * data, u64 datalen ) ;
|
||||
|
||||
static inline void blake512_final( hashState_blake * S, u8 * digest ) ;
|
||||
|
||||
|
||||
int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ) ;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
Jean-Philippe Aumasson
|
||||
Samuel Neves
|
||||
@@ -1,871 +0,0 @@
|
||||
|
||||
#ifndef __BLAKE512_ROUNDS_H__
|
||||
#define __BLAKE512_ROUNDS_H__
|
||||
|
||||
#ifndef HAVE_XOP
|
||||
#define BSWAP64(x) _mm_shuffle_epi8((x), u8to64)
|
||||
|
||||
#define _mm_roti_epi64(x, c) \
|
||||
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
|
||||
: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
|
||||
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-c)))
|
||||
#else
|
||||
#define BSWAP64(x) _mm_perm_epi8((x),(x),u8to64)
|
||||
#endif
|
||||
|
||||
|
||||
#define LOAD_MSG_0_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m0, m1); \
|
||||
t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x13198A2E03707344ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m2, m3); \
|
||||
t3 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xBE5466CF34E90C6CULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_0_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m0, m1); \
|
||||
t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x243F6A8885A308D3ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m2, m3); \
|
||||
t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x452821E638D01377ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_0_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m4, m5); \
|
||||
t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xD1310BA698DFB5ACULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m6, m7); \
|
||||
t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x24A19947B3916CF7ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_0_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m4, m5); \
|
||||
t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x9216D5D98979FB1BULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m6, m7); \
|
||||
t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_1_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m7, m2); \
|
||||
t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m4, m6); \
|
||||
t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x636920D871574E69ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_1_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m5, m4); \
|
||||
t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_alignr_epi8(m3, m7, 8); \
|
||||
t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xD1310BA698DFB5ACULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_1_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
|
||||
t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m5, m2); \
|
||||
t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_1_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m6, m1); \
|
||||
t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x13198A2E03707344ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m3, m1); \
|
||||
t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_2_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_alignr_epi8(m6, m5, 8); \
|
||||
t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x9216D5D98979FB1BULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m2, m7); \
|
||||
t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xA4093822299F31D0ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_2_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m4, m0); \
|
||||
t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m1, m6, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_2_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m5, m1, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x801F2E2858EFC16ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m3, m4); \
|
||||
t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x13198A2E03707344ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_2_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m7, m3); \
|
||||
t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x2FFD72DBD01ADFB7ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_alignr_epi8(m2, m0, 8); \
|
||||
t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x3F84D5B5B5470917ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_3_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m3, m1); \
|
||||
t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xD1310BA698DFB5ACULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m6, m5); \
|
||||
t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_3_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m4, m0); \
|
||||
t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m6, m7); \
|
||||
t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x24A19947B3916CF7ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_3_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m1, m2, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m2, m7, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_3_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m3, m5); \
|
||||
t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xA4093822299F31D0ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m0, m4); \
|
||||
t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_4_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m4, m2); \
|
||||
t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x243F6A8885A308D3ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m1, m5); \
|
||||
t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_4_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m0, m3, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xD1310BA698DFB5ACULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m2, m7, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xA4093822299F31D0ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_4_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m7, m5, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0x13198A2E03707344ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m3, m1, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x9216D5D98979FB1BULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_4_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_alignr_epi8(m6, m0, 8); \
|
||||
t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x801F2E2858EFC16ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m4, m6, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xC0AC29B7C97C50DDULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_5_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m1, m3); \
|
||||
t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m0, m4); \
|
||||
t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_5_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m6, m5); \
|
||||
t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0xA4093822299F31D0ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m5, m1); \
|
||||
t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_5_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m2, m3, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x24A19947B3916CF7ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m7, m0); \
|
||||
t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x801F2E2858EFC16ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_5_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m6, m2); \
|
||||
t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x452821E638D01377ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m7, m4, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x636920D871574E69ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_6_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m6, m0, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m7, m2); \
|
||||
t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x24A19947B3916CF7ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_6_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m2, m7); \
|
||||
t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_alignr_epi8(m5, m6, 8); \
|
||||
t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_6_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m0, m3); \
|
||||
t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
|
||||
t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xA4093822299F31D0ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_6_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m3, m1); \
|
||||
t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x243F6A8885A308D3ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m1, m5, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0xD1310BA698DFB5ACULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_7_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m6, m3); \
|
||||
t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m6, m1, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x13198A2E03707344ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_7_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_alignr_epi8(m7, m5, 8); \
|
||||
t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x24A19947B3916CF7ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m0, m4); \
|
||||
t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_7_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m2, m7); \
|
||||
t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x243F6A8885A308D3ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m4, m1); \
|
||||
t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_7_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m0, m2); \
|
||||
t1 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m3, m5); \
|
||||
t3 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x9216D5D98979FB1BULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_8_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m3, m7); \
|
||||
t1 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x636920D871574E69ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_alignr_epi8(m0, m5, 8); \
|
||||
t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x82EFA98EC4E6C89ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_8_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m7, m4); \
|
||||
t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xC0AC29B7C97C50DDULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_alignr_epi8(m4, m1, 8); \
|
||||
t3 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_8_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = m6; \
|
||||
t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xA4093822299F31D0ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_alignr_epi8(m5, m0, 8); \
|
||||
t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_8_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m1, m3, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = m2; \
|
||||
t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x13198A2E03707344ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_9_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m5, m4); \
|
||||
t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0xA4093822299F31D0ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m3, m0); \
|
||||
t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xC0AC29B7C97C50DDULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_9_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m1, m2); \
|
||||
t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m3, m2, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x3F84D5B5B5470917ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_9_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m7, m4); \
|
||||
t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m1, m6); \
|
||||
t3 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_9_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_alignr_epi8(m7, m5, 8); \
|
||||
t1 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x636920D871574E69ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m6, m0); \
|
||||
t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x82EFA98EC4E6C89ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_10_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m0, m1); \
|
||||
t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x13198A2E03707344ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m2, m3); \
|
||||
t3 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xBE5466CF34E90C6CULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_10_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m0, m1); \
|
||||
t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x243F6A8885A308D3ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m2, m3); \
|
||||
t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x452821E638D01377ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_10_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m4, m5); \
|
||||
t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xD1310BA698DFB5ACULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m6, m7); \
|
||||
t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x24A19947B3916CF7ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_10_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m4, m5); \
|
||||
t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x9216D5D98979FB1BULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m6, m7); \
|
||||
t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_11_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m7, m2); \
|
||||
t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m4, m6); \
|
||||
t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x636920D871574E69ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_11_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m5, m4); \
|
||||
t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_alignr_epi8(m3, m7, 8); \
|
||||
t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xD1310BA698DFB5ACULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_11_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
|
||||
t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m5, m2); \
|
||||
t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_11_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m6, m1); \
|
||||
t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x13198A2E03707344ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m3, m1); \
|
||||
t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_12_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_alignr_epi8(m6, m5, 8); \
|
||||
t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x9216D5D98979FB1BULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m2, m7); \
|
||||
t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xA4093822299F31D0ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_12_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m4, m0); \
|
||||
t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m1, m6, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_12_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m5, m1, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x801F2E2858EFC16ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m3, m4); \
|
||||
t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x13198A2E03707344ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_12_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m7, m3); \
|
||||
t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x2FFD72DBD01ADFB7ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_alignr_epi8(m2, m0, 8); \
|
||||
t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x3F84D5B5B5470917ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_13_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m3, m1); \
|
||||
t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xD1310BA698DFB5ACULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m6, m5); \
|
||||
t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_13_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m4, m0); \
|
||||
t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m6, m7); \
|
||||
t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x24A19947B3916CF7ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_13_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m1, m2, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m2, m7, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_13_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m3, m5); \
|
||||
t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xA4093822299F31D0ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m0, m4); \
|
||||
t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_14_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m4, m2); \
|
||||
t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x243F6A8885A308D3ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m1, m5); \
|
||||
t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_14_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m0, m3, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xD1310BA698DFB5ACULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m2, m7, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xA4093822299F31D0ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_14_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m7, m5, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0x13198A2E03707344ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m3, m1, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x9216D5D98979FB1BULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_14_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_alignr_epi8(m6, m0, 8); \
|
||||
t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x801F2E2858EFC16ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m4, m6, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xC0AC29B7C97C50DDULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_15_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m1, m3); \
|
||||
t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m0, m4); \
|
||||
t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_15_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m6, m5); \
|
||||
t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0xA4093822299F31D0ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m5, m1); \
|
||||
t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_15_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m2, m3, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x24A19947B3916CF7ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m7, m0); \
|
||||
t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x801F2E2858EFC16ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_15_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m6, m2); \
|
||||
t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x452821E638D01377ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m7, m4, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x636920D871574E69ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
|
||||
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
|
||||
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
|
||||
\
|
||||
row4l = _mm_xor_si128(row4l, row1l); \
|
||||
row4h = _mm_xor_si128(row4h, row1h); \
|
||||
\
|
||||
row4l = _mm_roti_epi64(row4l, -32); \
|
||||
row4h = _mm_roti_epi64(row4h, -32); \
|
||||
\
|
||||
row3l = _mm_add_epi64(row3l, row4l); \
|
||||
row3h = _mm_add_epi64(row3h, row4h); \
|
||||
\
|
||||
row2l = _mm_xor_si128(row2l, row3l); \
|
||||
row2h = _mm_xor_si128(row2h, row3h); \
|
||||
\
|
||||
row2l = _mm_roti_epi64(row2l, -25); \
|
||||
row2h = _mm_roti_epi64(row2h, -25); \
|
||||
|
||||
#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
|
||||
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
|
||||
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
|
||||
\
|
||||
row4l = _mm_xor_si128(row4l, row1l); \
|
||||
row4h = _mm_xor_si128(row4h, row1h); \
|
||||
\
|
||||
row4l = _mm_roti_epi64(row4l, -16); \
|
||||
row4h = _mm_roti_epi64(row4h, -16); \
|
||||
\
|
||||
row3l = _mm_add_epi64(row3l, row4l); \
|
||||
row3h = _mm_add_epi64(row3h, row4h); \
|
||||
\
|
||||
row2l = _mm_xor_si128(row2l, row3l); \
|
||||
row2h = _mm_xor_si128(row2h, row3h); \
|
||||
\
|
||||
row2l = _mm_roti_epi64(row2l, -11); \
|
||||
row2h = _mm_roti_epi64(row2h, -11); \
|
||||
|
||||
|
||||
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
|
||||
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
|
||||
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
|
||||
row2l = t0; \
|
||||
row2h = t1; \
|
||||
\
|
||||
t0 = row3l; \
|
||||
row3l = row3h; \
|
||||
row3h = t0; \
|
||||
\
|
||||
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
|
||||
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
|
||||
row4l = t1; \
|
||||
row4h = t0;
|
||||
|
||||
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
|
||||
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
|
||||
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
|
||||
row2l = t0; \
|
||||
row2h = t1; \
|
||||
\
|
||||
t0 = row3l; \
|
||||
row3l = row3h; \
|
||||
row3h = t0; \
|
||||
\
|
||||
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
|
||||
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
|
||||
row4l = t1; \
|
||||
row4h = t0;
|
||||
|
||||
#define ROUND(r) \
|
||||
LOAD_MSG_ ##r ##_1(b0, b1); \
|
||||
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
LOAD_MSG_ ##r ##_2(b0, b1); \
|
||||
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||
LOAD_MSG_ ##r ##_3(b0, b1); \
|
||||
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
LOAD_MSG_ ##r ##_4(b0, b1); \
|
||||
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -62,9 +62,10 @@ typedef struct {
|
||||
|
||||
typedef bmw_4way_small_context bmw256_4way_context;
|
||||
|
||||
void bmw256_4way_init(void *cc);
|
||||
void bmw256_4way_init( bmw256_4way_context *ctx );
|
||||
|
||||
void bmw256_4way(void *cc, const void *data, size_t len);
|
||||
void bmw256_4way_update(void *cc, const void *data, size_t len);
|
||||
#define bmw256_4way bmw256_4way_update
|
||||
|
||||
void bmw256_4way_close(void *cc, void *dst);
|
||||
|
||||
@@ -78,7 +79,7 @@ void bmw256_4way_addbits_and_close(
|
||||
// BMW-256 8 way 32
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[64];
|
||||
__m256i buf[16];
|
||||
__m256i H[16];
|
||||
size_t ptr;
|
||||
uint32_t bit_count; // assume bit_count fits in 32 bits
|
||||
@@ -87,11 +88,33 @@ typedef struct {
|
||||
typedef bmw_8way_small_context bmw256_8way_context;
|
||||
|
||||
void bmw256_8way_init( bmw256_8way_context *ctx );
|
||||
void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len );
|
||||
void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
|
||||
size_t len );
|
||||
#define bmw256_8way bmw256_8way_update
|
||||
void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// BMW-256 16 way 32
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[16];
|
||||
size_t ptr;
|
||||
uint32_t bit_count; // assume bit_count fits in 32 bits
|
||||
} bmw_16way_small_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef bmw_16way_small_context bmw256_16way_context;
|
||||
|
||||
void bmw256_16way_init( bmw256_16way_context *ctx );
|
||||
void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
|
||||
size_t len );
|
||||
void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
@@ -107,28 +130,29 @@ typedef struct {
|
||||
typedef bmw_2way_big_context bmw512_2way_context;
|
||||
|
||||
void bmw512_2way_init( bmw512_2way_context *ctx );
|
||||
void bmw512_2way( bmw512_2way_context *ctx, const void *data, size_t len );
|
||||
void bmw512_2way_update( bmw512_2way_context *ctx, const void *data,
|
||||
size_t len );
|
||||
void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );
|
||||
|
||||
#endif // __SSE2__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// BMW-512 4 way 64
|
||||
// BMW-512 64 bit 4 way
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16];
|
||||
__m256i H[16];
|
||||
size_t ptr;
|
||||
sph_u64 bit_count;
|
||||
} bmw_4way_big_context;
|
||||
} bmw_4way_big_context __attribute__((aligned(128)));
|
||||
|
||||
typedef bmw_4way_big_context bmw512_4way_context;
|
||||
|
||||
|
||||
void bmw512_4way_init(void *cc);
|
||||
|
||||
void bmw512_4way(void *cc, const void *data, size_t len);
|
||||
void bmw512_4way_update(void *cc, const void *data, size_t len);
|
||||
#define bmw512_4way bmw512_4way_update
|
||||
|
||||
void bmw512_4way_close(void *cc, void *dst);
|
||||
|
||||
@@ -137,6 +161,25 @@ void bmw512_4way_addbits_and_close(
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// BMW-512 64 bit 8 way
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[16];
|
||||
size_t ptr;
|
||||
uint64_t bit_count;
|
||||
} bmw512_8way_context __attribute__((aligned(128)));
|
||||
|
||||
void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
|
||||
size_t len );
|
||||
void bmw512_8way_init( bmw512_8way_context *ctx );
|
||||
void bmw512_8way_update( bmw512_8way_context *ctx, const void *data,
|
||||
size_t len );
|
||||
void bmw512_8way_close( bmw512_8way_context *ctx, void *dst );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,34 +1,88 @@
|
||||
#include "bmw512-gate.h"
|
||||
|
||||
#ifdef BMW512_4WAY
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
//#include "sph_keccak.h"
|
||||
#include "bmw-hash-4way.h"
|
||||
|
||||
#if defined(BMW512_8WAY)
|
||||
|
||||
void bmw512hash_8way(void *state, const void *input)
|
||||
{
|
||||
bmw512_8way_context ctx;
|
||||
bmw512_8way_init( &ctx );
|
||||
bmw512_8way_update( &ctx, input, 80 );
|
||||
bmw512_8way_close( &ctx, state );
|
||||
}
|
||||
|
||||
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[16*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0 ,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
bmw512hash_8way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
|
||||
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart) );
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BMW512_4WAY)
|
||||
|
||||
//#ifdef BMW512_4WAY
|
||||
|
||||
void bmw512hash_4way(void *state, const void *input)
|
||||
{
|
||||
bmw512_4way_context ctx;
|
||||
bmw512_4way_init( &ctx );
|
||||
bmw512_4way( &ctx, input, 80 );
|
||||
bmw512_4way_update( &ctx, input, 80 );
|
||||
bmw512_4way_close( &ctx, state );
|
||||
}
|
||||
|
||||
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[16*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[16*4] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
// const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
@@ -39,20 +93,20 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
bmw512hash_4way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
|
||||
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
#include "bmw512-gate.h"
|
||||
|
||||
int64_t bmw512_get_max64() { return 0x7ffffLL; }
|
||||
|
||||
bool register_bmw512_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT;
|
||||
gate->set_target = (void*)&alt_set_target;
|
||||
gate->get_max64 = (void*)&bmw512_get_max64;
|
||||
#if defined (BMW512_4WAY)
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
#if defined (BMW512_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_bmw512_8way;
|
||||
gate->hash = (void*)&bmw512hash_8way;
|
||||
#elif defined (BMW512_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_bmw512_4way;
|
||||
gate->hash = (void*)&bmw512hash_4way;
|
||||
#else
|
||||
|
||||
@@ -1,23 +1,33 @@
|
||||
#ifndef BMW512_GATE_H__
|
||||
#define BMW512_GATE_H__
|
||||
#define BMW512_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BMW512_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define BMW512_4WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(BMW512_4WAY)
|
||||
#if defined(BMW512_8WAY)
|
||||
|
||||
void bmw512hash_8way( void *state, const void *input );
|
||||
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(BMW512_4WAY)
|
||||
|
||||
void bmw512hash_4way( void *state, const void *input );
|
||||
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
#else
|
||||
|
||||
void bmw512hash( void *state, const void *input );
|
||||
int scanhash_bmw512( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if !defined(BMW512_8WAY) && !defined(BMW512_4WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
@@ -50,4 +52,4 @@ int scanhash_bmw512( struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -48,6 +48,8 @@ extern "C"{
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
static const sph_u32 IV224[] = {
|
||||
SPH_C32(0x00010203), SPH_C32(0x04050607),
|
||||
SPH_C32(0x08090A0B), SPH_C32(0x0C0D0E0F),
|
||||
@@ -70,6 +72,8 @@ static const sph_u32 IV256[] = {
|
||||
SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
|
||||
};
|
||||
|
||||
#endif // !AVX2
|
||||
|
||||
#if SPH_64
|
||||
|
||||
static const sph_u64 IV384[] = {
|
||||
@@ -135,6 +139,8 @@ static const sph_u64 IV512[] = {
|
||||
#define M16_30 14, 15, 1, 2, 5, 8, 9
|
||||
#define M16_31 15, 16, 2, 3, 6, 9, 10
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
#define ss0(x) (((x) >> 1) ^ SPH_T32((x) << 3) \
|
||||
^ SPH_ROTL32(x, 4) ^ SPH_ROTL32(x, 19))
|
||||
#define ss1(x) (((x) >> 1) ^ SPH_T32((x) << 2) \
|
||||
@@ -189,6 +195,8 @@ static const sph_u64 IV512[] = {
|
||||
#define expand2s_(qf, mf, hf, i16, ix, iy) \
|
||||
expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
|
||||
|
||||
#endif // !AVX2
|
||||
|
||||
#if SPH_64
|
||||
|
||||
#define sb0(x) (((x) >> 1) ^ SPH_T64((x) << 3) \
|
||||
@@ -291,6 +299,8 @@ static const sph_u64 Kb_tab[] = {
|
||||
tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
|
||||
op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
#define Ws0 MAKE_W(SPH_T32, 5, -, 7, +, 10, +, 13, +, 14)
|
||||
#define Ws1 MAKE_W(SPH_T32, 6, -, 8, +, 11, +, 14, -, 15)
|
||||
#define Ws2 MAKE_W(SPH_T32, 0, +, 7, +, 9, -, 12, +, 15)
|
||||
@@ -407,6 +417,8 @@ static const sph_u64 Kb_tab[] = {
|
||||
|
||||
#define Qs(j) (qt[j])
|
||||
|
||||
#endif // !AVX2
|
||||
|
||||
#if SPH_64
|
||||
|
||||
#define Wb0 MAKE_W(SPH_T64, 5, -, 7, +, 10, +, 13, +, 14)
|
||||
@@ -557,7 +569,6 @@ static const sph_u64 Kb_tab[] = {
|
||||
+ ((xl >> 2) ^ qf(22) ^ qf(15))); \
|
||||
} while (0)
|
||||
|
||||
#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
|
||||
|
||||
#if SPH_64
|
||||
|
||||
@@ -565,6 +576,10 @@ static const sph_u64 Kb_tab[] = {
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
|
||||
|
||||
static void
|
||||
compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
|
||||
{
|
||||
@@ -711,6 +726,8 @@ bmw32_close(sph_bmw_small_context *sc, unsigned ub, unsigned n,
|
||||
sph_enc32le(out + 4 * u, h1[v]);
|
||||
}
|
||||
|
||||
#endif // !AVX2
|
||||
|
||||
#if SPH_64
|
||||
|
||||
static void
|
||||
@@ -840,6 +857,8 @@ bmw64_close(sph_bmw_big_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
/* see sph_bmw.h */
|
||||
void
|
||||
sph_bmw224_init(void *cc)
|
||||
@@ -898,6 +917,8 @@ sph_bmw256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
// sph_bmw256_init(cc);
|
||||
}
|
||||
|
||||
#endif // !AVX2
|
||||
|
||||
#if SPH_64
|
||||
|
||||
/* see sph_bmw.h */
|
||||
|
||||
@@ -77,6 +77,9 @@ extern "C"{
|
||||
* computation can be cloned by copying the context (e.g. with a simple
|
||||
* <code>memcpy()</code>).
|
||||
*/
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char buf[64]; /* first field, for alignment */
|
||||
@@ -102,6 +105,8 @@ typedef sph_bmw_small_context sph_bmw224_context;
|
||||
*/
|
||||
typedef sph_bmw_small_context sph_bmw256_context;
|
||||
|
||||
#endif // !AVX2
|
||||
|
||||
#if SPH_64
|
||||
|
||||
/**
|
||||
@@ -137,6 +142,8 @@ typedef sph_bmw_big_context sph_bmw512_context;
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
/**
|
||||
* Initialize a BMW-224 context. This process performs no memory allocation.
|
||||
*
|
||||
@@ -227,6 +234,8 @@ void sph_bmw256_close(void *cc, void *dst);
|
||||
void sph_bmw256_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#endif // !AVX2
|
||||
|
||||
#if SPH_64
|
||||
|
||||
/**
|
||||
|
||||
@@ -1,519 +0,0 @@
|
||||
/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */
|
||||
/*
|
||||
* BMW implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#include "../sph_bmw.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
static const sph_u64 bmwIV512[] = {
|
||||
SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
|
||||
SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
|
||||
SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
|
||||
SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
|
||||
SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
|
||||
SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
|
||||
SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
|
||||
SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
|
||||
};
|
||||
|
||||
#define XCAT(x, y) XCAT_(x, y)
|
||||
#define XCAT_(x, y) x ## y
|
||||
|
||||
#define LPAR (
|
||||
|
||||
#define I16_16 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
#define I16_17 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
||||
#define I16_18 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
|
||||
#define I16_19 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
|
||||
#define I16_20 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
|
||||
#define I16_21 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20
|
||||
#define I16_22 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
||||
#define I16_23 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
|
||||
#define I16_24 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
|
||||
#define I16_25 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
|
||||
#define I16_26 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
|
||||
#define I16_27 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
|
||||
#define I16_28 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
|
||||
#define I16_29 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28
|
||||
#define I16_30 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
|
||||
#define I16_31 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
|
||||
|
||||
#define M16_16 0, 1, 3, 4, 7, 10, 11
|
||||
#define M16_17 1, 2, 4, 5, 8, 11, 12
|
||||
#define M16_18 2, 3, 5, 6, 9, 12, 13
|
||||
#define M16_19 3, 4, 6, 7, 10, 13, 14
|
||||
#define M16_20 4, 5, 7, 8, 11, 14, 15
|
||||
#define M16_21 5, 6, 8, 9, 12, 15, 16
|
||||
#define M16_22 6, 7, 9, 10, 13, 0, 1
|
||||
#define M16_23 7, 8, 10, 11, 14, 1, 2
|
||||
#define M16_24 8, 9, 11, 12, 15, 2, 3
|
||||
#define M16_25 9, 10, 12, 13, 0, 3, 4
|
||||
#define M16_26 10, 11, 13, 14, 1, 4, 5
|
||||
#define M16_27 11, 12, 14, 15, 2, 5, 6
|
||||
#define M16_28 12, 13, 15, 16, 3, 6, 7
|
||||
#define M16_29 13, 14, 0, 1, 4, 7, 8
|
||||
#define M16_30 14, 15, 1, 2, 5, 8, 9
|
||||
#define M16_31 15, 16, 2, 3, 6, 9, 10
|
||||
|
||||
#define ss0(x) (((x) >> 1) ^ SPH_T32((x) << 3) \
|
||||
^ SPH_ROTL32(x, 4) ^ SPH_ROTL32(x, 19))
|
||||
#define ss1(x) (((x) >> 1) ^ SPH_T32((x) << 2) \
|
||||
^ SPH_ROTL32(x, 8) ^ SPH_ROTL32(x, 23))
|
||||
#define ss2(x) (((x) >> 2) ^ SPH_T32((x) << 1) \
|
||||
^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
|
||||
#define ss3(x) (((x) >> 2) ^ SPH_T32((x) << 2) \
|
||||
^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
|
||||
#define ss4(x) (((x) >> 1) ^ (x))
|
||||
#define ss5(x) (((x) >> 2) ^ (x))
|
||||
#define rs1(x) SPH_ROTL32(x, 3)
|
||||
#define rs2(x) SPH_ROTL32(x, 7)
|
||||
#define rs3(x) SPH_ROTL32(x, 13)
|
||||
#define rs4(x) SPH_ROTL32(x, 16)
|
||||
#define rs5(x) SPH_ROTL32(x, 19)
|
||||
#define rs6(x) SPH_ROTL32(x, 23)
|
||||
#define rs7(x) SPH_ROTL32(x, 27)
|
||||
|
||||
#define Ks(j) SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
|
||||
|
||||
#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
|
||||
(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
|
||||
- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
|
||||
|
||||
#define expand1s_inner(qf, mf, hf, i16, \
|
||||
i0, i1, i2, i3, i4, i5, i6, i7, i8, \
|
||||
i9, i10, i11, i12, i13, i14, i15, \
|
||||
i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
|
||||
SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
|
||||
+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
|
||||
+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
|
||||
+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
|
||||
+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
|
||||
|
||||
#define expand1s(qf, mf, hf, i16) \
|
||||
expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
|
||||
#define expand1s_(qf, mf, hf, i16, ix, iy) \
|
||||
expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
|
||||
|
||||
#define expand2s_inner(qf, mf, hf, i16, \
|
||||
i0, i1, i2, i3, i4, i5, i6, i7, i8, \
|
||||
i9, i10, i11, i12, i13, i14, i15, \
|
||||
i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
|
||||
SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
|
||||
+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
|
||||
+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
|
||||
+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
|
||||
+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
|
||||
|
||||
#define expand2s(qf, mf, hf, i16) \
|
||||
expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
|
||||
#define expand2s_(qf, mf, hf, i16, ix, iy) \
|
||||
expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
|
||||
|
||||
#if SPH_64
|
||||
|
||||
#define sb0(x) (((x) >> 1) ^ SPH_T64((x) << 3) \
|
||||
^ SPH_ROTL64(x, 4) ^ SPH_ROTL64(x, 37))
|
||||
#define sb1(x) (((x) >> 1) ^ SPH_T64((x) << 2) \
|
||||
^ SPH_ROTL64(x, 13) ^ SPH_ROTL64(x, 43))
|
||||
#define sb2(x) (((x) >> 2) ^ SPH_T64((x) << 1) \
|
||||
^ SPH_ROTL64(x, 19) ^ SPH_ROTL64(x, 53))
|
||||
#define sb3(x) (((x) >> 2) ^ SPH_T64((x) << 2) \
|
||||
^ SPH_ROTL64(x, 28) ^ SPH_ROTL64(x, 59))
|
||||
#define sb4(x) (((x) >> 1) ^ (x))
|
||||
#define sb5(x) (((x) >> 2) ^ (x))
|
||||
#define rb1(x) SPH_ROTL64(x, 5)
|
||||
#define rb2(x) SPH_ROTL64(x, 11)
|
||||
#define rb3(x) SPH_ROTL64(x, 27)
|
||||
#define rb4(x) SPH_ROTL64(x, 32)
|
||||
#define rb5(x) SPH_ROTL64(x, 37)
|
||||
#define rb6(x) SPH_ROTL64(x, 43)
|
||||
#define rb7(x) SPH_ROTL64(x, 53)
|
||||
|
||||
#define Kb(j) SPH_T64((sph_u64)(j) * SPH_C64(0x0555555555555555))
|
||||
|
||||
#if 0
|
||||
|
||||
static const sph_u64 Kb_tab[] = {
|
||||
Kb(16), Kb(17), Kb(18), Kb(19), Kb(20), Kb(21), Kb(22), Kb(23),
|
||||
Kb(24), Kb(25), Kb(26), Kb(27), Kb(28), Kb(29), Kb(30), Kb(31)
|
||||
};
|
||||
|
||||
#define rol_off(mf, j, off) \
|
||||
SPH_ROTL64(mf(((j) + (off)) & 15), (((j) + (off)) & 15) + 1)
|
||||
|
||||
#define add_elt_b(mf, hf, j) \
|
||||
(SPH_T64(rol_off(mf, j, 0) + rol_off(mf, j, 3) \
|
||||
- rol_off(mf, j, 10) + Kb_tab[j]) ^ hf(((j) + 7) & 15))
|
||||
|
||||
#define expand1b(qf, mf, hf, i) \
|
||||
SPH_T64(sb1(qf((i) - 16)) + sb2(qf((i) - 15)) \
|
||||
+ sb3(qf((i) - 14)) + sb0(qf((i) - 13)) \
|
||||
+ sb1(qf((i) - 12)) + sb2(qf((i) - 11)) \
|
||||
+ sb3(qf((i) - 10)) + sb0(qf((i) - 9)) \
|
||||
+ sb1(qf((i) - 8)) + sb2(qf((i) - 7)) \
|
||||
+ sb3(qf((i) - 6)) + sb0(qf((i) - 5)) \
|
||||
+ sb1(qf((i) - 4)) + sb2(qf((i) - 3)) \
|
||||
+ sb3(qf((i) - 2)) + sb0(qf((i) - 1)) \
|
||||
+ add_elt_b(mf, hf, (i) - 16))
|
||||
|
||||
#define expand2b(qf, mf, hf, i) \
|
||||
SPH_T64(qf((i) - 16) + rb1(qf((i) - 15)) \
|
||||
+ qf((i) - 14) + rb2(qf((i) - 13)) \
|
||||
+ qf((i) - 12) + rb3(qf((i) - 11)) \
|
||||
+ qf((i) - 10) + rb4(qf((i) - 9)) \
|
||||
+ qf((i) - 8) + rb5(qf((i) - 7)) \
|
||||
+ qf((i) - 6) + rb6(qf((i) - 5)) \
|
||||
+ qf((i) - 4) + rb7(qf((i) - 3)) \
|
||||
+ sb4(qf((i) - 2)) + sb5(qf((i) - 1)) \
|
||||
+ add_elt_b(mf, hf, (i) - 16))
|
||||
|
||||
#else
|
||||
|
||||
#define add_elt_b(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
|
||||
(SPH_T64(SPH_ROTL64(mf(j0m), j1m) + SPH_ROTL64(mf(j3m), j4m) \
|
||||
- SPH_ROTL64(mf(j10m), j11m) + Kb(j16)) ^ hf(j7m))
|
||||
|
||||
#define expand1b_inner(qf, mf, hf, i16, \
|
||||
i0, i1, i2, i3, i4, i5, i6, i7, i8, \
|
||||
i9, i10, i11, i12, i13, i14, i15, \
|
||||
i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
|
||||
SPH_T64(sb1(qf(i0)) + sb2(qf(i1)) + sb3(qf(i2)) + sb0(qf(i3)) \
|
||||
+ sb1(qf(i4)) + sb2(qf(i5)) + sb3(qf(i6)) + sb0(qf(i7)) \
|
||||
+ sb1(qf(i8)) + sb2(qf(i9)) + sb3(qf(i10)) + sb0(qf(i11)) \
|
||||
+ sb1(qf(i12)) + sb2(qf(i13)) + sb3(qf(i14)) + sb0(qf(i15)) \
|
||||
+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
|
||||
|
||||
#define expand1b(qf, mf, hf, i16) \
|
||||
expand1b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
|
||||
#define expand1b_(qf, mf, hf, i16, ix, iy) \
|
||||
expand1b_inner LPAR qf, mf, hf, i16, ix, iy)
|
||||
|
||||
#define expand2b_inner(qf, mf, hf, i16, \
|
||||
i0, i1, i2, i3, i4, i5, i6, i7, i8, \
|
||||
i9, i10, i11, i12, i13, i14, i15, \
|
||||
i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
|
||||
SPH_T64(qf(i0) + rb1(qf(i1)) + qf(i2) + rb2(qf(i3)) \
|
||||
+ qf(i4) + rb3(qf(i5)) + qf(i6) + rb4(qf(i7)) \
|
||||
+ qf(i8) + rb5(qf(i9)) + qf(i10) + rb6(qf(i11)) \
|
||||
+ qf(i12) + rb7(qf(i13)) + sb4(qf(i14)) + sb5(qf(i15)) \
|
||||
+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
|
||||
|
||||
#define expand2b(qf, mf, hf, i16) \
|
||||
expand2b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
|
||||
#define expand2b_(qf, mf, hf, i16, ix, iy) \
|
||||
expand2b_inner LPAR qf, mf, hf, i16, ix, iy)
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \
|
||||
tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
|
||||
op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
|
||||
|
||||
#define Ws0 MAKE_W(SPH_T32, 5, -, 7, +, 10, +, 13, +, 14)
|
||||
#define Ws1 MAKE_W(SPH_T32, 6, -, 8, +, 11, +, 14, -, 15)
|
||||
#define Ws2 MAKE_W(SPH_T32, 0, +, 7, +, 9, -, 12, +, 15)
|
||||
#define Ws3 MAKE_W(SPH_T32, 0, -, 1, +, 8, -, 10, +, 13)
|
||||
#define Ws4 MAKE_W(SPH_T32, 1, +, 2, +, 9, -, 11, -, 14)
|
||||
#define Ws5 MAKE_W(SPH_T32, 3, -, 2, +, 10, -, 12, +, 15)
|
||||
#define Ws6 MAKE_W(SPH_T32, 4, -, 0, -, 3, -, 11, +, 13)
|
||||
#define Ws7 MAKE_W(SPH_T32, 1, -, 4, -, 5, -, 12, -, 14)
|
||||
#define Ws8 MAKE_W(SPH_T32, 2, -, 5, -, 6, +, 13, -, 15)
|
||||
#define Ws9 MAKE_W(SPH_T32, 0, -, 3, +, 6, -, 7, +, 14)
|
||||
#define Ws10 MAKE_W(SPH_T32, 8, -, 1, -, 4, -, 7, +, 15)
|
||||
#define Ws11 MAKE_W(SPH_T32, 8, -, 0, -, 2, -, 5, +, 9)
|
||||
#define Ws12 MAKE_W(SPH_T32, 1, +, 3, -, 6, -, 9, +, 10)
|
||||
#define Ws13 MAKE_W(SPH_T32, 2, +, 4, +, 7, +, 10, +, 11)
|
||||
#define Ws14 MAKE_W(SPH_T32, 3, -, 5, +, 8, -, 11, -, 12)
|
||||
#define Ws15 MAKE_W(SPH_T32, 12, -, 4, -, 6, -, 9, +, 13)
|
||||
|
||||
#define MAKE_Qas do { \
|
||||
qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
|
||||
qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
|
||||
qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
|
||||
qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
|
||||
qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
|
||||
qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
|
||||
qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
|
||||
qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
|
||||
qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
|
||||
qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
|
||||
qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
|
||||
qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
|
||||
qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
|
||||
qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
|
||||
qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
|
||||
qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
|
||||
} while (0)
|
||||
|
||||
#define MAKE_Qbs do { \
|
||||
qt[16] = expand1s(Qs, M, H, 16); \
|
||||
qt[17] = expand1s(Qs, M, H, 17); \
|
||||
qt[18] = expand2s(Qs, M, H, 18); \
|
||||
qt[19] = expand2s(Qs, M, H, 19); \
|
||||
qt[20] = expand2s(Qs, M, H, 20); \
|
||||
qt[21] = expand2s(Qs, M, H, 21); \
|
||||
qt[22] = expand2s(Qs, M, H, 22); \
|
||||
qt[23] = expand2s(Qs, M, H, 23); \
|
||||
qt[24] = expand2s(Qs, M, H, 24); \
|
||||
qt[25] = expand2s(Qs, M, H, 25); \
|
||||
qt[26] = expand2s(Qs, M, H, 26); \
|
||||
qt[27] = expand2s(Qs, M, H, 27); \
|
||||
qt[28] = expand2s(Qs, M, H, 28); \
|
||||
qt[29] = expand2s(Qs, M, H, 29); \
|
||||
qt[30] = expand2s(Qs, M, H, 30); \
|
||||
qt[31] = expand2s(Qs, M, H, 31); \
|
||||
} while (0)
|
||||
|
||||
#define MAKE_Qs do { \
|
||||
MAKE_Qas; \
|
||||
MAKE_Qbs; \
|
||||
} while (0)
|
||||
|
||||
#define Qs(j) (qt[j])
|
||||
|
||||
#define Wb0 MAKE_W(SPH_T64, 5, -, 7, +, 10, +, 13, +, 14)
|
||||
#define Wb1 MAKE_W(SPH_T64, 6, -, 8, +, 11, +, 14, -, 15)
|
||||
#define Wb2 MAKE_W(SPH_T64, 0, +, 7, +, 9, -, 12, +, 15)
|
||||
#define Wb3 MAKE_W(SPH_T64, 0, -, 1, +, 8, -, 10, +, 13)
|
||||
#define Wb4 MAKE_W(SPH_T64, 1, +, 2, +, 9, -, 11, -, 14)
|
||||
#define Wb5 MAKE_W(SPH_T64, 3, -, 2, +, 10, -, 12, +, 15)
|
||||
#define Wb6 MAKE_W(SPH_T64, 4, -, 0, -, 3, -, 11, +, 13)
|
||||
#define Wb7 MAKE_W(SPH_T64, 1, -, 4, -, 5, -, 12, -, 14)
|
||||
#define Wb8 MAKE_W(SPH_T64, 2, -, 5, -, 6, +, 13, -, 15)
|
||||
#define Wb9 MAKE_W(SPH_T64, 0, -, 3, +, 6, -, 7, +, 14)
|
||||
#define Wb10 MAKE_W(SPH_T64, 8, -, 1, -, 4, -, 7, +, 15)
|
||||
#define Wb11 MAKE_W(SPH_T64, 8, -, 0, -, 2, -, 5, +, 9)
|
||||
#define Wb12 MAKE_W(SPH_T64, 1, +, 3, -, 6, -, 9, +, 10)
|
||||
#define Wb13 MAKE_W(SPH_T64, 2, +, 4, +, 7, +, 10, +, 11)
|
||||
#define Wb14 MAKE_W(SPH_T64, 3, -, 5, +, 8, -, 11, -, 12)
|
||||
#define Wb15 MAKE_W(SPH_T64, 12, -, 4, -, 6, -, 9, +, 13)
|
||||
|
||||
#define MAKE_Qab do { \
|
||||
qt[ 0] = SPH_T64(sb0(Wb0 ) + H( 1)); \
|
||||
qt[ 1] = SPH_T64(sb1(Wb1 ) + H( 2)); \
|
||||
qt[ 2] = SPH_T64(sb2(Wb2 ) + H( 3)); \
|
||||
qt[ 3] = SPH_T64(sb3(Wb3 ) + H( 4)); \
|
||||
qt[ 4] = SPH_T64(sb4(Wb4 ) + H( 5)); \
|
||||
qt[ 5] = SPH_T64(sb0(Wb5 ) + H( 6)); \
|
||||
qt[ 6] = SPH_T64(sb1(Wb6 ) + H( 7)); \
|
||||
qt[ 7] = SPH_T64(sb2(Wb7 ) + H( 8)); \
|
||||
qt[ 8] = SPH_T64(sb3(Wb8 ) + H( 9)); \
|
||||
qt[ 9] = SPH_T64(sb4(Wb9 ) + H(10)); \
|
||||
qt[10] = SPH_T64(sb0(Wb10) + H(11)); \
|
||||
qt[11] = SPH_T64(sb1(Wb11) + H(12)); \
|
||||
qt[12] = SPH_T64(sb2(Wb12) + H(13)); \
|
||||
qt[13] = SPH_T64(sb3(Wb13) + H(14)); \
|
||||
qt[14] = SPH_T64(sb4(Wb14) + H(15)); \
|
||||
qt[15] = SPH_T64(sb0(Wb15) + H( 0)); \
|
||||
} while (0)
|
||||
|
||||
#define MAKE_Qbb do { \
|
||||
qt[16] = expand1b(Qb, M, H, 16); \
|
||||
qt[17] = expand1b(Qb, M, H, 17); \
|
||||
qt[18] = expand2b(Qb, M, H, 18); \
|
||||
qt[19] = expand2b(Qb, M, H, 19); \
|
||||
qt[20] = expand2b(Qb, M, H, 20); \
|
||||
qt[21] = expand2b(Qb, M, H, 21); \
|
||||
qt[22] = expand2b(Qb, M, H, 22); \
|
||||
qt[23] = expand2b(Qb, M, H, 23); \
|
||||
qt[24] = expand2b(Qb, M, H, 24); \
|
||||
qt[25] = expand2b(Qb, M, H, 25); \
|
||||
qt[26] = expand2b(Qb, M, H, 26); \
|
||||
qt[27] = expand2b(Qb, M, H, 27); \
|
||||
qt[28] = expand2b(Qb, M, H, 28); \
|
||||
qt[29] = expand2b(Qb, M, H, 29); \
|
||||
qt[30] = expand2b(Qb, M, H, 30); \
|
||||
qt[31] = expand2b(Qb, M, H, 31); \
|
||||
} while (0)
|
||||
|
||||
#define MAKE_Qb do { \
|
||||
MAKE_Qab; \
|
||||
MAKE_Qbb; \
|
||||
} while (0)
|
||||
|
||||
#define Qb(j) (qt[j])
|
||||
|
||||
#define FOLD(type, mkQ, tt, rol, mf, qf, dhf) do { \
|
||||
type qt[32], xl, xh; \
|
||||
mkQ; \
|
||||
xl = qf(16) ^ qf(17) ^ qf(18) ^ qf(19) \
|
||||
^ qf(20) ^ qf(21) ^ qf(22) ^ qf(23); \
|
||||
xh = xl ^ qf(24) ^ qf(25) ^ qf(26) ^ qf(27) \
|
||||
^ qf(28) ^ qf(29) ^ qf(30) ^ qf(31); \
|
||||
dhf( 0) = tt(((xh << 5) ^ (qf(16) >> 5) ^ mf( 0)) \
|
||||
+ (xl ^ qf(24) ^ qf( 0))); \
|
||||
dhf( 1) = tt(((xh >> 7) ^ (qf(17) << 8) ^ mf( 1)) \
|
||||
+ (xl ^ qf(25) ^ qf( 1))); \
|
||||
dhf( 2) = tt(((xh >> 5) ^ (qf(18) << 5) ^ mf( 2)) \
|
||||
+ (xl ^ qf(26) ^ qf( 2))); \
|
||||
dhf( 3) = tt(((xh >> 1) ^ (qf(19) << 5) ^ mf( 3)) \
|
||||
+ (xl ^ qf(27) ^ qf( 3))); \
|
||||
dhf( 4) = tt(((xh >> 3) ^ (qf(20) << 0) ^ mf( 4)) \
|
||||
+ (xl ^ qf(28) ^ qf( 4))); \
|
||||
dhf( 5) = tt(((xh << 6) ^ (qf(21) >> 6) ^ mf( 5)) \
|
||||
+ (xl ^ qf(29) ^ qf( 5))); \
|
||||
dhf( 6) = tt(((xh >> 4) ^ (qf(22) << 6) ^ mf( 6)) \
|
||||
+ (xl ^ qf(30) ^ qf( 6))); \
|
||||
dhf( 7) = tt(((xh >> 11) ^ (qf(23) << 2) ^ mf( 7)) \
|
||||
+ (xl ^ qf(31) ^ qf( 7))); \
|
||||
dhf( 8) = tt(rol(dhf(4), 9) + (xh ^ qf(24) ^ mf( 8)) \
|
||||
+ ((xl << 8) ^ qf(23) ^ qf( 8))); \
|
||||
dhf( 9) = tt(rol(dhf(5), 10) + (xh ^ qf(25) ^ mf( 9)) \
|
||||
+ ((xl >> 6) ^ qf(16) ^ qf( 9))); \
|
||||
dhf(10) = tt(rol(dhf(6), 11) + (xh ^ qf(26) ^ mf(10)) \
|
||||
+ ((xl << 6) ^ qf(17) ^ qf(10))); \
|
||||
dhf(11) = tt(rol(dhf(7), 12) + (xh ^ qf(27) ^ mf(11)) \
|
||||
+ ((xl << 4) ^ qf(18) ^ qf(11))); \
|
||||
dhf(12) = tt(rol(dhf(0), 13) + (xh ^ qf(28) ^ mf(12)) \
|
||||
+ ((xl >> 3) ^ qf(19) ^ qf(12))); \
|
||||
dhf(13) = tt(rol(dhf(1), 14) + (xh ^ qf(29) ^ mf(13)) \
|
||||
+ ((xl >> 4) ^ qf(20) ^ qf(13))); \
|
||||
dhf(14) = tt(rol(dhf(2), 15) + (xh ^ qf(30) ^ mf(14)) \
|
||||
+ ((xl >> 7) ^ qf(21) ^ qf(14))); \
|
||||
dhf(15) = tt(rol(dhf(3), 16) + (xh ^ qf(31) ^ mf(15)) \
|
||||
+ ((xl >> 2) ^ qf(22) ^ qf(15))); \
|
||||
} while (0)
|
||||
|
||||
#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
|
||||
|
||||
#define FOLDb FOLD(sph_u64, MAKE_Qb, SPH_T64, SPH_ROTL64, M, Qb, dH)
|
||||
|
||||
#define DECL_BMW \
|
||||
sph_u64 bmwH[16]; \
|
||||
|
||||
/* load initial constants */
|
||||
#define BMW_I \
|
||||
do { \
|
||||
memcpy(bmwH, bmwIV512, sizeof bmwH); \
|
||||
hashptr = 0; \
|
||||
hashctA = 0; \
|
||||
} while (0)
|
||||
|
||||
/* load hash for loop */
|
||||
#define BMW_U \
|
||||
do { \
|
||||
const void *data = hash; \
|
||||
size_t len = 64; \
|
||||
unsigned char *buf; \
|
||||
\
|
||||
hashctA += (sph_u64)len << 3; \
|
||||
buf = hashbuf; \
|
||||
memcpy(buf, data, 64); \
|
||||
hashptr = 64; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* bmw512 hash loaded */
|
||||
/* hash = blake512(loaded) */
|
||||
#define BMW_C \
|
||||
do { \
|
||||
void *dst = hash; \
|
||||
size_t out_size_w64 = 8; \
|
||||
unsigned char *data; \
|
||||
sph_u64 *dh; \
|
||||
unsigned char *out; \
|
||||
size_t ptr, u, v; \
|
||||
unsigned z; \
|
||||
sph_u64 h1[16], h2[16], *h; \
|
||||
data = hashbuf; \
|
||||
ptr = hashptr; \
|
||||
z = 0x80 >> 0; \
|
||||
data[ptr ++] = ((0 & -z) | z) & 0xFF; \
|
||||
memset(data + ptr, 0, (sizeof(char)*128) - 8 - ptr); \
|
||||
sph_enc64le_aligned(data + (sizeof(char)*128) - 8, \
|
||||
SPH_T64(hashctA + 0)); \
|
||||
/* for break loop */ \
|
||||
/* one copy of inline FOLD */ \
|
||||
/* FOLD uses, */ \
|
||||
/* uint64 *h, data */ \
|
||||
/* uint64 dh, state */ \
|
||||
h = bmwH; \
|
||||
dh = h2; \
|
||||
for (;;) { \
|
||||
FOLDb; \
|
||||
/* dh gets changed for 2nd run */ \
|
||||
if (dh == h1) break; \
|
||||
for (u = 0; u < 16; u ++) \
|
||||
sph_enc64le_aligned(data + 8 * u, h2[u]); \
|
||||
dh = h1; \
|
||||
h = (sph_u64*)final_b; \
|
||||
} \
|
||||
/* end wrapped for break loop */ \
|
||||
out = dst; \
|
||||
for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++) \
|
||||
sph_enc64le(out + 8 * u, h1[v]); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
static void
|
||||
compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16])
|
||||
{
|
||||
|
||||
#define M(x) sph_dec64le_aligned(data + 8 * (x))
|
||||
#define H(x) (h[x])
|
||||
#define dH(x) (dh[x])
|
||||
|
||||
FOLDb;
|
||||
|
||||
#undef M
|
||||
#undef H
|
||||
#undef dH
|
||||
}
|
||||
*/
|
||||
|
||||
static const sph_u64 final_b[16] = {
|
||||
SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1),
|
||||
SPH_C64(0xaaaaaaaaaaaaaaa2), SPH_C64(0xaaaaaaaaaaaaaaa3),
|
||||
SPH_C64(0xaaaaaaaaaaaaaaa4), SPH_C64(0xaaaaaaaaaaaaaaa5),
|
||||
SPH_C64(0xaaaaaaaaaaaaaaa6), SPH_C64(0xaaaaaaaaaaaaaaa7),
|
||||
SPH_C64(0xaaaaaaaaaaaaaaa8), SPH_C64(0xaaaaaaaaaaaaaaa9),
|
||||
SPH_C64(0xaaaaaaaaaaaaaaaa), SPH_C64(0xaaaaaaaaaaaaaaab),
|
||||
SPH_C64(0xaaaaaaaaaaaaaaac), SPH_C64(0xaaaaaaaaaaaaaaad),
|
||||
SPH_C64(0xaaaaaaaaaaaaaaae), SPH_C64(0xaaaaaaaaaaaaaaaf)
|
||||
};
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -1,61 +0,0 @@
|
||||
/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
|
||||
/**
|
||||
* BMW interface. BMW (aka "Blue Midnight Wish") is a family of
|
||||
* functions which differ by their output size; this implementation
|
||||
* defines BMW for output sizes 224, 256, 384 and 512 bits.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_bmw.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef SPH_BMW_H__
|
||||
#define SPH_BMW_H__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "sph_types.h"
|
||||
|
||||
#define SPH_SIZE_bmw512 512
|
||||
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
sph_u64 bmwH[16];
|
||||
#endif
|
||||
} sph_bmw_big_context;
|
||||
|
||||
typedef sph_bmw_big_context sph_bmw512_context;
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,369 +0,0 @@
|
||||
// Copyright (c) 2012-2013 The Cryptonote developers
|
||||
// Distributed under the MIT/X11 software license, see the accompanying
|
||||
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__arm__) || defined(_MSC_VER)
|
||||
#ifndef NOASM
|
||||
#define NOASM
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "crypto/oaes_lib.h"
|
||||
#include "crypto/c_keccak.h"
|
||||
#include "crypto/c_groestl.h"
|
||||
#include "crypto/c_blake256.h"
|
||||
#include "crypto/c_jh.h"
|
||||
#include "crypto/c_skein.h"
|
||||
#include "crypto/int-util.h"
|
||||
#include "crypto/hash-ops.h"
|
||||
|
||||
#if USE_INT128
|
||||
|
||||
#if __GNUC__ == 4 && __GNUC_MINOR__ >= 4 && __GNUC_MINOR__ < 6
|
||||
typedef unsigned int uint128_t __attribute__ ((__mode__ (TI)));
|
||||
#elif defined (_MSC_VER)
|
||||
/* only for mingw64 on windows */
|
||||
#undef USE_INT128
|
||||
#define USE_INT128 (0)
|
||||
#else
|
||||
typedef __uint128_t uint128_t;
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#define LITE 1
|
||||
#if LITE /* cryptonight-light */
|
||||
#define MEMORY (1 << 20)
|
||||
#define ITER (1 << 19)
|
||||
#else
|
||||
#define MEMORY (1 << 21) /* 2 MiB */
|
||||
#define ITER (1 << 20)
|
||||
#endif
|
||||
|
||||
#define AES_BLOCK_SIZE 16
|
||||
#define AES_KEY_SIZE 32 /*16*/
|
||||
#define INIT_SIZE_BLK 8
|
||||
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
|
||||
|
||||
#pragma pack(push, 1)
|
||||
union cn_slow_hash_state {
|
||||
union hash_state hs;
|
||||
struct {
|
||||
uint8_t k[64];
|
||||
uint8_t init[INIT_SIZE_BYTE];
|
||||
};
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
static void do_blake_hash(const void* input, size_t len, char* output) {
|
||||
blake256_hash((uint8_t*)output, input, len);
|
||||
}
|
||||
|
||||
static void do_groestl_hash(const void* input, size_t len, char* output) {
|
||||
groestl(input, len * 8, (uint8_t*)output);
|
||||
}
|
||||
|
||||
static void do_jh_hash(const void* input, size_t len, char* output) {
|
||||
int r = jh_hash(HASH_SIZE * 8, input, 8 * len, (uint8_t*)output);
|
||||
assert(likely(SUCCESS == r));
|
||||
}
|
||||
|
||||
static void do_skein_hash(const void* input, size_t len, char* output) {
|
||||
int r = skein_hash(8 * HASH_SIZE, input, 8 * len, (uint8_t*)output);
|
||||
assert(likely(SKEIN_SUCCESS == r));
|
||||
}
|
||||
|
||||
extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
|
||||
extern int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
|
||||
#if !defined(_MSC_VER) && !defined(NOASM)
|
||||
extern int fast_aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
|
||||
extern int fast_aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
|
||||
#else
|
||||
#define fast_aesb_single_round aesb_single_round
|
||||
#define fast_aesb_pseudo_round_mut aesb_pseudo_round_mut
|
||||
#endif
|
||||
|
||||
#if defined(NOASM) || !defined(__x86_64__)
|
||||
static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) {
|
||||
// multiplier = ab = a * 2^32 + b
|
||||
// multiplicand = cd = c * 2^32 + d
|
||||
// ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
|
||||
uint64_t a = hi_dword(multiplier);
|
||||
uint64_t b = lo_dword(multiplier);
|
||||
uint64_t c = hi_dword(multiplicand);
|
||||
uint64_t d = lo_dword(multiplicand);
|
||||
|
||||
uint64_t ac = a * c;
|
||||
uint64_t ad = a * d;
|
||||
uint64_t bc = b * c;
|
||||
uint64_t bd = b * d;
|
||||
|
||||
uint64_t adbc = ad + bc;
|
||||
uint64_t adbc_carry = adbc < ad ? 1 : 0;
|
||||
|
||||
// multiplier * multiplicand = product_hi * 2^64 + product_lo
|
||||
uint64_t product_lo = bd + (adbc << 32);
|
||||
uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
|
||||
*product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
|
||||
assert(ac <= *product_hi);
|
||||
|
||||
return product_lo;
|
||||
}
|
||||
#else
|
||||
extern uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi);
|
||||
#endif
|
||||
|
||||
static void (* const extra_hashes[4])(const void *, size_t, char *) = {
|
||||
do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
|
||||
};
|
||||
|
||||
|
||||
static inline size_t e2i(const uint8_t* a) {
|
||||
#if !LITE
|
||||
return ((uint32_t *)a)[0] & 0x1FFFF0;
|
||||
#else
|
||||
return ((uint32_t *)a)[0] & 0xFFFF0;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
|
||||
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
|
||||
hi += ((uint64_t*) c)[0];
|
||||
|
||||
((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
|
||||
((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
|
||||
((uint64_t*) dst)[0] = hi;
|
||||
((uint64_t*) dst)[1] = lo;
|
||||
}
|
||||
|
||||
static inline void xor_blocks(uint8_t* a, const uint8_t* b) {
|
||||
#if USE_INT128
|
||||
*((uint128_t*) a) ^= *((uint128_t*) b);
|
||||
#else
|
||||
((uint64_t*) a)[0] ^= ((uint64_t*) b)[0];
|
||||
((uint64_t*) a)[1] ^= ((uint64_t*) b)[1];
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
|
||||
#if USE_INT128
|
||||
*((uint128_t*) dst) = *((uint128_t*) a) ^ *((uint128_t*) b);
|
||||
#else
|
||||
((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0];
|
||||
((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
|
||||
#endif
|
||||
}
|
||||
|
||||
struct cryptonight_ctx {
|
||||
uint8_t _ALIGN(16) long_state[MEMORY];
|
||||
union cn_slow_hash_state state;
|
||||
uint8_t _ALIGN(16) text[INIT_SIZE_BYTE];
|
||||
uint8_t _ALIGN(16) a[AES_BLOCK_SIZE];
|
||||
uint8_t _ALIGN(16) b[AES_BLOCK_SIZE];
|
||||
uint8_t _ALIGN(16) c[AES_BLOCK_SIZE];
|
||||
oaes_ctx* aes_ctx;
|
||||
};
|
||||
|
||||
static void cryptolight_hash_ctx(void* output, const void* input, int len, struct cryptonight_ctx* ctx)
|
||||
{
|
||||
len = 76;
|
||||
hash_process(&ctx->state.hs, (const uint8_t*) input, len);
|
||||
ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
|
||||
size_t i, j;
|
||||
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
|
||||
|
||||
oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
|
||||
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
|
||||
aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 0], ctx->aes_ctx->key->exp_data);
|
||||
aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 1], ctx->aes_ctx->key->exp_data);
|
||||
aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 2], ctx->aes_ctx->key->exp_data);
|
||||
aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 3], ctx->aes_ctx->key->exp_data);
|
||||
aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 4], ctx->aes_ctx->key->exp_data);
|
||||
aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 5], ctx->aes_ctx->key->exp_data);
|
||||
aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 6], ctx->aes_ctx->key->exp_data);
|
||||
aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 7], ctx->aes_ctx->key->exp_data);
|
||||
memcpy(&ctx->long_state[i], ctx->text, INIT_SIZE_BYTE);
|
||||
}
|
||||
|
||||
xor_blocks_dst(&ctx->state.k[0], &ctx->state.k[32], ctx->a);
|
||||
xor_blocks_dst(&ctx->state.k[16], &ctx->state.k[48], ctx->b);
|
||||
|
||||
for (i = 0; likely(i < ITER / 4); ++i) {
|
||||
/* Dependency chain: address -> read value ------+
|
||||
* written value <-+ hard function (AES or MUL) <+
|
||||
* next address <-+
|
||||
*/
|
||||
/* Iteration 1 */
|
||||
j = e2i(ctx->a);
|
||||
aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
|
||||
xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
|
||||
/* Iteration 2 */
|
||||
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]);
|
||||
/* Iteration 3 */
|
||||
j = e2i(ctx->a);
|
||||
aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
|
||||
xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
|
||||
/* Iteration 4 */
|
||||
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]);
|
||||
}
|
||||
|
||||
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
|
||||
oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
|
||||
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
|
||||
xor_blocks(&ctx->text[0 * AES_BLOCK_SIZE], &ctx->long_state[i + 0 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx->text[0 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx->text[1 * AES_BLOCK_SIZE], &ctx->long_state[i + 1 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx->text[1 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx->text[2 * AES_BLOCK_SIZE], &ctx->long_state[i + 2 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx->text[2 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx->text[3 * AES_BLOCK_SIZE], &ctx->long_state[i + 3 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx->text[3 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx->text[4 * AES_BLOCK_SIZE], &ctx->long_state[i + 4 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx->text[4 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx->text[5 * AES_BLOCK_SIZE], &ctx->long_state[i + 5 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx->text[5 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx->text[6 * AES_BLOCK_SIZE], &ctx->long_state[i + 6 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx->text[6 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx->text[7 * AES_BLOCK_SIZE], &ctx->long_state[i + 7 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx->text[7 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||
}
|
||||
memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
|
||||
hash_permutation(&ctx->state.hs);
|
||||
/*memcpy(hash, &state, 32);*/
|
||||
extra_hashes[ctx->state.hs.b[0] & 3](&ctx->state, 200, output);
|
||||
oaes_free((OAES_CTX **) &ctx->aes_ctx);
|
||||
}
|
||||
|
||||
void cryptolight_hash(void* output, const void* input, int len) {
|
||||
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
|
||||
cryptolight_hash_ctx(output, input, len, ctx);
|
||||
free(ctx);
|
||||
}
|
||||
|
||||
#if defined(__AES__)
|
||||
|
||||
static void cryptolight_hash_ctx_aes_ni(void* output, const void* input,
|
||||
int len, struct cryptonight_ctx* ctx)
|
||||
{
|
||||
hash_process(&ctx->state.hs, (const uint8_t*)input, len);
|
||||
ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
|
||||
size_t i, j;
|
||||
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
|
||||
|
||||
oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
|
||||
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
|
||||
fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 0], ctx->aes_ctx->key->exp_data);
|
||||
fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 1], ctx->aes_ctx->key->exp_data);
|
||||
fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 2], ctx->aes_ctx->key->exp_data);
|
||||
fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 3], ctx->aes_ctx->key->exp_data);
|
||||
fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 4], ctx->aes_ctx->key->exp_data);
|
||||
fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 5], ctx->aes_ctx->key->exp_data);
|
||||
fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 6], ctx->aes_ctx->key->exp_data);
|
||||
fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 7], ctx->aes_ctx->key->exp_data);
|
||||
memcpy(&ctx->long_state[i], ctx->text, INIT_SIZE_BYTE);
|
||||
}
|
||||
|
||||
xor_blocks_dst(&ctx->state.k[0], &ctx->state.k[32], ctx->a);
|
||||
xor_blocks_dst(&ctx->state.k[16], &ctx->state.k[48], ctx->b);
|
||||
|
||||
for (i = 0; likely(i < ITER / 4); ++i) {
|
||||
/* Dependency chain: address -> read value ------+
|
||||
* written value <-+ hard function (AES or MUL) <+
|
||||
* next address <-+
|
||||
*/
|
||||
/* Iteration 1 */
|
||||
j = e2i(ctx->a);
|
||||
fast_aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
|
||||
xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
|
||||
/* Iteration 2 */
|
||||
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]);
|
||||
/* Iteration 3 */
|
||||
j = e2i(ctx->a);
|
||||
fast_aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
|
||||
xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
|
||||
/* Iteration 4 */
|
||||
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]);
|
||||
}
|
||||
|
||||
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
|
||||
oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
|
||||
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
|
||||
xor_blocks(&ctx->text[0 * AES_BLOCK_SIZE], &ctx->long_state[i + 0 * AES_BLOCK_SIZE]);
|
||||
fast_aesb_pseudo_round_mut(&ctx->text[0 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx->text[1 * AES_BLOCK_SIZE], &ctx->long_state[i + 1 * AES_BLOCK_SIZE]);
|
||||
fast_aesb_pseudo_round_mut(&ctx->text[1 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx->text[2 * AES_BLOCK_SIZE], &ctx->long_state[i + 2 * AES_BLOCK_SIZE]);
|
||||
fast_aesb_pseudo_round_mut(&ctx->text[2 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx->text[3 * AES_BLOCK_SIZE], &ctx->long_state[i + 3 * AES_BLOCK_SIZE]);
|
||||
fast_aesb_pseudo_round_mut(&ctx->text[3 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx->text[4 * AES_BLOCK_SIZE], &ctx->long_state[i + 4 * AES_BLOCK_SIZE]);
|
||||
fast_aesb_pseudo_round_mut(&ctx->text[4 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx->text[5 * AES_BLOCK_SIZE], &ctx->long_state[i + 5 * AES_BLOCK_SIZE]);
|
||||
fast_aesb_pseudo_round_mut(&ctx->text[5 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx->text[6 * AES_BLOCK_SIZE], &ctx->long_state[i + 6 * AES_BLOCK_SIZE]);
|
||||
fast_aesb_pseudo_round_mut(&ctx->text[6 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx->text[7 * AES_BLOCK_SIZE], &ctx->long_state[i + 7 * AES_BLOCK_SIZE]);
|
||||
fast_aesb_pseudo_round_mut(&ctx->text[7 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||
}
|
||||
memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
|
||||
hash_permutation(&ctx->state.hs);
|
||||
/*memcpy(hash, &state, 32);*/
|
||||
extra_hashes[ctx->state.hs.b[0] & 3](&ctx->state, 200, output);
|
||||
oaes_free((OAES_CTX **) &ctx->aes_ctx);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int scanhash_cryptolight( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *nonceptr = (uint32_t*) (((char*)pdata) + 39);
|
||||
uint32_t n = *nonceptr - 1;
|
||||
const uint32_t first_nonce = n + 1;
|
||||
//const uint32_t Htarg = ptarget[7];
|
||||
uint32_t _ALIGN(32) hash[HASH_SIZE / 4];
|
||||
int thr_id = mythr->id;
|
||||
|
||||
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
|
||||
|
||||
#if defined(__AES__)
|
||||
do {
|
||||
*nonceptr = ++n;
|
||||
cryptolight_hash_ctx_aes_ni(hash, pdata, 76, ctx);
|
||||
if (unlikely(hash[7] < ptarget[7])) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
free(ctx);
|
||||
return true;
|
||||
}
|
||||
} while (likely((n <= max_nonce && !work_restart[thr_id].restart)));
|
||||
#else
|
||||
do {
|
||||
*nonceptr = ++n;
|
||||
cryptolight_hash_ctx(hash, pdata, 76, ctx);
|
||||
if (unlikely(hash[7] < ptarget[7])) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
free(ctx);
|
||||
return true;
|
||||
}
|
||||
} while (likely((n <= max_nonce && !work_restart[thr_id].restart)));
|
||||
#endif
|
||||
free(ctx);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_cryptolight_algo( algo_gate_t* gate )
|
||||
{
|
||||
register_json_rpc2( gate );
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
gate->scanhash = (void*)&scanhash_cryptolight;
|
||||
gate->hash = (void*)&cryptolight_hash;
|
||||
gate->hash_suw = (void*)&cryptolight_hash;
|
||||
gate->get_max64 = (void*)&get_max64_0x40LL;
|
||||
return true;
|
||||
};
|
||||
|
||||
@@ -1,357 +0,0 @@
|
||||
#if defined(__AES__)
|
||||
|
||||
#include <x86intrin.h>
|
||||
#include <memory.h>
|
||||
#include "cryptonight.h"
|
||||
#include "miner.h"
|
||||
#include "crypto/c_keccak.h"
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
|
||||
{
|
||||
__m128i tmp4;
|
||||
*tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF);
|
||||
tmp4 = _mm_slli_si128(*tmp1, 0x04);
|
||||
*tmp1 = _mm_xor_si128(*tmp1, tmp4);
|
||||
tmp4 = _mm_slli_si128(tmp4, 0x04);
|
||||
*tmp1 = _mm_xor_si128(*tmp1, tmp4);
|
||||
tmp4 = _mm_slli_si128(tmp4, 0x04);
|
||||
*tmp1 = _mm_xor_si128(*tmp1, tmp4);
|
||||
*tmp1 = _mm_xor_si128(*tmp1, *tmp2);
|
||||
}
|
||||
|
||||
static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
|
||||
{
|
||||
__m128i tmp2, tmp4;
|
||||
|
||||
tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
|
||||
tmp2 = _mm_shuffle_epi32(tmp4, 0xAA);
|
||||
tmp4 = _mm_slli_si128(*tmp3, 0x04);
|
||||
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
|
||||
tmp4 = _mm_slli_si128(tmp4, 0x04);
|
||||
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
|
||||
tmp4 = _mm_slli_si128(tmp4, 0x04);
|
||||
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
|
||||
*tmp3 = _mm_xor_si128(*tmp3, tmp2);
|
||||
}
|
||||
|
||||
// Special thanks to Intel for helping me
|
||||
// with ExpandAESKey256() and its subroutines
|
||||
static inline void ExpandAESKey256(char *keybuf)
|
||||
{
|
||||
__m128i tmp1, tmp2, tmp3, *keys;
|
||||
|
||||
keys = (__m128i *)keybuf;
|
||||
|
||||
tmp1 = _mm_load_si128((__m128i *)keybuf);
|
||||
tmp3 = _mm_load_si128((__m128i *)(keybuf+0x10));
|
||||
|
||||
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01);
|
||||
ExpandAESKey256_sub1(&tmp1, &tmp2);
|
||||
keys[2] = tmp1;
|
||||
ExpandAESKey256_sub2(&tmp1, &tmp3);
|
||||
keys[3] = tmp3;
|
||||
|
||||
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02);
|
||||
ExpandAESKey256_sub1(&tmp1, &tmp2);
|
||||
keys[4] = tmp1;
|
||||
ExpandAESKey256_sub2(&tmp1, &tmp3);
|
||||
keys[5] = tmp3;
|
||||
|
||||
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04);
|
||||
ExpandAESKey256_sub1(&tmp1, &tmp2);
|
||||
keys[6] = tmp1;
|
||||
ExpandAESKey256_sub2(&tmp1, &tmp3);
|
||||
keys[7] = tmp3;
|
||||
|
||||
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08);
|
||||
ExpandAESKey256_sub1(&tmp1, &tmp2);
|
||||
keys[8] = tmp1;
|
||||
ExpandAESKey256_sub2(&tmp1, &tmp3);
|
||||
keys[9] = tmp3;
|
||||
|
||||
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
|
||||
ExpandAESKey256_sub1(&tmp1, &tmp2);
|
||||
keys[10] = tmp1;
|
||||
ExpandAESKey256_sub2(&tmp1, &tmp3);
|
||||
keys[11] = tmp3;
|
||||
|
||||
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
|
||||
ExpandAESKey256_sub1(&tmp1, &tmp2);
|
||||
keys[12] = tmp1;
|
||||
ExpandAESKey256_sub2(&tmp1, &tmp3);
|
||||
keys[13] = tmp3;
|
||||
|
||||
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
|
||||
ExpandAESKey256_sub1(&tmp1, &tmp2);
|
||||
keys[14] = tmp1;
|
||||
}
|
||||
|
||||
// align to 64 byte cache line
|
||||
typedef struct
|
||||
{
|
||||
uint8_t long_state[MEMORY] __attribute((aligned(64)));
|
||||
union cn_slow_hash_state state;
|
||||
uint8_t text[INIT_SIZE_BYTE] __attribute((aligned(64)));
|
||||
uint64_t a[AES_BLOCK_SIZE >> 3] __attribute__((aligned(64)));
|
||||
uint64_t b[AES_BLOCK_SIZE >> 3] __attribute__((aligned(64)));
|
||||
uint8_t c[AES_BLOCK_SIZE] __attribute__((aligned(64)));
|
||||
} cryptonight_ctx;
|
||||
|
||||
static __thread cryptonight_ctx ctx;
|
||||
|
||||
void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
||||
{
|
||||
uint8_t ExpandedKey[256] __attribute__((aligned(64)));
|
||||
__m128i *longoutput, *expkey, *xmminput;
|
||||
size_t i, j;
|
||||
|
||||
keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
|
||||
|
||||
if ( cryptonightV7 && len < 43 )
|
||||
return;
|
||||
|
||||
const uint64_t tweak = cryptonightV7
|
||||
? *((const uint64_t*) (((const uint8_t*)input) + 35))
|
||||
^ ctx.state.hs.w[24] : 0;
|
||||
|
||||
memcpy( ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE );
|
||||
ExpandAESKey256( ExpandedKey );
|
||||
memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
|
||||
|
||||
longoutput = (__m128i*)ctx.long_state;
|
||||
xmminput = (__m128i*)ctx.text;
|
||||
expkey = (__m128i*)ExpandedKey;
|
||||
|
||||
// prefetch expkey, xmminput and enough longoutput for 4 iterations
|
||||
_mm_prefetch( xmminput, _MM_HINT_T0 );
|
||||
_mm_prefetch( xmminput + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( expkey, _MM_HINT_T0 );
|
||||
_mm_prefetch( expkey + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( expkey + 8, _MM_HINT_T0 );
|
||||
for ( i = 0; i < 64; i += 16 )
|
||||
{
|
||||
__builtin_prefetch( longoutput + i, 1, 0 );
|
||||
__builtin_prefetch( longoutput + i + 4, 1, 0 );
|
||||
__builtin_prefetch( longoutput + i + 8, 1, 0 );
|
||||
__builtin_prefetch( longoutput + i + 12, 1, 0 );
|
||||
}
|
||||
|
||||
// n-4 iterations
|
||||
for ( i = 0; likely( i < MEMORY_M128I - 4*INIT_SIZE_M128I );
|
||||
i += INIT_SIZE_M128I )
|
||||
{
|
||||
// prefetch 4 iterations ahead.
|
||||
__builtin_prefetch( longoutput + i + 64, 1, 0 );
|
||||
__builtin_prefetch( longoutput + i + 68, 1, 0 );
|
||||
|
||||
for ( j = 0; j < 10; j++ )
|
||||
{
|
||||
xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
|
||||
xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
|
||||
xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
|
||||
xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
|
||||
xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
|
||||
xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
|
||||
xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
|
||||
xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
|
||||
}
|
||||
_mm_store_si128( &( longoutput[i ] ), xmminput[0] );
|
||||
_mm_store_si128( &( longoutput[i+1] ), xmminput[1] );
|
||||
_mm_store_si128( &( longoutput[i+2] ), xmminput[2] );
|
||||
_mm_store_si128( &( longoutput[i+3] ), xmminput[3] );
|
||||
_mm_store_si128( &( longoutput[i+4] ), xmminput[4] );
|
||||
_mm_store_si128( &( longoutput[i+5] ), xmminput[5] );
|
||||
_mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
|
||||
_mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
|
||||
}
|
||||
// last 4 iterations
|
||||
for ( ; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
|
||||
{
|
||||
for ( j = 0; j < 10; j++ )
|
||||
{
|
||||
xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
|
||||
xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
|
||||
xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
|
||||
xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
|
||||
xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
|
||||
xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
|
||||
xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
|
||||
xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
|
||||
}
|
||||
_mm_store_si128( &( longoutput[i ] ), xmminput[0] );
|
||||
_mm_store_si128( &( longoutput[i+1] ), xmminput[1] );
|
||||
_mm_store_si128( &( longoutput[i+2] ), xmminput[2] );
|
||||
_mm_store_si128( &( longoutput[i+3] ), xmminput[3] );
|
||||
_mm_store_si128( &( longoutput[i+4] ), xmminput[4] );
|
||||
_mm_store_si128( &( longoutput[i+5] ), xmminput[5] );
|
||||
_mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
|
||||
_mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
|
||||
}
|
||||
|
||||
ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4];
|
||||
ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6];
|
||||
ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5];
|
||||
ctx.b[1] = ((uint64_t *)ctx.state.k)[3] ^ ((uint64_t *)ctx.state.k)[7];
|
||||
|
||||
uint64_t a[2] __attribute((aligned(16))),
|
||||
b[2] __attribute((aligned(16))),
|
||||
c[2] __attribute((aligned(16)));
|
||||
a[0] = ctx.a[0];
|
||||
a[1] = ctx.a[1];
|
||||
__m128i b_x = _mm_load_si128( (__m128i*)ctx.b );
|
||||
__m128i a_x = _mm_load_si128( (__m128i*)a );
|
||||
__m128i* lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
|
||||
__m128i c_x = _mm_load_si128( lsa );
|
||||
uint64_t *nextblock;
|
||||
uint64_t hi, lo;
|
||||
|
||||
// n-1 iterations
|
||||
for( i = 0; __builtin_expect( i < 0x7ffff, 1 ); i++ )
|
||||
{
|
||||
c_x = _mm_aesenc_si128( c_x, a_x );
|
||||
_mm_store_si128( (__m128i*)c, c_x );
|
||||
b_x = _mm_xor_si128( b_x, c_x );
|
||||
nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
|
||||
_mm_store_si128( lsa, b_x );
|
||||
|
||||
if ( cryptonightV7 )
|
||||
{
|
||||
const uint8_t tmp = ( (const uint8_t*)(lsa) )[11];
|
||||
const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
|
||||
((uint8_t*)(lsa))[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
|
||||
}
|
||||
|
||||
b[0] = nextblock[0];
|
||||
b[1] = nextblock[1];
|
||||
|
||||
// hi,lo = 64bit x 64bit multiply of c[0] and b[0]
|
||||
__asm__( "mulq %3\n\t"
|
||||
: "=d" ( hi ),
|
||||
"=a" ( lo )
|
||||
: "%a" ( c[0] ),
|
||||
"rm" ( b[0] )
|
||||
: "cc" );
|
||||
|
||||
b_x = c_x;
|
||||
|
||||
a[0] += hi;
|
||||
a[1] += lo;
|
||||
nextblock[0] = a[0];
|
||||
nextblock[1] = cryptonightV7 ? a[1] ^ tweak : a[1];
|
||||
a[0] ^= b[0];
|
||||
a[1] ^= b[1];
|
||||
|
||||
lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
|
||||
a_x = _mm_load_si128( (__m128i*)a );
|
||||
c_x = _mm_load_si128( lsa );
|
||||
}
|
||||
// abreviated nth iteration
|
||||
c_x = _mm_aesenc_si128( c_x, a_x );
|
||||
_mm_store_si128( (__m128i*)c, c_x );
|
||||
b_x = _mm_xor_si128( b_x, c_x );
|
||||
nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
|
||||
_mm_store_si128( lsa, b_x );
|
||||
|
||||
if ( cryptonightV7 )
|
||||
{
|
||||
const uint8_t tmp = ( (const uint8_t*)(lsa) )[11];
|
||||
const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
|
||||
((uint8_t*)(lsa))[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
|
||||
}
|
||||
|
||||
b[0] = nextblock[0];
|
||||
b[1] = nextblock[1];
|
||||
|
||||
__asm__( "mulq %3\n\t"
|
||||
: "=d" ( hi ),
|
||||
"=a" ( lo )
|
||||
: "%a" ( c[0] ),
|
||||
"rm" ( b[0] )
|
||||
: "cc" );
|
||||
|
||||
a[0] += hi;
|
||||
a[1] += lo;
|
||||
nextblock[0] = a[0];
|
||||
nextblock[1] = cryptonightV7 ? a[1] ^ tweak : a[1];
|
||||
a[0] ^= b[0];
|
||||
a[1] ^= b[1];
|
||||
|
||||
memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE );
|
||||
ExpandAESKey256( ExpandedKey );
|
||||
memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
|
||||
|
||||
// prefetch expkey, all of xmminput and enough longoutput for 4 loops
|
||||
_mm_prefetch( xmminput, _MM_HINT_T0 );
|
||||
_mm_prefetch( xmminput + 4, _MM_HINT_T0 );
|
||||
for ( i = 0; i < 64; i += 16 )
|
||||
{
|
||||
_mm_prefetch( longoutput + i, _MM_HINT_T0 );
|
||||
_mm_prefetch( longoutput + i + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( longoutput + i + 8, _MM_HINT_T0 );
|
||||
_mm_prefetch( longoutput + i + 12, _MM_HINT_T0 );
|
||||
}
|
||||
_mm_prefetch( expkey, _MM_HINT_T0 );
|
||||
_mm_prefetch( expkey + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( expkey + 8, _MM_HINT_T0 );
|
||||
|
||||
// n-4 iterations
|
||||
for ( i = 0; likely( i < MEMORY_M128I - 4*INIT_SIZE_M128I );
|
||||
i += INIT_SIZE_M128I )
|
||||
{
|
||||
// stay 4 iterations ahead.
|
||||
_mm_prefetch( longoutput + i + 64, _MM_HINT_T0 );
|
||||
_mm_prefetch( longoutput + i + 68, _MM_HINT_T0 );
|
||||
|
||||
xmminput[0] = _mm_xor_si128( longoutput[i ], xmminput[0] );
|
||||
xmminput[1] = _mm_xor_si128( longoutput[i+1], xmminput[1] );
|
||||
xmminput[2] = _mm_xor_si128( longoutput[i+2], xmminput[2] );
|
||||
xmminput[3] = _mm_xor_si128( longoutput[i+3], xmminput[3] );
|
||||
xmminput[4] = _mm_xor_si128( longoutput[i+4], xmminput[4] );
|
||||
xmminput[5] = _mm_xor_si128( longoutput[i+5], xmminput[5] );
|
||||
xmminput[6] = _mm_xor_si128( longoutput[i+6], xmminput[6] );
|
||||
xmminput[7] = _mm_xor_si128( longoutput[i+7], xmminput[7] );
|
||||
|
||||
for( j = 0; j < 10; j++ )
|
||||
{
|
||||
xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
|
||||
xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
|
||||
xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
|
||||
xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
|
||||
xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
|
||||
xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
|
||||
xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
|
||||
xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
|
||||
}
|
||||
}
|
||||
// last 4 iterations
|
||||
for ( ; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
|
||||
{
|
||||
xmminput[0] = _mm_xor_si128( longoutput[i ], xmminput[0] );
|
||||
xmminput[1] = _mm_xor_si128( longoutput[i+1], xmminput[1] );
|
||||
xmminput[2] = _mm_xor_si128( longoutput[i+2], xmminput[2] );
|
||||
xmminput[3] = _mm_xor_si128( longoutput[i+3], xmminput[3] );
|
||||
xmminput[4] = _mm_xor_si128( longoutput[i+4], xmminput[4] );
|
||||
xmminput[5] = _mm_xor_si128( longoutput[i+5], xmminput[5] );
|
||||
xmminput[6] = _mm_xor_si128( longoutput[i+6], xmminput[6] );
|
||||
xmminput[7] = _mm_xor_si128( longoutput[i+7], xmminput[7] );
|
||||
|
||||
for( j = 0; j < 10; j++ )
|
||||
{
|
||||
xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
|
||||
xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
|
||||
xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
|
||||
xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
|
||||
xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
|
||||
xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
|
||||
xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
|
||||
xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
|
||||
}
|
||||
}
|
||||
|
||||
memcpy( ctx.state.init, ctx.text, INIT_SIZE_BYTE);
|
||||
keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
|
||||
extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
|
||||
|
||||
}
|
||||
#endif
|
||||
@@ -1,129 +0,0 @@
|
||||
// Copyright (c) 2012-2013 The Cryptonote developers
|
||||
// Distributed under the MIT/X11 software license, see the accompanying
|
||||
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
||||
|
||||
// Modified for CPUminer by Lucas Jones
|
||||
|
||||
#include "cpuminer-config.h"
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__AES__)
|
||||
#include "algo/groestl/aes_ni/hash-groestl256.h"
|
||||
#else
|
||||
#include "crypto/c_groestl.h"
|
||||
#endif
|
||||
#include "crypto/c_blake256.h"
|
||||
#include "crypto/c_jh.h"
|
||||
#include "crypto/c_skein.h"
|
||||
#include "cryptonight.h"
|
||||
|
||||
/*
|
||||
#if defined __unix__ && (!defined __APPLE__)
|
||||
#include <sys/mman.h>
|
||||
#elif defined _WIN32
|
||||
#include <windows.h>
|
||||
#endif
|
||||
*/
|
||||
|
||||
void do_blake_hash(const void* input, size_t len, char* output) {
|
||||
blake256_hash((uint8_t*)output, input, len);
|
||||
}
|
||||
|
||||
void do_groestl_hash(const void* input, size_t len, char* output) {
|
||||
#if defined(__AES__)
|
||||
hashState_groestl256 ctx;
|
||||
init_groestl256( &ctx, 32 );
|
||||
update_and_final_groestl256( &ctx, output, input, len * 8 );
|
||||
#else
|
||||
groestl(input, len * 8, (uint8_t*)output);
|
||||
#endif
|
||||
}
|
||||
|
||||
void do_jh_hash(const void* input, size_t len, char* output) {
|
||||
jh_hash(32 * 8, input, 8 * len, (uint8_t*)output);
|
||||
}
|
||||
|
||||
void do_skein_hash(const void* input, size_t len, char* output) {
|
||||
skein_hash(8 * 32, input, 8 * len, (uint8_t*)output);
|
||||
}
|
||||
|
||||
void (* const extra_hashes[4])( const void *, size_t, char *) =
|
||||
{ do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash };
|
||||
|
||||
void cryptonight_hash( void *restrict output, const void *input, int len )
|
||||
{
|
||||
#if defined(__AES__)
|
||||
cryptonight_hash_aes( output, input, len );
|
||||
#else
|
||||
cryptonight_hash_ctx ( output, input, len );
|
||||
#endif
|
||||
}
|
||||
|
||||
void cryptonight_hash_suw( void *restrict output, const void *input )
|
||||
{
|
||||
#if defined(__AES__)
|
||||
cryptonight_hash_aes( output, input, 76 );
|
||||
#else
|
||||
cryptonight_hash_ctx ( output, input, 76 );
|
||||
#endif
|
||||
}
|
||||
|
||||
bool cryptonightV7 = false;
|
||||
|
||||
int scanhash_cryptonight( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
uint32_t *nonceptr = (uint32_t*) (((char*)pdata) + 39);
|
||||
uint32_t n = *nonceptr - 1;
|
||||
const uint32_t first_nonce = n + 1;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t hash[32 / 4] __attribute__((aligned(32)));
|
||||
|
||||
// if ( ( cryptonightV7 && ( *(uint8_t*)pdata < 7 ) )
|
||||
// || ( !cryptonightV7 && ( *(uint8_t*)pdata == 7 ) ) )
|
||||
// applog(LOG_WARNING,"Cryptonight variant mismatch, shares may be rejected.");
|
||||
|
||||
do
|
||||
{
|
||||
*nonceptr = ++n;
|
||||
cryptonight_hash( hash, pdata, 76 );
|
||||
if (unlikely( hash[7] < Htarg ))
|
||||
{
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
// work_set_target_ratio( work, hash );
|
||||
return true;
|
||||
}
|
||||
} while (likely((n <= max_nonce && !work_restart[thr_id].restart)));
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_cryptonight_algo( algo_gate_t* gate )
|
||||
{
|
||||
cryptonightV7 = false;
|
||||
register_json_rpc2( gate );
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
gate->scanhash = (void*)&scanhash_cryptonight;
|
||||
gate->hash = (void*)&cryptonight_hash;
|
||||
gate->hash_suw = (void*)&cryptonight_hash_suw;
|
||||
gate->get_max64 = (void*)&get_max64_0x40LL;
|
||||
return true;
|
||||
};
|
||||
|
||||
bool register_cryptonightv7_algo( algo_gate_t* gate )
|
||||
{
|
||||
cryptonightV7 = true;
|
||||
register_json_rpc2( gate );
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
gate->scanhash = (void*)&scanhash_cryptonight;
|
||||
gate->hash = (void*)&cryptonight_hash;
|
||||
gate->hash_suw = (void*)&cryptonight_hash_suw;
|
||||
gate->get_max64 = (void*)&get_max64_0x40LL;
|
||||
return true;
|
||||
};
|
||||
|
||||
@@ -1,310 +0,0 @@
|
||||
// Copyright (c) 2012-2013 The Cryptonote developers
|
||||
// Distributed under the MIT/X11 software license, see the accompanying
|
||||
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
||||
|
||||
// Modified for CPUminer by Lucas Jones
|
||||
|
||||
#include "miner.h"
|
||||
#include <memory.h>
|
||||
|
||||
#if defined(__arm__) || defined(_MSC_VER)
|
||||
#ifndef NOASM
|
||||
#define NOASM
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "crypto/oaes_lib.h"
|
||||
#include "crypto/c_keccak.h"
|
||||
#include "crypto/c_groestl.h"
|
||||
#include "crypto/c_blake256.h"
|
||||
#include "crypto/c_jh.h"
|
||||
#include "crypto/c_skein.h"
|
||||
#include "crypto/int-util.h"
|
||||
//#include "crypto/hash-ops.h"
|
||||
#include "cryptonight.h"
|
||||
|
||||
#if USE_INT128
|
||||
|
||||
#if __GNUC__ == 4 && __GNUC_MINOR__ >= 4 && __GNUC_MINOR__ < 6
|
||||
typedef unsigned int uint128_t __attribute__ ((__mode__ (TI)));
|
||||
#elif defined (_MSC_VER)
|
||||
/* only for mingw64 on windows */
|
||||
#undef USE_INT128
|
||||
#define USE_INT128 (0)
|
||||
#else
|
||||
typedef __uint128_t uint128_t;
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#define LITE 0
|
||||
#if LITE /* cryptonight-light */
|
||||
#define MEMORY (1 << 20)
|
||||
#define ITER (1 << 19)
|
||||
#else
|
||||
#define MEMORY (1 << 21) /* 2 MiB */
|
||||
#define ITER (1 << 20)
|
||||
#endif
|
||||
|
||||
#define AES_BLOCK_SIZE 16
|
||||
#define AES_KEY_SIZE 32 /*16*/
|
||||
#define INIT_SIZE_BLK 8
|
||||
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
|
||||
|
||||
/*
|
||||
#pragma pack(push, 1)
|
||||
union cn_slow_hash_state {
|
||||
union hash_state hs;
|
||||
struct {
|
||||
uint8_t k[64];
|
||||
uint8_t init[INIT_SIZE_BYTE];
|
||||
};
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
static void do_blake_hash(const void* input, size_t len, char* output) {
|
||||
blake256_hash((uint8_t*)output, input, len);
|
||||
}
|
||||
|
||||
static void do_groestl_hash(const void* input, size_t len, char* output) {
|
||||
groestl(input, len * 8, (uint8_t*)output);
|
||||
}
|
||||
|
||||
static void do_jh_hash(const void* input, size_t len, char* output) {
|
||||
int r = jh_hash(HASH_SIZE * 8, input, 8 * len, (uint8_t*)output);
|
||||
assert(likely(SUCCESS == r));
|
||||
}
|
||||
|
||||
static void do_skein_hash(const void* input, size_t len, char* output) {
|
||||
int r = skein_hash(8 * HASH_SIZE, input, 8 * len, (uint8_t*)output);
|
||||
assert(likely(SKEIN_SUCCESS == r));
|
||||
}
|
||||
*/
|
||||
|
||||
extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
|
||||
extern int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
|
||||
#if !defined(_MSC_VER) && !defined(NOASM)
|
||||
extern int fast_aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
|
||||
extern int fast_aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
|
||||
#else
|
||||
#define fast_aesb_single_round aesb_single_round
|
||||
#define fast_aesb_pseudo_round_mut aesb_pseudo_round_mut
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(NOASM) || !defined(__x86_64__)
|
||||
static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) {
|
||||
// multiplier = ab = a * 2^32 + b
|
||||
// multiplicand = cd = c * 2^32 + d
|
||||
// ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
|
||||
uint64_t a = hi_dword(multiplier);
|
||||
uint64_t b = lo_dword(multiplier);
|
||||
uint64_t c = hi_dword(multiplicand);
|
||||
uint64_t d = lo_dword(multiplicand);
|
||||
|
||||
uint64_t ac = a * c;
|
||||
uint64_t ad = a * d;
|
||||
uint64_t bc = b * c;
|
||||
uint64_t bd = b * d;
|
||||
|
||||
uint64_t adbc = ad + bc;
|
||||
uint64_t adbc_carry = adbc < ad ? 1 : 0;
|
||||
|
||||
// multiplier * multiplicand = product_hi * 2^64 + product_lo
|
||||
uint64_t product_lo = bd + (adbc << 32);
|
||||
uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
|
||||
*product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
|
||||
assert(ac <= *product_hi);
|
||||
|
||||
return product_lo;
|
||||
}
|
||||
#else
|
||||
extern uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi);
|
||||
#endif
|
||||
|
||||
/*
|
||||
static void (* const extra_hashes[4])(const void *, size_t, char *) = {
|
||||
do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
|
||||
};
|
||||
*/
|
||||
|
||||
static inline size_t e2i(const uint8_t* a) {
|
||||
#if !LITE
|
||||
return ((uint32_t *)a)[0] & 0x1FFFF0;
|
||||
#else
|
||||
return ((uint32_t *)a)[0] & 0xFFFF0;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void mul_sum_xor_dst( const uint8_t* a, uint8_t* c, uint8_t* dst,
|
||||
const uint64_t tweak )
|
||||
{
|
||||
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
|
||||
hi += ((uint64_t*) c)[0];
|
||||
|
||||
((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
|
||||
((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
|
||||
((uint64_t*) dst)[0] = hi;
|
||||
((uint64_t*) dst)[1] = cryptonightV7 ? lo ^ tweak : lo;
|
||||
}
|
||||
|
||||
static inline void xor_blocks(uint8_t* a, const uint8_t* b) {
|
||||
#if USE_INT128
|
||||
*((uint128_t*) a) ^= *((uint128_t*) b);
|
||||
#else
|
||||
((uint64_t*) a)[0] ^= ((uint64_t*) b)[0];
|
||||
((uint64_t*) a)[1] ^= ((uint64_t*) b)[1];
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
|
||||
#if USE_INT128
|
||||
*((uint128_t*) dst) = *((uint128_t*) a) ^ *((uint128_t*) b);
|
||||
#else
|
||||
((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0];
|
||||
((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
|
||||
#endif
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
uint8_t _ALIGN(16) long_state[MEMORY];
|
||||
union cn_slow_hash_state state;
|
||||
uint8_t _ALIGN(16) text[INIT_SIZE_BYTE];
|
||||
uint8_t _ALIGN(16) a[AES_BLOCK_SIZE];
|
||||
uint8_t _ALIGN(16) b[AES_BLOCK_SIZE];
|
||||
uint8_t _ALIGN(16) c[AES_BLOCK_SIZE];
|
||||
oaes_ctx* aes_ctx;
|
||||
} cryptonight_ctx;
|
||||
|
||||
static __thread cryptonight_ctx ctx;
|
||||
|
||||
void cryptonight_hash_ctx(void* output, const void* input, int len)
|
||||
{
|
||||
// hash_process(&ctx.state.hs, (const uint8_t*) input, len);
|
||||
keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
|
||||
|
||||
if ( cryptonightV7 && len < 43 )
|
||||
return;
|
||||
const uint64_t tweak = cryptonightV7
|
||||
? *((const uint64_t*) (((const uint8_t*)input) + 35))
|
||||
^ ctx.state.hs.w[24] : 0;
|
||||
|
||||
ctx.aes_ctx = (oaes_ctx*) oaes_alloc();
|
||||
|
||||
__builtin_prefetch( ctx.text, 0, 3 );
|
||||
__builtin_prefetch( ctx.text + 64, 0, 3 );
|
||||
__builtin_prefetch( ctx.long_state, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 64, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 128, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 192, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 256, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 320, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 384, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 448, 1, 0 );
|
||||
|
||||
size_t i, j;
|
||||
memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
|
||||
|
||||
oaes_key_import_data(ctx.aes_ctx, ctx.state.hs.b, AES_KEY_SIZE);
|
||||
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
|
||||
|
||||
__builtin_prefetch( ctx.long_state + i + 512, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + i + 576, 1, 0 );
|
||||
|
||||
aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 0], ctx.aes_ctx->key->exp_data);
|
||||
aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 1], ctx.aes_ctx->key->exp_data);
|
||||
aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 2], ctx.aes_ctx->key->exp_data);
|
||||
aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 3], ctx.aes_ctx->key->exp_data);
|
||||
aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 4], ctx.aes_ctx->key->exp_data);
|
||||
aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 5], ctx.aes_ctx->key->exp_data);
|
||||
aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 6], ctx.aes_ctx->key->exp_data);
|
||||
aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 7], ctx.aes_ctx->key->exp_data);
|
||||
memcpy(&ctx.long_state[i], ctx.text, INIT_SIZE_BYTE);
|
||||
}
|
||||
|
||||
xor_blocks_dst(&ctx.state.k[0], &ctx.state.k[32], ctx.a);
|
||||
xor_blocks_dst(&ctx.state.k[16], &ctx.state.k[48], ctx.b);
|
||||
|
||||
for (i = 0; likely(i < ITER / 4); ++i)
|
||||
{
|
||||
/* Dependency chain: address -> read value ------+
|
||||
* written value <-+ hard function (AES or MUL) <+
|
||||
* next address <-+
|
||||
*/
|
||||
/* Iteration 1 */
|
||||
j = e2i(ctx.a);
|
||||
aesb_single_round(&ctx.long_state[j], ctx.c, ctx.a);
|
||||
xor_blocks_dst(ctx.c, ctx.b, &ctx.long_state[j]);
|
||||
|
||||
if ( cryptonightV7 )
|
||||
{
|
||||
uint8_t *lsa = (uint8_t*)&ctx.long_state[((uint64_t *)(ctx.a))[0] & 0x1FFFF0];
|
||||
const uint8_t tmp = lsa[11];
|
||||
const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
|
||||
lsa[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
|
||||
}
|
||||
|
||||
/* Iteration 2 */
|
||||
mul_sum_xor_dst(ctx.c, ctx.a, &ctx.long_state[e2i(ctx.c)], tweak );
|
||||
|
||||
/* Iteration 3 */
|
||||
j = e2i(ctx.a);
|
||||
aesb_single_round(&ctx.long_state[j], ctx.b, ctx.a);
|
||||
xor_blocks_dst(ctx.b, ctx.c, &ctx.long_state[j]);
|
||||
|
||||
if ( cryptonightV7 )
|
||||
{
|
||||
uint8_t *lsa = (uint8_t*)&ctx.long_state[((uint64_t *)(ctx.a))[0] & 0x1FFFF0];
|
||||
const uint8_t tmp = lsa[11];
|
||||
const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
|
||||
lsa[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
|
||||
}
|
||||
|
||||
/* Iteration 4 */
|
||||
mul_sum_xor_dst(ctx.b, ctx.a, &ctx.long_state[e2i(ctx.b)], tweak );
|
||||
|
||||
}
|
||||
|
||||
__builtin_prefetch( ctx.text, 0, 3 );
|
||||
__builtin_prefetch( ctx.text + 64, 0, 3 );
|
||||
__builtin_prefetch( ctx.long_state, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 64, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 128, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 192, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 256, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 320, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 384, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 448, 1, 0 );
|
||||
|
||||
memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
|
||||
oaes_key_import_data(ctx.aes_ctx, &ctx.state.hs.b[32], AES_KEY_SIZE);
|
||||
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
|
||||
|
||||
__builtin_prefetch( ctx.long_state + i + 512, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + i + 576, 1, 0 );
|
||||
|
||||
xor_blocks(&ctx.text[0 * AES_BLOCK_SIZE], &ctx.long_state[i + 0 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx.text[0 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx.text[1 * AES_BLOCK_SIZE], &ctx.long_state[i + 1 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx.text[1 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx.text[2 * AES_BLOCK_SIZE], &ctx.long_state[i + 2 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx.text[2 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx.text[3 * AES_BLOCK_SIZE], &ctx.long_state[i + 3 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx.text[3 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx.text[4 * AES_BLOCK_SIZE], &ctx.long_state[i + 4 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx.text[4 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx.text[5 * AES_BLOCK_SIZE], &ctx.long_state[i + 5 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx.text[5 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx.text[6 * AES_BLOCK_SIZE], &ctx.long_state[i + 6 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx.text[6 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx.text[7 * AES_BLOCK_SIZE], &ctx.long_state[i + 7 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx.text[7 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
|
||||
}
|
||||
memcpy(ctx.state.init, ctx.text, INIT_SIZE_BYTE);
|
||||
// hash_permutation(&ctx.state.hs);
|
||||
keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
|
||||
/*memcpy(hash, &state, 32);*/
|
||||
extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
|
||||
oaes_free((OAES_CTX **) &ctx.aes_ctx);
|
||||
}
|
||||
|
||||
@@ -1,51 +0,0 @@
|
||||
#ifndef __CRYPTONIGHT_H_INCLUDED
|
||||
#define __CRYPTONIGHT_H_INCLUDED
|
||||
|
||||
#include <stddef.h>
|
||||
#include "crypto/oaes_lib.h"
|
||||
#include "miner.h"
|
||||
|
||||
#define MEMORY (1 << 21) /* 2 MiB */
|
||||
#define MEMORY_M128I (MEMORY >> 4) // 2 MiB / 16 = 128 ki * __m128i
|
||||
#define ITER (1 << 20)
|
||||
#define AES_BLOCK_SIZE 16
|
||||
#define AES_KEY_SIZE 32 /*16*/
|
||||
#define INIT_SIZE_BLK 8
|
||||
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128
|
||||
#define INIT_SIZE_M128I (INIT_SIZE_BYTE >> 4) // 8
|
||||
|
||||
|
||||
#pragma pack(push, 1)
|
||||
union hash_state {
|
||||
uint8_t b[200];
|
||||
uint64_t w[25];
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
#pragma pack(push, 1)
|
||||
union cn_slow_hash_state {
|
||||
union hash_state hs;
|
||||
struct {
|
||||
uint8_t k[64];
|
||||
uint8_t init[INIT_SIZE_BYTE];
|
||||
};
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
void do_blake_hash(const void* input, size_t len, char* output);
|
||||
void do_groestl_hash(const void* input, size_t len, char* output);
|
||||
void do_jh_hash(const void* input, size_t len, char* output);
|
||||
void do_skein_hash(const void* input, size_t len, char* output);
|
||||
void cryptonight_hash_ctx(void* output, const void* input, int len);
|
||||
void keccakf(uint64_t st[25], int rounds);
|
||||
extern void (* const extra_hashes[4])(const void *, size_t, char *);
|
||||
|
||||
int scanhash_cryptonight( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void cryptonight_hash_aes( void *restrict output, const void *input, int len );
|
||||
|
||||
extern bool cryptonightV7;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
|
||||
// 2x128
|
||||
|
||||
/*
|
||||
|
||||
// The result of hashing 10 rounds of initial data which consists of params
|
||||
// zero padded.
|
||||
static const uint64_t IV256[] =
|
||||
@@ -25,7 +25,239 @@ static const uint64_t IV512[] =
|
||||
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
|
||||
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
||||
};
|
||||
*/
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// 4 way 128 is handy to avoid reinterleaving in many algos.
|
||||
// If reinterleaving is necessary it may be more efficient to use
|
||||
// 2 way 256. The same transform code should work for both.
|
||||
|
||||
static void transform_4way( cube_4way_context *sp )
|
||||
{
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
__m512i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;
|
||||
|
||||
x0 = _mm512_load_si512( (__m512i*)sp->h );
|
||||
x1 = _mm512_load_si512( (__m512i*)sp->h + 1 );
|
||||
x2 = _mm512_load_si512( (__m512i*)sp->h + 2 );
|
||||
x3 = _mm512_load_si512( (__m512i*)sp->h + 3 );
|
||||
x4 = _mm512_load_si512( (__m512i*)sp->h + 4 );
|
||||
x5 = _mm512_load_si512( (__m512i*)sp->h + 5 );
|
||||
x6 = _mm512_load_si512( (__m512i*)sp->h + 6 );
|
||||
x7 = _mm512_load_si512( (__m512i*)sp->h + 7 );
|
||||
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x4 = _mm512_add_epi32( x0, x4 );
|
||||
x5 = _mm512_add_epi32( x1, x5 );
|
||||
x6 = _mm512_add_epi32( x2, x6 );
|
||||
x7 = _mm512_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x1;
|
||||
x0 = mm512_rol_32( x2, 7 );
|
||||
x1 = mm512_rol_32( x3, 7 );
|
||||
x2 = mm512_rol_32( y0, 7 );
|
||||
x3 = mm512_rol_32( y1, 7 );
|
||||
x0 = _mm512_xor_si512( x0, x4 );
|
||||
x1 = _mm512_xor_si512( x1, x5 );
|
||||
x2 = _mm512_xor_si512( x2, x6 );
|
||||
x3 = _mm512_xor_si512( x3, x7 );
|
||||
x4 = mm512_swap128_64( x4 );
|
||||
x5 = mm512_swap128_64( x5 );
|
||||
x6 = mm512_swap128_64( x6 );
|
||||
x7 = mm512_swap128_64( x7 );
|
||||
x4 = _mm512_add_epi32( x0, x4 );
|
||||
x5 = _mm512_add_epi32( x1, x5 );
|
||||
x6 = _mm512_add_epi32( x2, x6 );
|
||||
x7 = _mm512_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x2;
|
||||
x0 = mm512_rol_32( x1, 11 );
|
||||
x1 = mm512_rol_32( y0, 11 );
|
||||
x2 = mm512_rol_32( x3, 11 );
|
||||
x3 = mm512_rol_32( y1, 11 );
|
||||
x0 = _mm512_xor_si512( x0, x4 );
|
||||
x1 = _mm512_xor_si512( x1, x5 );
|
||||
x2 = _mm512_xor_si512( x2, x6 );
|
||||
x3 = _mm512_xor_si512( x3, x7 );
|
||||
x4 = mm512_swap64_32( x4 );
|
||||
x5 = mm512_swap64_32( x5 );
|
||||
x6 = mm512_swap64_32( x6 );
|
||||
x7 = mm512_swap64_32( x7 );
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)sp->h, x0 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 1, x1 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 2, x2 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 3, x3 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 4, x4 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 5, x5 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 6, x6 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 7, x7 );
|
||||
}
|
||||
|
||||
int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
|
||||
int blockbytes )
|
||||
{
|
||||
__m512i *h = (__m512i*)sp->h;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = blockbytes/16;
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m512_const1_128( iv[0] );
|
||||
h[ 1] = m512_const1_128( iv[1] );
|
||||
h[ 2] = m512_const1_128( iv[2] );
|
||||
h[ 3] = m512_const1_128( iv[3] );
|
||||
h[ 4] = m512_const1_128( iv[4] );
|
||||
h[ 5] = m512_const1_128( iv[5] );
|
||||
h[ 6] = m512_const1_128( iv[6] );
|
||||
h[ 7] = m512_const1_128( iv[7] );
|
||||
h[ 0] = m512_const1_128( iv[0] );
|
||||
h[ 1] = m512_const1_128( iv[1] );
|
||||
h[ 2] = m512_const1_128( iv[2] );
|
||||
h[ 3] = m512_const1_128( iv[3] );
|
||||
h[ 4] = m512_const1_128( iv[4] );
|
||||
h[ 5] = m512_const1_128( iv[5] );
|
||||
h[ 6] = m512_const1_128( iv[6] );
|
||||
h[ 7] = m512_const1_128( iv[7] );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_update( cube_4way_context *sp, const void *data, size_t size )
|
||||
{
|
||||
const int len = size >> 4;
|
||||
const __m512i *in = (__m512i*)data;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_4way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_close( cube_4way_context *sp, void *output )
|
||||
{
|
||||
__m512i *hash = (__m512i*)output;
|
||||
int i;
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
|
||||
m512_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_4way( sp );
|
||||
|
||||
sp->h[7] = _mm512_xor_si512( sp->h[7],
|
||||
m512_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<6 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
const void *data, size_t size )
|
||||
{
|
||||
__m512i *h = (__m512i*)sp->h;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = 32/16;
|
||||
sp->rounds = 16;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m512_const1_128( iv[0] );
|
||||
h[ 1] = m512_const1_128( iv[1] );
|
||||
h[ 2] = m512_const1_128( iv[2] );
|
||||
h[ 3] = m512_const1_128( iv[3] );
|
||||
h[ 4] = m512_const1_128( iv[4] );
|
||||
h[ 5] = m512_const1_128( iv[5] );
|
||||
h[ 6] = m512_const1_128( iv[6] );
|
||||
h[ 7] = m512_const1_128( iv[7] );
|
||||
|
||||
const int len = size >> 4;
|
||||
const __m512i *in = (__m512i*)data;
|
||||
__m512i *hash = (__m512i*)output;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_4way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
|
||||
m512_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_4way( sp );
|
||||
|
||||
sp->h[7] = _mm512_xor_si512( sp->h[7],
|
||||
m512_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size )
|
||||
{
|
||||
const int len = size >> 4;
|
||||
const __m512i *in = (__m512i*)data;
|
||||
__m512i *hash = (__m512i*)output;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_4way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
|
||||
m512_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_4way( sp );
|
||||
|
||||
sp->h[7] = _mm512_xor_si512( sp->h[7],
|
||||
m512_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
// 2 way 128
|
||||
|
||||
static void transform_2way( cube_2way_context *sp )
|
||||
{
|
||||
@@ -59,10 +291,10 @@ static void transform_2way( cube_2way_context *sp )
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = mm256_swap64_128( x4 );
|
||||
x5 = mm256_swap64_128( x5 );
|
||||
x6 = mm256_swap64_128( x6 );
|
||||
x7 = mm256_swap64_128( x7 );
|
||||
x4 = mm256_swap128_64( x4 );
|
||||
x5 = mm256_swap128_64( x5 );
|
||||
x6 = mm256_swap128_64( x6 );
|
||||
x7 = mm256_swap128_64( x7 );
|
||||
x4 = _mm256_add_epi32( x0, x4 );
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
@@ -77,10 +309,10 @@ static void transform_2way( cube_2way_context *sp )
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = mm256_swap32_64( x4 );
|
||||
x5 = mm256_swap32_64( x5 );
|
||||
x6 = mm256_swap32_64( x6 );
|
||||
x7 = mm256_swap32_64( x7 );
|
||||
x4 = mm256_swap64_32( x4 );
|
||||
x5 = mm256_swap64_32( x5 );
|
||||
x6 = mm256_swap64_32( x6 );
|
||||
x7 = mm256_swap64_32( x7 );
|
||||
}
|
||||
|
||||
_mm256_store_si256( (__m256i*)sp->h, x0 );
|
||||
@@ -91,45 +323,35 @@ static void transform_2way( cube_2way_context *sp )
|
||||
_mm256_store_si256( (__m256i*)sp->h + 5, x5 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 6, x6 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 7, x7 );
|
||||
|
||||
}
|
||||
|
||||
int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
|
||||
int blockbytes )
|
||||
{
|
||||
__m128i* h = (__m128i*)sp->h;
|
||||
__m256i *h = (__m256i*)sp->h;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = blockbytes/16;
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
if ( hashbitlen == 512 )
|
||||
{
|
||||
|
||||
h[ 0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
|
||||
h[ 2] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
|
||||
h[ 4] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
|
||||
h[ 6] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
|
||||
h[ 8] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
|
||||
h[10] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
|
||||
h[12] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
|
||||
h[14] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
|
||||
h[1] = h[ 0]; h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6];
|
||||
h[9] = h[ 8]; h[11] = h[10]; h[13] = h[12]; h[15] = h[14];
|
||||
}
|
||||
else
|
||||
{
|
||||
h[ 0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
|
||||
h[ 2] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
|
||||
h[ 4] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
|
||||
h[ 6] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
|
||||
h[ 8] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
|
||||
h[10] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
|
||||
h[12] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
|
||||
h[14] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
|
||||
h[1] = h[ 0]; h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6];
|
||||
h[9] = h[ 8]; h[11] = h[10]; h[13] = h[12]; h[15] = h[14];
|
||||
}
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -141,9 +363,6 @@ int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
|
||||
const __m256i *in = (__m256i*)data;
|
||||
int i;
|
||||
|
||||
// It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
|
||||
// Current usage sata is either 64 or 80 bytes.
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
|
||||
@@ -164,11 +383,11 @@ int cube_2way_close( cube_2way_context *sp, void *output )
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
_mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ) );
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
_mm256_set_epi32( 1,0,0,0, 1,0,0,0 ) );
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
|
||||
@@ -197,13 +416,63 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
_mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ) );
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0,
|
||||
1,0,0,0 ) );
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
|
||||
const void *data, size_t size )
|
||||
{
|
||||
__m256i *h = (__m256i*)sp->h;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = 32/16;
|
||||
sp->rounds = 16;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
|
||||
const int len = size >> 4;
|
||||
const __m256i *in = (__m256i*)data;
|
||||
__m256i *hash = (__m256i*)output;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_2way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
|
||||
@@ -1,11 +1,56 @@
|
||||
#ifndef CUBE_HASH_2WAY_H__
|
||||
#define CUBE_HASH_2WAY_H__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define CUBE_HASH_2WAY_H__ 1
|
||||
|
||||
#include <stdint.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
struct _cube_4way_context
|
||||
{
|
||||
__m512i h[8];
|
||||
int hashlen;
|
||||
int rounds;
|
||||
int blocksize;
|
||||
int pos;
|
||||
} __attribute__ ((aligned (128)));
|
||||
|
||||
typedef struct _cube_4way_context cube_4way_context;
|
||||
|
||||
int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
|
||||
int blockbytes );
|
||||
int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
|
||||
int cube_4way_close( cube_4way_context *sp, void *output );
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size );
|
||||
int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
const void *data, size_t size );
|
||||
|
||||
int cube_4x256_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
const void *data, size_t size );
|
||||
|
||||
#define cube512_4way_init( sp ) cube_4way_update( sp, 512 )
|
||||
#define cube512_4way_update cube_4way_update
|
||||
#define cube512_4way_update_close cube_4way_update
|
||||
#define cube512_4way_close cube_4way_update
|
||||
#define cube512_4way_full( sp, output, data, size ) \
|
||||
cube_4way_full( sp, output, 512, data, size )
|
||||
#define cube512_4x256_full( sp, output, data, size ) \
|
||||
cube_4x256_full( sp, output, 512, data, size )
|
||||
|
||||
#define cube256_4way_init( sp ) cube_4way_update( sp, 256 )
|
||||
#define cube256_4way_update cube_4way_update
|
||||
#define cube256_4way_update_close cube_4way_update
|
||||
#define cube256_4way_close cube_4way_update
|
||||
#define cube256_4way_full( sp, output, data, size ) \
|
||||
cube_4way_full( sp, output, 256, data, size )
|
||||
#define cube256_4x256_full( sp, output, data, size ) \
|
||||
cube_4x256_full( sp, output, 256, data, size )
|
||||
|
||||
#endif
|
||||
|
||||
// 2x128, 2 way parallel SSE2
|
||||
|
||||
struct _cube_2way_context
|
||||
@@ -15,21 +60,18 @@ struct _cube_2way_context
|
||||
int rounds;
|
||||
int blocksize; // __m128i
|
||||
int pos; // number of __m128i read into x from current block
|
||||
} __attribute__ ((aligned (64)));
|
||||
} __attribute__ ((aligned (128)));
|
||||
|
||||
typedef struct _cube_2way_context cube_2way_context;
|
||||
|
||||
int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds,
|
||||
int blockbytes );
|
||||
// reinitialize context with same parameters, much faster.
|
||||
int cube_2way_reinit( cube_2way_context *sp );
|
||||
|
||||
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size );
|
||||
|
||||
int cube_2way_close( cube_2way_context *sp, void *output );
|
||||
|
||||
int cube_2way_update_close( cube_2way_context *sp, void *output,
|
||||
const void *data, size_t size );
|
||||
int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
|
||||
const void *data, size_t size );
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -21,7 +21,27 @@ static void transform( cubehashParam *sp )
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
#ifdef __AVX2__
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
register __m512i x0, x1;
|
||||
|
||||
x0 = _mm512_load_si512( (__m512i*)sp->x );
|
||||
x1 = _mm512_load_si512( (__m512i*)sp->x + 1 );
|
||||
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x1 = _mm512_add_epi32( x0, x1 );
|
||||
x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 );
|
||||
x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) );
|
||||
x0 = _mm512_xor_si512( mm512_rol_32(
|
||||
mm512_swap256_128( x0 ), 11 ), x1 );
|
||||
x1 = mm512_swap64_32( x1 );
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)sp->x, x0 );
|
||||
_mm512_store_si512( (__m512i*)sp->x + 1, x1 );
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
|
||||
register __m256i x0, x1, x2, x3, y0, y1;
|
||||
|
||||
@@ -39,8 +59,8 @@ static void transform( cubehashParam *sp )
|
||||
x1 = mm256_rol_32( y0, 7 );
|
||||
x0 = _mm256_xor_si256( x0, x2 );
|
||||
x1 = _mm256_xor_si256( x1, x3 );
|
||||
x2 = mm256_swap64_128( x2 );
|
||||
x3 = mm256_swap64_128( x3 );
|
||||
x2 = mm256_swap128_64( x2 );
|
||||
x3 = mm256_swap128_64( x3 );
|
||||
x2 = _mm256_add_epi32( x0, x2 );
|
||||
x3 = _mm256_add_epi32( x1, x3 );
|
||||
y0 = mm256_swap_128( x0 );
|
||||
@@ -49,8 +69,8 @@ static void transform( cubehashParam *sp )
|
||||
x1 = mm256_rol_32( y1, 11 );
|
||||
x0 = _mm256_xor_si256( x0, x2 );
|
||||
x1 = _mm256_xor_si256( x1, x3 );
|
||||
x2 = mm256_swap32_64( x2 );
|
||||
x3 = mm256_swap32_64( x3 );
|
||||
x2 = mm256_swap64_32( x2 );
|
||||
x3 = mm256_swap64_32( x3 );
|
||||
}
|
||||
|
||||
_mm256_store_si256( (__m256i*)sp->x, x0 );
|
||||
@@ -210,11 +230,10 @@ int cubehashDigest( cubehashParam *sp, byte *digest )
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
|
||||
_mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 ) );
|
||||
m128_const_64( 0, 0x80 ) );
|
||||
transform( sp );
|
||||
|
||||
sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );
|
||||
sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
@@ -256,11 +275,89 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
|
||||
_mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 ) );
|
||||
m128_const_64( 0, 0x80 ) );
|
||||
transform( sp );
|
||||
|
||||
sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );
|
||||
sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
|
||||
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
|
||||
for ( i = 0; i < sp->hashlen; i++ )
|
||||
hash[i] = sp->x[i];
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
|
||||
const byte *data, size_t size )
|
||||
{
|
||||
__m128i *x = (__m128i*)sp->x;
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = 32/16;
|
||||
sp->rounds = 16;
|
||||
sp->pos = 0;
|
||||
|
||||
if ( hashbitlen == 512 )
|
||||
{
|
||||
|
||||
x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
|
||||
x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
|
||||
x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
|
||||
x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
|
||||
x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
|
||||
x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
|
||||
x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
|
||||
x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
|
||||
}
|
||||
else
|
||||
{
|
||||
x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
|
||||
x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
|
||||
x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
|
||||
x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
|
||||
x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
|
||||
x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
|
||||
x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
|
||||
x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
const int len = size / 16;
|
||||
const __m128i* in = (__m128i*)data;
|
||||
__m128i* hash = (__m128i*)digest;
|
||||
int i;
|
||||
|
||||
// It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
|
||||
// Current usage sata is either 64 or 80 bytes.
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
|
||||
m128_const_64( 0, 0x80 ) );
|
||||
transform( sp );
|
||||
|
||||
sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
|
||||
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
|
||||
@@ -19,7 +19,7 @@ struct _cubehashParam
|
||||
int rounds;
|
||||
int blocksize; // __m128i
|
||||
int pos; // number of __m128i read into x from current block
|
||||
__m128i _ALIGN(256) x[8]; // aligned for __m256i
|
||||
__m128i _ALIGN(64) x[8]; // aligned for __m256i
|
||||
};
|
||||
|
||||
typedef struct _cubehashParam cubehashParam;
|
||||
@@ -39,6 +39,9 @@ int cubehashDigest(cubehashParam* sp, byte *digest);
|
||||
int cubehashUpdateDigest( cubehashParam *sp, byte *digest, const byte *data,
|
||||
size_t size );
|
||||
|
||||
int cubehash_full( cubehashParam* sp, byte *digest, int hashbitlen,
|
||||
const byte *data, size_t size );
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
* - implements NIST hash api
|
||||
* - assumes that message lenght is multiple of 8-bits
|
||||
* - _ECHO_VPERM_ must be defined if compiling with ../main.c
|
||||
* - define NO_AES_NI for aes_ni version
|
||||
*
|
||||
* Cagdas Calik
|
||||
* ccalik@metu.edu.tr
|
||||
@@ -21,13 +20,7 @@
|
||||
#include "hash_api.h"
|
||||
//#include "vperm.h"
|
||||
#include <immintrin.h>
|
||||
/*
|
||||
#ifndef NO_AES_NI
|
||||
#include <wmmintrin.h>
|
||||
#else
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
*/
|
||||
#include "simd-utils.h"
|
||||
|
||||
MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
|
||||
MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
|
||||
@@ -179,53 +172,53 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
|
||||
|
||||
for(b = 0; b < uBlockCount; b++)
|
||||
{
|
||||
ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
|
||||
ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
|
||||
|
||||
// load message
|
||||
for(j = ctx->uHashSize / 256; j < 4; j++)
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
// load message
|
||||
for(j = ctx->uHashSize / 256; j < 4; j++)
|
||||
{
|
||||
_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][j] = _mm_load_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// save state
|
||||
SAVESTATE(_statebackup, _state);
|
||||
// save state
|
||||
SAVESTATE(_statebackup, _state);
|
||||
|
||||
k1 = ctx->k;
|
||||
k1 = ctx->k;
|
||||
|
||||
for(r = 0; r < ctx->uRounds / 2; r++)
|
||||
{
|
||||
ECHO_ROUND_UNROLL2;
|
||||
}
|
||||
for(r = 0; r < ctx->uRounds / 2; r++)
|
||||
{
|
||||
ECHO_ROUND_UNROLL2;
|
||||
}
|
||||
|
||||
if(ctx->uHashSize == 256)
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
if(ctx->uHashSize == 256)
|
||||
{
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
|
||||
}
|
||||
}
|
||||
pmsg += ctx->uBlockLength;
|
||||
else
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
|
||||
}
|
||||
}
|
||||
pmsg += ctx->uBlockLength;
|
||||
}
|
||||
SAVESTATE(ctx->state, _state);
|
||||
|
||||
@@ -390,13 +383,13 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
|
||||
}
|
||||
|
||||
// Store the hash value
|
||||
_mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]);
|
||||
_mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]);
|
||||
_mm_store_si128((__m128i*)hashval + 0, state->state[0][0]);
|
||||
_mm_store_si128((__m128i*)hashval + 1, state->state[1][0]);
|
||||
|
||||
if(state->uHashSize == 512)
|
||||
{
|
||||
_mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]);
|
||||
_mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]);
|
||||
_mm_store_si128((__m128i*)hashval + 2, state->state[2][0]);
|
||||
_mm_store_si128((__m128i*)hashval + 3, state->state[3][0]);
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
@@ -513,18 +506,177 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
}
|
||||
|
||||
// Store the hash value
|
||||
_mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] );
|
||||
_mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] );
|
||||
|
||||
if( state->uHashSize == 512 )
|
||||
{
|
||||
_mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] );
|
||||
_mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] );
|
||||
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
int nHashSize, const BitSequence *data, DataLength datalen )
|
||||
{
|
||||
int i, j;
|
||||
|
||||
state->k = m128_zero;
|
||||
state->processed_bits = 0;
|
||||
state->uBufferBytes = 0;
|
||||
|
||||
switch( nHashSize )
|
||||
{
|
||||
case 256:
|
||||
state->uHashSize = 256;
|
||||
state->uBlockLength = 192;
|
||||
state->uRounds = 8;
|
||||
state->hashsize = m128_const_64( 0, 0x100 );
|
||||
state->const1536 = m128_const_64( 0, 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
state->uHashSize = 512;
|
||||
state->uBlockLength = 128;
|
||||
state->uRounds = 10;
|
||||
state->hashsize = m128_const_64( 0, 0x200 );
|
||||
state->const1536 = m128_const_64( 0, 0x400 );
|
||||
break;
|
||||
|
||||
default:
|
||||
return BAD_HASHBITLEN;
|
||||
}
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = 0; j < nHashSize / 256; j++)
|
||||
state->state[i][j] = state->hashsize;
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = nHashSize / 256; j < 4; j++)
|
||||
state->state[i][j] = m128_zero;
|
||||
|
||||
|
||||
unsigned int uBlockCount, uRemainingBytes;
|
||||
|
||||
if( (state->uBufferBytes + datalen) >= state->uBlockLength )
|
||||
{
|
||||
if( state->uBufferBytes != 0 )
|
||||
{
|
||||
// Fill the buffer
|
||||
memcpy( state->buffer + state->uBufferBytes,
|
||||
(void*)data, state->uBlockLength - state->uBufferBytes );
|
||||
|
||||
// Process buffer
|
||||
Compress( state, state->buffer, 1 );
|
||||
state->processed_bits += state->uBlockLength * 8;
|
||||
|
||||
data += state->uBlockLength - state->uBufferBytes;
|
||||
datalen -= state->uBlockLength - state->uBufferBytes;
|
||||
}
|
||||
|
||||
// buffer now does not contain any unprocessed bytes
|
||||
|
||||
uBlockCount = datalen / state->uBlockLength;
|
||||
uRemainingBytes = datalen % state->uBlockLength;
|
||||
|
||||
if( uBlockCount > 0 )
|
||||
{
|
||||
Compress( state, data, uBlockCount );
|
||||
state->processed_bits += uBlockCount * state->uBlockLength * 8;
|
||||
data += uBlockCount * state->uBlockLength;
|
||||
}
|
||||
|
||||
if( uRemainingBytes > 0 )
|
||||
memcpy(state->buffer, (void*)data, uRemainingBytes);
|
||||
|
||||
state->uBufferBytes = uRemainingBytes;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy( state->buffer + state->uBufferBytes, (void*)data, datalen );
|
||||
state->uBufferBytes += datalen;
|
||||
}
|
||||
|
||||
__m128i remainingbits;
|
||||
|
||||
// Add remaining bytes in the buffer
|
||||
state->processed_bits += state->uBufferBytes * 8;
|
||||
|
||||
remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
|
||||
|
||||
// Pad with 0x80
|
||||
state->buffer[state->uBufferBytes++] = 0x80;
|
||||
// Enough buffer space for padding in this block?
|
||||
if( (state->uBlockLength - state->uBufferBytes) >= 18 )
|
||||
{
|
||||
// Pad with zeros
|
||||
memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) );
|
||||
|
||||
// Hash size
|
||||
*( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize;
|
||||
|
||||
// Processed bits
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
|
||||
state->processed_bits;
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
|
||||
|
||||
// Last block contains message bits?
|
||||
if( state->uBufferBytes == 1 )
|
||||
{
|
||||
state->k = _mm_xor_si128( state->k, state->k );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
}
|
||||
else
|
||||
{
|
||||
state->k = _mm_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
}
|
||||
|
||||
// Compress
|
||||
Compress( state, state->buffer, 1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fill with zero and compress
|
||||
memset( state->buffer + state->uBufferBytes, 0,
|
||||
state->uBlockLength - state->uBufferBytes );
|
||||
state->k = _mm_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
Compress( state, state->buffer, 1 );
|
||||
|
||||
// Last block
|
||||
memset( state->buffer, 0, state->uBlockLength - 18 );
|
||||
|
||||
// Hash size
|
||||
*( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
|
||||
state->uHashSize;
|
||||
|
||||
// Processed bits
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
|
||||
state->processed_bits;
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
|
||||
// Compress the last block
|
||||
state->k = _mm_xor_si128( state->k, state->k );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
Compress( state, state->buffer, 1) ;
|
||||
}
|
||||
|
||||
// Store the hash value
|
||||
_mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] );
|
||||
|
||||
if( state->uHashSize == 512 )
|
||||
{
|
||||
_mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] );
|
||||
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
|
||||
{
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
#ifndef HASH_API_H
|
||||
#define HASH_API_H
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#ifdef __AES__
|
||||
#define HASH_IMPL_STR "ECHO-aesni"
|
||||
#else
|
||||
#define HASH_IMPL_STR "ECHO-vperm"
|
||||
@@ -55,6 +55,8 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
|
||||
|
||||
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen );
|
||||
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
int nHashSize, const BitSequence *data, DataLength databitlen );
|
||||
|
||||
#endif // HASH_API_H
|
||||
|
||||
|
||||
775
algo/echo/echo-hash-4way.c
Normal file
775
algo/echo/echo-hash-4way.c
Normal file
@@ -0,0 +1,775 @@
|
||||
#if defined(__VAES__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include "echo-hash-4way.h"
|
||||
|
||||
/*
|
||||
static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57,
|
||||
0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234
|
||||
};
|
||||
*/
|
||||
// do these need to be reversed?
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
|
||||
#define mul2mask \
|
||||
m512_const2_64( 0, 0x00001b00 )
|
||||
//_mm512_set4_epi32( 0, 0, 0, 0x00001b00 )
|
||||
// _mm512_set4_epi32( 0x00001b00, 0, 0, 0 )
|
||||
|
||||
#define lsbmask m512_const1_32( 0x01010101 )
|
||||
|
||||
#define ECHO_SUBBYTES( state, i, j ) \
|
||||
state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
|
||||
state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero ); \
|
||||
k1 = _mm512_add_epi32( k1, m512_one_128 );
|
||||
|
||||
#define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) do \
|
||||
{ \
|
||||
const int j1 = ( (j)+1 ) & 3; \
|
||||
const int j2 = ( (j)+2 ) & 3; \
|
||||
const int j3 = ( (j)+3 ) & 3; \
|
||||
s2 = _mm512_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask );\
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ] [j ] = s2; \
|
||||
state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
|
||||
state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
|
||||
state2[ 3 ] [j ] = _mm512_xor_si512( s2, state1[ 0 ][ j ] );\
|
||||
s2 = _mm512_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 );\
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
s2 = _mm512_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
|
||||
s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
|
||||
} while(0)
|
||||
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES(_state, 0, 0);\
|
||||
ECHO_SUBBYTES(_state, 1, 0);\
|
||||
ECHO_SUBBYTES(_state, 2, 0);\
|
||||
ECHO_SUBBYTES(_state, 3, 0);\
|
||||
ECHO_SUBBYTES(_state, 0, 1);\
|
||||
ECHO_SUBBYTES(_state, 1, 1);\
|
||||
ECHO_SUBBYTES(_state, 2, 1);\
|
||||
ECHO_SUBBYTES(_state, 3, 1);\
|
||||
ECHO_SUBBYTES(_state, 0, 2);\
|
||||
ECHO_SUBBYTES(_state, 1, 2);\
|
||||
ECHO_SUBBYTES(_state, 2, 2);\
|
||||
ECHO_SUBBYTES(_state, 3, 2);\
|
||||
ECHO_SUBBYTES(_state, 0, 3);\
|
||||
ECHO_SUBBYTES(_state, 1, 3);\
|
||||
ECHO_SUBBYTES(_state, 2, 3);\
|
||||
ECHO_SUBBYTES(_state, 3, 3);\
|
||||
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES(_state2, 0, 0);\
|
||||
ECHO_SUBBYTES(_state2, 1, 0);\
|
||||
ECHO_SUBBYTES(_state2, 2, 0);\
|
||||
ECHO_SUBBYTES(_state2, 3, 0);\
|
||||
ECHO_SUBBYTES(_state2, 0, 1);\
|
||||
ECHO_SUBBYTES(_state2, 1, 1);\
|
||||
ECHO_SUBBYTES(_state2, 2, 1);\
|
||||
ECHO_SUBBYTES(_state2, 3, 1);\
|
||||
ECHO_SUBBYTES(_state2, 0, 2);\
|
||||
ECHO_SUBBYTES(_state2, 1, 2);\
|
||||
ECHO_SUBBYTES(_state2, 2, 2);\
|
||||
ECHO_SUBBYTES(_state2, 3, 2);\
|
||||
ECHO_SUBBYTES(_state2, 0, 3);\
|
||||
ECHO_SUBBYTES(_state2, 1, 3);\
|
||||
ECHO_SUBBYTES(_state2, 2, 3);\
|
||||
ECHO_SUBBYTES(_state2, 3, 3);\
|
||||
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
#define SAVESTATE(dst, src)\
|
||||
dst[0][0] = src[0][0];\
|
||||
dst[0][1] = src[0][1];\
|
||||
dst[0][2] = src[0][2];\
|
||||
dst[0][3] = src[0][3];\
|
||||
dst[1][0] = src[1][0];\
|
||||
dst[1][1] = src[1][1];\
|
||||
dst[1][2] = src[1][2];\
|
||||
dst[1][3] = src[1][3];\
|
||||
dst[2][0] = src[2][0];\
|
||||
dst[2][1] = src[2][1];\
|
||||
dst[2][2] = src[2][2];\
|
||||
dst[2][3] = src[2][3];\
|
||||
dst[3][0] = src[3][0];\
|
||||
dst[3][1] = src[3][1];\
|
||||
dst[3][2] = src[3][2];\
|
||||
dst[3][3] = src[3][3]
|
||||
|
||||
// blockcount always 1
|
||||
void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg,
|
||||
unsigned int uBlockCount )
|
||||
{
|
||||
unsigned int r, b, i, j;
|
||||
__m512i t1, t2, s2, k1;
|
||||
__m512i _state[4][4], _state2[4][4], _statebackup[4][4];
|
||||
|
||||
_state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ];
|
||||
_state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ];
|
||||
_state[ 0 ][ 2 ] = ctx->state[ 0 ][ 2 ];
|
||||
_state[ 0 ][ 3 ] = ctx->state[ 0 ][ 3 ];
|
||||
_state[ 1 ][ 0 ] = ctx->state[ 1 ][ 0 ];
|
||||
_state[ 1 ][ 1 ] = ctx->state[ 1 ][ 1 ];
|
||||
_state[ 1 ][ 2 ] = ctx->state[ 1 ][ 2 ];
|
||||
_state[ 1 ][ 3 ] = ctx->state[ 1 ][ 3 ];
|
||||
_state[ 2 ][ 0 ] = ctx->state[ 2 ][ 0 ];
|
||||
_state[ 2 ][ 1 ] = ctx->state[ 2 ][ 1 ];
|
||||
_state[ 2 ][ 2 ] = ctx->state[ 2 ][ 2 ];
|
||||
_state[ 2 ][ 3 ] = ctx->state[ 2 ][ 3 ];
|
||||
_state[ 3 ][ 0 ] = ctx->state[ 3 ][ 0 ];
|
||||
_state[ 3 ][ 1 ] = ctx->state[ 3 ][ 1 ];
|
||||
_state[ 3 ][ 2 ] = ctx->state[ 3 ][ 2 ];
|
||||
_state[ 3 ][ 3 ] = ctx->state[ 3 ][ 3 ];
|
||||
|
||||
for ( b = 0; b < uBlockCount; b++ )
|
||||
{
|
||||
ctx->k = _mm512_add_epi64( ctx->k, ctx->const1536 );
|
||||
|
||||
for( j = ctx->uHashSize / 256; j < 4; j++ )
|
||||
{
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
_state[ i ][ j ] = _mm512_load_si512(
|
||||
pmsg + 4 * (j - (ctx->uHashSize / 256)) + i );
|
||||
}
|
||||
}
|
||||
|
||||
// save state
|
||||
SAVESTATE( _statebackup, _state );
|
||||
|
||||
k1 = ctx->k;
|
||||
|
||||
for ( r = 0; r < ctx->uRounds / 2; r++ )
|
||||
{
|
||||
ECHO_ROUND_UNROLL2;
|
||||
}
|
||||
|
||||
if ( ctx->uHashSize == 256 )
|
||||
{
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_state[ i ][ 1 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_state[ i ][ 2 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_state[ i ][ 3 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 0 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 1 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 2 ] ) ;
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 3 ] );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_state[ i ][ 2 ] );
|
||||
_state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
|
||||
_state[ i ][ 3 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 0 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ] [0 ],
|
||||
_statebackup[ i ][ 2 ] );
|
||||
_state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
|
||||
_statebackup[ i ][ 1 ] );
|
||||
_state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
|
||||
_statebackup[ i ][ 3 ] );
|
||||
}
|
||||
}
|
||||
pmsg += ctx->uBlockLength;
|
||||
}
|
||||
SAVESTATE(ctx->state, _state);
|
||||
|
||||
}
|
||||
|
||||
int echo_4way_init( echo_4way_context *ctx, int nHashSize )
|
||||
{
|
||||
int i, j;
|
||||
|
||||
ctx->k = m512_zero;
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
|
||||
switch( nHashSize )
|
||||
{
|
||||
case 256:
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = m512_const2_64( 0, 0x100 );
|
||||
ctx->const1536 = m512_const2_64( 0, 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = m512_const2_64( 0, 0x200 );
|
||||
ctx->const1536 = m512_const2_64( 0, 0x400);
|
||||
break;
|
||||
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = 0; j < nHashSize / 256; j++ )
|
||||
ctx->state[ i ][ j ] = ctx->hashsize;
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = nHashSize / 256; j < 4; j++ )
|
||||
ctx->state[ i ][ j ] = m512_zero;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int echo_4way_update_close( echo_4way_context *state, void *hashval,
|
||||
const void *data, int databitlen )
|
||||
{
|
||||
// bytelen is either 32 (maybe), 64 or 80 or 128!
|
||||
// all are less than full block.
|
||||
|
||||
int vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
const int vblen = state->uBlockLength / 16; // 16 bytes per lane
|
||||
__m512i remainingbits;
|
||||
|
||||
if ( databitlen == 1024 )
|
||||
{
|
||||
echo_4way_compress( state, data, 1 );
|
||||
state->processed_bits = 1024;
|
||||
remainingbits = m512_const2_64( 0, -1024 );
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
memcpy_512( state->buffer, data, vlen );
|
||||
state->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = m512_const2_64( 0, (uint64_t)databitlen );
|
||||
}
|
||||
|
||||
state->buffer[ vlen ] = m512_const2_64( 0, 0x80 );
|
||||
memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
state->buffer[ vblen-2 ] = m512_const2_64( (uint64_t)state->uHashSize << 48, 0 );
|
||||
state->buffer[ vblen-1 ] = m512_const2_64( 0, state->processed_bits);
|
||||
|
||||
state->k = _mm512_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm512_sub_epi64( state->k, state->const1536 );
|
||||
|
||||
echo_4way_compress( state, state->buffer, 1 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] );
|
||||
_mm512_store_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] );
|
||||
|
||||
if ( state->uHashSize == 512 )
|
||||
{
|
||||
_mm512_store_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] );
|
||||
_mm512_store_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
const void *data, int datalen )
|
||||
{
|
||||
int i, j;
|
||||
int databitlen = datalen * 8;
|
||||
ctx->k = m512_zero;
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
|
||||
switch( nHashSize )
|
||||
{
|
||||
case 256:
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = m512_const2_64( 0, 0x100 );
|
||||
ctx->const1536 = m512_const2_64( 0, 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = m512_const2_64( 0, 0x200 );
|
||||
ctx->const1536 = m512_const2_64( 0, 0x400 );
|
||||
break;
|
||||
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = 0; j < nHashSize / 256; j++ )
|
||||
ctx->state[ i ][ j ] = ctx->hashsize;
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = nHashSize / 256; j < 4; j++ )
|
||||
ctx->state[ i ][ j ] = m512_zero;
|
||||
|
||||
|
||||
// bytelen is either 32 (maybe), 64 or 80 or 128!
|
||||
// all are less than full block.
|
||||
|
||||
int vlen = datalen / 32;
|
||||
const int vblen = ctx->uBlockLength / 16; // 16 bytes per lane
|
||||
__m512i remainingbits;
|
||||
|
||||
if ( databitlen == 1024 )
|
||||
{
|
||||
echo_4way_compress( ctx, data, 1 );
|
||||
ctx->processed_bits = 1024;
|
||||
remainingbits = m512_const2_64( 0, -1024 );
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
memcpy_512( ctx->buffer, data, vlen );
|
||||
ctx->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = m512_const2_64( 0, databitlen );
|
||||
}
|
||||
|
||||
ctx->buffer[ vlen ] = m512_const2_64( 0, 0x80 );
|
||||
memset_zero_512( ctx->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
ctx->buffer[ vblen-2 ] =
|
||||
m512_const2_64( (uint64_t)ctx->uHashSize << 48, 0 );
|
||||
ctx->buffer[ vblen-1 ] = m512_const2_64( 0, ctx->processed_bits);
|
||||
|
||||
ctx->k = _mm512_add_epi64( ctx->k, remainingbits );
|
||||
ctx->k = _mm512_sub_epi64( ctx->k, ctx->const1536 );
|
||||
|
||||
echo_4way_compress( ctx, ctx->buffer, 1 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)hashval + 0, ctx->state[ 0 ][ 0] );
|
||||
_mm512_store_si512( (__m512i*)hashval + 1, ctx->state[ 1 ][ 0] );
|
||||
|
||||
if ( ctx->uHashSize == 512 )
|
||||
{
|
||||
_mm512_store_si512( (__m512i*)hashval + 2, ctx->state[ 2 ][ 0 ] );
|
||||
_mm512_store_si512( (__m512i*)hashval + 3, ctx->state[ 3 ][ 0 ] );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
// AVX2 + VAES
|
||||
|
||||
#define mul2mask_2way m256_const2_64( 0, 0x0000000000001b00 )
|
||||
|
||||
#define lsbmask_2way m256_const1_32( 0x01010101 )
|
||||
|
||||
#define ECHO_SUBBYTES_2WAY( state, i, j ) \
|
||||
state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \
|
||||
state[i][j] = _mm256_aesenc_epi128( state[i][j], m256_zero ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 );
|
||||
|
||||
#define ECHO_MIXBYTES_2WAY( state1, state2, j, t1, t2, s2 ) do \
|
||||
{ \
|
||||
const int j1 = ( (j)+1 ) & 3; \
|
||||
const int j2 = ( (j)+2 ) & 3; \
|
||||
const int j3 = ( (j)+3 ) & 3; \
|
||||
s2 = _mm256_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
|
||||
t1 = _mm256_srli_epi16( state1[ 0 ][ j ], 7 ); \
|
||||
t1 = _mm256_and_si256( t1, lsbmask_2way );\
|
||||
t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
|
||||
s2 = _mm256_xor_si256( s2, t2 ); \
|
||||
state2[ 0 ] [j ] = s2; \
|
||||
state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
|
||||
state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
|
||||
state2[ 3 ] [j ] = _mm256_xor_si256( s2, state1[ 0 ][ j ] );\
|
||||
s2 = _mm256_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
|
||||
t1 = _mm256_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
|
||||
t1 = _mm256_and_si256( t1, lsbmask_2way ); \
|
||||
t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
|
||||
s2 = _mm256_xor_si256( s2, t2 );\
|
||||
state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], \
|
||||
_mm256_xor_si256( s2, state1[ 1 ][ j1 ] ) ); \
|
||||
state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], s2 ); \
|
||||
state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
s2 = _mm256_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
|
||||
t1 = _mm256_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
|
||||
t1 = _mm256_and_si256( t1, lsbmask_2way ); \
|
||||
t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
|
||||
s2 = _mm256_xor_si256( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
|
||||
state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], \
|
||||
_mm256_xor_si256( s2, state1[ 2 ][ j2 ] ) ); \
|
||||
state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], s2 ); \
|
||||
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
|
||||
s2 = _mm256_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
|
||||
t1 = _mm256_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
|
||||
t1 = _mm256_and_si256( t1, lsbmask_2way ); \
|
||||
t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
|
||||
s2 = _mm256_xor_si256( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], \
|
||||
_mm256_xor_si256( s2, state1[ 3 ][ j3] ) ); \
|
||||
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
|
||||
} while(0)
|
||||
|
||||
#define ECHO_ROUND_UNROLL2_2WAY \
|
||||
ECHO_SUBBYTES_2WAY(_state, 0, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 1, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 2, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 3, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 0, 1);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 1, 1);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 2, 1);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 3, 1);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 0, 2);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 1, 2);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 2, 2);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 3, 2);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 0, 3);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 1, 3);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 2, 3);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 3, 3);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 0, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 1, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 2, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 3, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 0, 1);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 1, 1);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 2, 1);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 3, 1);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 0, 2);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 1, 2);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 2, 2);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 3, 2);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 0, 3);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 1, 3);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 2, 3);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 3, 3);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
#define SAVESTATE_2WAY(dst, src)\
|
||||
dst[0][0] = src[0][0];\
|
||||
dst[0][1] = src[0][1];\
|
||||
dst[0][2] = src[0][2];\
|
||||
dst[0][3] = src[0][3];\
|
||||
dst[1][0] = src[1][0];\
|
||||
dst[1][1] = src[1][1];\
|
||||
dst[1][2] = src[1][2];\
|
||||
dst[1][3] = src[1][3];\
|
||||
dst[2][0] = src[2][0];\
|
||||
dst[2][1] = src[2][1];\
|
||||
dst[2][2] = src[2][2];\
|
||||
dst[2][3] = src[2][3];\
|
||||
dst[3][0] = src[3][0];\
|
||||
dst[3][1] = src[3][1];\
|
||||
dst[3][2] = src[3][2];\
|
||||
dst[3][3] = src[3][3]
|
||||
|
||||
// blockcount always 1
|
||||
void echo_2way_compress( echo_2way_context *ctx, const __m256i *pmsg,
|
||||
unsigned int uBlockCount )
|
||||
{
|
||||
unsigned int r, b, i, j;
|
||||
__m256i t1, t2, s2, k1;
|
||||
__m256i _state[4][4], _state2[4][4], _statebackup[4][4];
|
||||
|
||||
_state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ];
|
||||
_state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ];
|
||||
_state[ 0 ][ 2 ] = ctx->state[ 0 ][ 2 ];
|
||||
_state[ 0 ][ 3 ] = ctx->state[ 0 ][ 3 ];
|
||||
_state[ 1 ][ 0 ] = ctx->state[ 1 ][ 0 ];
|
||||
_state[ 1 ][ 1 ] = ctx->state[ 1 ][ 1 ];
|
||||
_state[ 1 ][ 2 ] = ctx->state[ 1 ][ 2 ];
|
||||
_state[ 1 ][ 3 ] = ctx->state[ 1 ][ 3 ];
|
||||
_state[ 2 ][ 0 ] = ctx->state[ 2 ][ 0 ];
|
||||
_state[ 2 ][ 1 ] = ctx->state[ 2 ][ 1 ];
|
||||
_state[ 2 ][ 2 ] = ctx->state[ 2 ][ 2 ];
|
||||
_state[ 2 ][ 3 ] = ctx->state[ 2 ][ 3 ];
|
||||
_state[ 3 ][ 0 ] = ctx->state[ 3 ][ 0 ];
|
||||
_state[ 3 ][ 1 ] = ctx->state[ 3 ][ 1 ];
|
||||
_state[ 3 ][ 2 ] = ctx->state[ 3 ][ 2 ];
|
||||
_state[ 3 ][ 3 ] = ctx->state[ 3 ][ 3 ];
|
||||
|
||||
for ( b = 0; b < uBlockCount; b++ )
|
||||
{
|
||||
ctx->k = _mm256_add_epi64( ctx->k, ctx->const1536 );
|
||||
|
||||
for( j = ctx->uHashSize / 256; j < 4; j++ )
|
||||
{
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
_state[ i ][ j ] = _mm256_load_si256(
|
||||
pmsg + 4 * (j - (ctx->uHashSize / 256)) + i );
|
||||
}
|
||||
}
|
||||
|
||||
// save state
|
||||
SAVESTATE_2WAY( _statebackup, _state );
|
||||
|
||||
k1 = ctx->k;
|
||||
|
||||
for ( r = 0; r < ctx->uRounds / 2; r++ )
|
||||
{
|
||||
ECHO_ROUND_UNROLL2_2WAY;
|
||||
}
|
||||
|
||||
if ( ctx->uHashSize == 256 )
|
||||
{
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_state[ i ][ 1 ] );
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_state[ i ][ 2 ] );
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_state[ i ][ 3 ] );
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 0 ] );
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 1 ] );
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 2 ] ) ;
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 3 ] );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_state[ i ][ 2 ] );
|
||||
_state[ i ][ 1 ] = _mm256_xor_si256( _state[ i ][ 1 ],
|
||||
_state[ i ][ 3 ] );
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 0 ] );
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ] [0 ],
|
||||
_statebackup[ i ][ 2 ] );
|
||||
_state[ i ][ 1 ] = _mm256_xor_si256( _state[ i ][ 1 ],
|
||||
_statebackup[ i ][ 1 ] );
|
||||
_state[ i ][ 1 ] = _mm256_xor_si256( _state[ i ][ 1 ],
|
||||
_statebackup[ i ][ 3 ] );
|
||||
}
|
||||
}
|
||||
pmsg += ctx->uBlockLength;
|
||||
}
|
||||
SAVESTATE_2WAY(ctx->state, _state);
|
||||
|
||||
}
|
||||
int echo_2way_init( echo_2way_context *ctx, int nHashSize )
|
||||
{
|
||||
int i, j;
|
||||
|
||||
ctx->k = m256_zero;
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
|
||||
switch( nHashSize )
|
||||
{
|
||||
case 256:
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = m256_const2_64( 0, 0x100 );
|
||||
ctx->const1536 = m256_const2_64( 0, 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = m256_const2_64( 0, 0x200 );
|
||||
ctx->const1536 = m256_const2_64( 0, 0x400 );
|
||||
break;
|
||||
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = 0; j < nHashSize / 256; j++ )
|
||||
ctx->state[ i ][ j ] = ctx->hashsize;
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = nHashSize / 256; j < 4; j++ )
|
||||
ctx->state[ i ][ j ] = m256_zero;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int echo_2way_update_close( echo_2way_context *state, void *hashval,
|
||||
const void *data, int databitlen )
|
||||
{
|
||||
// bytelen is either 32 (maybe), 64 or 80 or 128!
|
||||
// all are less than full block.
|
||||
|
||||
int vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
const int vblen = state->uBlockLength / 16; // 16 bytes per lane
|
||||
__m256i remainingbits;
|
||||
|
||||
if ( databitlen == 1024 )
|
||||
{
|
||||
echo_2way_compress( state, data, 1 );
|
||||
state->processed_bits = 1024;
|
||||
remainingbits = m256_const2_64( 0, -1024 );
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy_256( state->buffer, data, vlen );
|
||||
state->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = m256_const2_64( 0, databitlen );
|
||||
}
|
||||
|
||||
state->buffer[ vlen ] = m256_const2_64( 0, 0x80 );
|
||||
memset_zero_256( state->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
state->buffer[ vblen-2 ] = m256_const2_64( (uint64_t)state->uHashSize << 48, 0 );
|
||||
state->buffer[ vblen-1 ] = m256_const2_64( 0, state->processed_bits );
|
||||
|
||||
state->k = _mm256_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm256_sub_epi64( state->k, state->const1536 );
|
||||
|
||||
echo_2way_compress( state, state->buffer, 1 );
|
||||
|
||||
_mm256_store_si256( (__m256i*)hashval + 0, state->state[ 0 ][ 0] );
|
||||
_mm256_store_si256( (__m256i*)hashval + 1, state->state[ 1 ][ 0] );
|
||||
|
||||
if ( state->uHashSize == 512 )
|
||||
{
|
||||
_mm256_store_si256( (__m256i*)hashval + 2, state->state[ 2 ][ 0 ] );
|
||||
_mm256_store_si256( (__m256i*)hashval + 3, state->state[ 3 ][ 0 ] );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
|
||||
const void *data, int datalen )
|
||||
{
|
||||
int i, j;
|
||||
int databitlen = datalen * 8;
|
||||
ctx->k = m256_zero;
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
|
||||
switch( nHashSize )
|
||||
{
|
||||
case 256:
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = m256_const2_64( 0, 0x100 );
|
||||
ctx->const1536 = m256_const2_64( 0, 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = m256_const2_64( 0, 0x200 );
|
||||
ctx->const1536 = m256_const2_64( 0, 0x400 );
|
||||
break;
|
||||
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = 0; j < nHashSize / 256; j++ )
|
||||
ctx->state[ i ][ j ] = ctx->hashsize;
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = nHashSize / 256; j < 4; j++ )
|
||||
ctx->state[ i ][ j ] = m256_zero;
|
||||
|
||||
int vlen = datalen / 32;
|
||||
const int vblen = ctx->uBlockLength / 16; // 16 bytes per lane
|
||||
__m256i remainingbits;
|
||||
|
||||
if ( databitlen == 1024 )
|
||||
{
|
||||
echo_2way_compress( ctx, data, 1 );
|
||||
ctx->processed_bits = 1024;
|
||||
remainingbits = m256_const2_64( 0, -1024 );
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
memcpy_256( ctx->buffer, data, vlen );
|
||||
ctx->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = m256_const2_64( 0, databitlen );
|
||||
}
|
||||
|
||||
ctx->buffer[ vlen ] = m256_const2_64( 0, 0x80 );
|
||||
memset_zero_256( ctx->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
ctx->buffer[ vblen-2 ] = m256_const2_64( (uint64_t)ctx->uHashSize << 48, 0 );
|
||||
ctx->buffer[ vblen-1 ] = m256_const2_64( 0, ctx->processed_bits );
|
||||
|
||||
ctx->k = _mm256_add_epi64( ctx->k, remainingbits );
|
||||
ctx->k = _mm256_sub_epi64( ctx->k, ctx->const1536 );
|
||||
|
||||
echo_2way_compress( ctx, ctx->buffer, 1 );
|
||||
|
||||
_mm256_store_si256( (__m256i*)hashval + 0, ctx->state[ 0 ][ 0] );
|
||||
_mm256_store_si256( (__m256i*)hashval + 1, ctx->state[ 1 ][ 0] );
|
||||
|
||||
if ( ctx->uHashSize == 512 )
|
||||
{
|
||||
_mm256_store_si256( (__m256i*)hashval + 2, ctx->state[ 2 ][ 0 ] );
|
||||
_mm256_store_si256( (__m256i*)hashval + 3, ctx->state[ 3 ][ 0 ] );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif // VAES
|
||||
90
algo/echo/echo-hash-4way.h
Normal file
90
algo/echo/echo-hash-4way.h
Normal file
@@ -0,0 +1,90 @@
|
||||
#if !defined(ECHO_HASH_4WAY_H__)
|
||||
#define ECHO_HASH_4WAY_H__ 1
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m512i state[4][4];
|
||||
__m512i buffer[ 4 * 192 / 16 ]; // 4x128 interleaved 192 bytes
|
||||
__m512i k;
|
||||
__m512i hashsize;
|
||||
__m512i const1536;
|
||||
|
||||
unsigned int uRounds;
|
||||
unsigned int uHashSize;
|
||||
unsigned int uBlockLength;
|
||||
unsigned int uBufferBytes;
|
||||
unsigned int processed_bits;
|
||||
|
||||
} echo_4way_context __attribute__ ((aligned (64)));
|
||||
#define echo512_4way_context echo_4way_context
|
||||
|
||||
int echo_4way_init( echo_4way_context *state, int hashbitlen );
|
||||
#define echo512_4way_init( state ) echo_4way_init( state, 512 )
|
||||
#define echo256_4way_init( state ) echo_4way_init( state, 256 )
|
||||
|
||||
int echo_4way_update( echo_4way_context *state, const void *data,
|
||||
unsigned int databitlen);
|
||||
#define echo512_4way_update echo_4way_update
|
||||
|
||||
// int echo_4way_close( echo_4way_context *state, void *hashval );
|
||||
// #define echo512_4way_close echo_4way_close
|
||||
|
||||
int echo_4way_update_close( echo_4way_context *state, void *hashval,
|
||||
const void *data, int databitlen );
|
||||
#define echo512_4way_update_close echo_4way_update_close
|
||||
|
||||
int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
const void *data, int datalen );
|
||||
#define echo512_4way_full( state, hashval, data, datalen ) \
|
||||
echo_4way_full( state, hashval, 512, data, datalen )
|
||||
#define echo256_4way_full( state, hashval, data, datalen ) \
|
||||
echo_4way_full( state, hashval, 256, data, datalen )
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m256i state[4][4];
|
||||
__m256i buffer[ 4 * 192 / 16 ]; // 4x128 interleaved 192 bytes
|
||||
__m256i k;
|
||||
__m256i hashsize;
|
||||
__m256i const1536;
|
||||
|
||||
unsigned int uRounds;
|
||||
unsigned int uHashSize;
|
||||
unsigned int uBlockLength;
|
||||
unsigned int uBufferBytes;
|
||||
unsigned int processed_bits;
|
||||
|
||||
} echo_2way_context __attribute__ ((aligned (64)));
|
||||
#define echo512_2way_context echo_2way_context
|
||||
|
||||
int echo_2way_init( echo_2way_context *state, int hashbitlen );
|
||||
#define echo512_2way_init( state ) echo_2way_init( state, 512 )
|
||||
#define echo256_2way_init( state ) echo_2way_init( state, 256 )
|
||||
|
||||
int echo_2way_update( echo_2way_context *state, const void *data,
|
||||
unsigned int databitlen);
|
||||
#define echo512_2way_update echo_2way_update
|
||||
|
||||
int echo_2way_update_close( echo_2way_context *state, void *hashval,
|
||||
const void *data, int databitlen );
|
||||
#define echo512_2way_update_close echo_2way_update_close
|
||||
|
||||
int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
|
||||
const void *data, int datalen );
|
||||
#define echo512_2way_full( state, hashval, data, datalen ) \
|
||||
echo_2way_full( state, hashval, 512, data, datalen )
|
||||
#define echo256_2way_full( state, hashval, data, datalen ) \
|
||||
echo_2way_full( state, hashval, 256, data, datalen )
|
||||
|
||||
|
||||
#endif // VAES
|
||||
|
||||
#endif // ECHO_HASH_4WAY_H__
|
||||
@@ -36,6 +36,8 @@
|
||||
|
||||
#include "sph_echo.h"
|
||||
|
||||
#if !defined(__AES__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
@@ -1028,4 +1030,5 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
#endif // !AES
|
||||
|
||||
@@ -36,6 +36,8 @@
|
||||
#ifndef SPH_ECHO_H__
|
||||
#define SPH_ECHO_H__
|
||||
|
||||
#if !defined(__AES__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
@@ -316,5 +318,5 @@ void sph_echo512_addbits_and_close(
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // !AES
|
||||
#endif
|
||||
|
||||
565
algo/fugue/fugue-aesni.c
Normal file
565
algo/fugue/fugue-aesni.c
Normal file
@@ -0,0 +1,565 @@
|
||||
/*
|
||||
* file : fugue_vperm.c
|
||||
* version : 1.0.208
|
||||
* date : 14.12.2010
|
||||
*
|
||||
* - vperm and aes_ni implementations of hash function Fugue
|
||||
* - implements NIST hash api
|
||||
* - assumes that message lenght is multiple of 8-bits
|
||||
* - _FUGUE_VPERM_ must be defined if compiling with ../main.c
|
||||
* - default version is vperm, define AES_NI for aes_ni version
|
||||
*
|
||||
* Cagdas Calik
|
||||
* ccalik@metu.edu.tr
|
||||
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(__AES__)
|
||||
|
||||
#include <x86intrin.h>
|
||||
|
||||
#include <memory.h>
|
||||
#include "fugue-aesni.h"
|
||||
|
||||
|
||||
MYALIGN const unsigned long long _supermix1a[] = {0x0202010807020100, 0x0a05000f06010c0b};
|
||||
MYALIGN const unsigned long long _supermix1b[] = {0x0b0d080703060504, 0x0e0a090c050e0f0a};
|
||||
MYALIGN const unsigned long long _supermix1c[] = {0x0402060c070d0003, 0x090a060580808080};
|
||||
MYALIGN const unsigned long long _supermix1d[] = {0x808080800f0e0d0c, 0x0f0e0d0c80808080};
|
||||
MYALIGN const unsigned long long _supermix2a[] = {0x07020d0880808080, 0x0b06010c050e0f0a};
|
||||
MYALIGN const unsigned long long _supermix4a[] = {0x000f0a050c0b0601, 0x0302020404030e09};
|
||||
MYALIGN const unsigned long long _supermix4b[] = {0x07020d08080e0d0d, 0x07070908050e0f0a};
|
||||
MYALIGN const unsigned long long _supermix4c[] = {0x0706050403020000, 0x0302000007060504};
|
||||
MYALIGN const unsigned long long _supermix7a[] = {0x010c0b060d080702, 0x0904030e03000104};
|
||||
MYALIGN const unsigned long long _supermix7b[] = {0x8080808080808080, 0x0504070605040f06};
|
||||
MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
|
||||
MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
|
||||
MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
|
||||
MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
|
||||
MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
|
||||
MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
|
||||
MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
|
||||
|
||||
|
||||
MYALIGN const unsigned int _IV512[] = {
|
||||
0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
|
||||
0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
|
||||
0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
|
||||
0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
|
||||
0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
|
||||
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000};
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
#define PACK_S0(s0, s1, t1)\
|
||||
s0 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s0), _mm_castsi128_ps(s1), 0x30))
|
||||
|
||||
#define UNPACK_S0(s0, s1, t1)\
|
||||
s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
|
||||
s0 = mm128_mask_32( s0, 8 )
|
||||
|
||||
#define CMIX(s1, s2, r1, r2, t1, t2)\
|
||||
t1 = s1;\
|
||||
t1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t1), _mm_castsi128_ps(s2), _MM_SHUFFLE(3, 0, 2, 1)));\
|
||||
r1 = _mm_xor_si128(r1, t1);\
|
||||
r2 = _mm_xor_si128(r2, t1);
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define PACK_S0(s0, s1, t1)\
|
||||
t1 = _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 3, 3));\
|
||||
s0 = _mm_xor_si128(s0, t1);
|
||||
|
||||
#define UNPACK_S0(s0, s1, t1)\
|
||||
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
|
||||
s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
|
||||
s0 = mm128_mask_32( s0, 8 )
|
||||
|
||||
#define CMIX(s1, s2, r1, r2, t1, t2)\
|
||||
t1 = _mm_shuffle_epi32(s1, 0xf9);\
|
||||
t2 = _mm_shuffle_epi32(s2, 0xcf);\
|
||||
t1 = _mm_xor_si128(t1, t2);\
|
||||
r1 = _mm_xor_si128(r1, t1);\
|
||||
r2 = _mm_xor_si128(r2, t1)
|
||||
|
||||
#endif
|
||||
|
||||
#define TIX256(msg, s10, s8, s24, s0, t1, t2, t3)\
|
||||
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s10 = _mm_xor_si128(s10, t1);\
|
||||
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
|
||||
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
|
||||
t1 = _mm_slli_si128(t1, 8);\
|
||||
s8 = _mm_xor_si128(s8, t1);\
|
||||
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s0 = _mm_xor_si128(s0, t1)
|
||||
|
||||
|
||||
#define TIX384(msg, s16, s8, s27, s30, s0, s4, t1, t2, t3)\
|
||||
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s16 = _mm_xor_si128(s16, t1);\
|
||||
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
|
||||
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
|
||||
t1 = _mm_slli_si128(t1, 8);\
|
||||
s8 = _mm_xor_si128(s8, t1);\
|
||||
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s0 = _mm_xor_si128(s0, t1);\
|
||||
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s4 = _mm_xor_si128(s4, t1)
|
||||
|
||||
#define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
|
||||
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s22 = _mm_xor_si128(s22, t1);\
|
||||
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
|
||||
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
|
||||
t1 = _mm_slli_si128(t1, 8);\
|
||||
s8 = _mm_xor_si128(s8, t1);\
|
||||
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s0 = _mm_xor_si128(s0, t1);\
|
||||
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s4 = _mm_xor_si128(s4, t1);\
|
||||
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s7 = _mm_xor_si128(s7, t1)
|
||||
|
||||
|
||||
#define PRESUPERMIX(x, t1, s1, s2, t2)\
|
||||
s1 = x;\
|
||||
s2 = _mm_add_epi8(x, x);\
|
||||
t2 = _mm_add_epi8(s2, s2);\
|
||||
t1 = _mm_srli_epi16(x, 6);\
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
||||
|
||||
#define SUBSTITUTE(r0, _t1, _t2, _t3, _t0)\
|
||||
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
|
||||
_t2 = _mm_aesenclast_si128( _t2, m128_zero )
|
||||
|
||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||
PRESUPERMIX(t0, t1, t2, t3, t4);\
|
||||
POSTSUPERMIX(t0, t1, t2, t3, t4)
|
||||
|
||||
|
||||
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t1 = t2;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1b));\
|
||||
t4 = t1;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = t4;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1d));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = t2;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1a));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t2 = _mm_xor_si128(t2, t3);\
|
||||
t2 = _mm_xor_si128(t2, t0);\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||
t4 = _mm_xor_si128(t4, t2);\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||
t4 = _mm_xor_si128(t4, t2);\
|
||||
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
||||
t1 = t0;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix4a));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
||||
t0 = _mm_xor_si128(t0, t3);\
|
||||
t4 = _mm_xor_si128(t4, t0);\
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
|
||||
t4 = _mm_xor_si128(t4, t0)
|
||||
|
||||
|
||||
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
|
||||
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
||||
PACK_S0(r1c, r1a, _t0);\
|
||||
SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
||||
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
||||
r2c = _mm_xor_si128(r2c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r2d = _mm_xor_si128(r2d, _t0);\
|
||||
UNPACK_S0(r1c, r1a, _t3);\
|
||||
SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
||||
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
||||
r3c = _mm_xor_si128(r3c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r3d = _mm_xor_si128(r3d, _t0);\
|
||||
UNPACK_S0(r2c, r2a, _t3);\
|
||||
SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
||||
UNPACK_S0(r3c, r3a, _t3)
|
||||
|
||||
|
||||
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
|
||||
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
||||
PACK_S0(r1c, r1a, _t0);\
|
||||
SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
||||
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
||||
r2c = _mm_xor_si128(r2c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r2d = _mm_xor_si128(r2d, _t0);\
|
||||
UNPACK_S0(r1c, r1a, _t3);\
|
||||
SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
||||
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
||||
r3c = _mm_xor_si128(r3c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r3d = _mm_xor_si128(r3d, _t0);\
|
||||
UNPACK_S0(r2c, r2a, _t3);\
|
||||
SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
||||
_t0 = _mm_shuffle_epi32(r3c, 0x39);\
|
||||
r4c = _mm_xor_si128(r4c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r4d = _mm_xor_si128(r4d, _t0);\
|
||||
UNPACK_S0(r3c, r3a, _t3);\
|
||||
SUBSTITUTE(r4c, _t1, _t2, _t3, _t0);\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
|
||||
UNPACK_S0(r4c, r4a, _t3)
|
||||
|
||||
|
||||
|
||||
#define LOADCOLUMN(x, s, a)\
|
||||
block[0] = col[(base + a + 0) % s];\
|
||||
block[1] = col[(base + a + 1) % s];\
|
||||
block[2] = col[(base + a + 2) % s];\
|
||||
block[3] = col[(base + a + 3) % s];\
|
||||
x = _mm_load_si128((__m128i*)block)
|
||||
|
||||
#define STORECOLUMN(x, s)\
|
||||
_mm_store_si128((__m128i*)block, x);\
|
||||
col[(base + 0) % s] = block[0];\
|
||||
col[(base + 1) % s] = block[1];\
|
||||
col[(base + 2) % s] = block[2];\
|
||||
col[(base + 3) % s] = block[3]
|
||||
|
||||
void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
|
||||
{
|
||||
__m128i _t0, _t1, _t2, _t3;
|
||||
|
||||
switch(ctx->base)
|
||||
{
|
||||
case 1:
|
||||
TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4],
|
||||
ctx->state[5], ctx->state[ 6], ctx->state[8],
|
||||
ctx->state[9], ctx->state[10], _t0, _t1, _t2 );
|
||||
|
||||
SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7],
|
||||
ctx->state[1], ctx->state[7], ctx->state[8],
|
||||
ctx->state[6], ctx->state[0], ctx->state[6],
|
||||
ctx->state[7], ctx->state[5], ctx->state[11],
|
||||
ctx->state[5], ctx->state[6], ctx->state[4],
|
||||
ctx->state[10] );
|
||||
ctx->base++;
|
||||
pmsg += 4;
|
||||
uBlockCount--;
|
||||
if( uBlockCount == 0 ) break;
|
||||
|
||||
case 2:
|
||||
TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0],
|
||||
ctx->state[ 1], ctx->state[2], ctx->state[4],
|
||||
ctx->state[ 5], ctx->state[6], _t0, _t1, _t2);
|
||||
|
||||
SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3],
|
||||
ctx->state[9], ctx->state[3], ctx->state[4],
|
||||
ctx->state[2], ctx->state[8], ctx->state[2],
|
||||
ctx->state[3], ctx->state[1], ctx->state[7],
|
||||
ctx->state[1], ctx->state[2], ctx->state[0],
|
||||
ctx->state[6]);
|
||||
|
||||
ctx->base = 0;
|
||||
pmsg += 4;
|
||||
uBlockCount--;
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
while( uBlockCount > 0 )
|
||||
{
|
||||
TIX512( pmsg, ctx->state[ 7], ctx->state[2], ctx->state[8], ctx->state[9],
|
||||
ctx->state[10], ctx->state[0], ctx->state[1], ctx->state[2],
|
||||
_t0, _t1, _t2 );
|
||||
SUBROUND512_4( ctx->state[0], ctx->state[1], ctx->state[11],
|
||||
ctx->state[5], ctx->state[11], ctx->state[0],
|
||||
ctx->state[10], ctx->state[4], ctx->state[10],
|
||||
ctx->state[11], ctx->state[9], ctx->state[3],
|
||||
ctx->state[9], ctx->state[10], ctx->state[8],
|
||||
ctx->state[2] );
|
||||
|
||||
ctx->base++;
|
||||
pmsg += 4;
|
||||
uBlockCount--;
|
||||
if( uBlockCount == 0 ) break;
|
||||
|
||||
TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4], ctx->state[5],
|
||||
ctx->state[6], ctx->state[8], ctx->state[9], ctx->state[10],
|
||||
_t0, _t1, _t2 );
|
||||
|
||||
SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], ctx->state[1], ctx->state[7], ctx->state[8], ctx->state[6], ctx->state[0],
|
||||
ctx->state[6], ctx->state[7], ctx->state[5], ctx->state[11],
|
||||
ctx->state[5], ctx->state[6, ctx->state[4], ctx->state[10]);
|
||||
|
||||
ctx->base++;
|
||||
pmsg += 4;
|
||||
uBlockCount--;
|
||||
if( uBlockCount == 0 ) break;
|
||||
|
||||
TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0], ctx->state[1],
|
||||
ctx->state[2], ctx->state[4], ctx->state[5], ctx->state[6],
|
||||
_t0, _t1, _t2);
|
||||
SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3], ctx->state[9],
|
||||
ctx->state[3], ctx->state[4], ctx->state[2], ctx->state[8],
|
||||
ctx->state[2], ctx->state[3], ctx->state[1], ctx->state[7],
|
||||
ctx->state[1], ctx->state[2], ctx->state[0], ctx->state[6]);
|
||||
|
||||
ctx->base = 0;
|
||||
pmsg += 4;
|
||||
uBlockCount--;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
{
|
||||
unsigned int block[4] __attribute__ ((aligned (32)));
|
||||
unsigned int col[36] __attribute__ ((aligned (16)));
|
||||
unsigned int i, base;
|
||||
__m128i r0, _t0, _t1, _t2, _t3;
|
||||
|
||||
for(i = 0; i < 12; i++)
|
||||
{
|
||||
_mm_store_si128((__m128i*)block, ctx->state[i]);
|
||||
|
||||
col[3 * i + 0] = block[0];
|
||||
col[3 * i + 1] = block[1];
|
||||
col[3 * i + 2] = block[2];
|
||||
}
|
||||
|
||||
base = (36 - (12 * ctx->base)) % 36;
|
||||
|
||||
for(i = 0; i < 32; i++)
|
||||
{
|
||||
// ROR3
|
||||
base = (base + 33) % 36;
|
||||
|
||||
// CMIX
|
||||
col[(base + 0) % 36] ^= col[(base + 4) % 36];
|
||||
col[(base + 1) % 36] ^= col[(base + 5) % 36];
|
||||
col[(base + 2) % 36] ^= col[(base + 6) % 36];
|
||||
col[(base + 18) % 36] ^= col[(base + 4) % 36];
|
||||
col[(base + 19) % 36] ^= col[(base + 5) % 36];
|
||||
col[(base + 20) % 36] ^= col[(base + 6) % 36];
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
}
|
||||
|
||||
for(i = 0; i < 13; i++)
|
||||
{
|
||||
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
|
||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 9) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 18) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 27) % 36] ^= col[(base + 0) % 36];
|
||||
|
||||
// ROR9
|
||||
base = (base + 27) % 36;
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
|
||||
// S4 += S0; S10 += S0; S18 += S0; S27 += S0;
|
||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 10) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 18) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 27) % 36] ^= col[(base + 0) % 36];
|
||||
|
||||
// ROR9
|
||||
base = (base + 27) % 36;
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
|
||||
// S4 += S0; S10 += S0; S19 += S0; S27 += S0;
|
||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 10) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 19) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 27) % 36] ^= col[(base + 0) % 36];
|
||||
|
||||
// ROR9
|
||||
base = (base + 27) % 36;
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
|
||||
// S4 += S0; S10 += S0; S19 += S0; S28 += S0;
|
||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 10) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 19) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 28) % 36] ^= col[(base + 0) % 36];
|
||||
|
||||
// ROR8
|
||||
base = (base + 28) % 36;
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
}
|
||||
|
||||
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
|
||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 9) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 18) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 27) % 36] ^= col[(base + 0) % 36];
|
||||
|
||||
// Transform to the standard basis and store output; S1 || S2 || S3 || S4
|
||||
LOADCOLUMN(r0, 36, 1);
|
||||
_mm_store_si128((__m128i*)hashval, r0);
|
||||
|
||||
// Transform to the standard basis and store output; S9 || S10 || S11 || S12
|
||||
LOADCOLUMN(r0, 36, 9);
|
||||
_mm_store_si128((__m128i*)hashval + 1, r0);
|
||||
|
||||
// Transform to the standard basis and store output; S18 || S19 || S20 || S21
|
||||
LOADCOLUMN(r0, 36, 18);
|
||||
_mm_store_si128((__m128i*)hashval + 2, r0);
|
||||
|
||||
// Transform to the standard basis and store output; S27 || S28 || S29 || S30
|
||||
LOADCOLUMN(r0, 36, 27);
|
||||
_mm_store_si128((__m128i*)hashval + 3, r0);
|
||||
}
|
||||
|
||||
HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
|
||||
{
|
||||
int i;
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
ctx->base = 0;
|
||||
|
||||
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 4;
|
||||
|
||||
for(i = 0; i < 6; i++)
|
||||
ctx->state[i] = m128_zero;
|
||||
|
||||
ctx->state[6] = _mm_load_si128((__m128i*)_IV512 + 0);
|
||||
ctx->state[7] = _mm_load_si128((__m128i*)_IV512 + 1);
|
||||
ctx->state[8] = _mm_load_si128((__m128i*)_IV512 + 2);
|
||||
ctx->state[9] = _mm_load_si128((__m128i*)_IV512 + 3);
|
||||
ctx->state[10] = _mm_load_si128((__m128i*)_IV512 + 4);
|
||||
ctx->state[11] = _mm_load_si128((__m128i*)_IV512 + 5);
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen)
|
||||
{
|
||||
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
||||
|
||||
uByteLength = (unsigned int)(databitlen / 8);
|
||||
|
||||
if(state->uBufferBytes + uByteLength >= state->uBlockLength)
|
||||
{
|
||||
if(state->uBufferBytes != 0)
|
||||
{
|
||||
// Fill the buffer
|
||||
memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
|
||||
|
||||
// Process the buffer
|
||||
Compress512(state, state->buffer, 1);
|
||||
|
||||
state->processed_bits += state->uBlockLength * 8;
|
||||
data += state->uBlockLength - state->uBufferBytes;
|
||||
uByteLength -= state->uBlockLength - state->uBufferBytes;
|
||||
}
|
||||
|
||||
// buffer now does not contain any unprocessed bytes
|
||||
|
||||
uBlockCount = uByteLength / state->uBlockLength;
|
||||
uRemainingBytes = uByteLength % state->uBlockLength;
|
||||
|
||||
if(uBlockCount > 0)
|
||||
{
|
||||
Compress512(state, data, uBlockCount);
|
||||
|
||||
state->processed_bits += uBlockCount * state->uBlockLength * 8;
|
||||
data += uBlockCount * state->uBlockLength;
|
||||
}
|
||||
|
||||
if(uRemainingBytes > 0)
|
||||
{
|
||||
memcpy(state->buffer, (void*)data, uRemainingBytes);
|
||||
}
|
||||
|
||||
state->uBufferBytes = uRemainingBytes;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(state->buffer + state->uBufferBytes, (void*)data, uByteLength);
|
||||
state->uBufferBytes += uByteLength;
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
|
||||
{
|
||||
unsigned int i;
|
||||
BitSequence lengthbuf[8] __attribute__((aligned(64)));
|
||||
|
||||
// Update message bit count
|
||||
state->processed_bits += state->uBufferBytes * 8;
|
||||
|
||||
// Pad the remaining buffer bytes with zero
|
||||
if(state->uBufferBytes != 0)
|
||||
{
|
||||
if ( state->uBufferBytes != state->uBlockLength)
|
||||
memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes);
|
||||
|
||||
Compress512(state, state->buffer, 1);
|
||||
}
|
||||
|
||||
// Last two blocks are message length in bits
|
||||
for(i = 0; i < 8; i++)
|
||||
lengthbuf[i] = ((state->processed_bits) >> (8 * (7 - i))) & 0xff;
|
||||
|
||||
// Process the last two blocks
|
||||
Compress512(state, lengthbuf, 2);
|
||||
|
||||
// Finalization
|
||||
Final512(state, hashval);
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen)
|
||||
{
|
||||
fugue512_Init(hs, 512);
|
||||
fugue512_Update(hs, data, databitlen*8);
|
||||
fugue512_Final(hs, hashval);
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
#endif // AES
|
||||
46
algo/fugue/fugue-aesni.h
Normal file
46
algo/fugue/fugue-aesni.h
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
* file : hash_api.h
|
||||
* version : 1.0.208
|
||||
* date : 14.12.2010
|
||||
*
|
||||
* Fugue vperm implementation Hash API
|
||||
*
|
||||
* Cagdas Calik
|
||||
* ccalik@metu.edu.tr
|
||||
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef FUGUE_HASH_API_H
|
||||
#define FUGUE_HASH_API_H
|
||||
|
||||
#if defined(__AES__)
|
||||
|
||||
#include "algo/sha/sha3_common.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m128i state[12];
|
||||
unsigned int base;
|
||||
|
||||
unsigned int uHashSize;
|
||||
unsigned int uBlockLength;
|
||||
unsigned int uBufferBytes;
|
||||
DataLength processed_bits;
|
||||
BitSequence buffer[4];
|
||||
|
||||
} hashState_fugue __attribute__ ((aligned (64)));
|
||||
|
||||
HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
|
||||
|
||||
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
|
||||
|
||||
HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
|
||||
|
||||
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
|
||||
|
||||
#endif // AES
|
||||
#endif // HASH_API_H
|
||||
|
||||
@@ -74,6 +74,14 @@ void sph_fugue512_close(void *cc, void *dst);
|
||||
void sph_fugue512_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#define sph_fugue512_full( cc, dst, data, len ) \
|
||||
do{ \
|
||||
sph_fugue512_init( cc ); \
|
||||
sph_fugue512( cc, data, len ); \
|
||||
sph_fugue512_close( cc, dst ); \
|
||||
}while(0)
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <memory.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include "sph_gost.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
@@ -696,9 +696,26 @@ static void AddModulo512(const void *a,const void *b,void *c)
|
||||
|
||||
static void AddXor512(const void *a,const void *b,void *c)
|
||||
{
|
||||
const unsigned long long *A=a, *B=b;
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
|
||||
casti_m512i( b, 0 ) );
|
||||
#elif defined(__AVX2__)
|
||||
casti_m256i( c, 0 ) = _mm256_xor_si256( casti_m256i( a, 0 ),
|
||||
casti_m256i( b, 0 ) );
|
||||
casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
|
||||
casti_m256i( b, 1 ) );
|
||||
#elif defined(__SSE2__)
|
||||
casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
|
||||
casti_m128i( b, 0 ) );
|
||||
casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
|
||||
casti_m128i( b, 1 ) );
|
||||
casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
|
||||
casti_m128i( b, 2 ) );
|
||||
casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
|
||||
casti_m128i( b, 3 ) );
|
||||
#else
|
||||
const unsigned long long *A=a, *B=b;
|
||||
unsigned long long *C=c;
|
||||
#ifdef FULL_UNROLL
|
||||
C[0] = A[0] ^ B[0];
|
||||
C[1] = A[1] ^ B[1];
|
||||
C[2] = A[2] ^ B[2];
|
||||
@@ -707,12 +724,6 @@ static void AddXor512(const void *a,const void *b,void *c)
|
||||
C[5] = A[5] ^ B[5];
|
||||
C[6] = A[6] ^ B[6];
|
||||
C[7] = A[7] ^ B[7];
|
||||
#else
|
||||
int i = 0;
|
||||
|
||||
for(i=0; i<8; i++) {
|
||||
C[i] = A[i] ^ B[i];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -893,31 +904,32 @@ static void g_N(const unsigned char *N,unsigned char *h,const unsigned char *m)
|
||||
|
||||
static void hash_X(unsigned char *IV,const unsigned char *message,unsigned long long length,unsigned char *out)
|
||||
{
|
||||
unsigned char v512[64] = {
|
||||
unsigned char v512[64] __attribute__((aligned(64))) = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00
|
||||
};
|
||||
unsigned char v0[64] = {
|
||||
};
|
||||
unsigned char v0[64] __attribute__((aligned(64))) = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
||||
};
|
||||
unsigned char Sigma[64] __attribute__((aligned(64))) = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
||||
};
|
||||
unsigned char Sigma[64] = {
|
||||
unsigned char N[64] __attribute__((aligned(64))) = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
||||
};
|
||||
unsigned char N[64] = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
||||
};
|
||||
unsigned char m[64], *hash = IV;
|
||||
unsigned char m[64] __attribute__((aligned(64)));
|
||||
unsigned char *hash = IV;
|
||||
unsigned long long len = length;
|
||||
|
||||
// Stage 2
|
||||
@@ -952,7 +964,7 @@ static void hash_X(unsigned char *IV,const unsigned char *message,unsigned long
|
||||
|
||||
static void hash_512(const unsigned char *message, unsigned long long length, unsigned char *out)
|
||||
{
|
||||
unsigned char IV[64] = {
|
||||
unsigned char IV[64] __attribute__((aligned(64))) = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
@@ -81,9 +81,9 @@ typedef struct {
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char buf[64]; /* first field, for alignment */
|
||||
unsigned char buf[64] __attribute__((aligned(64)));
|
||||
sph_u32 V[5][8] __attribute__((aligned(64)));
|
||||
size_t ptr;
|
||||
sph_u32 V[5][8];
|
||||
#endif
|
||||
} sph_gost512_context;
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,3 +1,6 @@
|
||||
#if !defined GROESTL_INTR_AES_H__
|
||||
#define GROESTL_INTR_AES_H__
|
||||
|
||||
/* groestl-intr-aes.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
|
||||
@@ -11,16 +14,51 @@
|
||||
#include <wmmintrin.h>
|
||||
#include "hash-groestl.h"
|
||||
|
||||
/* global constants */
|
||||
__m128i ROUND_CONST_Lx;
|
||||
//__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
//__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
__m128i ROUND_CONST_P[ROUNDS1024];
|
||||
__m128i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m128i TRANSP_MASK;
|
||||
__m128i SUBSH_MASK[8];
|
||||
__m128i ALL_1B;
|
||||
__m128i ALL_FF;
|
||||
static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
{ 0x7060504030201000, 0xf0e0d0c0b0a09080 },
|
||||
{ 0x7161514131211101, 0xf1e1d1c1b1a19181 },
|
||||
{ 0x7262524232221202, 0xf2e2d2c2b2a29282 },
|
||||
{ 0x7363534333231303, 0xf3e3d3c3b3a39383 },
|
||||
{ 0x7464544434241404, 0xf4e4d4c4b4a49484 },
|
||||
{ 0x7565554535251505, 0xf5e5d5c5b5a59585 },
|
||||
{ 0x7666564636261606, 0xf6e6d6c6b6a69686 },
|
||||
{ 0x7767574737271707, 0xf7e7d7c7b7a79787 },
|
||||
{ 0x7868584838281808, 0xf8e8d8c8b8a89888 },
|
||||
{ 0x7969594939291909, 0xf9e9d9c9b9a99989 },
|
||||
{ 0x7a6a5a4a3a2a1a0a, 0xfaeadacabaaa9a8a },
|
||||
{ 0x7b6b5b4b3b2b1b0b, 0xfbebdbcbbbab9b8b },
|
||||
{ 0x7c6c5c4c3c2c1c0c, 0xfcecdcccbcac9c8c },
|
||||
{ 0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d }
|
||||
};
|
||||
|
||||
static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
{ 0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f },
|
||||
{ 0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e },
|
||||
{ 0x8d9dadbdcdddedfd, 0x0d1d2d3d4d5d6d7d },
|
||||
{ 0x8c9cacbcccdcecfc, 0x0c1c2c3c4c5c6c7c },
|
||||
{ 0x8b9babbbcbdbebfb, 0x0b1b2b3b4b5b6b7b },
|
||||
{ 0x8a9aaabacadaeafa, 0x0a1a2a3a4a5a6a7a },
|
||||
{ 0x8999a9b9c9d9e9f9, 0x0919293949596979 },
|
||||
{ 0x8898a8b8c8d8e8f8, 0x0818283848586878 },
|
||||
{ 0x8797a7b7c7d7e7f7, 0x0717273747576777 },
|
||||
{ 0x8696a6b6c6d6e6f6, 0x0616263646566676 },
|
||||
{ 0x8595a5b5c5d5e5f5, 0x0515253545556575 },
|
||||
{ 0x8494a4b4c4d4e4f4, 0x0414243444546474 },
|
||||
{ 0x8393a3b3c3d3e3f3, 0x0313233343536373 },
|
||||
{ 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
|
||||
};
|
||||
|
||||
static const __m128i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
|
||||
static const __m128i SUBSH_MASK0 = { 0x0b0e0104070a0d00, 0x0306090c0f020508 };
|
||||
static const __m128i SUBSH_MASK1 = { 0x0c0f0205080b0e01, 0x04070a0d00030609 };
|
||||
static const __m128i SUBSH_MASK2 = { 0x0d000306090c0f02, 0x05080b0e0104070a };
|
||||
static const __m128i SUBSH_MASK3 = { 0x0e0104070a0d0003, 0x06090c0f0205080b };
|
||||
static const __m128i SUBSH_MASK4 = { 0x0f0205080b0e0104, 0x070a0d000306090c };
|
||||
static const __m128i SUBSH_MASK5 = { 0x000306090c0f0205, 0x080b0e0104070a0d };
|
||||
static const __m128i SUBSH_MASK6 = { 0x0104070a0d000306, 0x090c0f0205080b0e };
|
||||
static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
@@ -73,7 +111,7 @@ __m128i ALL_FF;
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm_xor_si128(b0, a4);\
|
||||
b6 = _mm_xor_si128(b6, a4);\
|
||||
@@ -111,7 +149,7 @@ __m128i ALL_FF;
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = ALL_1B;\
|
||||
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
@@ -152,25 +190,6 @@ __m128i ALL_FF;
|
||||
}/*MixBytes*/
|
||||
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
|
||||
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
|
||||
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
|
||||
SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\
|
||||
SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\
|
||||
SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\
|
||||
SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\
|
||||
SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\
|
||||
SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\
|
||||
SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\
|
||||
SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\
|
||||
for(i = 0; i < ROUNDS1024; i++)\
|
||||
{\
|
||||
ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
|
||||
ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
|
||||
}\
|
||||
}while(0);\
|
||||
|
||||
/* one round
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
@@ -194,32 +213,34 @@ __m128i ALL_FF;
|
||||
u8 round_counter = 0;\
|
||||
for(round_counter = 0; round_counter < 14; round_counter+=2) {\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[0]));\
|
||||
xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[1]));\
|
||||
xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
|
||||
xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\
|
||||
xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\
|
||||
xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\
|
||||
xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\
|
||||
xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\
|
||||
xmm8 = _mm_xor_si128( xmm8, \
|
||||
casti_m128i( round_const_p, round_counter ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
|
||||
xmm9 = _mm_shuffle_epi8( xmm9, SUBSH_MASK1 ); \
|
||||
xmm10 = _mm_shuffle_epi8( xmm10, SUBSH_MASK2 ); \
|
||||
xmm11 = _mm_shuffle_epi8( xmm11, SUBSH_MASK3 ); \
|
||||
xmm12 = _mm_shuffle_epi8( xmm12, SUBSH_MASK4 ); \
|
||||
xmm13 = _mm_shuffle_epi8( xmm13, SUBSH_MASK5 ); \
|
||||
xmm14 = _mm_shuffle_epi8( xmm14, SUBSH_MASK6 ); \
|
||||
xmm15 = _mm_shuffle_epi8( xmm15, SUBSH_MASK7 ); \
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
|
||||
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ); \
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
|
||||
xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
|
||||
xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
|
||||
xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\
|
||||
xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\
|
||||
xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
|
||||
xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
|
||||
xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
xmm0 = _mm_xor_si128( xmm0, \
|
||||
casti_m128i( round_const_p, round_counter+1 ) ); \
|
||||
xmm0 = _mm_shuffle_epi8( xmm0, SUBSH_MASK0 ); \
|
||||
xmm1 = _mm_shuffle_epi8( xmm1, SUBSH_MASK1 ); \
|
||||
xmm2 = _mm_shuffle_epi8( xmm2, SUBSH_MASK2 ); \
|
||||
xmm3 = _mm_shuffle_epi8( xmm3, SUBSH_MASK3 ); \
|
||||
xmm4 = _mm_shuffle_epi8( xmm4, SUBSH_MASK4 ); \
|
||||
xmm5 = _mm_shuffle_epi8( xmm5, SUBSH_MASK5 ); \
|
||||
xmm6 = _mm_shuffle_epi8( xmm6, SUBSH_MASK6 ); \
|
||||
xmm7 = _mm_shuffle_epi8( xmm7, SUBSH_MASK7 ); \
|
||||
SUBMIX( xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, \
|
||||
xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
|
||||
}\
|
||||
}
|
||||
|
||||
@@ -227,48 +248,52 @@ __m128i ALL_FF;
|
||||
u8 round_counter = 0;\
|
||||
for(round_counter = 0; round_counter < 14; round_counter+=2) {\
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm1 = ALL_FF;\
|
||||
xmm8 = _mm_xor_si128(xmm8, xmm1);\
|
||||
xmm9 = _mm_xor_si128(xmm9, xmm1);\
|
||||
xmm10 = _mm_xor_si128(xmm10, xmm1);\
|
||||
xmm11 = _mm_xor_si128(xmm11, xmm1);\
|
||||
xmm12 = _mm_xor_si128(xmm12, xmm1);\
|
||||
xmm13 = _mm_xor_si128(xmm13, xmm1);\
|
||||
xmm14 = _mm_xor_si128(xmm14, xmm1);\
|
||||
xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\
|
||||
xmm1 = m128_neg1;\
|
||||
xmm8 = _mm_xor_si128( xmm8, xmm1 ); \
|
||||
xmm9 = _mm_xor_si128( xmm9, xmm1 ); \
|
||||
xmm10 = _mm_xor_si128( xmm10, xmm1 ); \
|
||||
xmm11 = _mm_xor_si128( xmm11, xmm1 ); \
|
||||
xmm12 = _mm_xor_si128( xmm12, xmm1 ); \
|
||||
xmm13 = _mm_xor_si128( xmm13, xmm1 ); \
|
||||
xmm14 = _mm_xor_si128( xmm14, xmm1 ); \
|
||||
xmm15 = _mm_xor_si128( xmm15, \
|
||||
casti_m128i( round_const_q, round_counter ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[1]));\
|
||||
xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[3]));\
|
||||
xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\
|
||||
xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\
|
||||
xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\
|
||||
xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\
|
||||
xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\
|
||||
xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\
|
||||
xmm8 = _mm_shuffle_epi8( xmm8, SUBSH_MASK1 ); \
|
||||
xmm9 = _mm_shuffle_epi8( xmm9, SUBSH_MASK3 ); \
|
||||
xmm10 = _mm_shuffle_epi8( xmm10, SUBSH_MASK5 ); \
|
||||
xmm11 = _mm_shuffle_epi8( xmm11, SUBSH_MASK7 ); \
|
||||
xmm12 = _mm_shuffle_epi8( xmm12, SUBSH_MASK0 ); \
|
||||
xmm13 = _mm_shuffle_epi8( xmm13, SUBSH_MASK2 ); \
|
||||
xmm14 = _mm_shuffle_epi8( xmm14, SUBSH_MASK4 ); \
|
||||
xmm15 = _mm_shuffle_epi8( xmm15, SUBSH_MASK6 ); \
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
|
||||
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 , xmm7 ); \
|
||||
\
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm9 = ALL_FF;\
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm9);\
|
||||
xmm1 = _mm_xor_si128(xmm1, xmm9);\
|
||||
xmm2 = _mm_xor_si128(xmm2, xmm9);\
|
||||
xmm3 = _mm_xor_si128(xmm3, xmm9);\
|
||||
xmm4 = _mm_xor_si128(xmm4, xmm9);\
|
||||
xmm5 = _mm_xor_si128(xmm5, xmm9);\
|
||||
xmm6 = _mm_xor_si128(xmm6, xmm9);\
|
||||
xmm7 = _mm_xor_si128(xmm7, (ROUND_CONST_Q[round_counter+1]));\
|
||||
xmm9 = m128_neg1;\
|
||||
xmm0 = _mm_xor_si128( xmm0, xmm9 ); \
|
||||
xmm1 = _mm_xor_si128( xmm1, xmm9 ); \
|
||||
xmm2 = _mm_xor_si128( xmm2, xmm9 ); \
|
||||
xmm3 = _mm_xor_si128( xmm3, xmm9 ); \
|
||||
xmm4 = _mm_xor_si128( xmm4, xmm9 ); \
|
||||
xmm5 = _mm_xor_si128( xmm5, xmm9 ); \
|
||||
xmm6 = _mm_xor_si128( xmm6, xmm9 ); \
|
||||
xmm7 = _mm_xor_si128( xmm7, \
|
||||
casti_m128i( round_const_q, round_counter+1 ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\
|
||||
xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\
|
||||
xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\
|
||||
xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\
|
||||
xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\
|
||||
xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\
|
||||
xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\
|
||||
xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\
|
||||
xmm0 = _mm_shuffle_epi8( xmm0, SUBSH_MASK1 ); \
|
||||
xmm1 = _mm_shuffle_epi8( xmm1, SUBSH_MASK3 ); \
|
||||
xmm2 = _mm_shuffle_epi8( xmm2, SUBSH_MASK5 ); \
|
||||
xmm3 = _mm_shuffle_epi8( xmm3, SUBSH_MASK7 ); \
|
||||
xmm4 = _mm_shuffle_epi8( xmm4, SUBSH_MASK0 ); \
|
||||
xmm5 = _mm_shuffle_epi8( xmm5, SUBSH_MASK2 ); \
|
||||
xmm6 = _mm_shuffle_epi8( xmm6, SUBSH_MASK4 ); \
|
||||
xmm7 = _mm_shuffle_epi8( xmm7, SUBSH_MASK6 ); \
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
SUBMIX( xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, \
|
||||
xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
|
||||
}\
|
||||
}
|
||||
|
||||
@@ -280,7 +305,7 @@ __m128i ALL_FF;
|
||||
* clobbers: t0-t7
|
||||
*/
|
||||
#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
|
||||
t0 = TRANSP_MASK;\
|
||||
t0 = TRANSP_MASK; \
|
||||
\
|
||||
i6 = _mm_shuffle_epi8(i6, t0);\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
@@ -368,7 +393,7 @@ __m128i ALL_FF;
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
t1 = _mm_unpackhi_epi64(t1, i5);\
|
||||
t2 = i6;\
|
||||
o0 = TRANSP_MASK;\
|
||||
o0 = TRANSP_MASK; \
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
t2 = _mm_unpackhi_epi64(t2, i7);\
|
||||
/* load transpose mask into a register, because it will be used 8 times */\
|
||||
@@ -609,3 +634,4 @@ void OF1024( __m128i* chaining )
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,16 +0,0 @@
|
||||
// specify assembly or intrinsics implementation
|
||||
//#define TASM
|
||||
#define TINTR
|
||||
|
||||
//#define AES_NI
|
||||
|
||||
//#ifdef AES_NI
|
||||
// specify AES-NI, AVX (with AES-NI) or vector-permute implementation
|
||||
|
||||
//#ifndef NO_AES_NI
|
||||
|
||||
#define VAES
|
||||
// #define VAVX
|
||||
// #define VVPERM
|
||||
|
||||
//#endif
|
||||
@@ -1,529 +0,0 @@
|
||||
/* groestl-asm-aes.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with inline assembly using ssse3, sse4.1, and aes
|
||||
* instructions.
|
||||
* Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include "hash-groestl256.h"
|
||||
/* global constants */
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
|
||||
__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
|
||||
|
||||
/* temporary variables */
|
||||
__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TEMP[3*16];
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\
|
||||
asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\
|
||||
asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\
|
||||
asm("pand xmm"tostr(j)", xmm"tostr(k)"");\
|
||||
asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\
|
||||
}/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
Output: b0, ..., b7 = MixBytes(a0,...,a7).
|
||||
but we use the relations:
|
||||
t_i = a_i + a_{i+3}
|
||||
x_i = t_i + t_{i+3}
|
||||
y_i = t_i + t+{i+2} + a_{i+6}
|
||||
z_i = 2*x_i
|
||||
w_i = z_i + y_{i+4}
|
||||
v_i = 2*w_i
|
||||
b_i = v_{i+3} + y_{i+4}
|
||||
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
|
||||
and then adding v_i computed in the meantime in registers xmm0..xmm7.
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
|
||||
asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
|
||||
asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\
|
||||
asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
|
||||
asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\
|
||||
asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
|
||||
asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\
|
||||
asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
|
||||
asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\
|
||||
asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
|
||||
asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\
|
||||
asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
|
||||
asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\
|
||||
asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
|
||||
asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\
|
||||
asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
|
||||
asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\
|
||||
asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
|
||||
asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
|
||||
asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\
|
||||
asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\
|
||||
asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\
|
||||
asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\
|
||||
asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\
|
||||
asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
|
||||
asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
|
||||
asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\
|
||||
asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
|
||||
asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
|
||||
asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\
|
||||
asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\
|
||||
asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\
|
||||
asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
|
||||
asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\
|
||||
asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\
|
||||
asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\
|
||||
asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\
|
||||
asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\
|
||||
asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\
|
||||
asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
|
||||
asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
asm("movaps xmm"tostr(b1)", [ALL_1B]");\
|
||||
MUL2(a0, b0, b1);\
|
||||
asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\
|
||||
MUL2(a1, b0, b1);\
|
||||
asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\
|
||||
MUL2(a2, b0, b1);\
|
||||
asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\
|
||||
MUL2(a3, b0, b1);\
|
||||
asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\
|
||||
MUL2(a4, b0, b1);\
|
||||
asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\
|
||||
MUL2(a5, b0, b1);\
|
||||
asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\
|
||||
MUL2(a6, b0, b1);\
|
||||
asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\
|
||||
MUL2(a7, b0, b1);\
|
||||
asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
|
||||
MUL2(a1, b0, b1);\
|
||||
asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\
|
||||
MUL2(a2, b0, b1);\
|
||||
asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
|
||||
MUL2(a5, b0, b1);\
|
||||
asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\
|
||||
MUL2(a6, b0, b1);\
|
||||
asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
|
||||
MUL2(a7, b0, b1);\
|
||||
asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\
|
||||
asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\
|
||||
asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\
|
||||
asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
|
||||
}/*MixBytes*/
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
|
||||
((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
|
||||
((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
|
||||
((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
|
||||
((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
|
||||
((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
|
||||
((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
|
||||
((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
|
||||
((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
|
||||
((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
|
||||
((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
|
||||
((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
|
||||
((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
|
||||
((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
|
||||
((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
|
||||
((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
|
||||
((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
|
||||
((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
|
||||
}\
|
||||
((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
|
||||
}while(0);
|
||||
|
||||
#define Push_All_Regs() do{\
|
||||
/* not using any...
|
||||
asm("push rax");\
|
||||
asm("push rbx");\
|
||||
asm("push rcx");*/\
|
||||
}while(0);
|
||||
|
||||
#define Pop_All_Regs() do{\
|
||||
/* not using any...
|
||||
asm("pop rcx");\
|
||||
asm("pop rbx");\
|
||||
asm("pop rax");*/\
|
||||
}while(0);
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
|
||||
asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
|
||||
asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
|
||||
asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
|
||||
asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
|
||||
asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
|
||||
asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
|
||||
asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
|
||||
asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
|
||||
asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
|
||||
asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
}
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
|
||||
\
|
||||
asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
|
||||
\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
|
||||
\
|
||||
asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
|
||||
\
|
||||
asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
|
||||
asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
|
||||
asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
|
||||
asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
|
||||
\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
|
||||
\
|
||||
asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
|
||||
asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
|
||||
asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\
|
||||
asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
|
||||
asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\
|
||||
asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
|
||||
asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
|
||||
asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
|
||||
asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
|
||||
asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT256(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
asm volatile ("emms");
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
asm ("movaps xmm12, [rdi+0*16]");
|
||||
asm ("movaps xmm13, [rdi+1*16]");
|
||||
asm ("movaps xmm14, [rdi+2*16]");
|
||||
asm ("movaps xmm15, [rdi+3*16]");
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* store transposed IV */
|
||||
asm ("movaps [rdi+0*16], xmm12");
|
||||
asm ("movaps [rdi+1*16], xmm2");
|
||||
asm ("movaps [rdi+2*16], xmm6");
|
||||
asm ("movaps [rdi+3*16], xmm7");
|
||||
|
||||
asm volatile ("emms");
|
||||
asm (".att_syntax noprefix");
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
/* message M in rsi */
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load message into registers xmm12 - xmm15 (Q = message) */
|
||||
asm ("movaps xmm12, [rsi+0*16]");
|
||||
asm ("movaps xmm13, [rsi+1*16]");
|
||||
asm ("movaps xmm14, [rsi+2*16]");
|
||||
asm ("movaps xmm15, [rsi+3*16]");
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* load previous chaining value */
|
||||
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
|
||||
asm ("movaps xmm8, [rdi+0*16]");
|
||||
asm ("movaps xmm0, [rdi+1*16]");
|
||||
asm ("movaps xmm4, [rdi+2*16]");
|
||||
asm ("movaps xmm5, [rdi+3*16]");
|
||||
|
||||
/* xor message to CV get input of P */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
asm ("pxor xmm8, xmm12");
|
||||
asm ("pxor xmm0, xmm2");
|
||||
asm ("pxor xmm4, xmm6");
|
||||
asm ("pxor xmm5, xmm7");
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
asm ("pxor xmm0, xmm8");
|
||||
asm ("pxor xmm1, xmm10");
|
||||
asm ("pxor xmm2, xmm12");
|
||||
asm ("pxor xmm3, xmm14");
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
asm ("pxor xmm0, [rdi+0*16]");
|
||||
asm ("pxor xmm1, [rdi+1*16]");
|
||||
asm ("pxor xmm2, [rdi+2*16]");
|
||||
asm ("pxor xmm3, [rdi+3*16]");
|
||||
|
||||
/* store CV */
|
||||
asm ("movaps [rdi+0*16], xmm0");
|
||||
asm ("movaps [rdi+1*16], xmm1");
|
||||
asm ("movaps [rdi+2*16], xmm2");
|
||||
asm ("movaps [rdi+3*16], xmm3");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("movaps xmm8, [rdi+0*16]");
|
||||
asm ("movaps xmm10, [rdi+1*16]");
|
||||
asm ("movaps xmm12, [rdi+2*16]");
|
||||
asm ("movaps xmm14, [rdi+3*16]");
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("pxor xmm8, [rdi+0*16]");
|
||||
asm ("pxor xmm10, [rdi+1*16]");
|
||||
asm ("pxor xmm12, [rdi+2*16]");
|
||||
asm ("pxor xmm14, [rdi+3*16]");
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
asm ("movaps [rdi+2*16], xmm9");
|
||||
asm ("movaps [rdi+3*16], xmm11");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1,519 +0,0 @@
|
||||
/* groestl-asm-avx.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with inline assembly using ssse3, sse4.1, aes and avx
|
||||
* instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
/* global variables */
|
||||
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Lx[16];
|
||||
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
|
||||
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
|
||||
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (32))) unsigned char TRANSP_MASK[16];
|
||||
__attribute__ ((aligned (32))) unsigned char SUBSH_MASK[8*16];
|
||||
__attribute__ ((aligned (32))) unsigned char ALL_1B[32];
|
||||
__attribute__ ((aligned (32))) unsigned char ALL_FF[32];
|
||||
|
||||
/* temporary variables */
|
||||
__attribute__ ((aligned (32))) unsigned char TEMP[6*32];
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
|
||||
((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
|
||||
((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
|
||||
((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
|
||||
((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
|
||||
((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
|
||||
((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
|
||||
((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
|
||||
((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
|
||||
((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
|
||||
((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
|
||||
((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
|
||||
((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
|
||||
((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
|
||||
((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
|
||||
((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
|
||||
((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
|
||||
((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
|
||||
}\
|
||||
((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
|
||||
}while(0);
|
||||
|
||||
#define Push_All_Regs() do{\
|
||||
/* not using any...
|
||||
asm("push rax");\
|
||||
asm("push rbx");\
|
||||
asm("push rcx");*/\
|
||||
}while(0);
|
||||
|
||||
#define Pop_All_Regs() do{\
|
||||
/* not using any...
|
||||
asm("pop rcx");\
|
||||
asm("pop rbx");\
|
||||
asm("pop rax");*/\
|
||||
}while(0);
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b
|
||||
* xmm[z] has to be zero */
|
||||
#define VMUL2(i, j, k, z){\
|
||||
asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\
|
||||
asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
|
||||
asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\
|
||||
asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
|
||||
}/**/
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b
|
||||
* xmm[z] has to be zero */
|
||||
#define VMUL2v2(i, j, k, z){\
|
||||
asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\
|
||||
asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
|
||||
asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
|
||||
}/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
Output: b0, ..., b7 = MixBytes(a0,...,a7).
|
||||
but we use the relations:
|
||||
t_i = a_i + a_{i+3}
|
||||
x_i = t_i + t_{i+3}
|
||||
y_i = t_i + t+{i+2} + a_{i+6}
|
||||
z_i = 2*x_i
|
||||
w_i = z_i + y_{i+4}
|
||||
v_i = 2*w_i
|
||||
b_i = v_{i+3} + y_{i+4}
|
||||
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
|
||||
and then adding v_i computed in the meantime in registers xmm0..xmm7.
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
|
||||
asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
|
||||
asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
|
||||
asm("vmovdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
|
||||
asm("vmovdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
|
||||
asm("vmovdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
|
||||
asm("vmovdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
|
||||
asm("vmovdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
|
||||
asm("vmovdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
|
||||
\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a1)"");\
|
||||
asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a2)"");\
|
||||
asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a3)"");\
|
||||
asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a4)"");\
|
||||
asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a5)"");\
|
||||
asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(a6)"");\
|
||||
asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(a7)"");\
|
||||
asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b6)"");\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a4)"");\
|
||||
asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a5)"");\
|
||||
asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a6)"");\
|
||||
asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a7)"");\
|
||||
asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a0)"");\
|
||||
asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a1)"");\
|
||||
asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a2)"");\
|
||||
asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a3)"");\
|
||||
\
|
||||
asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a6)"");\
|
||||
asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a7)"");\
|
||||
asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a0)"");\
|
||||
asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a1)"");\
|
||||
asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a2)"");\
|
||||
asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a3)"");\
|
||||
asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a4)"");\
|
||||
asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a5)"");\
|
||||
\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
asm("vmovaps [TEMP+0*16], xmm"tostr(b0)"");\
|
||||
asm("vmovaps [TEMP+1*16], xmm"tostr(b1)"");\
|
||||
asm("vmovaps [TEMP+2*16], xmm"tostr(b2)"");\
|
||||
\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
|
||||
asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
|
||||
asm("vmovaps [TEMP+3*16], xmm"tostr(a2)"");\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a3)"");\
|
||||
asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a4)"");\
|
||||
asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a5)"");\
|
||||
asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a6)"");\
|
||||
asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a7)"");\
|
||||
asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
|
||||
asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
|
||||
asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [TEMP+3*16]");\
|
||||
\
|
||||
/*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
asm("vmovaps xmm"tostr(b1)", [ALL_1B]");\
|
||||
asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(b2)"");\
|
||||
VMUL2(a7, b0, b1, b2);\
|
||||
VMUL2(a6, b0, b1, b2);\
|
||||
VMUL2(a5, b0, b1, b2);\
|
||||
VMUL2(a4, b0, b1, b2);\
|
||||
VMUL2(a3, b0, b1, b2);\
|
||||
VMUL2(a2, b0, b1, b2);\
|
||||
VMUL2(a1, b0, b1, b2);\
|
||||
VMUL2(a0, b0, b1, b2);\
|
||||
\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [TEMP+0*16]");\
|
||||
asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", [TEMP+1*16]");\
|
||||
asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", [TEMP+2*16]");\
|
||||
asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b3)"");\
|
||||
asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b4)"");\
|
||||
asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b5)"");\
|
||||
asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b6)"");\
|
||||
asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b7)"");\
|
||||
\
|
||||
/*compute v_i: double w_i */\
|
||||
VMUL2(a0, b0, b1, b2);\
|
||||
VMUL2(a1, b0, b1, b2);\
|
||||
VMUL2(a2, b0, b1, b2);\
|
||||
VMUL2(a3, b0, b1, b2);\
|
||||
VMUL2(a4, b0, b1, b2);\
|
||||
VMUL2(a5, b0, b1, b2);\
|
||||
VMUL2(a6, b0, b1, b2);\
|
||||
VMUL2(a7, b0, b1, b2);\
|
||||
\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
asm("vpxor xmm"tostr(b0)", xmm"tostr(a3)", [TEMP+0*16]");\
|
||||
asm("vpxor xmm"tostr(b1)", xmm"tostr(a4)", [TEMP+1*16]");\
|
||||
asm("vpxor xmm"tostr(b2)", xmm"tostr(a5)", [TEMP+2*16]");\
|
||||
asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a6)"");\
|
||||
asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a7)"");\
|
||||
asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a0)"");\
|
||||
asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a1)"");\
|
||||
asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a2)"");\
|
||||
}/*MixBytes*/
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
asm ("vmovaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
|
||||
asm ("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
|
||||
asm ("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
asm ("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a0)", xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a1)", xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a2)", xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a3)", xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a4)", xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a5)", xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a6)", xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a7)", xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b0)"");\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
}
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\
|
||||
\
|
||||
asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\
|
||||
asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\
|
||||
\
|
||||
asm ("vpunpckhwd xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
\
|
||||
asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
|
||||
asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
|
||||
asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
|
||||
asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
|
||||
\
|
||||
asm ("vpunpckhdq xmm"tostr(o2)", xmm"tostr(i0)", xmm"tostr(i2)"");\
|
||||
asm ("vpunpckhdq xmm"tostr(o3)", xmm"tostr(o1)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\
|
||||
asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i4)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(o2)", xmm"tostr(i1)", xmm"tostr(i5)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i1)", xmm"tostr(i5)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(o4)", xmm"tostr(i2)", xmm"tostr(i6)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o5)", xmm"tostr(i2)", xmm"tostr(i6)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(o6)", xmm"tostr(i3)", xmm"tostr(i7)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o7)", xmm"tostr(i3)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
asm ("vpunpckhqdq xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o2)", xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
asm ("vpxor xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(i4)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(i6)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT256(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
asm volatile ("emms");
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
asm ("vmovaps xmm12, [rdi+0*16]");
|
||||
asm ("vmovaps xmm13, [rdi+1*16]");
|
||||
asm ("vmovaps xmm14, [rdi+2*16]");
|
||||
asm ("vmovaps xmm15, [rdi+3*16]");
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* store transposed IV */
|
||||
asm ("vmovaps [rdi+0*16], xmm12");
|
||||
asm ("vmovaps [rdi+1*16], xmm2");
|
||||
asm ("vmovaps [rdi+2*16], xmm6");
|
||||
asm ("vmovaps [rdi+3*16], xmm7");
|
||||
|
||||
asm volatile ("emms");
|
||||
asm (".att_syntax noprefix");
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
/* message M in rsi */
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load message into registers xmm12 - xmm15 (Q = message) */
|
||||
asm ("vmovaps xmm12, [rsi+0*16]");
|
||||
asm ("vmovaps xmm13, [rsi+1*16]");
|
||||
asm ("vmovaps xmm14, [rsi+2*16]");
|
||||
asm ("vmovaps xmm15, [rsi+3*16]");
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* load previous chaining value and xor message to CV to get input of P */
|
||||
/* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
asm ("vpxor xmm8, xmm12, [rdi+0*16]");
|
||||
asm ("vpxor xmm0, xmm2, [rdi+1*16]");
|
||||
asm ("vpxor xmm4, xmm6, [rdi+2*16]");
|
||||
asm ("vpxor xmm5, xmm7, [rdi+3*16]");
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
asm ("vpxor xmm0, xmm0, xmm8");
|
||||
asm ("vpxor xmm1, xmm1, xmm10");
|
||||
asm ("vpxor xmm2, xmm2, xmm12");
|
||||
asm ("vpxor xmm3, xmm3, xmm14");
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
asm ("vpxor xmm0, xmm0, [rdi+0*16]");
|
||||
asm ("vpxor xmm1, xmm1, [rdi+1*16]");
|
||||
asm ("vpxor xmm2, xmm2, [rdi+2*16]");
|
||||
asm ("vpxor xmm3, xmm3, [rdi+3*16]");
|
||||
|
||||
/* store CV */
|
||||
asm ("vmovaps [rdi+0*16], xmm0");
|
||||
asm ("vmovaps [rdi+1*16], xmm1");
|
||||
asm ("vmovaps [rdi+2*16], xmm2");
|
||||
asm ("vmovaps [rdi+3*16], xmm3");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("vmovaps xmm8, [rdi+0*16]");
|
||||
asm ("vmovaps xmm10, [rdi+1*16]");
|
||||
asm ("vmovaps xmm12, [rdi+2*16]");
|
||||
asm ("vmovaps xmm14, [rdi+3*16]");
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("vpxor xmm8, xmm8, [rdi+0*16]");
|
||||
asm ("vpxor xmm10, xmm10, [rdi+1*16]");
|
||||
asm ("vpxor xmm12, xmm12, [rdi+2*16]");
|
||||
asm ("vpxor xmm14, xmm14, [rdi+3*16]");
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
asm ("vmovaps [rdi+2*16], xmm9");
|
||||
asm ("vmovaps [rdi+3*16], xmm11");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1,856 +0,0 @@
|
||||
/* groestl-asm-vperm.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with inline assembly using ssse3 instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* Based on the vperm and aes_ni implementations of the hash function Groestl
|
||||
* by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
|
||||
* Institute of Applied Mathematics, Middle East Technical University, Turkey
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
/* global constants */
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
|
||||
__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_0F[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_15[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_63[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16];
|
||||
|
||||
/* temporary variables */
|
||||
__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16];
|
||||
__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TEMP[8*16];
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
#define SET_SHARED_CONSTANTS(){\
|
||||
((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
|
||||
((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
|
||||
((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\
|
||||
((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\
|
||||
((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\
|
||||
((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\
|
||||
((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\
|
||||
((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\
|
||||
((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\
|
||||
((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\
|
||||
((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\
|
||||
((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\
|
||||
((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\
|
||||
((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\
|
||||
((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\
|
||||
((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\
|
||||
((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\
|
||||
((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\
|
||||
((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\
|
||||
((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\
|
||||
((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\
|
||||
((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\
|
||||
((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\
|
||||
((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\
|
||||
((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\
|
||||
((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\
|
||||
((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\
|
||||
((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\
|
||||
((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\
|
||||
((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\
|
||||
/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\
|
||||
((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\
|
||||
((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\
|
||||
((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\
|
||||
((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\
|
||||
((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform w/o settings c*
|
||||
* transforms 2 rows to/from "vperm mode"
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0, a1 = 2 rows
|
||||
* table = transformation table to use
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0, a1 = 2 rows transformed with table
|
||||
* */
|
||||
#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
|
||||
asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\
|
||||
asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\
|
||||
asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\
|
||||
asm ("pandn xmm"tostr(t1)", xmm"tostr(a1)"");\
|
||||
asm ("psrld xmm"tostr(t0)", 4");\
|
||||
asm ("psrld xmm"tostr(t1)", 4");\
|
||||
asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\
|
||||
asm ("pand xmm"tostr(a1)", xmm"tostr(c0)"");\
|
||||
asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\
|
||||
asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\
|
||||
asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\
|
||||
asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\
|
||||
asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\
|
||||
asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\
|
||||
asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\
|
||||
asm ("pxor xmm"tostr(a0)", xmm"tostr(t2)"");\
|
||||
asm ("pxor xmm"tostr(a1)", xmm"tostr(t3)"");\
|
||||
}/**/
|
||||
|
||||
#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
|
||||
asm ("movaps xmm"tostr(c0)", [ALL_0F]");\
|
||||
asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\
|
||||
asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform
|
||||
* transforms 2 rows to/from "vperm mode"
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0, a1 = 2 rows
|
||||
* table = transformation table to use
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0, a1 = 2 rows transformed with table
|
||||
* */
|
||||
#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(table, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform State
|
||||
* inputs:
|
||||
* a0-a3 = state
|
||||
* table = transformation table to use
|
||||
* t* = clobbers
|
||||
* outputs:
|
||||
* a0-a3 = transformed state
|
||||
* */
|
||||
#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(table, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Add Constant to State
|
||||
* inputs:
|
||||
* a0-a7 = state
|
||||
* constant = constant to add
|
||||
* t0 = clobber
|
||||
* outputs:
|
||||
* a0-a7 = state + constant
|
||||
* */
|
||||
#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
|
||||
asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\
|
||||
asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a1)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a2)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a3)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a4)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a5)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a6)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a7)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Set Substitute Core Constants
|
||||
* */
|
||||
#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Substitute Core
|
||||
* first part of sbox inverse computation
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0 = 1 row
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* b0a, b0b = inputs for lookup step
|
||||
* */
|
||||
#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
|
||||
asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\
|
||||
asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\
|
||||
asm ("psrld xmm"tostr(t0)", 4");\
|
||||
asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\
|
||||
asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\
|
||||
asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\
|
||||
asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\
|
||||
asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
|
||||
asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(b0b)", xmm"tostr(b0a)"");\
|
||||
asm ("movdqa xmm"tostr(t1)", xmm"tostr(c2)"");\
|
||||
asm ("pshufb xmm"tostr(t1)", xmm"tostr(a0)"");\
|
||||
asm ("pxor xmm"tostr(t1)", xmm"tostr(b0a)"");\
|
||||
asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\
|
||||
asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\
|
||||
asm ("pxor xmm"tostr(b0a)", xmm"tostr(a0)"");\
|
||||
asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
|
||||
asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\
|
||||
asm ("pxor xmm"tostr(b0b)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Lookup
|
||||
* second part of sbox inverse computation
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0a, a0b = output of Substitution Core
|
||||
* table = lookup table to use (*1 / *2 / *4)
|
||||
* t0 = clobber
|
||||
* outputs:
|
||||
* b0 = output of sbox + multiplication
|
||||
* */
|
||||
#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
|
||||
asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\
|
||||
asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\
|
||||
asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\
|
||||
asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* SubBytes and *2 / *4
|
||||
* this function is derived from:
|
||||
* Constant-time SSSE3 AES core implementation
|
||||
* by Mike Hamburg
|
||||
* and
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0-a7 = state
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0-a7 = state * 4
|
||||
* c2 = row0 * 2 -> b0
|
||||
* c1 = row7 * 2 -> b3
|
||||
* c0 = row7 * 1 -> b4
|
||||
* t2 = row4 * 1 -> b7
|
||||
* TEMP_MUL1 = row(i) * 1
|
||||
* TEMP_MUL2 = row(i) * 2
|
||||
*
|
||||
* call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
|
||||
#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
|
||||
/* set Constants */\
|
||||
VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
|
||||
/* row 1 */\
|
||||
VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
|
||||
/* --- */\
|
||||
/* row 2 */\
|
||||
VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
|
||||
/* --- */\
|
||||
/* row 3 */\
|
||||
VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
|
||||
/* --- */\
|
||||
/* row 5 */\
|
||||
VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
|
||||
/* --- */\
|
||||
/* row 6 */\
|
||||
VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
|
||||
/* --- */\
|
||||
/* row 7 */\
|
||||
VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
|
||||
/* --- */\
|
||||
/* row 4 */\
|
||||
VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
|
||||
/* --- */\
|
||||
/* row 0 */\
|
||||
VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
|
||||
asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
|
||||
/* --- */\
|
||||
}/**/
|
||||
|
||||
|
||||
/* Optimized MixBytes
|
||||
* inputs:
|
||||
* a0-a7 = (row0-row7) * 4
|
||||
* b0 = row0 * 2
|
||||
* b3 = row7 * 2
|
||||
* b4 = row7 * 1
|
||||
* b7 = row4 * 1
|
||||
* all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
|
||||
* output: b0-b7
|
||||
* */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* save one value */\
|
||||
asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\
|
||||
/* 1 */\
|
||||
asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\
|
||||
asm ("pxor xmm"tostr(b1)", [TEMP_MUL2+3*16]");\
|
||||
asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\
|
||||
\
|
||||
/* 2 */\
|
||||
asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(a4)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\
|
||||
asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\
|
||||
\
|
||||
/* 4 */\
|
||||
asm ("pxor xmm"tostr(b7)", xmm"tostr(a6)"");\
|
||||
/*asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\
|
||||
asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+6*16]");\
|
||||
asm ("pxor xmm"tostr(b7)", [TEMP_MUL2+1*16]");\
|
||||
asm ("pxor xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\
|
||||
asm ("pxor xmm"tostr(b2)", xmm"tostr(b7)"");\
|
||||
\
|
||||
/* 3 */\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(a7)"");\
|
||||
asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+5*16]");\
|
||||
asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+7*16]");\
|
||||
/*asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\
|
||||
asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+2*16]");\
|
||||
asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(b0)"");\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\
|
||||
\
|
||||
/* 5 */\
|
||||
asm ("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\
|
||||
/*asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\
|
||||
asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+2*16]");\
|
||||
asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+3*16]");\
|
||||
asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+5*16]");\
|
||||
asm ("pxor xmm"tostr(b3)", xmm"tostr(b4)"");\
|
||||
asm ("pxor xmm"tostr(b6)", xmm"tostr(b4)"");\
|
||||
\
|
||||
/* 6 */\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\
|
||||
asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
|
||||
asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
|
||||
\
|
||||
/* 7 */\
|
||||
asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\
|
||||
asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\
|
||||
asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\
|
||||
asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
|
||||
\
|
||||
/* 8 */\
|
||||
asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\
|
||||
asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\
|
||||
asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\
|
||||
asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
|
||||
\
|
||||
/* 9 */\
|
||||
asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+5*16]");\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
|
||||
\
|
||||
/* 10 */\
|
||||
asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\
|
||||
asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+1*16]");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(a1)"");\
|
||||
asm ("pxor xmm"tostr(b4)", xmm"tostr(a1)"");\
|
||||
\
|
||||
/* 11 */\
|
||||
asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\
|
||||
asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+6*16]");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
|
||||
asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\
|
||||
\
|
||||
/* 12 */\
|
||||
asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+2*16]");\
|
||||
asm ("pxor xmm"tostr(b2)", xmm"tostr(a3)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
|
||||
\
|
||||
/* 13 */\
|
||||
asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
|
||||
asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
|
||||
asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
|
||||
asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
|
||||
asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
|
||||
}/**/
|
||||
|
||||
//#if (LENGTH <= 256)
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
SET_SHARED_CONSTANTS();\
|
||||
((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\
|
||||
((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\
|
||||
((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\
|
||||
((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\
|
||||
((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\
|
||||
((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\
|
||||
((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\
|
||||
((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\
|
||||
((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\
|
||||
((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\
|
||||
((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\
|
||||
((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\
|
||||
((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\
|
||||
((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\
|
||||
((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\
|
||||
((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
|
||||
}\
|
||||
((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
|
||||
}/**/
|
||||
|
||||
#define Push_All_Regs(){\
|
||||
/* not using any...
|
||||
asm("push rax");\
|
||||
asm("push rbx");\
|
||||
asm("push rcx");*/\
|
||||
}/**/
|
||||
|
||||
#define Pop_All_Regs(){\
|
||||
/* not using any...
|
||||
asm("pop rcx");\
|
||||
asm("pop rbx");\
|
||||
asm("pop rax");*/\
|
||||
}/**/
|
||||
|
||||
|
||||
/* vperm:
|
||||
* transformation before rounds with ipt
|
||||
* first round add transformed constant
|
||||
* middle rounds: add constant XOR 0x15...15
|
||||
* last round: additionally add 0x15...15 after MB
|
||||
* transformation after rounds with opt
|
||||
*/
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant + ShiftBytes (interleaved) */\
|
||||
asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
|
||||
asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
|
||||
asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\
|
||||
asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
|
||||
asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
|
||||
asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\
|
||||
asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
|
||||
asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
|
||||
asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
|
||||
asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
|
||||
asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
|
||||
asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
|
||||
asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
|
||||
asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
|
||||
/* SubBytes + Multiplication by 2 and 4 */\
|
||||
VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}/**/
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
|
||||
ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
|
||||
}
|
||||
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
|
||||
\
|
||||
asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
|
||||
\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
|
||||
\
|
||||
asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
|
||||
\
|
||||
asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
|
||||
asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
|
||||
asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
|
||||
asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
|
||||
\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
|
||||
\
|
||||
asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
|
||||
asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
|
||||
asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\
|
||||
asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
|
||||
asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\
|
||||
asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
|
||||
asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
|
||||
asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
|
||||
asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
|
||||
asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
#define VPERM_Transform_RoundConst_CNT2(i, j){\
|
||||
asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\
|
||||
asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\
|
||||
asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\
|
||||
asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\
|
||||
VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
|
||||
asm ("pxor xmm0, [ALL_15]");\
|
||||
asm ("pxor xmm1, [ALL_15]");\
|
||||
asm ("pxor xmm2, [ALL_15]");\
|
||||
asm ("pxor xmm3, [ALL_15]");\
|
||||
asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\
|
||||
asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\
|
||||
asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\
|
||||
asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\
|
||||
}/**/
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
#define VPERM_Transform_RoundConst(){\
|
||||
asm ("movaps xmm0, [ROUND_CONST_Lx]");\
|
||||
VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
|
||||
asm ("pxor xmm0, [ALL_15]");\
|
||||
asm ("movaps [ROUND_CONST_Lx], xmm0");\
|
||||
VPERM_Transform_RoundConst_CNT2(0, 1);\
|
||||
VPERM_Transform_RoundConst_CNT2(2, 3);\
|
||||
VPERM_Transform_RoundConst_CNT2(4, 5);\
|
||||
VPERM_Transform_RoundConst_CNT2(6, 7);\
|
||||
VPERM_Transform_RoundConst_CNT2(8, 9);\
|
||||
}/**/
|
||||
|
||||
void INIT256(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
asm volatile ("emms");
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
VPERM_Transform_RoundConst();
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
asm ("movaps xmm12, [rdi+0*16]");
|
||||
asm ("movaps xmm13, [rdi+1*16]");
|
||||
asm ("movaps xmm14, [rdi+2*16]");
|
||||
asm ("movaps xmm15, [rdi+3*16]");
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* store transposed IV */
|
||||
asm ("movaps [rdi+0*16], xmm12");
|
||||
asm ("movaps [rdi+1*16], xmm2");
|
||||
asm ("movaps [rdi+2*16], xmm6");
|
||||
asm ("movaps [rdi+3*16], xmm7");
|
||||
|
||||
asm volatile ("emms");
|
||||
asm (".att_syntax noprefix");
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
/* message M in rsi */
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load message into registers xmm12 - xmm15 (Q = message) */
|
||||
asm ("movaps xmm12, [rsi+0*16]");
|
||||
asm ("movaps xmm13, [rsi+1*16]");
|
||||
asm ("movaps xmm14, [rsi+2*16]");
|
||||
asm ("movaps xmm15, [rsi+3*16]");
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* load previous chaining value */
|
||||
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
|
||||
asm ("movaps xmm8, [rdi+0*16]");
|
||||
asm ("movaps xmm0, [rdi+1*16]");
|
||||
asm ("movaps xmm4, [rdi+2*16]");
|
||||
asm ("movaps xmm5, [rdi+3*16]");
|
||||
|
||||
/* xor message to CV get input of P */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
asm ("pxor xmm8, xmm12");
|
||||
asm ("pxor xmm0, xmm2");
|
||||
asm ("pxor xmm4, xmm6");
|
||||
asm ("pxor xmm5, xmm7");
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
asm ("pxor xmm0, xmm8");
|
||||
asm ("pxor xmm1, xmm10");
|
||||
asm ("pxor xmm2, xmm12");
|
||||
asm ("pxor xmm3, xmm14");
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
asm ("pxor xmm0, [rdi+0*16]");
|
||||
asm ("pxor xmm1, [rdi+1*16]");
|
||||
asm ("pxor xmm2, [rdi+2*16]");
|
||||
asm ("pxor xmm3, [rdi+3*16]");
|
||||
|
||||
/* store CV */
|
||||
asm ("movaps [rdi+0*16], xmm0");
|
||||
asm ("movaps [rdi+1*16], xmm1");
|
||||
asm ("movaps [rdi+2*16], xmm2");
|
||||
asm ("movaps [rdi+3*16], xmm3");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("movaps xmm8, [rdi+0*16]");
|
||||
asm ("movaps xmm10, [rdi+1*16]");
|
||||
asm ("movaps xmm12, [rdi+2*16]");
|
||||
asm ("movaps xmm14, [rdi+3*16]");
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("pxor xmm8, [rdi+0*16]");
|
||||
asm ("pxor xmm10, [rdi+1*16]");
|
||||
asm ("pxor xmm12, [rdi+2*16]");
|
||||
asm ("pxor xmm14, [rdi+3*16]");
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
|
||||
VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
asm ("movaps [rdi+2*16], xmm9");
|
||||
asm ("movaps [rdi+3*16], xmm11");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -11,17 +11,44 @@
|
||||
#include <wmmintrin.h>
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
/* global constants */
|
||||
__m128i ROUND_CONST_Lx;
|
||||
__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
//__m128i ROUND_CONST_P[ROUNDS1024];
|
||||
//__m128i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m128i TRANSP_MASK;
|
||||
__m128i SUBSH_MASK[8];
|
||||
__m128i ALL_1B;
|
||||
__m128i ALL_FF;
|
||||
static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
{ 0x7060504030201000, 0xffffffffffffffff },
|
||||
{ 0x7161514131211101, 0xffffffffffffffff },
|
||||
{ 0x7262524232221202, 0xffffffffffffffff },
|
||||
{ 0x7363534333231303, 0xffffffffffffffff },
|
||||
{ 0x7464544434241404, 0xffffffffffffffff },
|
||||
{ 0x7565554535251505, 0xffffffffffffffff },
|
||||
{ 0x7666564636261606, 0xffffffffffffffff },
|
||||
{ 0x7767574737271707, 0xffffffffffffffff },
|
||||
{ 0x7868584838281808, 0xffffffffffffffff },
|
||||
{ 0x7969594939291909, 0xffffffffffffffff }
|
||||
};
|
||||
|
||||
static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
{ 0x0000000000000000, 0x8f9fafbfcfdfefff },
|
||||
{ 0x0000000000000000, 0x8e9eaebecedeeefe },
|
||||
{ 0x0000000000000000, 0x8d9dadbdcdddedfd },
|
||||
{ 0x0000000000000000, 0x8c9cacbcccdcecfc },
|
||||
{ 0x0000000000000000, 0x8b9babbbcbdbebfb },
|
||||
{ 0x0000000000000000, 0x8a9aaabacadaeafa },
|
||||
{ 0x0000000000000000, 0x8999a9b9c9d9e9f9 },
|
||||
{ 0x0000000000000000, 0x8898a8b8c8d8e8f8 },
|
||||
{ 0x0000000000000000, 0x8797a7b7c7d7e7f7 },
|
||||
{ 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
|
||||
};
|
||||
|
||||
static const __m128i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
|
||||
|
||||
static const __m128i SUBSH_MASK0 = { 0x0c0f0104070b0e00, 0x03060a0d08020509 };
|
||||
static const __m128i SUBSH_MASK1 = { 0x0e090205000d0801, 0x04070c0f0a03060b };
|
||||
static const __m128i SUBSH_MASK2 = { 0x080b0306010f0a02, 0x05000e090c04070d };
|
||||
static const __m128i SUBSH_MASK3 = { 0x0a0d040702090c03, 0x0601080b0e05000f };
|
||||
static const __m128i SUBSH_MASK4 = { 0x0b0e0500030a0d04, 0x0702090c0f060108 };
|
||||
static const __m128i SUBSH_MASK5 = { 0x0d080601040c0f05, 0x00030b0e0907020a };
|
||||
static const __m128i SUBSH_MASK6 = { 0x0f0a0702050e0906, 0x01040d080b00030c };
|
||||
static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
@@ -38,8 +65,6 @@ __m128i ALL_FF;
|
||||
i = _mm_xor_si128(i, j);\
|
||||
}
|
||||
|
||||
/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
@@ -113,7 +138,7 @@ __m128i ALL_FF;
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = ALL_1B;\
|
||||
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
@@ -153,25 +178,6 @@ __m128i ALL_FF;
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
|
||||
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
|
||||
SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
|
||||
SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
|
||||
SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
|
||||
SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
|
||||
SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
|
||||
SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
|
||||
SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
|
||||
SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
|
||||
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
|
||||
}\
|
||||
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
|
||||
}while(0); \
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
@@ -179,34 +185,34 @@ __m128i ALL_FF;
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = ROUND_CONST_Lx;\
|
||||
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
|
||||
a1 = _mm_xor_si128(a1, b1);\
|
||||
a2 = _mm_xor_si128(a2, b1);\
|
||||
a3 = _mm_xor_si128(a3, b1);\
|
||||
a4 = _mm_xor_si128(a4, b1);\
|
||||
a5 = _mm_xor_si128(a5, b1);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
|
||||
b1 = m128_const_64( 0xffffffffffffffff, 0 ); \
|
||||
a0 = _mm_xor_si128( a0, casti_m128i( round_const_l0, i ) ); \
|
||||
a1 = _mm_xor_si128( a1, b1 ); \
|
||||
a2 = _mm_xor_si128( a2, b1 ); \
|
||||
a3 = _mm_xor_si128( a3, b1 ); \
|
||||
a4 = _mm_xor_si128( a4, b1 ); \
|
||||
a5 = _mm_xor_si128( a5, b1 ); \
|
||||
a6 = _mm_xor_si128( a6, b1 ); \
|
||||
a7 = _mm_xor_si128( a7, casti_m128i( round_const_l7, i ) ); \
|
||||
\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
b0 = _mm_xor_si128(b0, b0);\
|
||||
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
|
||||
a0 = _mm_aesenclast_si128(a0, b0);\
|
||||
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
|
||||
a1 = _mm_aesenclast_si128(a1, b0);\
|
||||
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
|
||||
a2 = _mm_aesenclast_si128(a2, b0);\
|
||||
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
|
||||
a3 = _mm_aesenclast_si128(a3, b0);\
|
||||
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
|
||||
a4 = _mm_aesenclast_si128(a4, b0);\
|
||||
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
|
||||
a5 = _mm_aesenclast_si128(a5, b0);\
|
||||
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
|
||||
a6 = _mm_aesenclast_si128(a6, b0);\
|
||||
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
|
||||
a7 = _mm_aesenclast_si128(a7, b0);\
|
||||
a0 = _mm_shuffle_epi8( a0, SUBSH_MASK0 ); \
|
||||
a0 = _mm_aesenclast_si128( a0, b0 );\
|
||||
a1 = _mm_shuffle_epi8( a1, SUBSH_MASK1 ); \
|
||||
a1 = _mm_aesenclast_si128( a1, b0 );\
|
||||
a2 = _mm_shuffle_epi8( a2, SUBSH_MASK2 ); \
|
||||
a2 = _mm_aesenclast_si128( a2, b0 );\
|
||||
a3 = _mm_shuffle_epi8( a3, SUBSH_MASK3 ); \
|
||||
a3 = _mm_aesenclast_si128( a3, b0 );\
|
||||
a4 = _mm_shuffle_epi8( a4, SUBSH_MASK4 ); \
|
||||
a4 = _mm_aesenclast_si128( a4, b0 );\
|
||||
a5 = _mm_shuffle_epi8( a5, SUBSH_MASK5 ); \
|
||||
a5 = _mm_aesenclast_si128( a5, b0 );\
|
||||
a6 = _mm_shuffle_epi8( a6, SUBSH_MASK6 ); \
|
||||
a6 = _mm_aesenclast_si128( a6, b0 );\
|
||||
a7 = _mm_shuffle_epi8( a7, SUBSH_MASK7 ); \
|
||||
a7 = _mm_aesenclast_si128( a7, b0 );\
|
||||
\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
@@ -234,8 +240,9 @@ __m128i ALL_FF;
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
t0 = TRANSP_MASK;\
|
||||
t0 = TRANSP_MASK; \
|
||||
\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm_shuffle_epi8(i1, t0);\
|
||||
|
||||
@@ -1,482 +0,0 @@
|
||||
/* groestl-intr-avx.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with intrinsics using ssse3, sse4.1, aes and avx
|
||||
* instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
#include <immintrin.h>
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
/* global constants */
|
||||
__m128i ROUND_CONST_Lx;
|
||||
__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
__m128i ROUND_CONST_P[ROUNDS1024];
|
||||
__m128i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m128i TRANSP_MASK;
|
||||
__m128i SUBSH_MASK[8];
|
||||
__m128i ALL_FF;
|
||||
//#if LENGTH <= 256
|
||||
__m128i ALL_1B;
|
||||
//#else
|
||||
//__m256d ALL_1B;
|
||||
//#endif
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
#define insert_m128i_in_m256d(ymm, xmm, pos) (_mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castpd_si256(ymm), xmm, pos)))
|
||||
#define extract_m128i_from_m256d(ymm, pos) (_mm256_extractf128_si256(_mm256_castpd_si256(ymm), pos))
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
|
||||
ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
|
||||
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
|
||||
SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
|
||||
SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
|
||||
SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
|
||||
SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
|
||||
SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
|
||||
SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
|
||||
SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
|
||||
SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
|
||||
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
|
||||
}\
|
||||
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
|
||||
}while(0);
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b
|
||||
* xmm[z] has to be zero */
|
||||
#define VMUL2(i, j, k, z){\
|
||||
j = _mm_cmpgt_epi8(z, i);\
|
||||
i = _mm_add_epi8(i, i);\
|
||||
j = _mm_and_si128(j, k);\
|
||||
i = _mm_xor_si128(i, j);\
|
||||
}/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
Output: b0, ..., b7 = MixBytes(a0,...,a7).
|
||||
but we use the relations:
|
||||
t_i = a_i + a_{i+3}
|
||||
x_i = t_i + t_{i+3}
|
||||
y_i = t_i + t+{i+2} + a_{i+6}
|
||||
z_i = 2*x_i
|
||||
w_i = z_i + y_{i+4}
|
||||
v_i = 2*w_i
|
||||
b_i = v_{i+3} + y_{i+4}
|
||||
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
|
||||
and then adding v_i computed in the meantime in registers xmm0..xmm7.
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
|
||||
b0 = a2;\
|
||||
b1 = a3;\
|
||||
b2 = a4;\
|
||||
b3 = a5;\
|
||||
b4 = a6;\
|
||||
b5 = a7;\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
a2 = _mm_xor_si128(a2, a3);\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm_xor_si128(b0, a4);\
|
||||
b1 = _mm_xor_si128(b1, a5);\
|
||||
b2 = _mm_xor_si128(b2, a6);\
|
||||
b3 = _mm_xor_si128(b3, a7);\
|
||||
b4 = _mm_xor_si128(b4, a0);\
|
||||
b5 = _mm_xor_si128(b5, a1);\
|
||||
b6 = _mm_xor_si128(b6, a2);\
|
||||
b7 = _mm_xor_si128(b7, a3);\
|
||||
\
|
||||
b0 = _mm_xor_si128(b0, a6);\
|
||||
b1 = _mm_xor_si128(b1, a7);\
|
||||
b2 = _mm_xor_si128(b2, a0);\
|
||||
b3 = _mm_xor_si128(b3, a1);\
|
||||
b4 = _mm_xor_si128(b4, a2);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
b6 = _mm_xor_si128(b6, a4);\
|
||||
b7 = _mm_xor_si128(b7, a5);\
|
||||
\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
TEMP1 = b1;\
|
||||
TEMP2 = b2;\
|
||||
\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b1 = a1;\
|
||||
TEMP3 = a2;\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(a2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP3);\
|
||||
\
|
||||
/*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
b1 = ALL_1B;\
|
||||
b2 = _mm_xor_si128(b2, b2);\
|
||||
VMUL2(a7, b0, b1, b2);\
|
||||
VMUL2(a6, b0, b1, b2);\
|
||||
VMUL2(a5, b0, b1, b2);\
|
||||
VMUL2(a4, b0, b1, b2);\
|
||||
VMUL2(a3, b0, b1, b2);\
|
||||
VMUL2(a2, b0, b1, b2);\
|
||||
VMUL2(a1, b0, b1, b2);\
|
||||
VMUL2(a0, b0, b1, b2);\
|
||||
\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
a2 = _mm_xor_si128(a2, TEMP2);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
\
|
||||
/*compute v_i: double w_i */\
|
||||
VMUL2(a0, b0, b1, b2);\
|
||||
VMUL2(a1, b0, b1, b2);\
|
||||
VMUL2(a2, b0, b1, b2);\
|
||||
VMUL2(a3, b0, b1, b2);\
|
||||
VMUL2(a4, b0, b1, b2);\
|
||||
VMUL2(a5, b0, b1, b2);\
|
||||
VMUL2(a6, b0, b1, b2);\
|
||||
VMUL2(a7, b0, b1, b2);\
|
||||
\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
b0 = _mm_xor_si128(a3, TEMP0);\
|
||||
b1 = _mm_xor_si128(a4, TEMP1);\
|
||||
b2 = _mm_xor_si128(a5, TEMP2);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
}/*MixBytes*/
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* Add Round Constant */\
|
||||
b1 = ROUND_CONST_Lx;\
|
||||
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
|
||||
a1 = _mm_xor_si128(a1, b1);\
|
||||
a2 = _mm_xor_si128(a2, b1);\
|
||||
a3 = _mm_xor_si128(a3, b1);\
|
||||
a4 = _mm_xor_si128(a4, b1);\
|
||||
a5 = _mm_xor_si128(a5, b1);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
|
||||
\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
b0 = _mm_xor_si128(b0, b0);\
|
||||
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
|
||||
a0 = _mm_aesenclast_si128(a0, b0);\
|
||||
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
|
||||
a1 = _mm_aesenclast_si128(a1, b0);\
|
||||
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
|
||||
a2 = _mm_aesenclast_si128(a2, b0);\
|
||||
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
|
||||
a3 = _mm_aesenclast_si128(a3, b0);\
|
||||
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
|
||||
a4 = _mm_aesenclast_si128(a4, b0);\
|
||||
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
|
||||
a5 = _mm_aesenclast_si128(a5, b0);\
|
||||
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
|
||||
a6 = _mm_aesenclast_si128(a6, b0);\
|
||||
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
|
||||
a7 = _mm_aesenclast_si128(a7, b0);\
|
||||
\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
t0 = TRANSP_MASK;\
|
||||
\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm_shuffle_epi8(i3, t0);\
|
||||
\
|
||||
o1 = _mm_unpackhi_epi16(i0, i1);\
|
||||
i0 = _mm_unpacklo_epi16(i0, i1);\
|
||||
t0 = _mm_unpackhi_epi16(i2, i3);\
|
||||
i2 = _mm_unpacklo_epi16(i2, i3);\
|
||||
\
|
||||
i0 = _mm_shuffle_epi32(i0, 216);\
|
||||
o1 = _mm_shuffle_epi32(o1, 216);\
|
||||
i2 = _mm_shuffle_epi32(i2, 216);\
|
||||
t0 = _mm_shuffle_epi32(t0, 216);\
|
||||
\
|
||||
o2 = _mm_unpackhi_epi32(i0, i2);\
|
||||
o3 = _mm_unpackhi_epi32(o1, t0);\
|
||||
i0 = _mm_unpacklo_epi32(i0, i2);\
|
||||
o1 = _mm_unpacklo_epi32(o1, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
o1 = _mm_unpackhi_epi64(i0, i4);\
|
||||
i0 = _mm_unpacklo_epi64(i0, i4);\
|
||||
o2 = _mm_unpacklo_epi64(i1, i5);\
|
||||
o3 = _mm_unpackhi_epi64(i1, i5);\
|
||||
o4 = _mm_unpacklo_epi64(i2, i6);\
|
||||
o5 = _mm_unpackhi_epi64(i2, i6);\
|
||||
o6 = _mm_unpacklo_epi64(i3, i7);\
|
||||
o7 = _mm_unpackhi_epi64(i3, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
o0 = _mm_unpackhi_epi64(i0, i1);\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
o1 = _mm_unpackhi_epi64(i2, i3);\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
o2 = _mm_unpackhi_epi64(i4, i5);\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
o3 = _mm_unpackhi_epi64(i6, i7);\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm_xor_si128(t0, t0);\
|
||||
i1 = _mm_unpackhi_epi64(i0, t0);\
|
||||
i0 = _mm_unpacklo_epi64(i0, t0);\
|
||||
i3 = _mm_unpackhi_epi64(i2, t0);\
|
||||
i2 = _mm_unpacklo_epi64(i2, t0);\
|
||||
i5 = _mm_unpackhi_epi64(i4, t0);\
|
||||
i4 = _mm_unpacklo_epi64(i4, t0);\
|
||||
i7 = _mm_unpackhi_epi64(i6, t0);\
|
||||
i6 = _mm_unpacklo_epi64(i6, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT256(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
|
||||
static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
xmm12 = chaining[0];
|
||||
xmm13 = chaining[1];
|
||||
xmm14 = chaining[2];
|
||||
xmm15 = chaining[3];
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* store transposed IV */
|
||||
chaining[0] = xmm12;
|
||||
chaining[1] = xmm2;
|
||||
chaining[2] = xmm6;
|
||||
chaining[3] = xmm7;
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
__m128i* const message = (__m128i*) m;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
static __m128i TEMP3;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
/* load message into registers xmm12 - xmm15 */
|
||||
xmm12 = message[0];
|
||||
xmm13 = message[1];
|
||||
xmm14 = message[2];
|
||||
xmm15 = message[3];
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* load previous chaining value and xor message to CV to get input of P */
|
||||
/* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
xmm8 = _mm_xor_si128(xmm12, chaining[0]);
|
||||
xmm0 = _mm_xor_si128(xmm2, chaining[1]);
|
||||
xmm4 = _mm_xor_si128(xmm6, chaining[2]);
|
||||
xmm5 = _mm_xor_si128(xmm7, chaining[3]);
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm8);
|
||||
xmm1 = _mm_xor_si128(xmm1, xmm10);
|
||||
xmm2 = _mm_xor_si128(xmm2, xmm12);
|
||||
xmm3 = _mm_xor_si128(xmm3, xmm14);
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, chaining[0]);
|
||||
xmm1 = _mm_xor_si128(xmm1, chaining[1]);
|
||||
xmm2 = _mm_xor_si128(xmm2, chaining[2]);
|
||||
xmm3 = _mm_xor_si128(xmm3, chaining[3]);
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm0;
|
||||
chaining[1] = xmm1;
|
||||
chaining[2] = xmm2;
|
||||
chaining[3] = xmm3;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
static __m128i TEMP3;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
xmm10 = chaining[1];
|
||||
xmm12 = chaining[2];
|
||||
xmm14 = chaining[3];
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
chaining[2] = xmm9;
|
||||
chaining[3] = xmm11;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,793 +0,0 @@
|
||||
/* groestl-intr-vperm.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with intrinsics using ssse3 instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer
|
||||
*
|
||||
* Based on the vperm and aes_ni implementations of the hash function Groestl
|
||||
* by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
|
||||
* Institute of Applied Mathematics, Middle East Technical University, Turkey
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include <tmmintrin.h>
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
/* global constants */
|
||||
__m128i ROUND_CONST_Lx;
|
||||
__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
__m128i ROUND_CONST_P[ROUNDS1024];
|
||||
__m128i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m128i TRANSP_MASK;
|
||||
__m128i SUBSH_MASK[8];
|
||||
__m128i ALL_0F;
|
||||
__m128i ALL_15;
|
||||
__m128i ALL_1B;
|
||||
__m128i ALL_63;
|
||||
__m128i ALL_FF;
|
||||
__m128i VPERM_IPT[2];
|
||||
__m128i VPERM_OPT[2];
|
||||
__m128i VPERM_INV[2];
|
||||
__m128i VPERM_SB1[2];
|
||||
__m128i VPERM_SB2[2];
|
||||
__m128i VPERM_SB4[2];
|
||||
__m128i VPERM_SBO[2];
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
#define SET_SHARED_CONSTANTS(){\
|
||||
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
|
||||
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
|
||||
ALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\
|
||||
ALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\
|
||||
ALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\
|
||||
VPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\
|
||||
VPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\
|
||||
VPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\
|
||||
VPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\
|
||||
VPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\
|
||||
VPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\
|
||||
VPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\
|
||||
VPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\
|
||||
VPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\
|
||||
VPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\
|
||||
VPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\
|
||||
VPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform w/o settings c*
|
||||
* transforms 2 rows to/from "vperm mode"
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0, a1 = 2 rows
|
||||
* table = transformation table to use
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0, a1 = 2 rows transformed with table
|
||||
* */
|
||||
#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
|
||||
t0 = c0;\
|
||||
t1 = c0;\
|
||||
t0 = _mm_andnot_si128(t0, a0);\
|
||||
t1 = _mm_andnot_si128(t1, a1);\
|
||||
t0 = _mm_srli_epi32(t0, 4);\
|
||||
t1 = _mm_srli_epi32(t1, 4);\
|
||||
a0 = _mm_and_si128(a0, c0);\
|
||||
a1 = _mm_and_si128(a1, c0);\
|
||||
t2 = c2;\
|
||||
t3 = c2;\
|
||||
t2 = _mm_shuffle_epi8(t2, a0);\
|
||||
t3 = _mm_shuffle_epi8(t3, a1);\
|
||||
a0 = c1;\
|
||||
a1 = c1;\
|
||||
a0 = _mm_shuffle_epi8(a0, t0);\
|
||||
a1 = _mm_shuffle_epi8(a1, t1);\
|
||||
a0 = _mm_xor_si128(a0, t2);\
|
||||
a1 = _mm_xor_si128(a1, t3);\
|
||||
}/**/
|
||||
|
||||
#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
|
||||
c0 = ALL_0F;\
|
||||
c1 = ((__m128i*) table )[0];\
|
||||
c2 = ((__m128i*) table )[1];\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform
|
||||
* transforms 2 rows to/from "vperm mode"
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0, a1 = 2 rows
|
||||
* table = transformation table to use
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0, a1 = 2 rows transformed with table
|
||||
* */
|
||||
#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(table, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform State
|
||||
* inputs:
|
||||
* a0-a3 = state
|
||||
* table = transformation table to use
|
||||
* t* = clobbers
|
||||
* outputs:
|
||||
* a0-a3 = transformed state
|
||||
* */
|
||||
#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(table, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Add Constant to State
|
||||
* inputs:
|
||||
* a0-a7 = state
|
||||
* constant = constant to add
|
||||
* t0 = clobber
|
||||
* outputs:
|
||||
* a0-a7 = state + constant
|
||||
* */
|
||||
#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
|
||||
t0 = constant;\
|
||||
a0 = _mm_xor_si128(a0, t0);\
|
||||
a1 = _mm_xor_si128(a1, t0);\
|
||||
a2 = _mm_xor_si128(a2, t0);\
|
||||
a3 = _mm_xor_si128(a3, t0);\
|
||||
a4 = _mm_xor_si128(a4, t0);\
|
||||
a5 = _mm_xor_si128(a5, t0);\
|
||||
a6 = _mm_xor_si128(a6, t0);\
|
||||
a7 = _mm_xor_si128(a7, t0);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Set Substitute Core Constants
|
||||
* */
|
||||
#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Substitute Core
|
||||
* first part of sbox inverse computation
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0 = 1 row
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* b0a, b0b = inputs for lookup step
|
||||
* */
|
||||
#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
|
||||
t0 = c0;\
|
||||
t0 = _mm_andnot_si128(t0, a0);\
|
||||
t0 = _mm_srli_epi32(t0, 4);\
|
||||
a0 = _mm_and_si128(a0, c0);\
|
||||
b0a = c1;\
|
||||
b0a = _mm_shuffle_epi8(b0a, a0);\
|
||||
a0 = _mm_xor_si128(a0, t0);\
|
||||
b0b = c2;\
|
||||
b0b = _mm_shuffle_epi8(b0b, t0);\
|
||||
b0b = _mm_xor_si128(b0b, b0a);\
|
||||
t1 = c2;\
|
||||
t1 = _mm_shuffle_epi8(t1, a0);\
|
||||
t1 = _mm_xor_si128(t1, b0a);\
|
||||
b0a = c2;\
|
||||
b0a = _mm_shuffle_epi8(b0a, b0b);\
|
||||
b0a = _mm_xor_si128(b0a, a0);\
|
||||
b0b = c2;\
|
||||
b0b = _mm_shuffle_epi8(b0b, t1);\
|
||||
b0b = _mm_xor_si128(b0b, t0);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Lookup
|
||||
* second part of sbox inverse computation
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0a, a0b = output of Substitution Core
|
||||
* table = lookup table to use (*1 / *2 / *4)
|
||||
* t0 = clobber
|
||||
* outputs:
|
||||
* b0 = output of sbox + multiplication
|
||||
* */
|
||||
#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
|
||||
b0 = ((__m128i*) table )[0];\
|
||||
t0 = ((__m128i*) table )[1];\
|
||||
b0 = _mm_shuffle_epi8(b0, a0b);\
|
||||
t0 = _mm_shuffle_epi8(t0, a0a);\
|
||||
b0 = _mm_xor_si128(b0, t0);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* SubBytes and *2 / *4
|
||||
* this function is derived from:
|
||||
* Constant-time SSSE3 AES core implementation
|
||||
* by Mike Hamburg
|
||||
* and
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0-a7 = state
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0-a7 = state * 4
|
||||
* c2 = row0 * 2 -> b0
|
||||
* c1 = row7 * 2 -> b3
|
||||
* c0 = row7 * 1 -> b4
|
||||
* t2 = row4 * 1 -> b7
|
||||
* TEMP_MUL1 = row(i) * 1
|
||||
* TEMP_MUL2 = row(i) * 2
|
||||
*
|
||||
* call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
|
||||
#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
|
||||
/* set Constants */\
|
||||
VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
|
||||
/* row 1 */\
|
||||
VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[1] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[1] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
|
||||
/* --- */\
|
||||
/* row 2 */\
|
||||
VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[2] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[2] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
|
||||
/* --- */\
|
||||
/* row 3 */\
|
||||
VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[3] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[3] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
|
||||
/* --- */\
|
||||
/* row 5 */\
|
||||
VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[5] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[5] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
|
||||
/* --- */\
|
||||
/* row 6 */\
|
||||
VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[6] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[6] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
|
||||
/* --- */\
|
||||
/* row 7 */\
|
||||
VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[7] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
|
||||
/* --- */\
|
||||
/* row 4 */\
|
||||
VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[4] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
|
||||
/* --- */\
|
||||
/* row 0 */\
|
||||
VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
|
||||
TEMP_MUL2[0] = c2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
|
||||
/* --- */\
|
||||
}/**/
|
||||
|
||||
|
||||
/* Optimized MixBytes
|
||||
* inputs:
|
||||
* a0-a7 = (row0-row7) * 4
|
||||
* b0 = row0 * 2
|
||||
* b3 = row7 * 2
|
||||
* b4 = row7 * 1
|
||||
* b7 = row4 * 1
|
||||
* all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
|
||||
* output: b0-b7
|
||||
* */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* save one value */\
|
||||
TEMP_MUL4 = a3;\
|
||||
/* 1 */\
|
||||
b1 = a0;\
|
||||
b1 = _mm_xor_si128(b1, a5);\
|
||||
b1 = _mm_xor_si128(b1, b4); /* -> helper! */\
|
||||
b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\
|
||||
b2 = b1;\
|
||||
\
|
||||
/* 2 */\
|
||||
b5 = a1;\
|
||||
b5 = _mm_xor_si128(b5, a4);\
|
||||
b5 = _mm_xor_si128(b5, b7); /* -> helper! */\
|
||||
b5 = _mm_xor_si128(b5, b3); /* -> helper! */\
|
||||
b6 = b5;\
|
||||
\
|
||||
/* 4 */\
|
||||
b7 = _mm_xor_si128(b7, a6);\
|
||||
/*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\
|
||||
b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\
|
||||
b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\
|
||||
b7 = _mm_xor_si128(b7, b3); /* -> helper! */\
|
||||
b2 = _mm_xor_si128(b2, b7);\
|
||||
\
|
||||
/* 3 */\
|
||||
b0 = _mm_xor_si128(b0, a7);\
|
||||
b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\
|
||||
b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\
|
||||
/*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\
|
||||
b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\
|
||||
b3 = b0;\
|
||||
b1 = _mm_xor_si128(b1, b0);\
|
||||
b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\
|
||||
\
|
||||
/* 5 */\
|
||||
b4 = _mm_xor_si128(b4, a2);\
|
||||
/*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\
|
||||
b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\
|
||||
b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\
|
||||
b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\
|
||||
b3 = _mm_xor_si128(b3, b4);\
|
||||
b6 = _mm_xor_si128(b6, b4);\
|
||||
\
|
||||
/* 6 */\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\
|
||||
b4 = _mm_xor_si128(b4, a3);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
b7 = _mm_xor_si128(b7, a3);\
|
||||
\
|
||||
/* 7 */\
|
||||
a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\
|
||||
a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\
|
||||
b2 = _mm_xor_si128(b2, a1);\
|
||||
b3 = _mm_xor_si128(b3, a1);\
|
||||
\
|
||||
/* 8 */\
|
||||
a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\
|
||||
a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\
|
||||
b6 = _mm_xor_si128(b6, a5);\
|
||||
b7 = _mm_xor_si128(b7, a5);\
|
||||
\
|
||||
/* 9 */\
|
||||
a3 = TEMP_MUL1[2];\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
\
|
||||
/* 10 */\
|
||||
a1 = TEMP_MUL1[6];\
|
||||
a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\
|
||||
b1 = _mm_xor_si128(b1, a1);\
|
||||
b4 = _mm_xor_si128(b4, a1);\
|
||||
\
|
||||
/* 11 */\
|
||||
a5 = TEMP_MUL1[3];\
|
||||
a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\
|
||||
b1 = _mm_xor_si128(b1, a5);\
|
||||
b6 = _mm_xor_si128(b6, a5);\
|
||||
\
|
||||
/* 12 */\
|
||||
a3 = TEMP_MUL1[7];\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\
|
||||
b2 = _mm_xor_si128(b2, a3);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
\
|
||||
/* 13 */\
|
||||
b0 = _mm_xor_si128(b0, (TEMP_MUL4));\
|
||||
b0 = _mm_xor_si128(b0, a4);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
b4 = _mm_xor_si128(b4, a0);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
}/**/
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
SET_SHARED_CONSTANTS();\
|
||||
SUBSH_MASK[0] = _mm_set_epi32(0x080f0e0d, 0x0c0b0a09, 0x07060504, 0x03020100);\
|
||||
SUBSH_MASK[1] = _mm_set_epi32(0x0a09080f, 0x0e0d0c0b, 0x00070605, 0x04030201);\
|
||||
SUBSH_MASK[2] = _mm_set_epi32(0x0c0b0a09, 0x080f0e0d, 0x01000706, 0x05040302);\
|
||||
SUBSH_MASK[3] = _mm_set_epi32(0x0e0d0c0b, 0x0a09080f, 0x02010007, 0x06050403);\
|
||||
SUBSH_MASK[4] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x03020100, 0x07060504);\
|
||||
SUBSH_MASK[5] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x04030201, 0x00070605);\
|
||||
SUBSH_MASK[6] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x05040302, 0x01000706);\
|
||||
SUBSH_MASK[7] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x06050403, 0x02010007);\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
|
||||
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
|
||||
}\
|
||||
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
|
||||
}/**/
|
||||
|
||||
/* vperm:
|
||||
* transformation before rounds with ipt
|
||||
* first round add transformed constant
|
||||
* middle rounds: add constant XOR 0x15...15
|
||||
* last round: additionally add 0x15...15 after MB
|
||||
* transformation after rounds with opt
|
||||
*/
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant + ShiftBytes (interleaved) */\
|
||||
b1 = ROUND_CONST_Lx;\
|
||||
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
|
||||
a1 = _mm_xor_si128(a1, b1);\
|
||||
a2 = _mm_xor_si128(a2, b1);\
|
||||
a3 = _mm_xor_si128(a3, b1);\
|
||||
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
|
||||
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
|
||||
a4 = _mm_xor_si128(a4, b1);\
|
||||
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
|
||||
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
|
||||
a5 = _mm_xor_si128(a5, b1);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
|
||||
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
|
||||
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
|
||||
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
|
||||
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
|
||||
/* SubBytes + Multiplication by 2 and 4 */\
|
||||
VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}/**/
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
|
||||
ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
|
||||
}
|
||||
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
t0 = TRANSP_MASK;\
|
||||
\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm_shuffle_epi8(i3, t0);\
|
||||
\
|
||||
o1 = i0;\
|
||||
t0 = i2;\
|
||||
\
|
||||
i0 = _mm_unpacklo_epi16(i0, i1);\
|
||||
o1 = _mm_unpackhi_epi16(o1, i1);\
|
||||
i2 = _mm_unpacklo_epi16(i2, i3);\
|
||||
t0 = _mm_unpackhi_epi16(t0, i3);\
|
||||
\
|
||||
i0 = _mm_shuffle_epi32(i0, 216);\
|
||||
o1 = _mm_shuffle_epi32(o1, 216);\
|
||||
i2 = _mm_shuffle_epi32(i2, 216);\
|
||||
t0 = _mm_shuffle_epi32(t0, 216);\
|
||||
\
|
||||
o2 = i0;\
|
||||
o3 = o1;\
|
||||
\
|
||||
i0 = _mm_unpacklo_epi32(i0, i2);\
|
||||
o1 = _mm_unpacklo_epi32(o1, t0);\
|
||||
o2 = _mm_unpackhi_epi32(o2, i2);\
|
||||
o3 = _mm_unpackhi_epi32(o3, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
o1 = i0;\
|
||||
o2 = i1;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i4);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i4);\
|
||||
o3 = i1;\
|
||||
o4 = i2;\
|
||||
o2 = _mm_unpacklo_epi64(o2, i5);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i5);\
|
||||
o5 = i2;\
|
||||
o6 = i3;\
|
||||
o4 = _mm_unpacklo_epi64(o4, i6);\
|
||||
o5 = _mm_unpackhi_epi64(o5, i6);\
|
||||
o7 = i3;\
|
||||
o6 = _mm_unpacklo_epi64(o6, i7);\
|
||||
o7 = _mm_unpackhi_epi64(o7, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
o0 = i0;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
o0 = _mm_unpackhi_epi64(o0, i1);\
|
||||
o1 = i2;\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i3);\
|
||||
o2 = i4;\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
o2 = _mm_unpackhi_epi64(o2, i5);\
|
||||
o3 = i6;\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm_xor_si128(t0, t0);\
|
||||
i1 = i0;\
|
||||
i3 = i2;\
|
||||
i5 = i4;\
|
||||
i7 = i6;\
|
||||
i0 = _mm_unpacklo_epi64(i0, t0);\
|
||||
i1 = _mm_unpackhi_epi64(i1, t0);\
|
||||
i2 = _mm_unpacklo_epi64(i2, t0);\
|
||||
i3 = _mm_unpackhi_epi64(i3, t0);\
|
||||
i4 = _mm_unpacklo_epi64(i4, t0);\
|
||||
i5 = _mm_unpackhi_epi64(i5, t0);\
|
||||
i6 = _mm_unpacklo_epi64(i6, t0);\
|
||||
i7 = _mm_unpackhi_epi64(i7, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
}/**/
|
||||
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
#define VPERM_Transform_RoundConst_CNT2(i, j){\
|
||||
xmm0 = ROUND_CONST_L0[i];\
|
||||
xmm1 = ROUND_CONST_L7[i];\
|
||||
xmm2 = ROUND_CONST_L0[j];\
|
||||
xmm3 = ROUND_CONST_L7[j];\
|
||||
VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
|
||||
xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
|
||||
xmm1 = _mm_xor_si128(xmm1, (ALL_15));\
|
||||
xmm2 = _mm_xor_si128(xmm2, (ALL_15));\
|
||||
xmm3 = _mm_xor_si128(xmm3, (ALL_15));\
|
||||
ROUND_CONST_L0[i] = xmm0;\
|
||||
ROUND_CONST_L7[i] = xmm1;\
|
||||
ROUND_CONST_L0[j] = xmm2;\
|
||||
ROUND_CONST_L7[j] = xmm3;\
|
||||
}/**/
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
#define VPERM_Transform_RoundConst(){\
|
||||
xmm0 = ROUND_CONST_Lx;\
|
||||
VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
|
||||
xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
|
||||
ROUND_CONST_Lx = xmm0;\
|
||||
VPERM_Transform_RoundConst_CNT2(0, 1);\
|
||||
VPERM_Transform_RoundConst_CNT2(2, 3);\
|
||||
VPERM_Transform_RoundConst_CNT2(4, 5);\
|
||||
VPERM_Transform_RoundConst_CNT2(6, 7);\
|
||||
VPERM_Transform_RoundConst_CNT2(8, 9);\
|
||||
}/**/
|
||||
|
||||
void INIT256(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, /*xmm11,*/ xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
VPERM_Transform_RoundConst();
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
xmm12 = chaining[0];
|
||||
xmm13 = chaining[1];
|
||||
xmm14 = chaining[2];
|
||||
xmm15 = chaining[3];
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* store transposed IV */
|
||||
chaining[0] = xmm12;
|
||||
chaining[1] = xmm2;
|
||||
chaining[2] = xmm6;
|
||||
chaining[3] = xmm7;
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
__m128i* const message = (__m128i*) m;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP_MUL1[8];
|
||||
static __m128i TEMP_MUL2[8];
|
||||
static __m128i TEMP_MUL4;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
/* load message into registers xmm12 - xmm15 */
|
||||
xmm12 = message[0];
|
||||
xmm13 = message[1];
|
||||
xmm14 = message[2];
|
||||
xmm15 = message[3];
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* load previous chaining value */
|
||||
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
|
||||
xmm8 = chaining[0];
|
||||
xmm0 = chaining[1];
|
||||
xmm4 = chaining[2];
|
||||
xmm5 = chaining[3];
|
||||
|
||||
/* xor message to CV get input of P */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
xmm8 = _mm_xor_si128(xmm8, xmm12);
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm2);
|
||||
xmm4 = _mm_xor_si128(xmm4, xmm6);
|
||||
xmm5 = _mm_xor_si128(xmm5, xmm7);
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm8);
|
||||
xmm1 = _mm_xor_si128(xmm1, xmm10);
|
||||
xmm2 = _mm_xor_si128(xmm2, xmm12);
|
||||
xmm3 = _mm_xor_si128(xmm3, xmm14);
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
|
||||
xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
|
||||
xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
|
||||
xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm0;
|
||||
chaining[1] = xmm1;
|
||||
chaining[2] = xmm2;
|
||||
chaining[3] = xmm3;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP_MUL1[8];
|
||||
static __m128i TEMP_MUL2[8];
|
||||
static __m128i TEMP_MUL4;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
xmm10 = chaining[1];
|
||||
xmm12 = chaining[2];
|
||||
xmm14 = chaining[3];
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
|
||||
VPERM_Transform(xmm9, xmm11, VPERM_OPT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
chaining[2] = xmm9;
|
||||
chaining[3] = xmm11;
|
||||
|
||||
return;
|
||||
}//OF512()
|
||||
|
||||
|
||||
|
||||
@@ -14,50 +14,15 @@
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#ifdef __AES__
|
||||
|
||||
#include "groestl-version.h"
|
||||
|
||||
#ifdef TASM
|
||||
#ifdef VAES
|
||||
#include "groestl-asm-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl-asm-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl-asm-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#ifdef TINTR
|
||||
#ifdef VAES
|
||||
#include "groestl-intr-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl-intr-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl-intr-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#error NO TYPE SPECIFIED (-DT[ASM/INTR])
|
||||
#endif
|
||||
#endif
|
||||
#include "groestl-intr-aes.h"
|
||||
|
||||
HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
|
||||
{
|
||||
int i;
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
SET_CONSTANTS();
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
@@ -67,8 +32,10 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT(ctx->chaining);
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
|
||||
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
@@ -87,8 +54,7 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT(ctx->chaining);
|
||||
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
@@ -104,7 +70,7 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
|
||||
// 5. Midstate will work at reduced impact than full hash, if total hash
|
||||
// (midstate + tail) is less than 1 block.
|
||||
// This, unfortunately, is the case with all current users.
|
||||
// 6. the morefull blocks the bigger the gain
|
||||
// 6. the more full blocks the bigger the gain
|
||||
|
||||
// use only for midstate precalc
|
||||
HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
|
||||
@@ -138,12 +104,11 @@ HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
|
||||
// deprecated do not use
|
||||
HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
|
||||
{
|
||||
const int len = (int)ctx->databitlen / 128; // bits to __m128i
|
||||
const int blocks = ctx->blk_count + 1; // adjust for final block
|
||||
|
||||
const int rem_ptr = ctx->rem_ptr; // end of data start of padding
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i; // where in buffer
|
||||
const int len = (int)ctx->databitlen / 128; // bits to __m128i
|
||||
const uint64_t blocks = ctx->blk_count + 1; // adjust for final block
|
||||
const int rem_ptr = ctx->rem_ptr; // end of data start of padding
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i; // where in buffer
|
||||
int i;
|
||||
|
||||
// first pad byte = 0x80, last pad byte = block count
|
||||
@@ -152,21 +117,18 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
|
||||
if ( rem_ptr == len - 1 )
|
||||
{
|
||||
// only 128 bits left in buffer, all padding at once
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi64x( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
ctx->buffer[rem_ptr] = m128_const_64( 0, 0x80 );
|
||||
// add zero padding
|
||||
for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
|
||||
0, 0 ,0,0, 0,0,0,0 );
|
||||
ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
@@ -180,6 +142,75 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
int groestl512_full( hashState_groestl* ctx, void* output,
|
||||
const void* input, uint64_t databitlen )
|
||||
{
|
||||
|
||||
int i;
|
||||
ctx->hashlen = 64;
|
||||
|
||||
for ( i = 0; i < SIZE512; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
uint64_t blocks = len / SIZE512;
|
||||
__m128i* in = (__m128i*)input;
|
||||
|
||||
// digest any full blocks, process directly from input
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF1024( ctx->chaining, &in[ i * SIZE512 ] );
|
||||
ctx->buf_ptr = blocks * SIZE512;
|
||||
|
||||
// copy any remaining data to buffer, it may already contain data
|
||||
// from a previous update for a midstate precalc
|
||||
for ( i = 0; i < len % SIZE512; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
|
||||
//--- final ---
|
||||
|
||||
blocks++; // adjust for final block
|
||||
|
||||
if ( i == len -1 )
|
||||
{
|
||||
// only 128 bits left in buffer, all padding at once
|
||||
ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m128_const_64( 0, 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF1024( ctx->chaining, ctx->buffer );
|
||||
OF1024( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
|
||||
const void* input, DataLength_gr databitlen )
|
||||
{
|
||||
@@ -187,7 +218,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE512;
|
||||
uint64_t blocks = len / SIZE512;
|
||||
__m128i* in = (__m128i*)input;
|
||||
int i;
|
||||
|
||||
@@ -211,21 +242,18 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
|
||||
if ( i == len -1 )
|
||||
{
|
||||
// only 128 bits left in buffer, all padding at once
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
ctx->buffer[i] = m128_const_64( 0, 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
|
||||
0, 0 ,0,0, 0,0,0,0 );
|
||||
ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
|
||||
@@ -87,5 +87,6 @@ HashReturn_gr final_groestl( hashState_groestl*, void* );
|
||||
|
||||
HashReturn_gr update_and_final_groestl( hashState_groestl*, void*,
|
||||
const void*, DataLength_gr );
|
||||
int groestl512_full( hashState_groestl*, void*, const void*, uint64_t );
|
||||
|
||||
#endif /* __hash_h */
|
||||
|
||||
@@ -11,43 +11,9 @@
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#ifdef __AES__
|
||||
|
||||
#include "groestl-version.h"
|
||||
|
||||
#ifdef TASM
|
||||
#ifdef VAES
|
||||
#include "groestl256-asm-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl256-asm-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl256-asm-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#ifdef TINTR
|
||||
#ifdef VAES
|
||||
#include "groestl256-intr-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl256-intr-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl256-intr-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#error NO TYPE SPECIFIED (-DT[ASM/INTR])
|
||||
#endif
|
||||
#endif
|
||||
#include "groestl256-intr-aes.h"
|
||||
|
||||
/* initialise context */
|
||||
HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
|
||||
@@ -55,7 +21,6 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
|
||||
int i;
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
SET_CONSTANTS();
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
@@ -86,8 +51,11 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT256(ctx->chaining);
|
||||
|
||||
ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );
|
||||
|
||||
// ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
// INIT256(ctx->chaining);
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
@@ -246,6 +214,98 @@ HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
int groestl256_full( hashState_groestl256* ctx,
|
||||
void* output, const void* input, DataLength_gr databitlen )
|
||||
{
|
||||
int i;
|
||||
ctx->hashlen = 32;
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT256( ctx->chaining );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE256;
|
||||
__m128i* in = (__m128i*)input;
|
||||
|
||||
// --- update ---
|
||||
|
||||
// digest any full blocks, process directly from input
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF512( ctx->chaining, &in[ i * SIZE256 ] );
|
||||
ctx->buf_ptr = blocks * SIZE256;
|
||||
|
||||
// cryptonight has 200 byte input, an odd number of __m128i
|
||||
// remainder is only 8 bytes, ie u64.
|
||||
if ( databitlen % 128 !=0 )
|
||||
{
|
||||
// must be cryptonight, copy 64 bits of data
|
||||
*(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
|
||||
i = -1; // signal for odd length
|
||||
}
|
||||
else
|
||||
{
|
||||
// Copy any remaining data to buffer for final transform
|
||||
for ( i = 0; i < len % SIZE256; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
}
|
||||
|
||||
//--- final ---
|
||||
|
||||
// adjust for final block
|
||||
blocks++;
|
||||
|
||||
if ( i == len - 1 )
|
||||
{
|
||||
// all padding at once
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
0, 0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( i == -1 )
|
||||
{
|
||||
// cryptonight odd length
|
||||
((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull;
|
||||
// finish the block with zero and length padding as normal
|
||||
i = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
// add length padding
|
||||
// cheat since we know the block count is trivial, good if block < 256
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
0, 0,0,0, 0,0,0,0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF512( ctx->chaining, ctx->buffer );
|
||||
OF512( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
|
||||
/* hash bit sequence */
|
||||
HashReturn_gr hash_groestl256(int hashbitlen,
|
||||
const BitSequence_gr* data,
|
||||
|
||||
@@ -93,9 +93,6 @@ typedef enum
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (32))) __m128i chaining[SIZE256];
|
||||
__attribute__ ((aligned (32))) __m128i buffer[SIZE256];
|
||||
// __attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */
|
||||
// __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; /* data buffer */
|
||||
// u64 block_counter; /* message block counter */
|
||||
int hashlen; // bytes
|
||||
int blk_count;
|
||||
int buf_ptr; /* data buffer pointer */
|
||||
@@ -118,4 +115,7 @@ HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
|
||||
HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
|
||||
const void*, DataLength_gr );
|
||||
|
||||
int groestl256_full( hashState_groestl256* ctx,
|
||||
void* output, const void* input, DataLength_gr databitlen );
|
||||
|
||||
#endif /* __hash_h */
|
||||
|
||||
64
algo/groestl/groestl-4way.c
Normal file
64
algo/groestl/groestl-4way.c
Normal file
@@ -0,0 +1,64 @@
|
||||
#include "groestl-gate.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#if defined(GROESTL_4WAY_VAES)
|
||||
|
||||
#include "groestl512-hash-4way.h"
|
||||
|
||||
void groestl_4way_hash( void *output, const void *input )
|
||||
{
|
||||
uint32_t hash[16*4] __attribute__ ((aligned (128)));
|
||||
groestl512_4way_context ctx;
|
||||
|
||||
groestl512_4way_init( &ctx, 64 );
|
||||
groestl512_4way_update_close( &ctx, hash, input, 640 );
|
||||
|
||||
groestl512_4way_init( &ctx, 64 );
|
||||
groestl512_4way_update_close( &ctx, hash, hash, 512 );
|
||||
|
||||
dintrlv_4x128( output, output+32, output+64, output+96, hash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_groestl_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t *noncep = vdata + 64+3; // 4*16 + 3
|
||||
int thr_id = mythr->id;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
|
||||
mm512_bswap32_intrlv80_4x128( vdata, pdata );
|
||||
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+ 4, n+1 );
|
||||
be32enc( noncep+ 8, n+2 );
|
||||
be32enc( noncep+12, n+3 );
|
||||
|
||||
groestl_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( hash+(lane<<3) )[7] <= Htarg )
|
||||
if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, hash+(lane<<3), mythr );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user