Compare commits

..

19 Commits

Author SHA1 Message Date
Jay D Dee
520d4d5384 v3.21.1 2023-02-08 22:11:05 -05:00
Jay D Dee
da7030faa8 v3.21.0 2022-12-21 13:09:14 -05:00
Jay D Dee
bd84f199fe v3.20.3 2022-10-21 23:12:18 -04:00
Jay D Dee
58030e2788 v3.20.2 2022-08-01 20:21:05 -04:00
Jay D Dee
1321ac474c v3.20.1 2022-07-26 18:36:40 -04:00
Jay D Dee
40d07c0097 v3.20.0 2022-07-17 13:30:50 -04:00
Jay D Dee
f552f2b1e8 v3.19.9 2022-07-10 11:04:00 -04:00
Jay D Dee
26b8927632 v3.19.8 2022-05-27 18:12:30 -04:00
Jay D Dee
db76d3865f v3.19.7 2022-04-02 12:44:57 -04:00
Jay D Dee
5b678d2481 v3.19.6 2022-02-21 23:14:24 -05:00
Jay D Dee
90137b391e v3.19.5 2022-01-30 20:59:54 -05:00
Jay D Dee
8727d79182 v3.19.4 2022-01-12 21:08:25 -05:00
Jay D Dee
17ccbc328f v3.19.3 2022-01-07 12:07:38 -05:00
Jay D Dee
0e3945ddb5 v3.19.2 2021-12-30 16:28:24 -05:00
Jay D Dee
7d2ef7973d v3.19.1 2021-11-20 00:46:01 -05:00
Jay D Dee
e6fd9b1d69 v3.19.0 2021-11-10 21:33:44 -05:00
Jay D Dee
1a234cbe53 v3.18.2 2021-10-19 22:35:36 -04:00
Jay D Dee
47cc5dcff5 v3.18.1 2021-10-10 22:50:19 -04:00
Jay D Dee
2cd1507c2e v3.7.4 2021-09-29 17:31:16 -04:00
179 changed files with 15866 additions and 18442 deletions

View File

@@ -1,4 +1,6 @@
These instructions may be out of date, see the Wiki for the latest...
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
1. Requirements:
---------------
@@ -32,14 +34,26 @@ but different package names.
$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
support depending on your CPU and compiler version:
openssl 1.1.0e or higher.
"-march=native" is always the best choice
znver1 and znver2 should be recognized on most recent version of GCC and
znver3 is expected with GCC 11. GCC 11 also includes rocketlake support.
In the meantime here are some suggestions to compile with new CPUs:
"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
"-march=native" is usually the best choice, used by build.sh.
"-msha" Add SHA to other tuning options
"-march=znver2 -mvaes" can be used for Ryzen 5000 if znver3 is not recongized.
"-mcascadelake -msha" or
"-mcometlake -mavx512 -msha" can be used for Rocket Lake.
Features can also be added individually:
"-msha" adds support for HW accelerated sha256.
"-mavx512" adds support for 512 bit vectors
"-mvaes" add support for parallel AES
Additional instructions for static compilalation can be found here:
https://lxadm.com/Static_compilation_of_cpuminer

View File

@@ -1,6 +1,6 @@
Instructions for compiling cpuminer-opt for Windows.
Thwaw intructions nay be out of date. Please consult the wiki for
These intructions are out of date. Please consult the wiki for
the latest:
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
@@ -40,7 +40,7 @@ $ mkdir $HOME/usr/lib
version available in the repositories.
Download the following source code packages from their respective and
respected download locations, copy them to ~/usr/lib/ and uncompress them.
respected download locations, copy them to $HOME/usr/lib/ and uncompress them.
openssl: https://github.com/openssl/openssl/releases
@@ -149,85 +149,10 @@ Copy cpuminer.exe to the release directory, compress and copy the release direct
Run cpuminer
In a command windows change directories to the unzipped release folder. to get a list of all options:
In a command windows change directories to the unzipped release folder. To get a list of all options:
cpuminer.exe --help
Command options are specific to where you mine. Refer to the pool's instructions on how to set them.
Create a link to the locally compiled version of gmp.h
$ ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h
Edit configure.ac to fix lipthread package name.
sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
7. Compile
you can use the default compile if you intend to use cpuminer-opt on the
same CPU and the virtual machine supports that architecture.
./build.sh
Otherwise you can compile manually while setting options in CFLAGS.
Some common options:
To compile for a specific CPU architecture:
CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl
This will compile for AMD Ryzen.
You can compile more generically for a set of specific CPU features
if you know what features you want:
CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
This will compile for an older CPU that does not have AVX.
You can find several examples in build-allarch.sh
If you have a CPU with more than 64 threads and Windows 7 or higher you
can enable the CPU Groups feature:
-D_WIN32_WINNT==0x0601
Once you have run configure successfully run make with n CPU threads:
make -j n
Copy cpuminer.exe to the release directory, compress and copy the release
directory to a Windows system and run cpuminer.exe from the command line.
Run cpuminer
In a command windows change directories to the unzipped release folder.
to get a list of all options:
cpuminer.exe --help
Command options are specific to where you mine. Refer to the pool's
instructions on how to set them.

View File

@@ -21,6 +21,7 @@ cpuminer_SOURCES = \
api.c \
sysinfos.c \
algo-gate-api.c\
malloc-huge.c \
algo/argon2/argon2a/argon2a.c \
algo/argon2/argon2a/ar2/argon2.c \
algo/argon2/argon2a/ar2/opt.c \
@@ -158,7 +159,9 @@ cpuminer_SOURCES = \
algo/ripemd/lbry.c \
algo/ripemd/lbry-4way.c \
algo/scrypt/scrypt.c \
algo/scrypt/scrypt-core-4way.c \
algo/scrypt/neoscrypt.c \
algo/sha/sha256-hash.c \
algo/sha/sph_sha2.c \
algo/sha/sph_sha2big.c \
algo/sha/sha256-hash-4way.c \
@@ -167,7 +170,9 @@ cpuminer_SOURCES = \
algo/sha/sha256-hash-2way-ni.c \
algo/sha/hmac-sha256-hash.c \
algo/sha/hmac-sha256-hash-4way.c \
algo/sha/sha256d.c \
algo/sha/sha2.c \
algo/sha/sha256d-4way.c \
algo/sha/sha256t-gate.c \
algo/sha/sha256t-4way.c \
algo/sha/sha256t.c \
@@ -200,7 +205,6 @@ cpuminer_SOURCES = \
algo/verthash/tiny_sha3/sha3.c \
algo/verthash/tiny_sha3/sha3-4way.c \
algo/whirlpool/sph_whirlpool.c \
algo/whirlpool/whirlpool-hash-4way.c \
algo/whirlpool/whirlpool-gate.c \
algo/whirlpool/whirlpool.c \
algo/whirlpool/whirlpoolx.c \
@@ -280,11 +284,9 @@ cpuminer_SOURCES = \
algo/x22/x22i-gate.c \
algo/x22/x25x.c \
algo/x22/x25x-4way.c \
algo/yescrypt/yescrypt.c \
algo/yescrypt/yescrypt-best.c \
algo/yespower/yespower-gate.c \
algo/yespower/yespower-blake2b.c \
algo/yespower/crypto/blake2b-yp.c \
algo/yespower/crypto/hmac-blake2b.c \
algo/yespower/yescrypt-r8g.c \
algo/yespower/yespower-opt.c

View File

@@ -40,17 +40,25 @@ Requirements
Intel Core2 and newer and AMD equivalents. Further optimizations are available
on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
performance.
32 bit CPUs are not supported.
Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
are not supported.
ARM and Aarch64 CPUs are not supported.
Mobile CPUs like laptop computers are not recommended because they aren't
designed for extreme heat of operating at full load for extended periods of
time.
Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
including Mint and Centos, are known to work and have all dependencies
in their repositories. Others may work but may require more effort. Older
versions such as Centos 6 don't work due to missing features.
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
binaries. WindowsXP 64 bit is YMMV.
FreeBSD is not actively tested but should work, YMMV.
MacOS, OSx and Android are not supported.
3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
@@ -66,53 +74,50 @@ Supported Algorithms
argon2d250 argon2d-crds, Credits (CRDS)
argon2d500 argon2d-dyn, Dynamic (DYN)
argon2d4096 argon2d-uis, Unitus, (UIS)
axiom Shabal-256 MemoHash
blake Blake-256 (SFR)
blake2b Blake2b 256
blake2s Blake-2 S
blake Blake-256
blake2b Blake2-512
blake2s Blake2-256
blakecoin blake256r8
bmw BMW 256
bmw512 BMW 512
c11 Chaincoin
c11
decred
deep Deepcoin (DCN)
dmd-gr Diamond-Groestl
groestl Groestl coin
hex x16r-hex
hmq1725 Espers
hmq1725
hodl Hodlcoin
jha Jackpotcoin
keccak Maxcoin
keccakc Creative coin
lbry LBC, LBRY Credits
luffa Luffa
lyra2h Hppcoin
lyra2h
lyra2re lyra2
lyra2rev2 lyra2v2
lyra2rev3 lyrav2v3
lyra2z
lyra2z330 Lyra2 330 rows, Zoin (ZOI)
m7m Magi (XMG)
minotaur Ringcoin (RNG)
lyra2z330
m7m
minotaur
minotaurx
myr-gr Myriad-Groestl
neoscrypt NeoScrypt(128, 2, 1)
nist5 Nist5
pentablake Pentablake
phi1612 phi
phi2 Luxcoin (LUX)
phi2-lux identical to phi2
pluck Pluck:128 (Supcoin)
phi2
polytimos Ninja
power2b MicroBitcoin (MBC)
quark Quark
qubit Qubit
scrypt scrypt(1024, 1, 1) (default)
scrypt:N scrypt(N, 1, 1)
scryptn2 scrypt(1048576, 1, 1)
sha256d Double SHA-256
sha256q Quad SHA-256, Pyrite (PYE)
sha256t Triple SHA-256, Onecoin (OC)
sha256q Quad SHA-256
sha256t Triple SHA-256
sha3d Double keccak256 (BSHA3)
shavite3 Shavite3
skein Skein+Sha (Skeincoin)
skein2 Double Skein (Woodcoin)
skunk Signatum (SIGT)
@@ -128,17 +133,17 @@ Supported Algorithms
x11 Dash
x11evo Revolvercoin
x11gost sib (SibCoin)
x12 Galaxie Cash (GCH)
x13 X13
x12
x13
x13bcd bcd
x13sm3 hsr (Hshare)
x14 X14
x15 X15
x14
x15
x16r
x16rv2
x16rt Gincoin (GIN)
x16rt-veil Veil (VEIL)
x16s Pigeoncoin (PGN)
x16rt
x16rt-veil veil
x16s
x17
x21s
x22i

View File

@@ -1,12 +1,22 @@
This file is included in the Windows binary package. Compile instructions
for Linux and Windows can be found in RELEASE_NOTES.
This package is officially avalable only from:
cpuminer-opt is open source and free of any fees. Many forks exist that are
closed source and contain usage fees. support open source free software.
This package is officially avalaible only from:
https://github.com/JayDDee/cpuminer-opt
No other sources should be trusted.
cpuminer is a console program that is executed from a DOS or Powershell
prompt. There is no GUI and no mouse support.
command prompt. There is no GUI and no mouse support.
New users are encouraged to consult the cpuminer-opt Wiki for detailed
information on usage:
https://github.com/JayDDee/cpuminer-opt/wiki
Miner programs are often flagged as malware by antivirus programs. This is
a false positive, they are flagged simply because they are cryptocurrency
@@ -18,14 +28,14 @@ error to find the fastest one that works. Pay attention to
the features listed at cpuminer startup to ensure you are mining at
optimum speed using the best available features.
Architecture names and compile options used are only provided for Intel
Core series. Budget CPUs like Pentium and Celeron are often missing some
features.
Architecture names and compile options used are only provided for
mainstream desktop CPUs. Budget CPUs like Pentium and Celeron are often
missing some features. Check your CPU.
AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
supported by cpuminer-opt due to an incompatible implementation of SSE2 on
these CPUs. Some algos may crash the miner with an invalid instruction.
Users are recommended to use an unoptimized miner such as cpuminer-multi.
Support for AMD CPUs older than Ryzen is incomplete and without specific
recommendations. Find the best fit. CPUs older than Piledriver, including
Athlon x2 and Phenom II x4, are not supported by cpuminer-opt due to an
incompatible implementation of SSE2 on these CPUs.
More information for Intel and AMD CPU architectures and their features
can be found on Wikipedia.
@@ -34,26 +44,20 @@ https://en.wikipedia.org/wiki/List_of_Intel_CPU_microarchitectures
https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures
File name Architecture name
Exe file name Compile flags Arch name
cpuminer-sse2.exe Core2, Nehalem, generic x86_64 with SSE2
cpuminer-aes-sse42.exe Westmere
cpuminer-avx.exe Sandybridge, Ivybridge
cpuminer-avx2.exe Haswell, Skylake, Kabylake, Coffeelake, Cometlake
cpuminer-avx2-sha.exe AMD Zen1, Zen2
cpuminer-avx2-sha-vaes.exe Intel Alderlake*, AMD Zen3
cpuminer-avx512.exe Intel HEDT Skylake-X, Cascadelake
cpuminer-avx512-sha-vaes.exe AMD Zen4, Intel Rocketlake, Icelake
cpuminer-sse2.exe "-msse2" Core2, Nehalem
cpuminer-aes-sse42.exe "-march=westmere" Westmere
cpuminer-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell(1)
cpuminer-avx512.exe "-march=skylake-avx512" Skylake-X, Cascadelake
cpuminer-avx512-sha.exe "-march=cascadelake -msha" Rocketlake(2)
cpuminer-avx512-sha-vaes.exe "-march=icelake-client" Icelake, Tigerlake(3)
cpuminer-zen.exe "-march=znver1" AMD Zen1, Zen2
cpuminer-zen3.exe "-march=znver2 -mvaes" Zen3(4)
(1) Haswell includes Broadwell, Skylake, Kabylake, Coffeelake & Cometlake.
(2) Rocketlake build uses cascadelake+sha as a workaround until Rocketlake
compiler support is avalable.
(3) Icelake & Tigerlake are only available on some laptops. Mining with a
laptop is not recommended.
(4) Zen3 build uses zen2+vaes as a workaround until Zen3 compiler support is
available. Zen2 CPUs should use Zen1 build.
* Alderlake is a hybrid architecture with a mix of E-cores & P-cores. Although
the P-cores can support AVX512 the E-cores can't so Intel decided to disable
AVX512 on the the P-cores.
Notes about included DLL files:
@@ -64,10 +68,10 @@ source code obtained from the author's official repository. The exact
procedure is documented in the build instructions for Windows:
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
Some DLL filess may already be installed on the system by Windows or third
party packages. They often will work and may be used instead of the included
file. Without a compelling reason to do so it's recommended to use the included
files as they are packaged.
Some included DLL files may already be installed on the system by Windows or
third party packages. They often will work and may be used instead of the
included version of the files.
If you like this software feel free to donate:

View File

@@ -22,7 +22,7 @@ required.
Compile Instructions
--------------------
See INSTALL_LINUX or INSTALL_WINDOWS for compile instruuctions
See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
Requirements
------------
@@ -65,6 +65,193 @@ If not what makes it happen or not happen?
Change Log
----------
v3.21.1
Fixed a segfault in some obsolete algos.
Small optimizations to Hamsi & Shabal AVX2 & AVX512.
v3.21.0
Added minotaurx algo for stratum only.
Blake256 & sha256 prehash optimized to ignore zero-padded data for AVX2 & AVX512.
Other small improvements.
v3.20.3
Faster c11 algo: AVX512 6%, AVX2 4%, AVX2+VAES 15%.
Faster AVX2+VAES for anime 14%, hmq1725 6%.
Small optimizations to Luffa AVX2 & AVX512.
v3.20.2
Bit rotation optimizations to Blake256, Blake512, Blake2b, Blake2s & Lyra2-blake2b for SSE2 & AVX2.
Removed old unused yescrypt library and other unused code.
v3.20.1
sph_blake2b optimized 1-way SSSE3 & AVX2.
Removed duplicate Blake2b used by Power2b algo, will now use optimized sph_blake2b.
Removed imprecise hash & target display from rejected share log.
Share and target difficulty is now displayed only for low difficulty shares.
Updated configure.ac to check for AVX512 asm support.
Small optimization to Lyra2 SSE2.
v3.20.0
#375 Fixed segfault in algos using Groestl VAES due to use of uninitialized data.
v3.19.9
More Blake256, Blake512, Luffa & Cubehash prehash optimizations.
Relaxed some excessively strict data alignment that was negatively affecting performance.
v3.19.8
#370 "stratum+ssl", in addition to "stratum+tcps", is now recognized as a valid
url protocol specifier for requesting a secure stratum connection.
The full url, including the protocol, is now displayed in the stratum connect
log and the periodic summary log.
Small optimizations to Cubehash, AVX2 & AVX512.
Byte order and prehash optimizations for Blake256 & Blake512, AVX2 & AVX512.
v3.19.7
#369 Fixed time limited mining, --time-limit.
Fixed a potential compile error when using optimization below -O3.
v3.19.6
#363 Fixed a stratum bug where the first job may be ignored delaying start of hashing
Fixed handling of nonce exhaust when hashing a fast algo with extranonce disabled
Small optimization to Shavite.
v3.19.5
Enhanced stratum-keepalive preemptively resets the stratum connection
before the server to avoid lost shares.
Added build-msys2.sh shell script for easier compiling on Windows, see Wiki for details.
X16RT: eliminate unnecessary recalculations of the hash order.
Fix a few compiler warnings.
Fixed log colour error when a block is solved.
v3.19.4
#359: Fix verthash memory allocation for non-hugepages, broken in v3.19.3.
New option stratum-keepalive prevents stratum timeouts when no shares are
submitted for several minutes due to high difficulty.
Fixed a bug displaying optimizations for some algos.
v3.19.3
Linux: Faster verthash (+25%), scryptn2 (+2%) when huge pages are available.
Small speed up for Hamsi AVX2 & AVX512, Keccak AVX512.
v3.19.2
Fixed log displaying incorrect memory usage for scrypt, broken in v3.19.1.
Reduce log noise when replies to submitted shares are lost due to stratum errors.
Fugue prehash optimization for X16r family AVX2 & AVX512.
Small speed improvement for Hamsi AVX2 & AVX512.
Win: With CPU groups enabled the number of CPUs displayed in the ASCII art
affinity map is the number of CPUs in a CPU group, was number of CPUs up to 64.
v3.19.1
Changes to Windows binaries package:
- builds for CPUs with AVX or lower have CPU groups disabled,
- zen3 build renamed to avx2-sha-vaes to support Alderlake as well as Zen3,
- zen build renamed to avx2-sha, supports Zen1 & Zen2,
- avx512-sha build removed, Rocketlake CPUs can use avx512-sha-vaes,
- see README.txt for compatibility details.
Fixed a few compiler warnings that are new in GCC 11.
Other minor fixes.
v3.19.0
Windows binaries now built with support for CPU groups, requires Windows 7.
Changes to cpu-affinity:
- PR#346: Fixed incorrect CPU affinity on Windows built for CPU groups,
- added support for CPU affinity for up to 256 threads or CPUs,
- streamlined code for more efficient initialization of miner threads,
- precise affining of each miner thread to a specific CPU,
- added an option to disable CPU affinity with "--cpu-affinity 0"
Faster sha256t with AVX512 & AVX2.
Added stratum error count to stats log, reported only when non-zero.
v3.18.2
Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.
AVX512 for sha256d.
SSE42 and AVX may now be displayed as mining features at startup.
This is hard coded for each algo, and is only implemented for scrypt
at this time as it is the only algo with significant performance differences
with those features.
Fixed an issue where a high hashrate algo could cause excessive invalid hash
rate log reports when starting up in benchmark mode.
v3.18.1
More speed for scrypt:
- additional scryptn2 optimizations for all CPU architectures,
- AVX2 is now used by default on CPUS with SHA but not AVX512,
- scrypt:1024 performance lost in v3.18.0 is restored,
- AVX512 & AVX2 improvements to scrypt:1024.
Big speedup for SwiFFTx AVX2 & SSE4.1: x22i +55%, x25x +22%.
Issue #337: fixed a problem that could display negative stats values in the
first summary report if the report was forced prematurely due to a stratum
diff change. The stats will still be invalid but should display zeros.
v3.18.0
Complete rewrite of Scrypt code, optimized for large N factor (scryptn2):
- AVX512 & SHA support for sha256, AVX512 has priority,
- up to 50% increase in hashrate,
- memory requirements reduced 30-60% depending on CPU architecture,
- memory usage displayed at startup,
- scrypt, default N=1024 (LTC), will likely perform slower.
Improved stale share detection and handling for Scrypt with large N factor:
- abort and discard partially computed hash when new work is detected,
- quicker response to new job, less time wasted mining stale job.
Improved stale share handling for all algorithms:
- report possible stale share when new work received with a previously
submitted share still pending,
- when new work is detected report the submission of an already completed,
otherwise valid, but likely stale, share,
- fixed incorrect block height in stale share log.
Small performance improvements to sha, bmw, cube & hamsi for AVX512 & AVX2.
When stratum disconnects miner threads go to idle until reconnected.
Colour changes to some logs.
Some low level function name changes for clarity and consistency.
The reference hashrate in the summary log and the benchmark total hashrate
are now the mean hashrate for the session.
v3.17.1
Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES.

View File

@@ -67,7 +67,6 @@ void do_nothing () {}
bool return_true () { return true; }
bool return_false () { return false; }
void *return_null () { return NULL; }
void call_error () { printf("ERR: Uninitialized function pointer\n"); }
void algo_not_tested()
{
@@ -95,7 +94,8 @@ int null_scanhash()
return 0;
}
// Default generic scanhash can be used in many cases.
// Default generic scanhash can be used in many cases. Not to be used when
// prehashing can be done or when byte swapping the data can be avoided.
int scanhash_generic( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
@@ -152,6 +152,9 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
// overwrite byte swapped nonce with original byte order for proper
// incrementing. The nonce only needs to byte swapped if it is to be
// sumbitted.
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
@@ -324,6 +327,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_LYRA2Z330: rc = register_lyra2z330_algo ( gate ); break;
case ALGO_M7M: rc = register_m7m_algo ( gate ); break;
case ALGO_MINOTAUR: rc = register_minotaur_algo ( gate ); break;
case ALGO_MINOTAURX: rc = register_minotaur_algo ( gate ); break;
case ALGO_MYR_GR: rc = register_myriad_algo ( gate ); break;
case ALGO_NEOSCRYPT: rc = register_neoscrypt_algo ( gate ); break;
case ALGO_NIST5: rc = register_nist5_algo ( gate ); break;
@@ -371,15 +375,11 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_X22I: rc = register_x22i_algo ( gate ); break;
case ALGO_X25X: rc = register_x25x_algo ( gate ); break;
case ALGO_XEVAN: rc = register_xevan_algo ( gate ); break;
case ALGO_YESCRYPT: rc = register_yescrypt_05_algo ( gate ); break;
// case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
case ALGO_YESCRYPTR8: rc = register_yescryptr8_05_algo ( gate ); break;
// case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
case ALGO_YESCRYPT: rc = register_yescrypt_algo ( gate ); break;
case ALGO_YESCRYPTR8: rc = register_yescryptr8_algo ( gate ); break;
case ALGO_YESCRYPTR8G: rc = register_yescryptr8g_algo ( gate ); break;
case ALGO_YESCRYPTR16: rc = register_yescryptr16_05_algo( gate ); break;
// case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
case ALGO_YESCRYPTR32: rc = register_yescryptr32_05_algo( gate ); break;
// case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
case ALGO_YESCRYPTR16: rc = register_yescryptr16_algo ( gate ); break;
case ALGO_YESCRYPTR32: rc = register_yescryptr32_algo ( gate ); break;
case ALGO_YESPOWER: rc = register_yespower_algo ( gate ); break;
case ALGO_YESPOWERR16: rc = register_yespowerr16_algo ( gate ); break;
case ALGO_YESPOWER_B2B: rc = register_yespower_b2b_algo ( gate ); break;

View File

@@ -1,3 +1,6 @@
#ifndef __ALGO_GATE_API_H__
#define __ALGO_GATE_API_H__ 1
#include <stdlib.h>
#include <stdbool.h>
#include <stdint.h>
@@ -94,7 +97,6 @@ typedef uint32_t set_t;
#define SHA_OPT 0x20 // Zen1, Icelake (sha256)
#define AVX512_OPT 0x40 // Skylake-X (AVX512[F,VL,DQ,BW])
#define VAES_OPT 0x80 // Icelake (VAES & AVX512)
#define VAES256_OPT 0x100 // Zen3 (VAES without AVX512)
// return set containing all elements from sets a & b
@@ -319,3 +321,4 @@ void exec_hash_function( int algo, void *output, const void *pdata );
// algo name if valid alias, NULL if invalid alias or algo.
void get_algo_alias( char **algo_or_alias );
#endif

View File

@@ -344,7 +344,7 @@ static size_t
detect_cpu(void) {
//union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
//cpu_vendors_x86 vendor = cpu_nobody;
x86_regs regs;
x86_regs regs; regs.eax = regs.ebx = regs.ecx = 0;
uint32_t max_level, max_ext_level;
size_t cpu_flags = 0;
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
@@ -460,4 +460,4 @@ get_top_cpuflag_desc(size_t flag) {
#endif
#endif
#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
#endif /* defined(CPU_X86) || defined(CPU_X86_64) */

View File

@@ -4,11 +4,12 @@ typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, sc
#endif
/* romix pre/post nop function */
/*
static void asm_calling_convention
scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
(void)blocks; (void)nblocks;
}
*/
/* romix pre/post endian conversion function */
static void asm_calling_convention
scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {

View File

@@ -37,6 +37,13 @@
#if defined(__AVX512F__)
static inline __m512i blamka( __m512i x, __m512i y )
{
__m512i xy = _mm512_mul_epu32( x, y );
return _mm512_add_epi64( _mm512_add_epi64( x, y ),
_mm512_add_epi64( xy, xy ) );
}
static void fill_block( __m512i *state, const block *ref_block,
block *next_block, int with_xor )
{

View File

@@ -328,9 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
#include <immintrin.h>
#define ror64(x, n) _mm512_ror_epi64((x), (n))
static __m512i muladd(__m512i x, __m512i y)
static inline __m512i muladd(__m512i x, __m512i y)
{
__m512i z = _mm512_mul_epu32(x, y);
return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
@@ -344,8 +342,8 @@ static __m512i muladd(__m512i x, __m512i y)
D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \
\
D0 = ror64(D0, 32); \
D1 = ror64(D1, 32); \
D0 = _mm512_ror_epi64(D0, 32); \
D1 = _mm512_ror_epi64(D1, 32); \
\
C0 = muladd(C0, D0); \
C1 = muladd(C1, D1); \
@@ -353,8 +351,8 @@ static __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \
\
B0 = ror64(B0, 24); \
B1 = ror64(B1, 24); \
B0 = _mm512_ror_epi64(B0, 24); \
B1 = _mm512_ror_epi64(B1, 24); \
} while ((void)0, 0)
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -365,8 +363,8 @@ static __m512i muladd(__m512i x, __m512i y)
D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \
\
D0 = ror64(D0, 16); \
D1 = ror64(D1, 16); \
D0 = _mm512_ror_epi64(D0, 16); \
D1 = _mm512_ror_epi64(D1, 16); \
\
C0 = muladd(C0, D0); \
C1 = muladd(C1, D1); \
@@ -374,8 +372,8 @@ static __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \
\
B0 = ror64(B0, 63); \
B1 = ror64(B1, 63); \
B0 = _mm512_ror_epi64(B0, 63); \
B1 = _mm512_ror_epi64(B1, 63); \
} while ((void)0, 0)
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -417,11 +415,10 @@ static __m512i muladd(__m512i x, __m512i y)
#define SWAP_HALVES(A0, A1) \
do { \
__m512i t0, t1; \
t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
A0 = t0; \
A1 = t1; \
__m512i t; \
t = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
A1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
A0 = t; \
} while((void)0, 0)
#define SWAP_QUARTERS(A0, A1) \

View File

@@ -49,6 +49,20 @@ extern "C"{
#define SPH_SIZE_blake512 512
/////////////////////////
//
// Blake-256 1 way SSE2
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
const uint32_t T0, const uint32_t T1 );
/////////////////////////
//
// Blake-512 1 way SSE2
void blake512_transform_le( uint64_t *H, const uint64_t *buf,
const uint64_t T0, const uint64_t T1 );
//////////////////////////
//
// Blake-256 4 way SSE2
@@ -98,6 +112,12 @@ typedef blake_8way_small_context blake256_8way_context;
void blake256_8way_init(void *cc);
void blake256_8way_update(void *cc, const void *data, size_t len);
void blake256_8way_close(void *cc, void *dst);
void blake256_8way_update_le(void *cc, const void *data, size_t len);
void blake256_8way_close_le(void *cc, void *dst);
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
void *data );
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data );
// 14 rounds, blake, decred
typedef blake_8way_small_context blake256r14_8way_context;
@@ -128,6 +148,12 @@ void blake512_4way_update( void *cc, const void *data, size_t len );
void blake512_4way_close( void *cc, void *dst );
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
const void *data );
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
const __m256i nonce, const __m256i *midstate );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -148,6 +174,14 @@ typedef blake_16way_small_context blake256_16way_context;
void blake256_16way_init(void *cc);
void blake256_16way_update(void *cc, const void *data, size_t len);
void blake256_16way_close(void *cc, void *dst);
// Expects data in little endian order, no byte swap needed
void blake256_16way_update_le(void *cc, const void *data, size_t len);
void blake256_16way_close_le(void *cc, void *dst);
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
void *data );
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data );
// 14 rounds, blake, decred
typedef blake_16way_small_context blake256r14_16way_context;
@@ -180,7 +214,12 @@ void blake512_8way_update( void *cc, const void *data, size_t len );
void blake512_8way_close( void *cc, void *dst );
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_8way_hash_le80( void *hash, const void *data );
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
const void *data );
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
const __m512i nonce, const __m512i *midstate );
#endif // AVX512
#endif // AVX2

File diff suppressed because it is too large Load Diff

View File

@@ -52,6 +52,180 @@ static const uint8_t sigma[12][16] =
};
#define Z00 0
#define Z01 1
#define Z02 2
#define Z03 3
#define Z04 4
#define Z05 5
#define Z06 6
#define Z07 7
#define Z08 8
#define Z09 9
#define Z0A A
#define Z0B B
#define Z0C C
#define Z0D D
#define Z0E E
#define Z0F F
#define Z10 E
#define Z11 A
#define Z12 4
#define Z13 8
#define Z14 9
#define Z15 F
#define Z16 D
#define Z17 6
#define Z18 1
#define Z19 C
#define Z1A 0
#define Z1B 2
#define Z1C B
#define Z1D 7
#define Z1E 5
#define Z1F 3
#define Z20 B
#define Z21 8
#define Z22 C
#define Z23 0
#define Z24 5
#define Z25 2
#define Z26 F
#define Z27 D
#define Z28 A
#define Z29 E
#define Z2A 3
#define Z2B 6
#define Z2C 7
#define Z2D 1
#define Z2E 9
#define Z2F 4
#define Z30 7
#define Z31 9
#define Z32 3
#define Z33 1
#define Z34 D
#define Z35 C
#define Z36 B
#define Z37 E
#define Z38 2
#define Z39 6
#define Z3A 5
#define Z3B A
#define Z3C 4
#define Z3D 0
#define Z3E F
#define Z3F 8
#define Z40 9
#define Z41 0
#define Z42 5
#define Z43 7
#define Z44 2
#define Z45 4
#define Z46 A
#define Z47 F
#define Z48 E
#define Z49 1
#define Z4A B
#define Z4B C
#define Z4C 6
#define Z4D 8
#define Z4E 3
#define Z4F D
#define Z50 2
#define Z51 C
#define Z52 6
#define Z53 A
#define Z54 0
#define Z55 B
#define Z56 8
#define Z57 3
#define Z58 4
#define Z59 D
#define Z5A 7
#define Z5B 5
#define Z5C F
#define Z5D E
#define Z5E 1
#define Z5F 9
#define Z60 C
#define Z61 5
#define Z62 1
#define Z63 F
#define Z64 E
#define Z65 D
#define Z66 4
#define Z67 A
#define Z68 0
#define Z69 7
#define Z6A 6
#define Z6B 3
#define Z6C 9
#define Z6D 2
#define Z6E 8
#define Z6F B
#define Z70 D
#define Z71 B
#define Z72 7
#define Z73 E
#define Z74 C
#define Z75 1
#define Z76 3
#define Z77 9
#define Z78 5
#define Z79 0
#define Z7A F
#define Z7B 4
#define Z7C 8
#define Z7D 6
#define Z7E 2
#define Z7F A
#define Z80 6
#define Z81 F
#define Z82 E
#define Z83 9
#define Z84 B
#define Z85 3
#define Z86 0
#define Z87 8
#define Z88 C
#define Z89 2
#define Z8A D
#define Z8B 7
#define Z8C 1
#define Z8D 4
#define Z8E A
#define Z8F 5
#define Z90 A
#define Z91 2
#define Z92 8
#define Z93 4
#define Z94 7
#define Z95 6
#define Z96 1
#define Z97 5
#define Z98 F
#define Z99 B
#define Z9A 9
#define Z9B E
#define Z9C 3
#define Z9D C
#define Z9E D
#define Z9F 0
#define Mx(r, i) Mx_(Z ## r ## i)
#define Mx_(n) Mx__(n)
#define Mx__(n) M ## n
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define B2B8W_G(a, b, c, d, x, y) \
@@ -214,11 +388,11 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
#define B2B_G(a, b, c, d, x, y) \
{ \
v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \
v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \
v[d] = mm256_swap64_32( _mm256_xor_si256( v[d], v[a] ) ); \
v[c] = _mm256_add_epi64( v[c], v[d] ); \
v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \
v[b] = mm256_shuflr64_24( _mm256_xor_si256( v[b], v[c] ) ); \
v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \
v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \
v[d] = mm256_shuflr64_16( _mm256_xor_si256( v[d], v[a] ) ); \
v[c] = _mm256_add_epi64( v[c], v[d] ); \
v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \
}

View File

@@ -108,11 +108,11 @@ do { \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \
d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
} while(0)
@@ -320,11 +320,11 @@ do { \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s1 ] ); \
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
} while(0)

View File

@@ -314,10 +314,11 @@ static const sph_u64 CB[16] = {
// Blake-512 8 way AVX512
#define GB_8WAY(m0, m1, c0, c1, a, b, c, d) do { \
#define GB_8WAY( m0, m1, c0, c1, a, b, c, d ) \
{ \
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
_mm512_set1_epi64( c1 ), m0 ), b ), a ); \
d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
d = mm512_swap64_32( _mm512_xor_si512( d, a ) ); \
c = _mm512_add_epi64( c, d ); \
b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
@@ -325,9 +326,10 @@ static const sph_u64 CB[16] = {
d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
c = _mm512_add_epi64( c, d ); \
b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \
} while (0)
}
#define ROUND_B_8WAY(r) do { \
#define ROUND_B_8WAY( r ) \
{ \
GB_8WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
GB_8WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
@@ -336,13 +338,13 @@ static const sph_u64 CB[16] = {
GB_8WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
GB_8WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
} while (0)
}
#define DECL_STATE64_8WAY \
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
uint64_t T0, T1;
#define COMPRESS64_8WAY( buf ) do \
#define COMPRESS64_8WAY( buf ) \
{ \
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -361,14 +363,10 @@ static const sph_u64 CB[16] = {
V9 = m512_const1_64( CB1 ); \
VA = m512_const1_64( CB2 ); \
VB = m512_const1_64( CB3 ); \
VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
m512_const1_64( CB4 ) ); \
VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
m512_const1_64( CB5 ) ); \
VE = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
m512_const1_64( CB6 ) ); \
VF = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
m512_const1_64( CB7 ) ); \
VC = _mm512_set1_epi64( T0 ^ CB4 ); \
VD = _mm512_set1_epi64( T0 ^ CB5 ); \
VE = _mm512_set1_epi64( T1 ^ CB6 ); \
VF = _mm512_set1_epi64( T1 ^ CB7 ); \
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
0x28292a2b2c2d2e2f, 0x2021222324252627, \
0x18191a1b1c1d1e1f, 0x1011121314151617, \
@@ -413,7 +411,7 @@ static const sph_u64 CB[16] = {
H5 = mm512_xor3( VD, V5, H5 ); \
H6 = mm512_xor3( VE, V6, H6 ); \
H7 = mm512_xor3( VF, V7, H7 ); \
} while (0)
}
void blake512_8way_compress( blake_8way_big_context *sc )
{
@@ -435,14 +433,10 @@ void blake512_8way_compress( blake_8way_big_context *sc )
V9 = m512_const1_64( CB1 );
VA = m512_const1_64( CB2 );
VB = m512_const1_64( CB3 );
VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
m512_const1_64( CB4 ) );
VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
m512_const1_64( CB5 ) );
VE = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
m512_const1_64( CB6 ) );
VF = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
m512_const1_64( CB7 ) );
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637,
0x28292a2b2c2d2e2f, 0x2021222324252627,
@@ -493,6 +487,307 @@ void blake512_8way_compress( blake_8way_big_context *sc )
sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
}
// won't be used after prehash implemented
void blake512_8way_compress_le( blake_8way_big_context *sc )
{
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
V0 = sc->H[0];
V1 = sc->H[1];
V2 = sc->H[2];
V3 = sc->H[3];
V4 = sc->H[4];
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = m512_const1_64( CB0 );
V9 = m512_const1_64( CB1 );
VA = m512_const1_64( CB2 );
VB = m512_const1_64( CB3 );
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
M0 = sc->buf[ 0];
M1 = sc->buf[ 1];
M2 = sc->buf[ 2];
M3 = sc->buf[ 3];
M4 = sc->buf[ 4];
M5 = sc->buf[ 5];
M6 = sc->buf[ 6];
M7 = sc->buf[ 7];
M8 = sc->buf[ 8];
M9 = sc->buf[ 9];
MA = sc->buf[10];
MB = sc->buf[11];
MC = sc->buf[12];
MD = sc->buf[13];
ME = sc->buf[14];
MF = sc->buf[15];
ROUND_B_8WAY(0);
ROUND_B_8WAY(1);
ROUND_B_8WAY(2);
ROUND_B_8WAY(3);
ROUND_B_8WAY(4);
ROUND_B_8WAY(5);
ROUND_B_8WAY(6);
ROUND_B_8WAY(7);
ROUND_B_8WAY(8);
ROUND_B_8WAY(9);
ROUND_B_8WAY(0);
ROUND_B_8WAY(1);
ROUND_B_8WAY(2);
ROUND_B_8WAY(3);
ROUND_B_8WAY(4);
ROUND_B_8WAY(5);
sc->H[0] = mm512_xor3( V8, V0, sc->H[0] );
sc->H[1] = mm512_xor3( V9, V1, sc->H[1] );
sc->H[2] = mm512_xor3( VA, V2, sc->H[2] );
sc->H[3] = mm512_xor3( VB, V3, sc->H[3] );
sc->H[4] = mm512_xor3( VC, V4, sc->H[4] );
sc->H[5] = mm512_xor3( VD, V5, sc->H[5] );
sc->H[6] = mm512_xor3( VE, V6, sc->H[6] );
sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
}
// with final_le forms a full hash in 2 parts from little endian data.
// all variables hard coded for 80 bytes/lane.
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
const void *data )
{
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
// initial hash
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
// fill buffer
memcpy_512( sc->buf, (__m512i*)data, 80>>3 );
sc->buf[10] = m512_const1_64( 0x8000000000000000ULL );
sc->buf[11] =
sc->buf[12] = m512_zero;
sc->buf[13] = m512_one_64;
sc->buf[14] = m512_zero;
sc->buf[15] = m512_const1_64( 80*8 );
// build working variables
V0 = sc->H[0];
V1 = sc->H[1];
V2 = sc->H[2];
V3 = sc->H[3];
V4 = sc->H[4];
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = m512_const1_64( CB0 );
V9 = m512_const1_64( CB1 );
VA = m512_const1_64( CB2 );
VB = m512_const1_64( CB3 );
VC = _mm512_set1_epi64( CB4 ^ 0x280ULL );
VD = _mm512_set1_epi64( CB5 ^ 0x280ULL );
VE = _mm512_set1_epi64( CB6 );
VF = _mm512_set1_epi64( CB7 );
// round 0
GB_8WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
GB_8WAY( sc->buf[ 2], sc->buf[ 3], CB2, CB3, V1, V5, V9, VD );
GB_8WAY( sc->buf[ 4], sc->buf[ 5], CB4, CB5, V2, V6, VA, VE );
GB_8WAY( sc->buf[ 6], sc->buf[ 7], CB6, CB7, V3, V7, VB, VF );
// Do half of G4, skip the nonce
// GB_8WAY( sc->buf[ 8], sc->buf[ 9], CBx(0, 8), CBx(0, 9), V0, V5, VA, VF );
V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
VF = mm512_swap64_32( _mm512_xor_si512( VF, V0 ) );
VA = _mm512_add_epi64( VA, VF );
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 );
V0 = _mm512_add_epi64( V0, V5 );
GB_8WAY( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
GB_8WAY( sc->buf[12], sc->buf[13], CBC, CBD, V2, V7, V8, VD );
GB_8WAY( sc->buf[14], sc->buf[15], CBE, CBF, V3, V4, V9, VE );
// round 1
// G1
// GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD);
V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( CB8 ),
sc->buf[ 4] ) );
// G2
// GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
V2 = _mm512_add_epi64( V2, V6 );
// G3
// GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF);
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CB6 ), sc->buf[13] ), V7 ) );
// save midstate for second part
midstate[ 0] = V0;
midstate[ 1] = V1;
midstate[ 2] = V2;
midstate[ 3] = V3;
midstate[ 4] = V4;
midstate[ 5] = V5;
midstate[ 6] = V6;
midstate[ 7] = V7;
midstate[ 8] = V8;
midstate[ 9] = V9;
midstate[10] = VA;
midstate[11] = VB;
midstate[12] = VC;
midstate[13] = VD;
midstate[14] = VE;
midstate[15] = VF;
}
// pick up where we left off, need the nonce now.
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
const __m512i nonce, const __m512i *midstate )
{
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
__m512i h[8] __attribute__ ((aligned (64)));
// Load data with new nonce
M0 = sc->buf[ 0];
M1 = sc->buf[ 1];
M2 = sc->buf[ 2];
M3 = sc->buf[ 3];
M4 = sc->buf[ 4];
M5 = sc->buf[ 5];
M6 = sc->buf[ 6];
M7 = sc->buf[ 7];
M8 = sc->buf[ 8];
M9 = nonce;
MA = sc->buf[10];
MB = sc->buf[11];
MC = sc->buf[12];
MD = sc->buf[13];
ME = sc->buf[14];
MF = sc->buf[15];
V0 = midstate[ 0];
V1 = midstate[ 1];
V2 = midstate[ 2];
V3 = midstate[ 3];
V4 = midstate[ 4];
V5 = midstate[ 5];
V6 = midstate[ 6];
V7 = midstate[ 7];
V8 = midstate[ 8];
V9 = midstate[ 9];
VA = midstate[10];
VB = midstate[11];
VC = midstate[12];
VD = midstate[13];
VE = midstate[14];
VF = midstate[15];
// finish round 0 with the nonce now available
V0 = _mm512_add_epi64( V0, _mm512_xor_si512(
_mm512_set1_epi64( CB8 ), M9 ) );
VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 16 );
VA = _mm512_add_epi64( VA, VF );
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 11 );
// Round 1
// G0
GB_8WAY(Mx(1, 0), Mx(1, 1), CBx(1, 0), CBx(1, 1), V0, V4, V8, VC);
// G1
// GB_8WAY(Mx(1, 2), Mx(1, 3), CBx(1, 2), CBx(1, 3), V1, V5, V9, VD);
// V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 );
V1 = _mm512_add_epi64( V1, V5 );
VD = mm512_swap64_32( _mm512_xor_si512( VD, V1 ) );
V9 = _mm512_add_epi64( V9, VD );
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 );
V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CBx(1,2) ), Mx(1,3) ), V5 ) );
VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 16 );
V9 = _mm512_add_epi64( V9, VD );
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 11 );
// G2
// GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
// V2 = _mm512_add_epi64( V2, V6 );
V2 = _mm512_add_epi64( V2, _mm512_xor_si512(
_mm512_set1_epi64( CBF ), M9 ) );
VE = mm512_swap64_32( _mm512_xor_si512( VE, V2 ) );
VA = _mm512_add_epi64( VA, VE );
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 );
V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CB9 ), MF ), V6 ) );
VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 16 );
VA = _mm512_add_epi64( VA, VE );
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 11 );
// G3
// GB_8WAY(Mx(1, 6), Mx(1, 7), CBx(1, 6), CBx(1, 7), V3, V7, VB, VF);
// V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
// _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) );
VF = mm512_swap64_32( _mm512_xor_si512( VF, V3 ) );
VB = _mm512_add_epi64( VB, VF );
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 );
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 16 );
VB = _mm512_add_epi64( VB, VF );
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 11 );
// G4, G5, G6, G7
GB_8WAY(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
GB_8WAY(Mx(1, A), Mx(1, B), CBx(1, A), CBx(1, B), V1, V6, VB, VC);
GB_8WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
GB_8WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);
// remaining rounds
ROUND_B_8WAY(2);
ROUND_B_8WAY(3);
ROUND_B_8WAY(4);
ROUND_B_8WAY(5);
ROUND_B_8WAY(6);
ROUND_B_8WAY(7);
ROUND_B_8WAY(8);
ROUND_B_8WAY(9);
ROUND_B_8WAY(0);
ROUND_B_8WAY(1);
ROUND_B_8WAY(2);
ROUND_B_8WAY(3);
ROUND_B_8WAY(4);
ROUND_B_8WAY(5);
h[0] = mm512_xor3( V8, V0, sc->H[0] );
h[1] = mm512_xor3( V9, V1, sc->H[1] );
h[2] = mm512_xor3( VA, V2, sc->H[2] );
h[3] = mm512_xor3( VB, V3, sc->H[3] );
h[4] = mm512_xor3( VC, V4, sc->H[4] );
h[5] = mm512_xor3( VD, V5, sc->H[5] );
h[6] = mm512_xor3( VE, V6, sc->H[6] );
h[7] = mm512_xor3( VF, V7, sc->H[7] );
// bswap final hash
mm512_block_bswap_64( (__m512i*)hash, h );
}
void blake512_8way_init( blake_8way_big_context *sc )
{
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
@@ -678,6 +973,73 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
mm512_block_bswap_64( (__m512i*)dst, sc->H );
}
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
const void *data, size_t len )
{
// init
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
// update
memcpy_512( sc->buf, (__m512i*)data, len>>3 );
sc->ptr = len;
if ( len == 128 )
{
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress_le( sc );
sc->ptr = 0;
}
// close
size_t ptr64 = sc->ptr >> 3;
unsigned bit_len;
uint64_t th, tl;
bit_len = sc->ptr << 3;
sc->buf[ptr64] = m512_const1_64( 0x8000000000000000ULL );
tl = sc->T0 + bit_len;
th = sc->T1;
if ( ptr64 == 0 )
{
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
}
else if ( sc->T0 == 0 )
{
sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
sc->T1 = sc->T1 - 1;
}
else
sc->T0 -= 1024 - bit_len;
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
sc->buf[13] = m512_one_64;
sc->buf[14] = m512_const1_64( th );
sc->buf[15] = m512_const1_64( tl );
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress_le( sc );
mm512_block_bswap_64( (__m512i*)dst, sc->H );
}
void
blake512_8way_update(void *cc, const void *data, size_t len)
{
@@ -694,20 +1056,22 @@ blake512_8way_close(void *cc, void *dst)
// Blake-512 4 way
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) do { \
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) \
{ \
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
_mm256_set1_epi64x( c1 ), m0 ), b ), a ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
_mm256_set1_epi64x( c0 ), m1 ), b ), a ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
} while (0)
}
#define ROUND_B_4WAY(r) do { \
#define ROUND_B_4WAY(r) \
{ \
GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
GB_4WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
@@ -716,13 +1080,13 @@ blake512_8way_close(void *cc, void *dst)
GB_4WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
GB_4WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
} while (0)
}
#define DECL_STATE64_4WAY \
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
uint64_t T0, T1;
#define COMPRESS64_4WAY do \
#define COMPRESS64_4WAY \
{ \
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -741,14 +1105,10 @@ blake512_8way_close(void *cc, void *dst)
V9 = m256_const1_64( CB1 ); \
VA = m256_const1_64( CB2 ); \
VB = m256_const1_64( CB3 ); \
VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
m256_const1_64( CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
m256_const1_64( CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
m256_const1_64( CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
m256_const1_64( CB7 ) ); \
VC = _mm256_set1_epi64x( T0 ^ CB4 ); \
VD = _mm256_set1_epi64x( T0 ^ CB5 ); \
VE = _mm256_set1_epi64x( T1 ^ CB6 ); \
VF = _mm256_set1_epi64x( T1 ^ CB7 ); \
shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
@@ -791,7 +1151,7 @@ blake512_8way_close(void *cc, void *dst)
H5 = mm256_xor3( VD, V5, H5 ); \
H6 = mm256_xor3( VE, V6, H6 ); \
H7 = mm256_xor3( VF, V7, H7 ); \
} while (0)
}
void blake512_4way_compress( blake_4way_big_context *sc )
@@ -869,6 +1229,221 @@ void blake512_4way_compress( blake_4way_big_context *sc )
sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
}
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
const void *data )
{
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
// initial hash
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
// fill buffer
memcpy_256( sc->buf, (__m256i*)data, 80>>3 );
sc->buf[10] = m256_const1_64( 0x8000000000000000ULL );
sc->buf[11] = m256_zero;
sc->buf[12] = m256_zero;
sc->buf[13] = m256_one_64;
sc->buf[14] = m256_zero;
sc->buf[15] = m256_const1_64( 80*8 );
// build working variables
V0 = sc->H[0];
V1 = sc->H[1];
V2 = sc->H[2];
V3 = sc->H[3];
V4 = sc->H[4];
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = m256_const1_64( CB0 );
V9 = m256_const1_64( CB1 );
VA = m256_const1_64( CB2 );
VB = m256_const1_64( CB3 );
VC = _mm256_set1_epi64x( CB4 ^ 0x280ULL );
VD = _mm256_set1_epi64x( CB5 ^ 0x280ULL );
VE = _mm256_set1_epi64x( CB6 );
VF = _mm256_set1_epi64x( CB7 );
// round 0
GB_4WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
GB_4WAY( sc->buf[ 2], sc->buf[ 3], CB2, CB3, V1, V5, V9, VD );
GB_4WAY( sc->buf[ 4], sc->buf[ 5], CB4, CB5, V2, V6, VA, VE );
GB_4WAY( sc->buf[ 6], sc->buf[ 7], CB6, CB7, V3, V7, VB, VF );
// G4 skip nonce
V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 );
VF = mm256_swap64_32( _mm256_xor_si256( VF, V0 ) );
VA = _mm256_add_epi64( VA, VF );
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 );
V0 = _mm256_add_epi64( V0, V5 );
GB_4WAY( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
GB_4WAY( sc->buf[12], sc->buf[13], CBC, CBD, V2, V7, V8, VD );
GB_4WAY( sc->buf[14], sc->buf[15], CBE, CBF, V3, V4, V9, VE );
// round 1
// G1
V1 = _mm256_add_epi64( V1, _mm256_xor_si256( _mm256_set1_epi64x( CB8 ),
sc->buf[ 4] ) );
// G2
V2 = _mm256_add_epi64( V2, V6 );
// G3
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB6 ), sc->buf[13] ), V7 ) );
// save midstate for second part
midstate[ 0] = V0;
midstate[ 1] = V1;
midstate[ 2] = V2;
midstate[ 3] = V3;
midstate[ 4] = V4;
midstate[ 5] = V5;
midstate[ 6] = V6;
midstate[ 7] = V7;
midstate[ 8] = V8;
midstate[ 9] = V9;
midstate[10] = VA;
midstate[11] = VB;
midstate[12] = VC;
midstate[13] = VD;
midstate[14] = VE;
midstate[15] = VF;
}
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
const __m256i nonce, const __m256i *midstate )
{
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
__m256i h[8] __attribute__ ((aligned (64)));
// Load data with new nonce
M0 = sc->buf[ 0];
M1 = sc->buf[ 1];
M2 = sc->buf[ 2];
M3 = sc->buf[ 3];
M4 = sc->buf[ 4];
M5 = sc->buf[ 5];
M6 = sc->buf[ 6];
M7 = sc->buf[ 7];
M8 = sc->buf[ 8];
M9 = nonce;
MA = sc->buf[10];
MB = sc->buf[11];
MC = sc->buf[12];
MD = sc->buf[13];
ME = sc->buf[14];
MF = sc->buf[15];
V0 = midstate[ 0];
V1 = midstate[ 1];
V2 = midstate[ 2];
V3 = midstate[ 3];
V4 = midstate[ 4];
V5 = midstate[ 5];
V6 = midstate[ 6];
V7 = midstate[ 7];
V8 = midstate[ 8];
V9 = midstate[ 9];
VA = midstate[10];
VB = midstate[11];
VC = midstate[12];
VD = midstate[13];
VE = midstate[14];
VF = midstate[15];
// finish round 0, with the nonce now available
V0 = _mm256_add_epi64( V0, _mm256_xor_si256(
_mm256_set1_epi64x( CB8 ), M9 ) );
VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V0 ) );
VA = _mm256_add_epi64( VA, VF );
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 );
// Round 1
// G0
GB_4WAY(Mx(1, 0), Mx(1, 1), CBx(1, 0), CBx(1, 1), V0, V4, V8, VC);
// G1
V1 = _mm256_add_epi64( V1, V5 );
VD = mm256_swap64_32( _mm256_xor_si256( VD, V1 ) );
V9 = _mm256_add_epi64( V9, VD );
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 );
V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) );
VD = mm256_shuflr64_16( _mm256_xor_si256( VD, V1 ) );
V9 = _mm256_add_epi64( V9, VD );
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 );
// G2
V2 = _mm256_add_epi64( V2, _mm256_xor_si256(
_mm256_set1_epi64x( CBF ), M9 ) );
VE = mm256_swap64_32( _mm256_xor_si256( VE, V2 ) );
VA = _mm256_add_epi64( VA, VE );
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 );
V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB9 ), MF ), V6 ) );
VE = mm256_shuflr64_16( _mm256_xor_si256( VE, V2 ) );
VA = _mm256_add_epi64( VA, VE );
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 );
// G3
VF = mm256_swap64_32( _mm256_xor_si256( VF, V3 ) );
VB = _mm256_add_epi64( VB, VF );
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 );
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V3 ) );
VB = _mm256_add_epi64( VB, VF );
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 );
// G4, G5, G6, G7
GB_4WAY(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
GB_4WAY(Mx(1, A), Mx(1, B), CBx(1, A), CBx(1, B), V1, V6, VB, VC);
GB_4WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
GB_4WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);
ROUND_B_4WAY(2);
ROUND_B_4WAY(3);
ROUND_B_4WAY(4);
ROUND_B_4WAY(5);
ROUND_B_4WAY(6);
ROUND_B_4WAY(7);
ROUND_B_4WAY(8);
ROUND_B_4WAY(9);
ROUND_B_4WAY(0);
ROUND_B_4WAY(1);
ROUND_B_4WAY(2);
ROUND_B_4WAY(3);
ROUND_B_4WAY(4);
ROUND_B_4WAY(5);
h[0] = mm256_xor3( V8, V0, sc->H[0] );
h[1] = mm256_xor3( V9, V1, sc->H[1] );
h[2] = mm256_xor3( VA, V2, sc->H[2] );
h[3] = mm256_xor3( VB, V3, sc->H[3] );
h[4] = mm256_xor3( VC, V4, sc->H[4] );
h[5] = mm256_xor3( VD, V5, sc->H[5] );
h[6] = mm256_xor3( VE, V6, sc->H[6] );
h[7] = mm256_xor3( VF, V7, sc->H[7] );
// bswap final hash
mm256_block_bswap_64( (__m256i*)hash, h );
}
void blake512_4way_init( blake_4way_big_context *sc )
{
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );

View File

@@ -8,7 +8,7 @@ uint32_t *decred_get_nonceptr( uint32_t *work_data )
return &work_data[ DECRED_NONCE_INDEX ];
}
double decred_calc_network_diff( struct work* work )
long double decred_calc_network_diff( struct work* work )
{
// sample for diff 43.281 : 1c05ea29
// todo: endian reversed on longpoll could be zr5 specific...
@@ -16,7 +16,7 @@ double decred_calc_network_diff( struct work* work )
uint32_t bits = ( nbits & 0xffffff );
int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
int m;
double d = (double)0x0000ffff / (double)bits;
long double d = (long double)0x0000ffff / (long double)bits;
for ( m = shift; m < 29; m++ )
d *= 256.0;
@@ -25,7 +25,7 @@ double decred_calc_network_diff( struct work* work )
if ( shift == 28 )
d *= 256.0; // testnet
if ( opt_debug_diff )
applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", (double)d,
shift, bits );
return net_diff;
}
@@ -70,7 +70,10 @@ void decred_be_build_stratum_request( char *req, struct work *work,
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
free(xnonce2str);
}
#if !defined(min)
#define min(a,b) (a>b ? (b) :(a))
#endif
void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
{

View File

@@ -630,6 +630,69 @@ static const sph_u64 CB[16] = {
H7 ^= S3 ^ V7 ^ VF; \
} while (0)
#define COMPRESS32_LE do { \
sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = S0 ^ CS0; \
V9 = S1 ^ CS1; \
VA = S2 ^ CS2; \
VB = S3 ^ CS3; \
VC = T0 ^ CS4; \
VD = T0 ^ CS5; \
VE = T1 ^ CS6; \
VF = T1 ^ CS7; \
M0 = *((uint32_t*)(buf + 0)); \
M1 = *((uint32_t*)(buf + 4)); \
M2 = *((uint32_t*)(buf + 8)); \
M3 = *((uint32_t*)(buf + 12)); \
M4 = *((uint32_t*)(buf + 16)); \
M5 = *((uint32_t*)(buf + 20)); \
M6 = *((uint32_t*)(buf + 24)); \
M7 = *((uint32_t*)(buf + 28)); \
M8 = *((uint32_t*)(buf + 32)); \
M9 = *((uint32_t*)(buf + 36)); \
MA = *((uint32_t*)(buf + 40)); \
MB = *((uint32_t*)(buf + 44)); \
MC = *((uint32_t*)(buf + 48)); \
MD = *((uint32_t*)(buf + 52)); \
ME = *((uint32_t*)(buf + 56)); \
MF = *((uint32_t*)(buf + 60)); \
ROUND_S(0); \
ROUND_S(1); \
ROUND_S(2); \
ROUND_S(3); \
ROUND_S(4); \
ROUND_S(5); \
ROUND_S(6); \
ROUND_S(7); \
if (BLAKE32_ROUNDS == 14) { \
ROUND_S(8); \
ROUND_S(9); \
ROUND_S(0); \
ROUND_S(1); \
ROUND_S(2); \
ROUND_S(3); \
} \
H0 ^= S0 ^ V0 ^ V8; \
H1 ^= S1 ^ V1 ^ V9; \
H2 ^= S2 ^ V2 ^ VA; \
H3 ^= S3 ^ V3 ^ VB; \
H4 ^= S0 ^ V4 ^ VC; \
H5 ^= S1 ^ V5 ^ VD; \
H6 ^= S2 ^ V6 ^ VE; \
H7 ^= S3 ^ V7 ^ VF; \
} while (0)
#endif
#if SPH_64
@@ -843,6 +906,45 @@ blake32(sph_blake_small_context *sc, const void *data, size_t len)
sc->ptr = ptr;
}
static void
blake32_le(sph_blake_small_context *sc, const void *data, size_t len)
{
unsigned char *buf;
size_t ptr;
DECL_STATE32
buf = sc->buf;
ptr = sc->ptr;
if (len < (sizeof sc->buf) - ptr) {
memcpy(buf + ptr, data, len);
ptr += len;
sc->ptr = ptr;
return;
}
READ_STATE32(sc);
while (len > 0) {
size_t clen;
clen = (sizeof sc->buf) - ptr;
if (clen > len)
clen = len;
memcpy(buf + ptr, data, clen);
ptr += clen;
data = (const unsigned char *)data + clen;
len -= clen;
if (ptr == sizeof sc->buf) {
if ((T0 = SPH_T32(T0 + 512)) < 512)
T1 = SPH_T32(T1 + 1);
COMPRESS32_LE;
ptr = 0;
}
}
WRITE_STATE32(sc);
sc->ptr = ptr;
}
static void
blake32_close(sph_blake_small_context *sc,
unsigned ub, unsigned n, void *dst, size_t out_size_w32)
@@ -1050,6 +1152,12 @@ sph_blake256(void *cc, const void *data, size_t len)
blake32(cc, data, len);
}
void
sph_blake256_update_le(void *cc, const void *data, size_t len)
{
blake32_le(cc, data, len);
}
/* see sph_blake.h */
void
sph_blake256_close(void *cc, void *dst)

View File

@@ -198,6 +198,7 @@ void sph_blake256_init(void *cc);
* @param len the input data length (in bytes)
*/
void sph_blake256(void *cc, const void *data, size_t len);
void sph_blake256_update_le(void *cc, const void *data, size_t len);
/**
* Terminate the current BLAKE-256 computation and output the result into

View File

@@ -30,18 +30,11 @@
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "simd-utils.h"
#include "algo/sha/sph_types.h"
#include "sph_blake2b.h"
// Cyclic right rotation.
#ifndef ROTR64
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
#endif
// Little-endian byte access.
#define B2B_GET64(p) \
(((uint64_t) ((uint8_t *) (p))[0]) ^ \
(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \
@@ -52,47 +45,143 @@
(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
(((uint64_t) ((uint8_t *) (p))[7]) << 56))
// G Mixing function.
#if defined(__AVX2__)
#define B2B_G(a, b, c, d, x, y) { \
v[a] = v[a] + v[b] + x; \
v[d] = ROTR64(v[d] ^ v[a], 32); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 24); \
v[a] = v[a] + v[b] + y; \
v[d] = ROTR64(v[d] ^ v[a], 16); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 63); }
#define BLAKE2B_G( Sa, Sb, Sc, Sd, Se, Sf, Sg, Sh ) \
{ \
V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
_mm256_set_epi64x( m[ sigmaR[ Sg ] ], m[ sigmaR[ Se ] ], \
m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
V[3] = mm256_swap64_32( _mm256_xor_si256( V[3], V[0] ) ); \
V[2] = _mm256_add_epi64( V[2], V[3] ); \
V[1] = mm256_shuflr64_24( _mm256_xor_si256( V[1], V[2] ) ); \
\
V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
_mm256_set_epi64x( m[ sigmaR[ Sh ] ], m[ sigmaR[ Sf ] ], \
m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
V[3] = mm256_shuflr64_16( _mm256_xor_si256( V[3], V[0] ) ); \
V[2] = _mm256_add_epi64( V[2], V[3] ); \
V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 63 ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
__m256i *V = (__m256i*)v; \
const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( 0, 1, 2, 3, 4, 5, 6, 7 ); \
V[3] = mm256_shufll_64( V[3] ); \
V[2] = mm256_swap_128( V[2] ); \
V[1] = mm256_shuflr_64( V[1] ); \
BLAKE2B_G( 8, 9, 10, 11, 12, 13, 14, 15 ); \
V[3] = mm256_shuflr_64( V[3] ); \
V[2] = mm256_swap_128( V[2] ); \
V[1] = mm256_shufll_64( V[1] ); \
}
#elif defined(__SSE2__)
// always true
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
{ \
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
\
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
__m128i *V = (__m128i*)v; \
__m128i V2, V3, V6, V7; \
const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V2 = mm128_alignr_64( V[3], V[2] ); \
V3 = mm128_alignr_64( V[2], V[3] ); \
V6 = mm128_alignr_64( V[6], V[7] ); \
V7 = mm128_alignr_64( V[7], V[6] ); \
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
V[2] = mm128_alignr_64( V2, V3 ); \
V[3] = mm128_alignr_64( V3, V2 ); \
V[6] = mm128_alignr_64( V7, V6 ); \
V[7] = mm128_alignr_64( V6, V7 ); \
}
#else
// never used, SSE2 is always available
#ifndef ROTR64
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
#endif
#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb ) \
{ \
Va = Va + Vb + m[ sigma[R][Sa] ]; \
Vd = ROTR64( Vd ^ Va, 32 ); \
Vc = Vc + Vd; \
Vb = ROTR64( Vb ^ Vc, 24 ); \
\
Va = Va + Vb + m[ sigma[R][Sb] ]; \
Vd = ROTR64( Vd ^ Va, 16 ); \
Vc = Vc + Vd; \
Vb = ROTR64( Vb ^ Vc, 63 ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
BLAKE2B_G( R, v[ 0], v[ 4], v[ 8], v[12], 0, 1 ); \
BLAKE2B_G( R, v[ 1], v[ 5], v[ 9], v[13], 2, 3 ); \
BLAKE2B_G( R, v[ 2], v[ 6], v[10], v[14], 4, 5 ); \
BLAKE2B_G( R, v[ 3], v[ 7], v[11], v[15], 6, 7 ); \
BLAKE2B_G( R, v[ 0], v[ 5], v[10], v[15], 8, 9 ); \
BLAKE2B_G( R, v[ 1], v[ 6], v[11], v[12], 10, 11 ); \
BLAKE2B_G( R, v[ 2], v[ 7], v[ 8], v[13], 12, 13 ); \
BLAKE2B_G( R, v[ 3], v[ 4], v[ 9], v[14], 14, 15 ); \
}
#endif
// Initialization Vector.
static const uint64_t blake2b_iv[8] = {
static const uint64_t blake2b_iv[8] __attribute__ ((aligned (32))) =
{
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
static const uint8_t sigma[12][16] __attribute__ ((aligned (32))) =
{
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
// Compression function. "last" flag indicates last block.
static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
{
const uint8_t sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
int i;
uint64_t v[16], m[16];
uint64_t v[16] __attribute__ ((aligned (32)));
uint64_t m[16] __attribute__ ((aligned (32)));
int i;
for (i = 0; i < 8; i++) { // init work variables
v[i] = ctx->h[i];
@@ -106,16 +195,8 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
for (i = 0; i < 16; i++) // get little-endian words
m[i] = B2B_GET64(&ctx->b[8 * i]);
for (i = 0; i < 12; i++) { // twelve rounds
B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]);
B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]);
}
for (i = 0; i < 12; i++)
BLAKE2B_ROUND( i );
for( i = 0; i < 8; ++i )
ctx->h[i] ^= v[i] ^ v[i + 8];

View File

@@ -594,22 +594,12 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
#define rb6(x) mm256_rol_64( x, 43 )
#define rb7(x) mm256_rol_64( x, 53 )
#define rol_off_64( M, j, off ) \
mm256_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
( ( (j) + (off) ) & 0xF ) + 1 )
#define add_elt_b( mj0, mj3, mj10, h, K ) \
_mm256_xor_si256( h, _mm256_add_epi64( K, \
_mm256_sub_epi64( _mm256_add_epi64( mj0, mj3 ), mj10 ) ) )
#define add_elt_b( M, H, j ) \
_mm256_xor_si256( \
_mm256_add_epi64( \
_mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \
rol_off_64( M, j, 3 ) ), \
rol_off_64( M, j, 10 ) ), \
_mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
H[ ( (j)+7 ) & 0xF ] )
#define expand1b( qt, M, H, i ) \
_mm256_add_epi64( mm256_add4_64( \
#define expand1_b( qt, i ) \
mm256_add4_64( \
mm256_add4_64( sb1( qt[ (i)-16 ] ), sb2( qt[ (i)-15 ] ), \
sb3( qt[ (i)-14 ] ), sb0( qt[ (i)-13 ] )), \
mm256_add4_64( sb1( qt[ (i)-12 ] ), sb2( qt[ (i)-11 ] ), \
@@ -617,11 +607,10 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
mm256_add4_64( sb1( qt[ (i)- 8 ] ), sb2( qt[ (i)- 7 ] ), \
sb3( qt[ (i)- 6 ] ), sb0( qt[ (i)- 5 ] )), \
mm256_add4_64( sb1( qt[ (i)- 4 ] ), sb2( qt[ (i)- 3 ] ), \
sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ), \
add_elt_b( M, H, (i)-16 ) )
sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) )
#define expand2b( qt, M, H, i) \
_mm256_add_epi64( mm256_add4_64( \
#define expand2_b( qt, i) \
mm256_add4_64( \
mm256_add4_64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ), \
qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ), \
mm256_add4_64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ), \
@@ -629,159 +618,98 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
mm256_add4_64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ), \
qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ), \
mm256_add4_64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ), \
sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \
add_elt_b( M, H, (i)-16 ) )
sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) )
#define Wb0 \
_mm256_add_epi64( \
_mm256_add_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
_mm256_xor_si256( M[10], H[10] ) ), \
_mm256_add_epi64( _mm256_xor_si256( M[13], H[13] ), \
_mm256_xor_si256( M[14], H[14] ) ) )
_mm256_add_epi64( _mm256_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \
_mm256_add_epi64( mh[13], mh[14] ) )
#define Wb1 \
_mm256_add_epi64( \
_mm256_add_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
_mm256_xor_si256( M[11], H[11] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[14], H[14] ), \
_mm256_xor_si256( M[15], H[15] ) ) )
_mm256_add_epi64( _mm256_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \
_mm256_sub_epi64( mh[14], mh[15] ) )
#define Wb2 \
_mm256_sub_epi64( \
_mm256_add_epi64( \
_mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
_mm256_xor_si256( M[15], H[15] ) ) )
_mm256_add_epi64( _mm256_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \
_mm256_sub_epi64( mh[12], mh[15] ) )
#define Wb3 \
_mm256_sub_epi64( \
_mm256_add_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[10], H[10] ), \
_mm256_xor_si256( M[13], H[13] ) ) )
_mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \
_mm256_sub_epi64( mh[10], \
mh[13] ) )
#define Wb4 \
_mm256_sub_epi64( \
_mm256_add_epi64( \
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
_mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
_mm256_xor_si256( M[14], H[14] ) ) )
_mm256_add_epi64( _mm256_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \
_mm256_add_epi64( mh[11], mh[14] ) )
#define Wb5 \
_mm256_sub_epi64( \
_mm256_add_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
_mm256_xor_si256( M[10], H[10] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
_mm256_xor_si256( M[15], H[15] ) ) )
_mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \
_mm256_sub_epi64( mh[12], mh[15] ) )
#define Wb6 \
_mm256_sub_epi64( \
_mm256_sub_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[11], H[11] ), \
_mm256_xor_si256( M[13], H[13] ) ) )
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \
_mm256_sub_epi64( mh[11], mh[13] ) )
#define Wb7 \
_mm256_sub_epi64( \
_mm256_sub_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
_mm256_add_epi64( _mm256_xor_si256( M[12], H[12] ), \
_mm256_xor_si256( M[14], H[14] ) ) )
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \
_mm256_add_epi64( mh[12], mh[14] ) )
#define Wb8 \
_mm256_add_epi64( \
_mm256_sub_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[13], H[13] ), \
_mm256_xor_si256( M[15], H[15] ) ) )
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \
_mm256_sub_epi64( mh[13], mh[15] ) )
#define Wb9 \
_mm256_sub_epi64( \
_mm256_add_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
_mm256_xor_si256( M[14], H[14] ) ) )
_mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \
_mm256_sub_epi64( mh[ 7], mh[14] ) )
#define Wb10 \
_mm256_sub_epi64( \
_mm256_sub_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
_mm256_xor_si256( M[15], H[15] ) ) )
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \
_mm256_sub_epi64( mh[ 7], mh[15] ) )
#define Wb11 \
_mm256_sub_epi64( \
_mm256_sub_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
_mm256_xor_si256( M[ 9], H[ 9] ) ) )
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \
_mm256_sub_epi64( mh[ 5], mh[ 9] ) )
#define Wb12 \
_mm256_sub_epi64( \
_mm256_sub_epi64( \
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
_mm256_xor_si256( M[10], H[10] ) ) )
_mm256_sub_epi64( _mm256_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \
_mm256_sub_epi64( mh[ 9], mh[10] ) )
#define Wb13 \
_mm256_add_epi64( \
_mm256_add_epi64( \
_mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
_mm256_add_epi64( _mm256_xor_si256( M[10], H[10] ), \
_mm256_xor_si256( M[11], H[11] ) ) )
_mm256_add_epi64( _mm256_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \
_mm256_add_epi64( mh[10], mh[11] ) )
#define Wb14 \
_mm256_sub_epi64( \
_mm256_add_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
_mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
_mm256_xor_si256( M[12], H[12] ) ) )
_mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \
_mm256_add_epi64( mh[11], mh[12] ) )
#define Wb15 \
_mm256_sub_epi64( \
_mm256_sub_epi64( \
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
_mm256_xor_si256( M[ 4], H[4] ) ), \
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
_mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
_mm256_xor_si256( M[13], H[13] ) ) )
_mm256_sub_epi64( _mm256_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \
_mm256_sub_epi64( mh[ 9], mh[13] ) )
void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
{
__m256i qt[32], xl, xh;
__m256i mh[16];
int i;
for ( i = 0; i < 16; i++ )
mh[i] = _mm256_xor_si256( M[i], H[i] );
qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] );
qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] );
@@ -799,22 +727,77 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] );
qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] );
qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] );
qt[16] = expand1b( qt, M, H, 16 );
qt[17] = expand1b( qt, M, H, 17 );
qt[18] = expand2b( qt, M, H, 18 );
qt[19] = expand2b( qt, M, H, 19 );
qt[20] = expand2b( qt, M, H, 20 );
qt[21] = expand2b( qt, M, H, 21 );
qt[22] = expand2b( qt, M, H, 22 );
qt[23] = expand2b( qt, M, H, 23 );
qt[24] = expand2b( qt, M, H, 24 );
qt[25] = expand2b( qt, M, H, 25 );
qt[26] = expand2b( qt, M, H, 26 );
qt[27] = expand2b( qt, M, H, 27 );
qt[28] = expand2b( qt, M, H, 28 );
qt[29] = expand2b( qt, M, H, 29 );
qt[30] = expand2b( qt, M, H, 30 );
qt[31] = expand2b( qt, M, H, 31 );
__m256i mj[16];
mj[ 0] = mm256_rol_64( M[ 0], 1 );
mj[ 1] = mm256_rol_64( M[ 1], 2 );
mj[ 2] = mm256_rol_64( M[ 2], 3 );
mj[ 3] = mm256_rol_64( M[ 3], 4 );
mj[ 4] = mm256_rol_64( M[ 4], 5 );
mj[ 5] = mm256_rol_64( M[ 5], 6 );
mj[ 6] = mm256_rol_64( M[ 6], 7 );
mj[ 7] = mm256_rol_64( M[ 7], 8 );
mj[ 8] = mm256_rol_64( M[ 8], 9 );
mj[ 9] = mm256_rol_64( M[ 9], 10 );
mj[10] = mm256_rol_64( M[10], 11 );
mj[11] = mm256_rol_64( M[11], 12 );
mj[12] = mm256_rol_64( M[12], 13 );
mj[13] = mm256_rol_64( M[13], 14 );
mj[14] = mm256_rol_64( M[14], 15 );
mj[15] = mm256_rol_64( M[15], 16 );
__m256i K = _mm256_set1_epi64x( 16 * 0x0555555555555555ULL );
const __m256i Kincr = _mm256_set1_epi64x( 0x0555555555555555ULL );
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
K = _mm256_add_epi64( K, Kincr );
qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8], K );
K = _mm256_add_epi64( K, Kincr );
qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9], K );
K = _mm256_add_epi64( K, Kincr );
qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10], K );
K = _mm256_add_epi64( K, Kincr );
qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11], K );
K = _mm256_add_epi64( K, Kincr );
qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12], K );
K = _mm256_add_epi64( K, Kincr );
qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13], K );
K = _mm256_add_epi64( K, Kincr );
qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14], K );
K = _mm256_add_epi64( K, Kincr );
qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15], K );
K = _mm256_add_epi64( K, Kincr );
qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0], K );
K = _mm256_add_epi64( K, Kincr );
qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1], K );
K = _mm256_add_epi64( K, Kincr );
qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2], K );
K = _mm256_add_epi64( K, Kincr );
qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3], K );
K = _mm256_add_epi64( K, Kincr );
qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4], K );
K = _mm256_add_epi64( K, Kincr );
qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5], K );
K = _mm256_add_epi64( K, Kincr );
qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6], K );
qt[16] = _mm256_add_epi64( qt[16], expand1_b( qt, 16 ) );
qt[17] = _mm256_add_epi64( qt[17], expand1_b( qt, 17 ) );
qt[18] = _mm256_add_epi64( qt[18], expand2_b( qt, 18 ) );
qt[19] = _mm256_add_epi64( qt[19], expand2_b( qt, 19 ) );
qt[20] = _mm256_add_epi64( qt[20], expand2_b( qt, 20 ) );
qt[21] = _mm256_add_epi64( qt[21], expand2_b( qt, 21 ) );
qt[22] = _mm256_add_epi64( qt[22], expand2_b( qt, 22 ) );
qt[23] = _mm256_add_epi64( qt[23], expand2_b( qt, 23 ) );
qt[24] = _mm256_add_epi64( qt[24], expand2_b( qt, 24 ) );
qt[25] = _mm256_add_epi64( qt[25], expand2_b( qt, 25 ) );
qt[26] = _mm256_add_epi64( qt[26], expand2_b( qt, 26 ) );
qt[27] = _mm256_add_epi64( qt[27], expand2_b( qt, 27 ) );
qt[28] = _mm256_add_epi64( qt[28], expand2_b( qt, 28 ) );
qt[29] = _mm256_add_epi64( qt[29], expand2_b( qt, 29 ) );
qt[30] = _mm256_add_epi64( qt[30], expand2_b( qt, 30 ) );
qt[31] = _mm256_add_epi64( qt[31], expand2_b( qt, 31 ) );
xl = _mm256_xor_si256(
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
@@ -823,7 +806,6 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
#define DH1L( m, sl, sr, a, b, c ) \
_mm256_add_epi64( \
_mm256_xor_si256( M[m], \
@@ -1066,21 +1048,12 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#define r8b6(x) mm512_rol_64( x, 43 )
#define r8b7(x) mm512_rol_64( x, 53 )
#define rol8w_off_64( M, j, off ) \
mm512_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
( ( (j) + (off) ) & 0xF ) + 1 )
#define add_elt_b8( mj0, mj3, mj10, h, K ) \
_mm512_xor_si512( h, _mm512_add_epi64( K, \
_mm512_sub_epi64( _mm512_add_epi64( mj0, mj3 ), mj10 ) ) )
#define add_elt_b8( M, H, j ) \
_mm512_xor_si512( \
_mm512_add_epi64( \
_mm512_sub_epi64( _mm512_add_epi64( rol8w_off_64( M, j, 0 ), \
rol8w_off_64( M, j, 3 ) ), \
rol8w_off_64( M, j, 10 ) ), \
_mm512_set1_epi64( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
H[ ( (j)+7 ) & 0xF ] )
#define expand1b8( qt, M, H, i ) \
_mm512_add_epi64( mm512_add4_64( \
#define expand1_b8( qt, i ) \
mm512_add4_64( \
mm512_add4_64( s8b1( qt[ (i)-16 ] ), s8b2( qt[ (i)-15 ] ), \
s8b3( qt[ (i)-14 ] ), s8b0( qt[ (i)-13 ] )), \
mm512_add4_64( s8b1( qt[ (i)-12 ] ), s8b2( qt[ (i)-11 ] ), \
@@ -1088,11 +1061,10 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
mm512_add4_64( s8b1( qt[ (i)- 8 ] ), s8b2( qt[ (i)- 7 ] ), \
s8b3( qt[ (i)- 6 ] ), s8b0( qt[ (i)- 5 ] )), \
mm512_add4_64( s8b1( qt[ (i)- 4 ] ), s8b2( qt[ (i)- 3 ] ), \
s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ), \
add_elt_b8( M, H, (i)-16 ) )
s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) )
#define expand2b8( qt, M, H, i) \
_mm512_add_epi64( mm512_add4_64( \
#define expand2_b8( qt, i) \
mm512_add4_64( \
mm512_add4_64( qt[ (i)-16 ], r8b1( qt[ (i)-15 ] ), \
qt[ (i)-14 ], r8b2( qt[ (i)-13 ] ) ), \
mm512_add4_64( qt[ (i)-12 ], r8b3( qt[ (i)-11 ] ), \
@@ -1100,157 +1072,97 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
mm512_add4_64( qt[ (i)- 8 ], r8b5( qt[ (i)- 7 ] ), \
qt[ (i)- 6 ], r8b6( qt[ (i)- 5 ] ) ), \
mm512_add4_64( qt[ (i)- 4 ], r8b7( qt[ (i)- 3 ] ), \
s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \
add_elt_b8( M, H, (i)-16 ) )
s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) )
#define W8b0 \
_mm512_add_epi64( \
_mm512_add_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
_mm512_xor_si512( M[10], H[10] ) ), \
_mm512_add_epi64( _mm512_xor_si512( M[13], H[13] ), \
_mm512_xor_si512( M[14], H[14] ) ) )
_mm512_add_epi64( _mm512_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \
_mm512_add_epi64( mh[13], mh[14] ) )
#define W8b1 \
_mm512_add_epi64( \
_mm512_add_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 6], H[ 6] ), \
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
_mm512_xor_si512( M[11], H[11] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[14], H[14] ), \
_mm512_xor_si512( M[15], H[15] ) ) )
_mm512_add_epi64( _mm512_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \
_mm512_sub_epi64( mh[14], mh[15] ) )
#define W8b2 \
_mm512_sub_epi64( \
_mm512_add_epi64( \
_mm512_add_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
_mm512_xor_si512( M[15], H[15] ) ) )
_mm512_add_epi64( _mm512_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \
_mm512_sub_epi64( mh[12], mh[15] ) )
#define W8b3 \
_mm512_sub_epi64( \
_mm512_add_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[10], H[10] ), \
_mm512_xor_si512( M[13], H[13] ) ) )
_mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \
_mm512_sub_epi64( mh[10], mh[13] ) )
#define W8b4 \
_mm512_sub_epi64( \
_mm512_add_epi64( \
_mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
_mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
_mm512_xor_si512( M[14], H[14] ) ) )
_mm512_add_epi64( _mm512_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \
_mm512_add_epi64( mh[11], mh[14] ) )
#define W8b5 \
_mm512_sub_epi64( \
_mm512_add_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
_mm512_xor_si512( M[10], H[10] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
_mm512_xor_si512( M[15], H[15] ) ) )
_mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \
_mm512_sub_epi64( mh[12], mh[15] ) )
#define W8b6 \
_mm512_sub_epi64( \
_mm512_sub_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 4], H[ 4] ), \
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[11], H[11] ), \
_mm512_xor_si512( M[13], H[13] ) ) )
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \
_mm512_sub_epi64( mh[11], mh[13] ) )
#define W8b7 \
_mm512_sub_epi64( \
_mm512_sub_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
_mm512_add_epi64( _mm512_xor_si512( M[12], H[12] ), \
_mm512_xor_si512( M[14], H[14] ) ) )
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \
_mm512_add_epi64( mh[12], mh[14] ) )
#define W8b8 \
_mm512_add_epi64( \
_mm512_sub_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[13], H[13] ), \
_mm512_xor_si512( M[15], H[15] ) ) )
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \
_mm512_sub_epi64( mh[13], mh[15] ) )
#define W8b9 \
_mm512_sub_epi64( \
_mm512_add_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
_mm512_xor_si512( M[14], H[14] ) ) )
_mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \
_mm512_sub_epi64( mh[ 7], mh[14] ) )
#define W8b10 \
_mm512_sub_epi64( \
_mm512_sub_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
_mm512_xor_si512( M[15], H[15] ) ) )
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \
_mm512_sub_epi64( mh[ 7], mh[15] ) )
#define W8b11 \
_mm512_sub_epi64( \
_mm512_sub_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
_mm512_xor_si512( M[ 9], H[ 9] ) ) )
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \
_mm512_sub_epi64( mh[ 5], mh[ 9] ) )
#define W8b12 \
_mm512_sub_epi64( \
_mm512_sub_epi64( \
_mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
_mm512_xor_si512( M[10], H[10] ) ) )
_mm512_sub_epi64( _mm512_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \
_mm512_sub_epi64( mh[ 9], mh[10] ) )
#define W8b13 \
_mm512_add_epi64( \
_mm512_add_epi64( \
_mm512_add_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
_mm512_add_epi64( _mm512_xor_si512( M[10], H[10] ), \
_mm512_xor_si512( M[11], H[11] ) ) )
_mm512_add_epi64( _mm512_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \
_mm512_add_epi64( mh[10], mh[11] ) )
#define W8b14 \
_mm512_sub_epi64( \
_mm512_add_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
_mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
_mm512_xor_si512( M[12], H[12] ) ) )
_mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \
_mm512_add_epi64( mh[11], mh[12] ) )
#define W8b15 \
_mm512_sub_epi64( \
_mm512_sub_epi64( \
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
_mm512_xor_si512( M[ 4], H[4] ) ), \
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
_mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
_mm512_xor_si512( M[13], H[13] ) ) )
_mm512_sub_epi64( _mm512_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \
_mm512_sub_epi64( mh[ 9], mh[13] ) )
void compress_big_8way( const __m512i *M, const __m512i H[16],
__m512i dH[16] )
{
__m512i qt[32], xl, xh;
__m512i mh[16];
int i;
for ( i = 0; i < 16; i++ )
mh[i] = _mm512_xor_si512( M[i], H[i] );
qt[ 0] = _mm512_add_epi64( s8b0( W8b0 ), H[ 1] );
qt[ 1] = _mm512_add_epi64( s8b1( W8b1 ), H[ 2] );
@@ -1268,22 +1180,77 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
qt[13] = _mm512_add_epi64( s8b3( W8b13), H[14] );
qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
qt[16] = expand1b8( qt, M, H, 16 );
qt[17] = expand1b8( qt, M, H, 17 );
qt[18] = expand2b8( qt, M, H, 18 );
qt[19] = expand2b8( qt, M, H, 19 );
qt[20] = expand2b8( qt, M, H, 20 );
qt[21] = expand2b8( qt, M, H, 21 );
qt[22] = expand2b8( qt, M, H, 22 );
qt[23] = expand2b8( qt, M, H, 23 );
qt[24] = expand2b8( qt, M, H, 24 );
qt[25] = expand2b8( qt, M, H, 25 );
qt[26] = expand2b8( qt, M, H, 26 );
qt[27] = expand2b8( qt, M, H, 27 );
qt[28] = expand2b8( qt, M, H, 28 );
qt[29] = expand2b8( qt, M, H, 29 );
qt[30] = expand2b8( qt, M, H, 30 );
qt[31] = expand2b8( qt, M, H, 31 );
__m512i mj[16];
mj[ 0] = mm512_rol_64( M[ 0], 1 );
mj[ 1] = mm512_rol_64( M[ 1], 2 );
mj[ 2] = mm512_rol_64( M[ 2], 3 );
mj[ 3] = mm512_rol_64( M[ 3], 4 );
mj[ 4] = mm512_rol_64( M[ 4], 5 );
mj[ 5] = mm512_rol_64( M[ 5], 6 );
mj[ 6] = mm512_rol_64( M[ 6], 7 );
mj[ 7] = mm512_rol_64( M[ 7], 8 );
mj[ 8] = mm512_rol_64( M[ 8], 9 );
mj[ 9] = mm512_rol_64( M[ 9], 10 );
mj[10] = mm512_rol_64( M[10], 11 );
mj[11] = mm512_rol_64( M[11], 12 );
mj[12] = mm512_rol_64( M[12], 13 );
mj[13] = mm512_rol_64( M[13], 14 );
mj[14] = mm512_rol_64( M[14], 15 );
mj[15] = mm512_rol_64( M[15], 16 );
__m512i K = _mm512_set1_epi64( 16 * 0x0555555555555555ULL );
const __m512i Kincr = _mm512_set1_epi64( 0x0555555555555555ULL );
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K );
K = _mm512_add_epi64( K, Kincr );
qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8], K );
K = _mm512_add_epi64( K, Kincr );
qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9], K );
K = _mm512_add_epi64( K, Kincr );
qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10], K );
K = _mm512_add_epi64( K, Kincr );
qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11], K );
K = _mm512_add_epi64( K, Kincr );
qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12], K );
K = _mm512_add_epi64( K, Kincr );
qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13], K );
K = _mm512_add_epi64( K, Kincr );
qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14], K );
K = _mm512_add_epi64( K, Kincr );
qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15], K );
K = _mm512_add_epi64( K, Kincr );
qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0], K );
K = _mm512_add_epi64( K, Kincr );
qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1], K );
K = _mm512_add_epi64( K, Kincr );
qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2], K );
K = _mm512_add_epi64( K, Kincr );
qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3], K );
K = _mm512_add_epi64( K, Kincr );
qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4], K );
K = _mm512_add_epi64( K, Kincr );
qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5], K );
K = _mm512_add_epi64( K, Kincr );
qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6], K );
qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
qt[18] = _mm512_add_epi64( qt[18], expand2_b8( qt, 18 ) );
qt[19] = _mm512_add_epi64( qt[19], expand2_b8( qt, 19 ) );
qt[20] = _mm512_add_epi64( qt[20], expand2_b8( qt, 20 ) );
qt[21] = _mm512_add_epi64( qt[21], expand2_b8( qt, 21 ) );
qt[22] = _mm512_add_epi64( qt[22], expand2_b8( qt, 22 ) );
qt[23] = _mm512_add_epi64( qt[23], expand2_b8( qt, 23 ) );
qt[24] = _mm512_add_epi64( qt[24], expand2_b8( qt, 24 ) );
qt[25] = _mm512_add_epi64( qt[25], expand2_b8( qt, 25 ) );
qt[26] = _mm512_add_epi64( qt[26], expand2_b8( qt, 26 ) );
qt[27] = _mm512_add_epi64( qt[27], expand2_b8( qt, 27 ) );
qt[28] = _mm512_add_epi64( qt[28], expand2_b8( qt, 28 ) );
qt[29] = _mm512_add_epi64( qt[29], expand2_b8( qt, 29 ) );
qt[30] = _mm512_add_epi64( qt[30], expand2_b8( qt, 30 ) );
qt[31] = _mm512_add_epi64( qt[31], expand2_b8( qt, 31 ) );
xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
mm512_xor3( qt[19], qt[20], qt[21] ),

View File

@@ -54,14 +54,12 @@ static void transform_4way( cube_4way_context *sp )
x5 = _mm512_add_epi32( x1, x5 );
x6 = _mm512_add_epi32( x2, x6 );
x7 = _mm512_add_epi32( x3, x7 );
y0 = x0;
y1 = x1;
x0 = mm512_rol_32( x2, 7 );
x1 = mm512_rol_32( x3, 7 );
x2 = mm512_rol_32( y0, 7 );
x3 = mm512_rol_32( y1, 7 );
x0 = _mm512_xor_si512( x0, x4 );
x1 = _mm512_xor_si512( x1, x5 );
y0 = mm512_rol_32( x2, 7 );
y1 = mm512_rol_32( x3, 7 );
x2 = mm512_rol_32( x0, 7 );
x3 = mm512_rol_32( x1, 7 );
x0 = _mm512_xor_si512( y0, x4 );
x1 = _mm512_xor_si512( y1, x5 );
x2 = _mm512_xor_si512( x2, x6 );
x3 = _mm512_xor_si512( x3, x7 );
x4 = mm512_swap128_64( x4 );
@@ -72,15 +70,13 @@ static void transform_4way( cube_4way_context *sp )
x5 = _mm512_add_epi32( x1, x5 );
x6 = _mm512_add_epi32( x2, x6 );
x7 = _mm512_add_epi32( x3, x7 );
y0 = x0;
y1 = x2;
x0 = mm512_rol_32( x1, 11 );
x1 = mm512_rol_32( y0, 11 );
x2 = mm512_rol_32( x3, 11 );
x3 = mm512_rol_32( y1, 11 );
x0 = _mm512_xor_si512( x0, x4 );
y0 = mm512_rol_32( x1, 11 );
x1 = mm512_rol_32( x0, 11 );
y1 = mm512_rol_32( x3, 11 );
x3 = mm512_rol_32( x2, 11 );
x0 = _mm512_xor_si512( y0, x4 );
x1 = _mm512_xor_si512( x1, x5 );
x2 = _mm512_xor_si512( x2, x6 );
x2 = _mm512_xor_si512( y1, x6 );
x3 = _mm512_xor_si512( x3, x7 );
x4 = mm512_swap64_32( x4 );
x5 = mm512_swap64_32( x5 );
@@ -98,6 +94,122 @@ static void transform_4way( cube_4way_context *sp )
_mm512_store_si512( (__m512i*)sp->h + 7, x7 );
}
// 8 ways, 4 way parallel double buffered
static void transform_4way_2buf( cube_4way_2buf_context *sp )
{
int r;
const int rounds = sp->rounds;
__m512i x0, x1, x2, x3, x4, x5, x6, x7;
__m512i y0, y1, y2, y3, y4, y5, y6, y7;
__m512i tx0, tx1, ty0, ty1;
x0 = _mm512_load_si512( (__m512i*)sp->h0 );
x1 = _mm512_load_si512( (__m512i*)sp->h0 + 1 );
x2 = _mm512_load_si512( (__m512i*)sp->h0 + 2 );
x3 = _mm512_load_si512( (__m512i*)sp->h0 + 3 );
x4 = _mm512_load_si512( (__m512i*)sp->h0 + 4 );
x5 = _mm512_load_si512( (__m512i*)sp->h0 + 5 );
x6 = _mm512_load_si512( (__m512i*)sp->h0 + 6 );
x7 = _mm512_load_si512( (__m512i*)sp->h0 + 7 );
y0 = _mm512_load_si512( (__m512i*)sp->h1 );
y1 = _mm512_load_si512( (__m512i*)sp->h1 + 1 );
y2 = _mm512_load_si512( (__m512i*)sp->h1 + 2 );
y3 = _mm512_load_si512( (__m512i*)sp->h1 + 3 );
y4 = _mm512_load_si512( (__m512i*)sp->h1 + 4 );
y5 = _mm512_load_si512( (__m512i*)sp->h1 + 5 );
y6 = _mm512_load_si512( (__m512i*)sp->h1 + 6 );
y7 = _mm512_load_si512( (__m512i*)sp->h1 + 7 );
for ( r = 0; r < rounds; ++r )
{
x4 = _mm512_add_epi32( x0, x4 );
y4 = _mm512_add_epi32( y0, y4 );
x5 = _mm512_add_epi32( x1, x5 );
y5 = _mm512_add_epi32( y1, y5 );
tx0 = mm512_rol_32( x2, 7 );
ty0 = mm512_rol_32( y2, 7 );
tx1 = mm512_rol_32( x3, 7 );
ty1 = mm512_rol_32( y3, 7 );
x6 = _mm512_add_epi32( x2, x6 );
y6 = _mm512_add_epi32( y2, y6 );
x7 = _mm512_add_epi32( x3, x7 );
y7 = _mm512_add_epi32( y3, y7 );
x2 = mm512_rol_32( x0, 7 );
y2 = mm512_rol_32( y0, 7 );
x3 = mm512_rol_32( x1, 7 );
y3 = mm512_rol_32( y1, 7 );
x0 = _mm512_xor_si512( tx0, x4 );
y0 = _mm512_xor_si512( ty0, y4 );
x1 = _mm512_xor_si512( tx1, x5 );
y1 = _mm512_xor_si512( ty1, y5 );
x4 = mm512_swap128_64( x4 );
y4 = mm512_swap128_64( y4 );
x5 = mm512_swap128_64( x5 );
y5 = mm512_swap128_64( y5 );
x2 = _mm512_xor_si512( x2, x6 );
y2 = _mm512_xor_si512( y2, y6 );
x3 = _mm512_xor_si512( x3, x7 );
y3 = _mm512_xor_si512( y3, y7 );
x6 = mm512_swap128_64( x6 );
y6 = mm512_swap128_64( y6 );
x7 = mm512_swap128_64( x7 );
y7 = mm512_swap128_64( y7 );
x4 = _mm512_add_epi32( x0, x4 );
y4 = _mm512_add_epi32( y0, y4 );
x5 = _mm512_add_epi32( x1, x5 );
y5 = _mm512_add_epi32( y1, y5 );
tx0 = mm512_rol_32( x1, 11 );
ty0 = mm512_rol_32( y1, 11 );
tx1 = mm512_rol_32( x3, 11 );
ty1 = mm512_rol_32( y3, 11 );
x6 = _mm512_add_epi32( x2, x6 );
y6 = _mm512_add_epi32( y2, y6 );
x7 = _mm512_add_epi32( x3, x7 );
y7 = _mm512_add_epi32( y3, y7 );
x1 = mm512_rol_32( x0, 11 );
y1 = mm512_rol_32( y0, 11 );
x3 = mm512_rol_32( x2, 11 );
y3 = mm512_rol_32( y2, 11 );
x0 = _mm512_xor_si512( tx0, x4 );
y0 = _mm512_xor_si512( ty0, y4 );
x1 = _mm512_xor_si512( x1, x5 );
y1 = _mm512_xor_si512( y1, y5 );
x4 = mm512_swap64_32( x4 );
y4 = mm512_swap64_32( y4 );
x5 = mm512_swap64_32( x5 );
y5 = mm512_swap64_32( y5 );
x2 = _mm512_xor_si512( tx1, x6 );
y2 = _mm512_xor_si512( ty1, y6 );
x3 = _mm512_xor_si512( x3, x7 );
y3 = _mm512_xor_si512( y3, y7 );
x6 = mm512_swap64_32( x6 );
y6 = mm512_swap64_32( y6 );
x7 = mm512_swap64_32( x7 );
y7 = mm512_swap64_32( y7 );
}
_mm512_store_si512( (__m512i*)sp->h0, x0 );
_mm512_store_si512( (__m512i*)sp->h0 + 1, x1 );
_mm512_store_si512( (__m512i*)sp->h0 + 2, x2 );
_mm512_store_si512( (__m512i*)sp->h0 + 3, x3 );
_mm512_store_si512( (__m512i*)sp->h0 + 4, x4 );
_mm512_store_si512( (__m512i*)sp->h0 + 5, x5 );
_mm512_store_si512( (__m512i*)sp->h0 + 6, x6 );
_mm512_store_si512( (__m512i*)sp->h0 + 7, x7 );
_mm512_store_si512( (__m512i*)sp->h1, y0 );
_mm512_store_si512( (__m512i*)sp->h1 + 1, y1 );
_mm512_store_si512( (__m512i*)sp->h1 + 2, y2 );
_mm512_store_si512( (__m512i*)sp->h1 + 3, y3 );
_mm512_store_si512( (__m512i*)sp->h1 + 4, y4 );
_mm512_store_si512( (__m512i*)sp->h1 + 5, y5 );
_mm512_store_si512( (__m512i*)sp->h1 + 6, y6 );
_mm512_store_si512( (__m512i*)sp->h1 + 7, y7 );
}
int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
int blockbytes )
{
@@ -109,14 +221,6 @@ int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
sp->rounds = rounds;
sp->pos = 0;
h[ 0] = m512_const1_128( iv[0] );
h[ 1] = m512_const1_128( iv[1] );
h[ 2] = m512_const1_128( iv[2] );
h[ 3] = m512_const1_128( iv[3] );
h[ 4] = m512_const1_128( iv[4] );
h[ 5] = m512_const1_128( iv[5] );
h[ 6] = m512_const1_128( iv[6] );
h[ 7] = m512_const1_128( iv[7] );
h[ 0] = m512_const1_128( iv[0] );
h[ 1] = m512_const1_128( iv[1] );
h[ 2] = m512_const1_128( iv[2] );
@@ -219,6 +323,67 @@ int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
return 0;
}
int cube_4way_2buf_full( cube_4way_2buf_context *sp,
void *output0, void *output1, int hashbitlen,
const void *data0, const void *data1, size_t size )
{
__m512i *h0 = (__m512i*)sp->h0;
__m512i *h1 = (__m512i*)sp->h1;
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
: (__m128i*)IV256 );
sp->hashlen = hashbitlen/128;
sp->blocksize = 32/16;
sp->rounds = 16;
sp->pos = 0;
h1[0] = h0[0] = m512_const1_128( iv[0] );
h1[1] = h0[1] = m512_const1_128( iv[1] );
h1[2] = h0[2] = m512_const1_128( iv[2] );
h1[3] = h0[3] = m512_const1_128( iv[3] );
h1[4] = h0[4] = m512_const1_128( iv[4] );
h1[5] = h0[5] = m512_const1_128( iv[5] );
h1[6] = h0[6] = m512_const1_128( iv[6] );
h1[7] = h0[7] = m512_const1_128( iv[7] );
const int len = size >> 4;
const __m512i *in0 = (__m512i*)data0;
const __m512i *in1 = (__m512i*)data1;
__m512i *hash0 = (__m512i*)output0;
__m512i *hash1 = (__m512i*)output1;
int i;
for ( i = 0; i < len; i++ )
{
sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], in0[i] );
sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], in1[i] );
sp->pos++;
if ( sp->pos == sp->blocksize )
{
transform_4way_2buf( sp );
sp->pos = 0;
}
}
// pos is zero for 64 byte data, 1 for 80 byte data.
__m512i tmp = m512_const2_64( 0, 0x0000000000000080 );
sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], tmp );
sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], tmp );
transform_4way_2buf( sp );
tmp = m512_const2_64( 0x0000000100000000, 0 );
sp->h0[7] = _mm512_xor_si512( sp->h0[7], tmp );
sp->h1[7] = _mm512_xor_si512( sp->h1[7], tmp );
for ( i = 0; i < 10; ++i )
transform_4way_2buf( sp );
memcpy( hash0, sp->h0, sp->hashlen<<6);
memcpy( hash1, sp->h1, sp->hashlen<<6);
return 0;
}
int cube_4way_update_close( cube_4way_context *sp, void *output,
const void *data, size_t size )
@@ -259,6 +424,21 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
// 2 way 128
// This isn't expected to be used with AVX512 so HW rotate intruction
// is assumed not avaiable.
// Use double buffering to optimize serial bit rotations. Full double
// buffering isn't practical because it needs twice as many registers
// with AVX2 having only half as many as AVX512.
#define ROL2( out0, out1, in0, in1, c ) \
{ \
__m256i t0 = _mm256_slli_epi32( in0, c ); \
__m256i t1 = _mm256_slli_epi32( in1, c ); \
out0 = _mm256_srli_epi32( in0, 32-(c) ); \
out1 = _mm256_srli_epi32( in1, 32-(c) ); \
out0 = _mm256_or_si256( out0, t0 ); \
out1 = _mm256_or_si256( out1, t1 ); \
}
static void transform_2way( cube_2way_context *sp )
{
int r;
@@ -281,14 +461,10 @@ static void transform_2way( cube_2way_context *sp )
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
x7 = _mm256_add_epi32( x3, x7 );
y0 = x0;
y1 = x1;
x0 = mm256_rol_32( x2, 7 );
x1 = mm256_rol_32( x3, 7 );
x2 = mm256_rol_32( y0, 7 );
x3 = mm256_rol_32( y1, 7 );
x0 = _mm256_xor_si256( x0, x4 );
x1 = _mm256_xor_si256( x1, x5 );
ROL2( y0, y1, x2, x3, 7 );
ROL2( x2, x3, x0, x1, 7 );
x0 = _mm256_xor_si256( y0, x4 );
x1 = _mm256_xor_si256( y1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
x3 = _mm256_xor_si256( x3, x7 );
x4 = mm256_swap128_64( x4 );
@@ -299,15 +475,11 @@ static void transform_2way( cube_2way_context *sp )
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
x7 = _mm256_add_epi32( x3, x7 );
y0 = x0;
y1 = x2;
x0 = mm256_rol_32( x1, 11 );
x1 = mm256_rol_32( y0, 11 );
x2 = mm256_rol_32( x3, 11 );
x3 = mm256_rol_32( y1, 11 );
x0 = _mm256_xor_si256( x0, x4 );
ROL2( y0, x1, x1, x0, 11 );
ROL2( y1, x3, x3, x2, 11 );
x0 = _mm256_xor_si256( y0, x4 );
x1 = _mm256_xor_si256( x1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
x2 = _mm256_xor_si256( y1, x6 );
x3 = _mm256_xor_si256( x3, x7 );
x4 = mm256_swap64_32( x4 );
x5 = mm256_swap64_32( x5 );
@@ -336,14 +508,6 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
sp->rounds = rounds;
sp->pos = 0;
h[ 0] = m256_const1_128( iv[0] );
h[ 1] = m256_const1_128( iv[1] );
h[ 2] = m256_const1_128( iv[2] );
h[ 3] = m256_const1_128( iv[3] );
h[ 4] = m256_const1_128( iv[4] );
h[ 5] = m256_const1_128( iv[5] );
h[ 6] = m256_const1_128( iv[6] );
h[ 7] = m256_const1_128( iv[7] );
h[ 0] = m256_const1_128( iv[0] );
h[ 1] = m256_const1_128( iv[1] );
h[ 2] = m256_const1_128( iv[2] );
@@ -356,7 +520,6 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
return 0;
}
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
{
const int len = size >> 4;

View File

@@ -17,41 +17,41 @@ struct _cube_4way_context
int pos;
} __attribute__ ((aligned (128)));
struct _cube_4way_2buf_context
{
__m512i h0[8];
__m512i h1[8];
int hashlen;
int rounds;
int blocksize;
int pos;
} __attribute__ ((aligned (128)));
typedef struct _cube_4way_context cube_4way_context;
typedef struct _cube_4way_2buf_context cube_4way_2buf_context;
int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
int blockbytes );
int blockbytes );
int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
int cube_4way_close( cube_4way_context *sp, void *output );
int cube_4way_update_close( cube_4way_context *sp, void *output,
const void *data, size_t size );
int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
const void *data, size_t size );
int cube_4x256_full( cube_4way_context *sp, void *output, int hashbitlen,
const void *data, size_t size );
#define cube512_4way_init( sp ) cube_4way_update( sp, 512 )
#define cube512_4way_update cube_4way_update
#define cube512_4way_update_close cube_4way_update
#define cube512_4way_close cube_4way_update
#define cube512_4way_full( sp, output, data, size ) \
cube_4way_full( sp, output, 512, data, size )
#define cube512_4x256_full( sp, output, data, size ) \
cube_4x256_full( sp, output, 512, data, size )
#define cube256_4way_init( sp ) cube_4way_update( sp, 256 )
#define cube256_4way_update cube_4way_update
#define cube256_4way_update_close cube_4way_update
#define cube256_4way_close cube_4way_update
#define cube256_4way_full( sp, output, data, size ) \
cube_4way_full( sp, output, 256, data, size )
#define cube256_4x256_full( sp, output, data, size ) \
cube_4x256_full( sp, output, 256, data, size )
int cube_4way_2buf_full( cube_4way_2buf_context *sp,
void *output0, void *output1, int hashbitlen,
const void *data0, const void *data1, size_t size );
#endif
// 2x128, 2 way parallel SSE2
// 2x128, 2 way parallel AVX2
struct _cube_2way_context
{

View File

@@ -31,10 +31,14 @@ static void transform( cubehashParam *sp )
for ( r = 0; r < rounds; ++r )
{
x1 = _mm512_add_epi32( x0, x1 );
x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 );
x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) );
x0 = _mm512_xor_si512( mm512_rol_32(
mm512_swap256_128( x0 ), 11 ), x1 );
x0 = mm512_swap_256( x0 );
x0 = mm512_rol_32( x0, 7 );
x0 = _mm512_xor_si512( x0, x1 );
x1 = mm512_swap128_64( x1 );
x1 = _mm512_add_epi32( x0, x1 );
x0 = mm512_swap256_128( x0 );
x0 = mm512_rol_32( x0, 11 );
x0 = _mm512_xor_si512( x0, x1 );
x1 = mm512_swap64_32( x1 );
}

View File

@@ -15,11 +15,11 @@
struct _cubehashParam
{
__m128i _ALIGN(64) x[8]; // aligned for __m512i
int hashlen; // __m128i
int rounds;
int blocksize; // __m128i
int pos; // number of __m128i read into x from current block
__m128i _ALIGN(64) x[8]; // aligned for __m256i
};
typedef struct _cubehashParam cubehashParam;

View File

@@ -37,12 +37,23 @@ typedef struct
} hashState_fugue __attribute__ ((aligned (64)));
// These functions are deprecated, use the lower case macro aliases that use
// the standard interface. This will be cleaned up at a later date.
HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
#define fugue512_init( state ) \
fugue512_Init( state, 512 )
#define fugue512_update( state, data, len ) \
fugue512_Update( state, data, (len)<<3 )
#define fugue512_final \
fugue512_Final
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
#endif // AES

View File

@@ -156,14 +156,12 @@ int groestl512_full( hashState_groestl* ctx, void* output,
}
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
// --- update ---
const int len = (int)databitlen / 128;
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
const int hash_offset = SIZE512 - hashlen_m128i;
int rem = ctx->rem_ptr;
uint64_t blocks = len / SIZE512;
__m128i* in = (__m128i*)input;
@@ -175,8 +173,8 @@ int groestl512_full( hashState_groestl* ctx, void* output,
// copy any remaining data to buffer, it may already contain data
// from a previous update for a midstate precalc
for ( i = 0; i < len % SIZE512; i++ )
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
i += rem; // use i as rem_ptr in final
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// use i as rem_ptr in final
//--- final ---

View File

@@ -43,7 +43,8 @@
#define ROUNDS (ROUNDS1024)
//#endif
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
#define ROTL64(a,n) rol64( a, n )
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))

View File

@@ -227,12 +227,10 @@ int groestl256_full( hashState_groestl256* ctx,
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
INIT256( ctx->chaining );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
const int len = (int)databitlen / 128;
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
const int hash_offset = SIZE256 - hashlen_m128i;
int rem = ctx->rem_ptr;
int blocks = len / SIZE256;
__m128i* in = (__m128i*)input;
@@ -245,7 +243,7 @@ int groestl256_full( hashState_groestl256* ctx,
// cryptonight has 200 byte input, an odd number of __m128i
// remainder is only 8 bytes, ie u64.
if ( databitlen % 128 !=0 )
if ( databitlen % 128 != 0 )
{
// must be cryptonight, copy 64 bits of data
*(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
@@ -255,8 +253,8 @@ int groestl256_full( hashState_groestl256* ctx,
{
// Copy any remaining data to buffer for final transform
for ( i = 0; i < len % SIZE256; i++ )
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
i += rem; // use i as rem_ptr in final
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// use i as rem_ptr in final
}
//--- final ---

View File

@@ -63,7 +63,8 @@ typedef crypto_uint64 u64;
//#define ROUNDS (ROUNDS1024)
//#endif
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
#define ROTL64(a,n) rol64( a, n )
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))

View File

@@ -50,7 +50,6 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
const int len = (int)datalen >> 4;
const int hashlen_m128i = 32 >> 4; // bytes to __m128i
const int hash_offset = SIZE256 - hashlen_m128i;
int rem = ctx->rem_ptr;
uint64_t blocks = len / SIZE256;
__m512i* in = (__m512i*)input;
int i;
@@ -67,7 +66,6 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
// The only non-zero in the IV is len. It can be hard coded.
ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
// --- update ---
@@ -76,11 +74,10 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
TF512_4way( ctx->chaining, &in[ i * SIZE256 ] );
ctx->buf_ptr = blocks * SIZE256;
// copy any remaining data to buffer, it may already contain data
// from a previous update for a midstate precalc
// copy any remaining data to buffer
for ( i = 0; i < len % SIZE256; i++ )
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
i += rem; // use i as rem_ptr in final
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// use i as rem_ptr in final
//--- final ---
@@ -206,7 +203,6 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
const int len = (int)datalen >> 4;
const int hashlen_m128i = 32 >> 4; // bytes to __m128i
const int hash_offset = SIZE256 - hashlen_m128i;
int rem = ctx->rem_ptr;
uint64_t blocks = len / SIZE256;
__m256i* in = (__m256i*)input;
int i;
@@ -223,7 +219,6 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
// The only non-zero in the IV is len. It can be hard coded.
ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
// --- update ---
@@ -232,11 +227,10 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
TF512_2way( ctx->chaining, &in[ i * SIZE256 ] );
ctx->buf_ptr = blocks * SIZE256;
// copy any remaining data to buffer, it may already contain data
// from a previous update for a midstate precalc
// copy any remaining data to buffer
for ( i = 0; i < len % SIZE256; i++ )
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
i += rem; // use i as rem_ptr in final
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// use i as rem_ptr in final
//--- final ---

View File

@@ -99,7 +99,6 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
memset_zero_512( ctx->buffer, SIZE512 );
ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
// --- update ---
@@ -108,8 +107,7 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
ctx->buf_ptr = blocks * SIZE512;
for ( i = 0; i < len % SIZE512; i++ )
ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
i += ctx->rem_ptr;
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// --- close ---
@@ -222,7 +220,6 @@ int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
memset_zero_256( ctx->buffer, SIZE512 );
ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
// --- update ---
@@ -231,8 +228,7 @@ int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
ctx->buf_ptr = blocks * SIZE512;
for ( i = 0; i < len % SIZE512; i++ )
ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
i += ctx->rem_ptr;
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// --- close ---

View File

@@ -11,7 +11,7 @@
#else
#include "sph_groestl.h"
#endif
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
typedef struct {
#ifdef __AES__
@@ -19,7 +19,6 @@ typedef struct {
#else
sph_groestl512_context groestl;
#endif
sph_sha256_context sha;
} myrgr_ctx_holder;
myrgr_ctx_holder myrgr_ctx;
@@ -31,7 +30,6 @@ void init_myrgr_ctx()
#else
sph_groestl512_init( &myrgr_ctx.groestl );
#endif
sph_sha256_init( &myrgr_ctx.sha );
}
void myriad_hash(void *output, const void *input)
@@ -49,8 +47,7 @@ void myriad_hash(void *output, const void *input)
sph_groestl512_close(&ctx.groestl, hash);
#endif
sph_sha256( &ctx.sha, hash, 64 );
sph_sha256_close( &ctx.sha, hash );
sha256_full( hash, hash, 64 );
memcpy(output, hash, 32);
}

View File

@@ -545,31 +545,33 @@ static const sph_u32 T512[64][16] = {
#define sE c7
#define sF m7
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// Hamsi 8 way AVX512
// Intel says _mm512_movepi64_mask has (1L/1T) timimg while
// _mm512_cmplt_epi64_mask as (3L/1T) timing, however, when tested hashing X13
// on i9-9940x cmplt with zero was 3% faster than movepi.
#define INPUT_BIG8 \
do { \
__m512i db = *buf; \
const uint64_t *tp = (uint64_t*)&T512[0][0]; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
__m512i db = _mm512_ror_epi64( *buf, 1 ); \
const __m512i zero = m512_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
for ( int u = 0; u < 64; u++ ) \
{ \
__m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
dm = mm512_negate_32( _mm512_or_si512( dm, \
_mm512_slli_epi64( dm, 32 ) ) ); \
m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \
m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \
m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \
m3 = mm512_xorand( m3, dm, m512_const1_64( tp[3] ) ); \
m4 = mm512_xorand( m4, dm, m512_const1_64( tp[4] ) ); \
m5 = mm512_xorand( m5, dm, m512_const1_64( tp[5] ) ); \
m6 = mm512_xorand( m6, dm, m512_const1_64( tp[6] ) ); \
m7 = mm512_xorand( m7, dm, m512_const1_64( tp[7] ) ); \
const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \
m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \
m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \
m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \
m3 = _mm512_mask_xor_epi64( m3, dm, m3, m512_const1_64( tp[3] ) ); \
m4 = _mm512_mask_xor_epi64( m4, dm, m4, m512_const1_64( tp[4] ) ); \
m5 = _mm512_mask_xor_epi64( m5, dm, m5, m512_const1_64( tp[5] ) ); \
m6 = _mm512_mask_xor_epi64( m6, dm, m6, m512_const1_64( tp[6] ) ); \
m7 = _mm512_mask_xor_epi64( m7, dm, m7, m512_const1_64( tp[7] ) ); \
db = _mm512_ror_epi64( db, 1 ); \
tp += 8; \
db = _mm512_srli_epi64( db, 1 ); \
} \
} while (0)
@@ -583,9 +585,8 @@ do { \
t = _mm512_xor_si512( t, c ); \
d = mm512_xoror( a, b, t ); \
t = mm512_xorand( t, a, b ); \
b = mm512_xor3( b, d, t ); \
a = c; \
c = b; \
c = mm512_xor3( b, d, t ); \
b = d; \
d = mm512_not( t ); \
} while (0)
@@ -609,162 +610,184 @@ do { \
#define READ_STATE_BIG8(sc) \
do { \
c0 = sc->h[0x0]; \
c1 = sc->h[0x1]; \
c2 = sc->h[0x2]; \
c3 = sc->h[0x3]; \
c4 = sc->h[0x4]; \
c5 = sc->h[0x5]; \
c6 = sc->h[0x6]; \
c7 = sc->h[0x7]; \
c0 = sc->h[0]; \
c1 = sc->h[1]; \
c2 = sc->h[2]; \
c3 = sc->h[3]; \
c4 = sc->h[4]; \
c5 = sc->h[5]; \
c6 = sc->h[6]; \
c7 = sc->h[7]; \
} while (0)
#define WRITE_STATE_BIG8(sc) \
do { \
sc->h[0x0] = c0; \
sc->h[0x1] = c1; \
sc->h[0x2] = c2; \
sc->h[0x3] = c3; \
sc->h[0x4] = c4; \
sc->h[0x5] = c5; \
sc->h[0x6] = c6; \
sc->h[0x7] = c7; \
sc->h[0] = c0; \
sc->h[1] = c1; \
sc->h[2] = c2; \
sc->h[3] = c3; \
sc->h[4] = c4; \
sc->h[5] = c5; \
sc->h[6] = c6; \
sc->h[7] = c7; \
} while (0)
#define ROUND_BIG8(rc, alpha) \
#define ROUND_BIG8( alpha ) \
do { \
__m512i t0, t1, t2, t3; \
s0 = _mm512_xor_si512( s0, m512_const1_64( \
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
__m512i t0, t1, t2, t3, t4, t5; \
s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \
s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \
s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \
s3 = _mm512_xor_si512( s3, alpha[ 3] ); /* c1 */ \
s4 = _mm512_xor_si512( s4, alpha[ 4] ); /* c2 */ \
s5 = _mm512_xor_si512( s5, alpha[ 5] ); /* m2 */ \
s6 = _mm512_xor_si512( s6, alpha[ 6] ); /* c3 */ \
s7 = _mm512_xor_si512( s7, alpha[ 7] ); /* m3 */ \
s8 = _mm512_xor_si512( s8, alpha[ 8] ); /* m4 */ \
s9 = _mm512_xor_si512( s9, alpha[ 9] ); /* c4 */ \
sA = _mm512_xor_si512( sA, alpha[10] ); /* m5 */ \
sB = _mm512_xor_si512( sB, alpha[11] ); /* c5 */ \
sC = _mm512_xor_si512( sC, alpha[12] ); /* c6 */ \
sD = _mm512_xor_si512( sD, alpha[13] ); /* m6 */ \
sE = _mm512_xor_si512( sE, alpha[14] ); /* c7 */ \
sF = _mm512_xor_si512( sF, alpha[15] ); /* m7 */ \
\
SBOX8( s0, s4, s8, sC ); \
SBOX8( s1, s5, s9, sD ); \
SBOX8( s2, s6, sA, sE ); \
SBOX8( s3, s7, sB, sF ); \
SBOX8( s0, s4, s8, sC ); /* ( m0, c2, m4, c6 ) */ \
SBOX8( s1, s5, s9, sD ); /* ( c0, m2, c4, m6 ) */ \
SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
\
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \
_mm512_bslli_epi128( s5, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \
_mm512_bslli_epi128( sE, 4 ) ); \
L8( s0, t1, s9, t3 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \
s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \
s4 = mm512_swap64_32( s4 ); \
s5 = mm512_swap64_32( s5 ); \
sD = mm512_swap64_32( sD ); \
sE = mm512_swap64_32( sE ); \
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
L8( s0, t0, s9, t1 ); \
\
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
_mm512_bslli_epi128( s6, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \
_mm512_bslli_epi128( sF, 4 ) ); \
L8( s1, t1, sA, t3 ); \
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \
sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \
s6 = mm512_swap64_32( s6 ); \
sF = mm512_swap64_32( sF ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
L8( s1, t2, sA, t3 ); \
s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
\
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \
_mm512_bslli_epi128( s7, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \
_mm512_bslli_epi128( sC, 4 ) ); \
L8( s2, t1, sB, t3 ); \
s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \
s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \
sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \
s7 = mm512_swap64_32( s7 ); \
sC = mm512_swap64_32( sC ); \
t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
L8( s2, t4, sB, t5 ); \
s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
s6 = mm512_swap64_32( s6 ); \
sF = mm512_swap64_32( sF ); \
\
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \
_mm512_bslli_epi128( s4, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
_mm512_bslli_epi128( sD, 4 ) ); \
L8( s3, t1, s8, t3 ); \
s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \
s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
L8( s3, t2, s8, t3 ); \
s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
sC = _mm512_mask_blend_epi32( 0x5555, t5, t3 ); \
sD = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 ); \
s7 = mm512_swap64_32( s7 ); \
sC = mm512_swap64_32( sC ); \
\
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \
_mm512_bslli_epi128( sB, 4 ) ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \
t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
t3 = mm512_swap64_32( t3 ); \
L8( t0, t1, t2, t3 ); \
t3 = mm512_swap64_32( t3 ); \
s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \
s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \
s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \
sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \
sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \
sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \
\
t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
_mm512_bslli_epi128( sD, 4 ) ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, sE ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
L8( t0, t1, t2, t3 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \
s4 = _mm512_mask_blend_epi32( 0x5555, s4, t0 ); \
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \
s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t1 ); \
s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t2 ); \
s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
s4 = mm512_swap64_32( s4 ); \
s5 = mm512_swap64_32( s5 ); \
sD = mm512_swap64_32( sD ); \
sE = mm512_swap64_32( sE ); \
} while (0)
#define P_BIG8 \
do { \
ROUND_BIG8(0, alpha_n); \
ROUND_BIG8(1, alpha_n); \
ROUND_BIG8(2, alpha_n); \
ROUND_BIG8(3, alpha_n); \
ROUND_BIG8(4, alpha_n); \
ROUND_BIG8(5, alpha_n); \
__m512i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (1ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (2ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (3ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (4ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (5ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
} while (0)
#define PF_BIG8 \
do { \
ROUND_BIG8( 0, alpha_f); \
ROUND_BIG8( 1, alpha_f); \
ROUND_BIG8( 2, alpha_f); \
ROUND_BIG8( 3, alpha_f); \
ROUND_BIG8( 4, alpha_f); \
ROUND_BIG8( 5, alpha_f); \
ROUND_BIG8( 6, alpha_f); \
ROUND_BIG8( 7, alpha_f); \
ROUND_BIG8( 8, alpha_f); \
ROUND_BIG8( 9, alpha_f); \
ROUND_BIG8(10, alpha_f); \
ROUND_BIG8(11, alpha_f); \
__m512i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 1ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 2ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 3ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 4ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 5ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 6ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 7ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 8ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 9ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (10ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (11ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
} while (0)
#define T_BIG8 \
do { /* order is important */ \
c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \
c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \
c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \
c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \
c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \
c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \
c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \
c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \
c7 = sc->h[ 7 ] = _mm512_xor_si512( sc->h[ 7 ], sB ); /* c5 */ \
c6 = sc->h[ 6 ] = _mm512_xor_si512( sc->h[ 6 ], sA ); /* m5 */ \
c5 = sc->h[ 5 ] = _mm512_xor_si512( sc->h[ 5 ], s9 ); /* c4 */ \
c4 = sc->h[ 4 ] = _mm512_xor_si512( sc->h[ 4 ], s8 ); /* m4 */ \
c3 = sc->h[ 3 ] = _mm512_xor_si512( sc->h[ 3 ], s3 ); /* c1 */ \
c2 = sc->h[ 2 ] = _mm512_xor_si512( sc->h[ 2 ], s2 ); /* m1 */ \
c1 = sc->h[ 1 ] = _mm512_xor_si512( sc->h[ 1 ], s1 ); /* c0 */ \
c0 = sc->h[ 0 ] = _mm512_xor_si512( sc->h[ 0 ], s0 ); /* m0 */ \
} while (0)
void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
@@ -801,7 +824,6 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
WRITE_STATE_BIG8( sc );
}
void hamsi512_8way_init( hamsi_8way_big_context *sc )
{
sc->partial_len = 0;
@@ -851,13 +873,12 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
#define INPUT_BIG \
do { \
__m256i db = *buf; \
const uint64_t *tp = (uint64_t*)&T512[0][0]; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \
for ( int u = 0; u < 64; u++ ) \
const __m256i zero = m256_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
for ( int u = 63; u >= 0; u-- ) \
{ \
__m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
dm = mm256_negate_32( _mm256_or_si256( dm, \
_mm256_slli_epi64( dm, 32 ) ) ); \
__m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
m256_const1_64( tp[0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
@@ -875,7 +896,6 @@ do { \
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
m256_const1_64( tp[7] ) ) ); \
tp += 8; \
db = _mm256_srli_epi64( db, 1 ); \
} \
} while (0)
@@ -895,10 +915,9 @@ do { \
d = _mm256_xor_si256( d, a ); \
a = _mm256_and_si256( a, b ); \
t = _mm256_xor_si256( t, a ); \
b = _mm256_xor_si256( b, d ); \
b = _mm256_xor_si256( b, t ); \
a = c; \
c = b; \
c = _mm256_xor_si256( b, d ); \
c = _mm256_xor_si256( c, t ); \
b = d; \
d = mm256_not( t ); \
} while (0)
@@ -924,180 +943,184 @@ do { \
#define READ_STATE_BIG(sc) \
do { \
c0 = sc->h[0x0]; \
c1 = sc->h[0x1]; \
c2 = sc->h[0x2]; \
c3 = sc->h[0x3]; \
c4 = sc->h[0x4]; \
c5 = sc->h[0x5]; \
c6 = sc->h[0x6]; \
c7 = sc->h[0x7]; \
c0 = sc->h[0]; \
c1 = sc->h[1]; \
c2 = sc->h[2]; \
c3 = sc->h[3]; \
c4 = sc->h[4]; \
c5 = sc->h[5]; \
c6 = sc->h[6]; \
c7 = sc->h[7]; \
} while (0)
#define WRITE_STATE_BIG(sc) \
do { \
sc->h[0x0] = c0; \
sc->h[0x1] = c1; \
sc->h[0x2] = c2; \
sc->h[0x3] = c3; \
sc->h[0x4] = c4; \
sc->h[0x5] = c5; \
sc->h[0x6] = c6; \
sc->h[0x7] = c7; \
sc->h[0] = c0; \
sc->h[1] = c1; \
sc->h[2] = c2; \
sc->h[3] = c3; \
sc->h[4] = c4; \
sc->h[5] = c5; \
sc->h[6] = c6; \
sc->h[7] = c7; \
} while (0)
/*
#define s0 m0
#define s1 c0
#define s2 m1
#define s3 c1
#define s4 c2
#define s5 m2
#define s6 c3
#define s7 m3
#define s8 m4
#define s9 c4
#define sA m5
#define sB c5
#define sC c6
#define sD m6
#define sE c7
#define sF m7
*/
#define ROUND_BIG(rc, alpha) \
#define ROUND_BIG( alpha ) \
do { \
__m256i t0, t1, t2, t3; \
s0 = _mm256_xor_si256( s0, m256_const1_64( \
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
__m256i t0, t1, t2, t3, t4, t5; \
s0 = _mm256_xor_si256( s0, alpha[ 0] ); \
s1 = _mm256_xor_si256( s1, alpha[ 1] ); \
s2 = _mm256_xor_si256( s2, alpha[ 2] ); \
s3 = _mm256_xor_si256( s3, alpha[ 3] ); \
s4 = _mm256_xor_si256( s4, alpha[ 4] ); \
s5 = _mm256_xor_si256( s5, alpha[ 5] ); \
s6 = _mm256_xor_si256( s6, alpha[ 6] ); \
s7 = _mm256_xor_si256( s7, alpha[ 7] ); \
s8 = _mm256_xor_si256( s8, alpha[ 8] ); \
s9 = _mm256_xor_si256( s9, alpha[ 9] ); \
sA = _mm256_xor_si256( sA, alpha[10] ); \
sB = _mm256_xor_si256( sB, alpha[11] ); \
sC = _mm256_xor_si256( sC, alpha[12] ); \
sD = _mm256_xor_si256( sD, alpha[13] ); \
sE = _mm256_xor_si256( sE, alpha[14] ); \
sF = _mm256_xor_si256( sF, alpha[15] ); \
\
SBOX( s0, s4, s8, sC ); \
SBOX( s1, s5, s9, sD ); \
SBOX( s2, s6, sA, sE ); \
SBOX( s3, s7, sB, sF ); \
\
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \
_mm256_bslli_epi128( s5, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \
_mm256_bslli_epi128( sE, 4 ), 0xAA ); \
L( s0, t1, s9, t3 ); \
s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
s4 = mm256_swap64_32( s4 ); \
s5 = mm256_swap64_32( s5 ); \
sD = mm256_swap64_32( sD ); \
sE = mm256_swap64_32( sE ); \
t0 = _mm256_blend_epi32( s4, s5, 0xaa ); \
t1 = _mm256_blend_epi32( sD, sE, 0xaa ); \
L( s0, t0, s9, t1 ); \
\
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
_mm256_bslli_epi128( s6, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \
_mm256_bslli_epi128( sF, 4 ), 0xAA ); \
L( s1, t1, sA, t3 ); \
s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
s6 = mm256_swap64_32( s6 ); \
sF = mm256_swap64_32( sF ); \
t2 = _mm256_blend_epi32( s5, s6, 0xaa ); \
t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \
L( s1, t2, sA, t3 ); \
s5 = _mm256_blend_epi32( t0, t2, 0x55 ); \
sE = _mm256_blend_epi32( t1, t3, 0x55 ); \
\
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \
_mm256_bslli_epi128( s7, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \
_mm256_bslli_epi128( sC, 4 ), 0xAA ); \
L( s2, t1, sB, t3 ); \
s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
s7 = mm256_swap64_32( s7 ); \
sC = mm256_swap64_32( sC ); \
t4 = _mm256_blend_epi32( s6, s7, 0xaa ); \
t5 = _mm256_blend_epi32( sF, sC, 0xaa ); \
L( s2, t4, sB, t5 ); \
s6 = _mm256_blend_epi32( t2, t4, 0x55 ); \
sF = _mm256_blend_epi32( t3, t5, 0x55 ); \
s6 = mm256_swap64_32( s6 ); \
sF = mm256_swap64_32( sF ); \
\
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \
_mm256_bslli_epi128( s4, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \
_mm256_bslli_epi128( sD, 4 ), 0xAA ); \
L( s3, t1, s8, t3 ); \
s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
t2 = _mm256_blend_epi32( s7, s4, 0xaa ); \
t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \
L( s3, t2, s8, t3 ); \
s7 = _mm256_blend_epi32( t4, t2, 0x55 ); \
s4 = _mm256_blend_epi32( t0, t2, 0xaa ); \
sC = _mm256_blend_epi32( t5, t3, 0x55 ); \
sD = _mm256_blend_epi32( t1, t3, 0xaa ); \
s7 = mm256_swap64_32( s7 ); \
sC = mm256_swap64_32( sC ); \
\
t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \
t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \
t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \
_mm256_bslli_epi128( sB, 4 ), 0xAA ); \
t0 = _mm256_blend_epi32( s0, mm256_swap64_32( s8 ), 0xaa ); \
t1 = _mm256_blend_epi32( s1, s9, 0xaa ); \
t2 = _mm256_blend_epi32( mm256_swap64_32( s2 ), sA, 0xaa ); \
t3 = _mm256_blend_epi32( s3, sB, 0x55 ); \
t3 = mm256_swap64_32( t3 ); \
L( t0, t1, t2, t3 ); \
t3 = mm256_swap64_32( t3 ); \
s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \
s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \
s8 = _mm256_blend_epi32( s8, mm256_swap64_32( t0 ), 0x55 ); \
s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \
s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \
s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \
sA = _mm256_blend_epi32( sA, t2, 0xAA ); \
s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \
sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \
s9 = _mm256_blend_epi32( s9, t1, 0xaa ); \
s2 = _mm256_blend_epi32( s2, mm256_swap64_32( t2 ), 0xaa ); \
sA = _mm256_blend_epi32( sA, t2, 0xaa ); \
s3 = _mm256_blend_epi32( s3, t3, 0xaa ); \
sB = _mm256_blend_epi32( sB, t3, 0x55 ); \
\
t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
_mm256_bslli_epi128( sD, 4 ), 0xAA ); \
t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \
t0 = _mm256_blend_epi32( s4, sC, 0xaa ); \
t1 = _mm256_blend_epi32( s5, sD, 0xaa ); \
t2 = _mm256_blend_epi32( s6, sE, 0xaa ); \
t3 = _mm256_blend_epi32( s7, sF, 0xaa ); \
L( t0, t1, t2, t3 ); \
s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \
sC = _mm256_blend_epi32( sC, t0, 0xAA ); \
s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \
sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \
s4 = _mm256_blend_epi32( s4, t0, 0x55 ); \
sC = _mm256_blend_epi32( sC, t0, 0xaa ); \
s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
sD = _mm256_blend_epi32( sD, t1, 0xaa ); \
s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \
sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \
sE = _mm256_blend_epi32( sE, t2, 0xaa ); \
s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \
sF = _mm256_blend_epi32( sF, t3, 0xAA ); \
sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
s4 = mm256_swap64_32( s4 ); \
s5 = mm256_swap64_32( s5 ); \
sD = mm256_swap64_32( sD ); \
sE = mm256_swap64_32( sE ); \
} while (0)
#define P_BIG \
do { \
ROUND_BIG(0, alpha_n); \
ROUND_BIG(1, alpha_n); \
ROUND_BIG(2, alpha_n); \
ROUND_BIG(3, alpha_n); \
ROUND_BIG(4, alpha_n); \
ROUND_BIG(5, alpha_n); \
__m256i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (1ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (2ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (3ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (4ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (5ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
} while (0)
#define PF_BIG \
do { \
ROUND_BIG( 0, alpha_f); \
ROUND_BIG( 1, alpha_f); \
ROUND_BIG( 2, alpha_f); \
ROUND_BIG( 3, alpha_f); \
ROUND_BIG( 4, alpha_f); \
ROUND_BIG( 5, alpha_f); \
ROUND_BIG( 6, alpha_f); \
ROUND_BIG( 7, alpha_f); \
ROUND_BIG( 8, alpha_f); \
ROUND_BIG( 9, alpha_f); \
ROUND_BIG(10, alpha_f); \
ROUND_BIG(11, alpha_f); \
__m256i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 1ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 2ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 3ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 4ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 5ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 6ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 7ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 8ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 9ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (10ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (11ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
} while (0)
#define T_BIG \
do { /* order is important */ \
c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \
c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \
c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \
c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \
c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \
c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \
c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \
c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \
c7 = sc->h[ 7 ] = _mm256_xor_si256( sc->h[ 7 ], sB ); \
c6 = sc->h[ 6 ] = _mm256_xor_si256( sc->h[ 6 ], sA ); \
c5 = sc->h[ 5 ] = _mm256_xor_si256( sc->h[ 5 ], s9 ); \
c4 = sc->h[ 4 ] = _mm256_xor_si256( sc->h[ 4 ], s8 ); \
c3 = sc->h[ 3 ] = _mm256_xor_si256( sc->h[ 3 ], s3 ); \
c2 = sc->h[ 2 ] = _mm256_xor_si256( sc->h[ 2 ], s2 ); \
c1 = sc->h[ 1 ] = _mm256_xor_si256( sc->h[ 1 ], s1 ); \
c0 = sc->h[ 0 ] = _mm256_xor_si256( sc->h[ 0 ], s0 ); \
} while (0)
void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )

View File

@@ -141,6 +141,13 @@ do { \
_mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
} while (0)
#define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
do { \
__m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
mm128_ror_32( x7, 11 ) ), w ); \
} while (0)
/*
* PASSy(n, in) computes pass number "y", for a total of "n", using the
* one-argument macro "in" to access input words. Current state is assumed
@@ -152,22 +159,22 @@ do { \
#define PASS1(n, in) do { \
unsigned pass_count; \
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
in(pass_count + 0), SPH_C32(0x00000000)); \
STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
in(pass_count + 1), SPH_C32(0x00000000)); \
STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
in(pass_count + 2), SPH_C32(0x00000000)); \
STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
in(pass_count + 3), SPH_C32(0x00000000)); \
STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
in(pass_count + 4), SPH_C32(0x00000000)); \
STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
in(pass_count + 5), SPH_C32(0x00000000)); \
STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
in(pass_count + 6), SPH_C32(0x00000000)); \
STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
in(pass_count + 7), SPH_C32(0x00000000)); \
STEP1(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
in(pass_count + 0) ); \
STEP1(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
in(pass_count + 1) ); \
STEP1(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
in(pass_count + 2) ); \
STEP1(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
in(pass_count + 3) ); \
STEP1(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
in(pass_count + 4) ); \
STEP1(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
in(pass_count + 5) ); \
STEP1(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
in(pass_count + 6) ); \
STEP1(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
in(pass_count + 7) ); \
} \
} while (0)
@@ -605,25 +612,32 @@ do { \
_mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
} while (0)
#define STEP1_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
do { \
__m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
mm256_ror_32( x7, 11 ) ), w ); \
} while (0)
#define PASS1_8W(n, in) do { \
unsigned pass_count; \
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
in(pass_count + 0), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
in(pass_count + 1), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
in(pass_count + 2), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
in(pass_count + 3), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
in(pass_count + 4), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
in(pass_count + 5), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
in(pass_count + 6), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
in(pass_count + 7), SPH_C32(0x00000000)); \
STEP1_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
in(pass_count + 0) ); \
STEP1_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
in(pass_count + 1) ); \
STEP1_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
in(pass_count + 2) ); \
STEP1_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
in(pass_count + 3) ); \
STEP1_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
in(pass_count + 4) ); \
STEP1_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
in(pass_count + 5) ); \
STEP1_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
in(pass_count + 6) ); \
STEP1_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
in(pass_count + 7) ); \
} \
} while (0)

View File

@@ -1,382 +0,0 @@
/*
* HEFTY1 cryptographic hash function
*
* Copyright (c) 2014, dbcc14 <BM-NBx4AKznJuyem3dArgVY8MGyABpihRy5>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation are those
* of the authors and should not be interpreted as representing official policies,
* either expressed or implied, of the FreeBSD Project.
*/
#include <assert.h>
#include <string.h>
#ifdef _MSC_VER
#define inline __inline
#endif
#include "sph_hefty1.h"
#define Min(A, B) (A <= B ? A : B)
#define RoundFunc(ctx, A, B, C, D, E, F, G, H, W, K) \
{ \
/* To thwart parallelism, Br modifies itself each time it's \
* called. This also means that calling it in different \
* orders yeilds different results. In C the order of \
* evaluation of function arguments and + operands are \
* unspecified (and depends on the compiler), so we must make \
* the order of Br calls explicit. \
*/ \
uint32_t brG = Br(ctx, G); \
uint32_t tmp1 = Ch(E, Br(ctx, F), brG) + H + W + K; \
uint32_t tmp2 = tmp1 + Sigma1(Br(ctx, E)); \
uint32_t brC = Br(ctx, C); \
uint32_t brB = Br(ctx, B); \
uint32_t tmp3 = Ma(Br(ctx, A), brB, brC); \
uint32_t tmp4 = tmp3 + Sigma0(Br(ctx, A)); \
H = G; \
G = F; \
F = E; \
E = D + Br(ctx, tmp2); \
D = C; \
C = B; \
B = A; \
A = tmp2 + tmp4; \
} \
/* Nothing up my sleeve constants */
const static uint32_t K[64] = {
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
};
/* Initial hash values */
const static uint32_t H[HEFTY1_STATE_WORDS] = {
0x6a09e667UL,
0xbb67ae85UL,
0x3c6ef372UL,
0xa54ff53aUL,
0x510e527fUL,
0x9b05688cUL,
0x1f83d9abUL,
0x5be0cd19UL
};
static inline uint32_t Rr(uint32_t X, uint8_t n)
{
return (X >> n) | (X << (32 - n));
}
static inline uint32_t Ch(uint32_t E, uint32_t F, uint32_t G)
{
return (E & F) ^ (~E & G);
}
static inline uint32_t Sigma1(uint32_t E)
{
return Rr(E, 6) ^ Rr(E, 11) ^ Rr(E, 25);
}
static inline uint32_t sigma1(uint32_t X)
{
return Rr(X, 17) ^ Rr(X, 19) ^ (X >> 10);
}
static inline uint32_t Ma(uint32_t A, uint32_t B, uint32_t C)
{
return (A & B) ^ (A & C) ^ (B & C);
}
static inline uint32_t Sigma0(uint32_t A)
{
return Rr(A, 2) ^ Rr(A, 13) ^ Rr(A, 22);
}
static inline uint32_t sigma0(uint32_t X)
{
return Rr(X, 7) ^ Rr(X, 18) ^ (X >> 3);
}
static inline uint32_t Reverse32(uint32_t n)
{
#if BYTE_ORDER == LITTLE_ENDIAN
return n << 24 | (n & 0x0000ff00) << 8 | (n & 0x00ff0000) >> 8 | n >> 24;
#else
return n;
#endif
}
static inline uint64_t Reverse64(uint64_t n)
{
#if BYTE_ORDER == LITTLE_ENDIAN
uint32_t a = n >> 32;
uint32_t b = (n << 32) >> 32;
return (uint64_t)Reverse32(b) << 32 | Reverse32(a);
#else
return n;
#endif
}
/* Smoosh byte into nibble */
static inline uint8_t Smoosh4(uint8_t X)
{
return (X >> 4) ^ (X & 0xf);
}
/* Smoosh 32-bit word into 2-bits */
static inline uint8_t Smoosh2(uint32_t X)
{
uint16_t w = (X >> 16) ^ (X & 0xffff);
uint8_t n = Smoosh4((w >> 8) ^ (w & 0xff));
return (n >> 2) ^ (n & 0x3);
}
static void Mangle(uint32_t *S)
{
uint32_t *R = S;
uint32_t *C = &S[1];
uint8_t r0 = Smoosh4(R[0] >> 24);
uint8_t r1 = Smoosh4(R[0] >> 16);
uint8_t r2 = Smoosh4(R[0] >> 8);
uint8_t r3 = Smoosh4(R[0] & 0xff);
int i;
/* Diffuse */
uint32_t tmp = 0;
for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++) {
uint8_t r = Smoosh2(tmp);
switch (r) {
case 0:
C[i] ^= Rr(R[0], i + r0);
break;
case 1:
C[i] += Rr(~R[0], i + r1);
break;
case 2:
C[i] &= Rr(~R[0], i + r2);
break;
case 3:
C[i] ^= Rr(R[0], i + r3);
break;
}
tmp ^= C[i];
}
/* Compress */
tmp = 0;
for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++)
if (i % 2)
tmp ^= C[i];
else
tmp += C[i];
R[0] ^= tmp;
}
static void Absorb(uint32_t *S, uint32_t X)
{
uint32_t *R = S;
R[0] ^= X;
Mangle(S);
}
static uint32_t Squeeze(uint32_t *S)
{
uint32_t Y = S[0];
Mangle(S);
return Y;
}
/* Branch, compress and serialize function */
static inline uint32_t Br(HEFTY1_CTX *ctx, uint32_t X)
{
uint32_t R = Squeeze(ctx->sponge);
uint8_t r0 = R >> 8;
uint8_t r1 = R & 0xff;
uint32_t Y = 1 << (r0 % 32);
switch (r1 % 4)
{
case 0:
/* Do nothing */
break;
case 1:
return X & ~Y;
case 2:
return X | Y;
case 3:
return X ^ Y;
}
return X;
}
static void HashBlock(HEFTY1_CTX *ctx)
{
uint32_t A, B, C, D, E, F, G, H;
uint32_t W[HEFTY1_BLOCK_BYTES];
assert(ctx);
A = ctx->h[0];
B = ctx->h[1];
C = ctx->h[2];
D = ctx->h[3];
E = ctx->h[4];
F = ctx->h[5];
G = ctx->h[6];
H = ctx->h[7];
int t = 0;
for (; t < 16; t++) {
W[t] = Reverse32(((uint32_t *)&ctx->block[0])[t]); /* To host byte order */
Absorb(ctx->sponge, W[t] ^ K[t]);
}
for (t = 0; t < 16; t++) {
Absorb(ctx->sponge, D ^ H);
RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]);
}
for (t = 16; t < 64; t++) {
Absorb(ctx->sponge, H + D);
W[t] = sigma1(W[t - 2]) + W[t - 7] + sigma0(W[t - 15]) + W[t - 16];
RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]);
}
ctx->h[0] += A;
ctx->h[1] += B;
ctx->h[2] += C;
ctx->h[3] += D;
ctx->h[4] += E;
ctx->h[5] += F;
ctx->h[6] += G;
ctx->h[7] += H;
A = 0;
B = 0;
C = 0;
D = 0;
E = 0;
F = 0;
G = 0;
H = 0;
memset(W, 0, sizeof(W));
}
/* Public interface */
void HEFTY1_Init(HEFTY1_CTX *ctx)
{
assert(ctx);
memcpy(ctx->h, H, sizeof(ctx->h));
memset(ctx->block, 0, sizeof(ctx->block));
ctx->written = 0;
memset(ctx->sponge, 0, sizeof(ctx->sponge));
}
void HEFTY1_Update(HEFTY1_CTX *ctx, const void *buf, size_t len)
{
assert(ctx);
uint64_t read = 0;
while (len) {
size_t end = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES);
size_t count = Min(len, HEFTY1_BLOCK_BYTES - end);
memcpy(&ctx->block[end], &((unsigned char *)buf)[read], count);
len -= count;
read += count;
ctx->written += count;
if (!(ctx->written % HEFTY1_BLOCK_BYTES))
HashBlock(ctx);
}
}
void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *ctx)
{
assert(digest);
assert(ctx);
/* Pad message (FIPS 180 Section 5.1.1) */
size_t used = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES);
ctx->block[used++] = 0x80; /* Append 1 to end of message */
if (used > HEFTY1_BLOCK_BYTES - 8) {
/* We have already written into the last 64bits, so
* we must continue into the next block. */
memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - used);
HashBlock(ctx);
used = 0; /* Create a new block (below) */
}
/* All remaining bits to zero */
memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - 8 - used);
/* The last 64bits encode the length (in network byte order) */
uint64_t *len = (uint64_t *)&ctx->block[HEFTY1_BLOCK_BYTES - 8];
*len = Reverse64(ctx->written*8);
HashBlock(ctx);
/* Convert back to network byte order */
int i = 0;
for (; i < HEFTY1_STATE_WORDS; i++)
ctx->h[i] = Reverse32(ctx->h[i]);
memcpy(digest, ctx->h, sizeof(ctx->h));
memset(ctx, 0, sizeof(HEFTY1_CTX));
}
unsigned char* HEFTY1(const unsigned char *buf, size_t len, unsigned char *digest)
{
HEFTY1_CTX ctx;
static unsigned char m[HEFTY1_DIGEST_BYTES];
if (!digest)
digest = m;
HEFTY1_Init(&ctx);
HEFTY1_Update(&ctx, buf, len);
HEFTY1_Final(digest, &ctx);
return digest;
}

View File

@@ -1,66 +0,0 @@
/*
* HEFTY1 cryptographic hash function
*
* Copyright (c) 2014, dbcc14 <BM-NBx4AKznJuyem3dArgVY8MGyABpihRy5>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation are those
* of the authors and should not be interpreted as representing official policies,
* either expressed or implied, of the FreeBSD Project.
*/
#ifndef __HEFTY1_H__
#define __HEFTY1_H__
#ifdef __cplusplus
extern "C" {
#endif
#ifndef WIN32
#include <sys/types.h>
#endif
#include <inttypes.h>
#define HEFTY1_DIGEST_BYTES 32
#define HEFTY1_BLOCK_BYTES 64
#define HEFTY1_STATE_WORDS 8
#define HEFTY1_SPONGE_WORDS 4
typedef struct HEFTY1_CTX {
uint32_t h[HEFTY1_STATE_WORDS];
uint8_t block[HEFTY1_BLOCK_BYTES];
uint64_t written;
uint32_t sponge[HEFTY1_SPONGE_WORDS];
} HEFTY1_CTX;
void HEFTY1_Init(HEFTY1_CTX *cxt);
void HEFTY1_Update(HEFTY1_CTX *cxt, const void *data, size_t len);
void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *cxt);
unsigned char* HEFTY1(const unsigned char *data, size_t len, unsigned char *digest);
#ifdef __cplusplus
}
#endif
#endif /* __HEFTY1_H__ */

View File

@@ -7,6 +7,7 @@
#include "hodl-gate.h"
#include "hodl-wolf.h"
#include "miner.h"
#include "algo/sha/sha256d.h"
#if defined(__AES__)

View File

@@ -45,6 +45,6 @@ void sha512Compute32b_parallel(
uint64_t *data[SHA512_PARALLEL_N],
uint64_t *digest[SHA512_PARALLEL_N]);
void sha512ProcessBlock(Sha512Context *context);
void sha512ProcessBlock(Sha512Context contexti[2] );
#endif

View File

@@ -49,12 +49,11 @@ extern "C"{
#define Sb_8W(x0, x1, x2, x3, c) \
do { \
__m512i cc = _mm512_set1_epi64( c ); \
x3 = mm512_not( x3 ); \
const __m512i cc = _mm512_set1_epi64( c ); \
x0 = mm512_xorandnot( x0, x2, cc ); \
tmp = mm512_xorand( cc, x0, x1 ); \
x0 = mm512_xorand( x0, x2, x3 ); \
x3 = mm512_xorandnot( x3, x1, x2 ); \
x0 = mm512_xorandnot( x0, x3, x2 ); \
x3 = _mm512_ternarylogic_epi64( x3, x1, x2, 0x2d ); /* ~x3 ^ (~x1 & x2) */\
x1 = mm512_xorand( x1, x0, x2 ); \
x2 = mm512_xorandnot( x2, x3, x0 ); \
x0 = mm512_xoror( x0, x1, x3 ); \
@@ -79,7 +78,7 @@ do { \
#define Sb(x0, x1, x2, x3, c) \
do { \
__m256i cc = _mm256_set1_epi64x( c ); \
const __m256i cc = _mm256_set1_epi64x( c ); \
x3 = mm256_not( x3 ); \
x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \

View File

@@ -1,5 +1,6 @@
#include "keccak-gate.h"
#include "sph_keccak.h"
#include "algo/sha/sha256d.h"
int hard_coded_eb = 1;

View File

@@ -53,7 +53,8 @@ static const uint64_t RC[] = {
#define WRITE_STATE(sc)
#define MOV64(d, s) (d = s)
#define XOR64_IOTA XOR64
#define XOR64_IOTA XOR
#define LPAR (
#define RPAR )
@@ -70,15 +71,16 @@ static const uint64_t RC[] = {
// Targetted macros, keccak-macros.h is included for each target.
#define DECL64(x) __m512i x
#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b))
#define AND64(d, a, b) (d = _mm512_and_si512(a,b))
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
#define DECL64(x) __m512i x
#define XOR(d, a, b) (d = _mm512_xor_si512(a,b))
#define XOR64 XOR
#define AND64(d, a, b) (d = _mm512_and_si512(a,b))
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
#define NOT64(d, s) (d = mm512_not( s ) )
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
#define XOR3( d, a, b, c ) (d = mm512_xor3( a, b, c ))
#include "keccak-macros.c"
@@ -236,6 +238,7 @@ keccak512_8way_close(void *cc, void *dst)
#undef INPUT_BUF
#undef DECL64
#undef XOR64
#undef XOR
#undef AND64
#undef OR64
#undef NOT64
@@ -243,7 +246,7 @@ keccak512_8way_close(void *cc, void *dst)
#undef KECCAK_F_1600
#undef XOROR
#undef XORAND
#undef XOR3
#endif // AVX512
// AVX2
@@ -254,14 +257,16 @@ keccak512_8way_close(void *cc, void *dst)
kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
} while (0)
#define DECL64(x) __m256i x
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b))
#define AND64(d, a, b) (d = _mm256_and_si256(a,b))
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
#define ROL64(d, v, n) (d = mm256_rol_64(v, n))
#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
#define DECL64(x) __m256i x
#define XOR(d, a, b) (d = _mm256_xor_si256(a,b))
#define XOR64 XOR
#define AND64(d, a, b) (d = _mm256_and_si256(a,b))
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
#define NOT64(d, s) (d = mm256_not( s ) )
#define ROL64(d, v, n) (d = mm256_rol_64(v, n))
#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
#define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))
#include "keccak-macros.c"
@@ -421,6 +426,7 @@ keccak512_4way_close(void *cc, void *dst)
#undef INPUT_BUF
#undef DECL64
#undef XOR64
#undef XOR
#undef AND64
#undef OR64
#undef NOT64
@@ -428,5 +434,6 @@ keccak512_4way_close(void *cc, void *dst)
#undef KECCAK_F_1600
#undef XOROR
#undef XORAND
#undef XOR3
#endif // AVX2

View File

@@ -1,6 +1,19 @@
#ifdef TH_ELT
#undef TH_ELT
#endif
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
DECL64(tt0); \
DECL64(tt1); \
XOR3( tt0, d0, d1, d4 ); \
XOR( tt1, d2, d3 ); \
XOR( tt0, tt0, tt1 ); \
ROL64( tt0, tt0, 1 ); \
XOR3( tt1, c0, c1, c4 ); \
XOR3( tt0, tt0, c2, c3 ); \
XOR( t, tt0, tt1 ); \
} while (0)
/*
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
DECL64(tt0); \
DECL64(tt1); \
@@ -17,7 +30,7 @@
XOR64(tt2, tt2, tt3); \
XOR64(t, tt0, tt2); \
} while (0)
*/
#ifdef THETA
#undef THETA
#endif

View File

@@ -62,186 +62,66 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
#define cns4w(i) m512_const1_128( ( (__m128i*)CNS_INIT)[i] )
#define ADD_CONSTANT4W(a,b,c0,c1)\
a = _mm512_xor_si512(a,c0);\
b = _mm512_xor_si512(b,c1);
#define ADD_CONSTANT4W( a, b, c0, c1 ) \
a = _mm512_xor_si512( a, c0 ); \
b = _mm512_xor_si512( b, c1 );
#define MULT24W( a0, a1 ) \
do { \
{ \
__m512i b = _mm512_xor_si512( a0, \
_mm512_maskz_shuffle_epi32( 0xbbbb, a1, 16 ) ); \
a0 = _mm512_or_si512( _mm512_bsrli_epi128( b, 4 ), \
_mm512_bslli_epi128( a1,12 ) ); \
a1 = _mm512_or_si512( _mm512_bsrli_epi128( a1, 4 ), \
_mm512_bslli_epi128( b,12 ) ); \
} while(0)
a0 = _mm512_alignr_epi8( a1, b, 4 ); \
a1 = _mm512_alignr_epi8( b, a1, 4 ); \
}
/*
#define MULT24W( a0, a1, mask ) \
do { \
__m512i b = _mm512_xor_si512( a0, \
_mm512_shuffle_epi32( _mm512_and_si512(a1,mask), 16 ) ); \
a0 = _mm512_or_si512( _mm512_bsrli_epi128(b,4), _mm512_bslli_epi128(a1,12) );\
a1 = _mm512_or_si512( _mm512_bsrli_epi128(a1,4), _mm512_bslli_epi128(b,12) );\
} while(0)
*/
// confirm pointer arithmetic
// ok but use array indexes
#define STEP_PART4W(x,c0,c1,t)\
SUBCRUMB4W(*x,*(x+1),*(x+2),*(x+3),*t);\
SUBCRUMB4W(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
MIXWORD4W(*x,*(x+4),*t,*(t+1));\
MIXWORD4W(*(x+1),*(x+5),*t,*(t+1));\
MIXWORD4W(*(x+2),*(x+6),*t,*(t+1));\
MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\
ADD_CONSTANT4W(*x, *(x+4), c0, c1);
#define SUBCRUMB4W(a0,a1,a2,a3,t)\
t = a0;\
#define SUBCRUMB4W( a0, a1, a2, a3 ) \
{ \
__m512i t = a0; \
a0 = mm512_xoror( a3, a0, a1 ); \
a2 = _mm512_xor_si512(a2,a3);\
a2 = _mm512_xor_si512( a2, a3 ); \
a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a3 = mm512_xorand( a2, a3, t ); \
a2 = mm512_xorand( a1, a2, a0);\
a1 = _mm512_or_si512(a1,a3);\
a3 = _mm512_xor_si512(a3,a2);\
t = _mm512_xor_si512(t,a1);\
a2 = _mm512_and_si512(a2,a1);\
a1 = mm512_xnor(a1,a0);\
a0 = t;
a2 = mm512_xorand( a1, a2, a0); \
a1 = _mm512_or_si512( a1, a3 ); \
a3 = _mm512_xor_si512( a3, a2 ); \
t = _mm512_xor_si512( t, a1 ); \
a2 = _mm512_and_si512( a2, a1 ); \
a1 = mm512_xnor( a1, a0 ); \
a0 = t; \
}
/*
#define SUBCRUMB4W(a0,a1,a2,a3,t)\
t = _mm512_load_si512(&a0);\
a0 = _mm512_or_si512(a0,a1);\
a2 = _mm512_xor_si512(a2,a3);\
a1 = _mm512_andnot_si512(a1, m512_neg1 );\
a0 = _mm512_xor_si512(a0,a3);\
a3 = _mm512_and_si512(a3,t);\
a1 = _mm512_xor_si512(a1,a3);\
a3 = _mm512_xor_si512(a3,a2);\
a2 = _mm512_and_si512(a2,a0);\
a0 = _mm512_andnot_si512(a0, m512_neg1 );\
a2 = _mm512_xor_si512(a2,a1);\
a1 = _mm512_or_si512(a1,a3);\
t = _mm512_xor_si512(t,a1);\
a3 = _mm512_xor_si512(a3,a2);\
a2 = _mm512_and_si512(a2,a1);\
a1 = _mm512_xor_si512(a1,a0);\
a0 = _mm512_load_si512(&t);
*/
#define MIXWORD4W( a, b ) \
b = _mm512_xor_si512( a, b ); \
a = _mm512_xor_si512( b, _mm512_rol_epi32( a, 2 ) ); \
b = _mm512_xor_si512( a, _mm512_rol_epi32( b, 14 ) ); \
a = _mm512_xor_si512( b, _mm512_rol_epi32( a, 10 ) ); \
b = _mm512_rol_epi32( b, 1 );
#define MIXWORD4W(a,b,t1,t2)\
b = _mm512_xor_si512(a,b);\
t1 = _mm512_slli_epi32(a,2);\
t2 = _mm512_srli_epi32(a,30);\
a = mm512_xoror( b, t1, t2 ); \
t1 = _mm512_slli_epi32(b,14);\
t2 = _mm512_srli_epi32(b,18);\
b = _mm512_or_si512(t1,t2);\
b = mm512_xoror( a, t1, t2 ); \
t1 = _mm512_slli_epi32(a,10);\
t2 = _mm512_srli_epi32(a,22);\
a = mm512_xoror( b, t1, t2 ); \
t1 = _mm512_slli_epi32(b,1);\
t2 = _mm512_srli_epi32(b,31);\
b = _mm512_or_si512(t1,t2);
#define STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
SUBCRUMB4W( x0, x1, x2, x3 ); \
SUBCRUMB4W( x5, x6, x7, x4 ); \
MIXWORD4W( x0, x4 ); \
MIXWORD4W( x1, x5 ); \
MIXWORD4W( x2, x6 ); \
MIXWORD4W( x3, x7 ); \
ADD_CONSTANT4W( x0, x4, c0, c1 );
/*
#define MIXWORD4W(a,b,t1,t2)\
b = _mm512_xor_si512(a,b);\
t1 = _mm512_slli_epi32(a,2);\
t2 = _mm512_srli_epi32(a,30);\
a = _mm512_or_si512(t1,t2);\
a = _mm512_xor_si512(a,b);\
t1 = _mm512_slli_epi32(b,14);\
t2 = _mm512_srli_epi32(b,18);\
b = _mm512_or_si512(t1,t2);\
b = _mm512_xor_si512(a,b);\
t1 = _mm512_slli_epi32(a,10);\
t2 = _mm512_srli_epi32(a,22);\
a = _mm512_or_si512(t1,t2);\
a = _mm512_xor_si512(a,b);\
t1 = _mm512_slli_epi32(b,1);\
t2 = _mm512_srli_epi32(b,31);\
b = _mm512_or_si512(t1,t2);
*/
#define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
a1 = _mm512_shuffle_epi32(a1,147);\
t0 = _mm512_load_si512(&a1);\
a1 = _mm512_unpacklo_epi32(a1,a0);\
t0 = _mm512_unpackhi_epi32(t0,a0);\
t1 = _mm512_shuffle_epi32(t0,78);\
a0 = _mm512_shuffle_epi32(a1,78);\
SUBCRUMB4W(t1,t0,a0,a1,tmp0);\
t0 = _mm512_unpacklo_epi32(t0,t1);\
a1 = _mm512_unpacklo_epi32(a1,a0);\
a0 = _mm512_load_si512(&a1);\
a0 = _mm512_unpackhi_epi64(a0,t0);\
a1 = _mm512_unpacklo_epi64(a1,t0);\
a1 = _mm512_shuffle_epi32(a1,57);\
MIXWORD4W(a0,a1,tmp0,tmp1);\
ADD_CONSTANT4W(a0,a1,c0,c1);
#define NMLTOM7684W(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
s2 = _mm512_load_si512(&r1);\
q2 = _mm512_load_si512(&p1);\
r2 = _mm512_shuffle_epi32(r2,216);\
p2 = _mm512_shuffle_epi32(p2,216);\
r1 = _mm512_unpacklo_epi32(r1,r0);\
p1 = _mm512_unpacklo_epi32(p1,p0);\
s2 = _mm512_unpackhi_epi32(s2,r0);\
q2 = _mm512_unpackhi_epi32(q2,p0);\
s0 = _mm512_load_si512(&r2);\
q0 = _mm512_load_si512(&p2);\
r2 = _mm512_unpacklo_epi64(r2,r1);\
p2 = _mm512_unpacklo_epi64(p2,p1);\
s1 = _mm512_load_si512(&s0);\
q1 = _mm512_load_si512(&q0);\
s0 = _mm512_unpackhi_epi64(s0,r1);\
q0 = _mm512_unpackhi_epi64(q0,p1);\
r2 = _mm512_shuffle_epi32(r2,225);\
p2 = _mm512_shuffle_epi32(p2,225);\
r0 = _mm512_load_si512(&s1);\
p0 = _mm512_load_si512(&q1);\
s0 = _mm512_shuffle_epi32(s0,225);\
q0 = _mm512_shuffle_epi32(q0,225);\
s1 = _mm512_unpacklo_epi64(s1,s2);\
q1 = _mm512_unpacklo_epi64(q1,q2);\
r0 = _mm512_unpackhi_epi64(r0,s2);\
p0 = _mm512_unpackhi_epi64(p0,q2);\
s2 = _mm512_load_si512(&r0);\
q2 = _mm512_load_si512(&p0);\
s3 = _mm512_load_si512(&r2);\
q3 = _mm512_load_si512(&p2);
#define MIXTON7684W(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
s0 = _mm512_load_si512(&r0);\
q0 = _mm512_load_si512(&p0);\
s1 = _mm512_load_si512(&r2);\
q1 = _mm512_load_si512(&p2);\
r0 = _mm512_unpackhi_epi32(r0,r1);\
p0 = _mm512_unpackhi_epi32(p0,p1);\
r2 = _mm512_unpackhi_epi32(r2,r3);\
p2 = _mm512_unpackhi_epi32(p2,p3);\
s0 = _mm512_unpacklo_epi32(s0,r1);\
q0 = _mm512_unpacklo_epi32(q0,p1);\
s1 = _mm512_unpacklo_epi32(s1,r3);\
q1 = _mm512_unpacklo_epi32(q1,p3);\
r1 = _mm512_load_si512(&r0);\
p1 = _mm512_load_si512(&p0);\
r0 = _mm512_unpackhi_epi64(r0,r2);\
p0 = _mm512_unpackhi_epi64(p0,p2);\
s0 = _mm512_unpackhi_epi64(s0,s1);\
q0 = _mm512_unpackhi_epi64(q0,q1);\
r1 = _mm512_unpacklo_epi64(r1,r2);\
p1 = _mm512_unpacklo_epi64(p1,p2);\
s2 = _mm512_load_si512(&r0);\
q2 = _mm512_load_si512(&p0);\
s1 = _mm512_load_si512(&r1);\
q1 = _mm512_load_si512(&p1);
#define STEP_PART24W( a0, a1, t0, t1, c0, c1 ) \
a1 = _mm512_shuffle_epi32( a1, 147 ); \
t0 = _mm512_load_si512( &a1 ); \
a1 = _mm512_unpacklo_epi32( a1, a0 ); \
t0 = _mm512_unpackhi_epi32( t0, a0 ); \
t1 = _mm512_shuffle_epi32( t0, 78 ); \
a0 = _mm512_shuffle_epi32( a1, 78 ); \
SUBCRUMB4W( t1, t0, a0, a1 ); \
t0 = _mm512_unpacklo_epi32( t0, t1 ); \
a1 = _mm512_unpacklo_epi32( a1, a0 ); \
a0 = _mm512_load_si512( &a1 ); \
a0 = _mm512_unpackhi_epi64( a0, t0 ); \
a1 = _mm512_unpacklo_epi64( a1, t0 ); \
a1 = _mm512_shuffle_epi32( a1, 57 ); \
MIXWORD4W( a0, a1 ); \
ADD_CONSTANT4W( a0, a1, c0, c1 );
#define NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
s1 = _mm512_load_si512(&r3);\
@@ -279,8 +159,7 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
__m512i t0, t1;
__m512i *chainv = state->chainv;
__m512i msg0, msg1;
__m512i tmp[2];
__m512i x[8];
__m512i x0, x1, x2, x3, x4, x5, x6, x7;
t0 = mm512_xor3( chainv[0], chainv[2], chainv[4] );
t1 = mm512_xor3( chainv[1], chainv[3], chainv[5] );
@@ -372,42 +251,30 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
chainv[7] = _mm512_rol_epi32( chainv[7], 3 );
chainv[9] = _mm512_rol_epi32( chainv[9], 4 );
NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6],
x[0], x[1], x[2], x[3],
chainv[1],chainv[3],chainv[5],chainv[7],
x[4], x[5], x[6], x[7] );
NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );
STEP_PART4W( &x[0], cns4w( 0), cns4w( 1), &tmp[0] );
STEP_PART4W( &x[0], cns4w( 2), cns4w( 3), &tmp[0] );
STEP_PART4W( &x[0], cns4w( 4), cns4w( 5), &tmp[0] );
STEP_PART4W( &x[0], cns4w( 6), cns4w( 7), &tmp[0] );
STEP_PART4W( &x[0], cns4w( 8), cns4w( 9), &tmp[0] );
STEP_PART4W( &x[0], cns4w(10), cns4w(11), &tmp[0] );
STEP_PART4W( &x[0], cns4w(12), cns4w(13), &tmp[0] );
STEP_PART4W( &x[0], cns4w(14), cns4w(15), &tmp[0] );
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 0), cns4w( 1) );
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 2), cns4w( 3) );
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 4), cns4w( 5) );
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 6), cns4w( 7) );
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 8), cns4w( 9) );
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w(10), cns4w(11) );
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w(12), cns4w(13) );
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w(14), cns4w(15) );
MIXTON10244W( x[0], x[1], x[2], x[3],
chainv[0], chainv[2], chainv[4],chainv[6],
x[4], x[5], x[6], x[7],
chainv[1],chainv[3],chainv[5],chainv[7]);
MIXTON10244W( x0, x1, x2, x3, chainv[0], chainv[2], chainv[4], chainv[6],
x4, x5, x6, x7, chainv[1], chainv[3], chainv[5], chainv[7] );
/* Process last 256-bit block */
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(16), cns4w(17),
tmp[0], tmp[1] );
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(18), cns4w(19),
tmp[0], tmp[1] );
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(20), cns4w(21),
tmp[0], tmp[1] );
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(22), cns4w(23),
tmp[0], tmp[1] );
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(24), cns4w(25),
tmp[0], tmp[1] );
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(26), cns4w(27),
tmp[0], tmp[1] );
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(28), cns4w(29),
tmp[0], tmp[1] );
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(30), cns4w(31),
tmp[0], tmp[1] );
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(16), cns4w(17) );
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(18), cns4w(19) );
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(20), cns4w(21) );
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(22), cns4w(23) );
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(24), cns4w(25) );
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(26), cns4w(27) );
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(28), cns4w(29) );
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(30), cns4w(31) );
}
void finalization512_4way( luffa_4way_context *state, uint32 *b )
@@ -683,10 +550,11 @@ int luffa_4way_update_close( luffa_4way_context *state,
#define cns(i) m256_const1_128( ( (__m128i*)CNS_INIT)[i] )
#define ADD_CONSTANT(a,b,c0,c1)\
a = _mm256_xor_si256(a,c0);\
b = _mm256_xor_si256(b,c1);
#define ADD_CONSTANT( a, b, c0, c1 ) \
a = _mm256_xor_si256( a, c0 ); \
b = _mm256_xor_si256( b, c1 );
/*
#define MULT2( a0, a1, mask ) \
do { \
__m256i b = _mm256_xor_si256( a0, \
@@ -694,127 +562,83 @@ do { \
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
} while(0)
*/
#define STEP_PART(x,c0,c1,t)\
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
MIXWORD(*x,*(x+4),*t,*(t+1));\
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
ADD_CONSTANT(*x, *(x+4), c0, c1);
#define MULT2( a0, a1, mask ) \
{ \
__m256i b = _mm256_xor_si256( a0, \
_mm256_shuffle_epi32( _mm256_and_si256( a1, mask ), 16 ) ); \
a0 = _mm256_alignr_epi8( a1, b, 4 ); \
a1 = _mm256_alignr_epi8( b, a1, 4 ); \
}
#define SUBCRUMB(a0,a1,a2,a3,t)\
t = a0;\
a0 = _mm256_or_si256(a0,a1);\
a2 = _mm256_xor_si256(a2,a3);\
a1 = mm256_not( a1 );\
a0 = _mm256_xor_si256(a0,a3);\
a3 = _mm256_and_si256(a3,t);\
a1 = _mm256_xor_si256(a1,a3);\
a3 = _mm256_xor_si256(a3,a2);\
a2 = _mm256_and_si256(a2,a0);\
a0 = mm256_not( a0 );\
a2 = _mm256_xor_si256(a2,a1);\
a1 = _mm256_or_si256(a1,a3);\
t = _mm256_xor_si256(t,a1);\
a3 = _mm256_xor_si256(a3,a2);\
a2 = _mm256_and_si256(a2,a1);\
a1 = _mm256_xor_si256(a1,a0);\
a0 = t;\
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \
__m256i t = a0; \
a0 = _mm256_or_si256( a0, a1 ); \
a2 = _mm256_xor_si256( a2, a3 ); \
a1 = mm256_not( a1 ); \
a0 = _mm256_xor_si256( a0, a3 ); \
a3 = _mm256_and_si256( a3, t ); \
a1 = _mm256_xor_si256( a1, a3 ); \
a3 = _mm256_xor_si256( a3, a2 ); \
a2 = _mm256_and_si256( a2, a0 ); \
a0 = mm256_not( a0 ); \
a2 = _mm256_xor_si256( a2, a1 ); \
a1 = _mm256_or_si256( a1, a3 ); \
t = _mm256_xor_si256( t, a1 ); \
a3 = _mm256_xor_si256( a3, a2 ); \
a2 = _mm256_and_si256( a2, a1 ); \
a1 = _mm256_xor_si256( a1, a0 ); \
a0 = t; \
}
#define MIXWORD(a,b,t1,t2)\
b = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(a,2);\
t2 = _mm256_srli_epi32(a,30);\
a = _mm256_or_si256(t1,t2);\
a = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(b,14);\
t2 = _mm256_srli_epi32(b,18);\
b = _mm256_or_si256(t1,t2);\
b = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(a,10);\
t2 = _mm256_srli_epi32(a,22);\
a = _mm256_or_si256(t1,t2);\
a = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(b,1);\
t2 = _mm256_srli_epi32(b,31);\
b = _mm256_or_si256(t1,t2);
#define MIXWORD( a, b ) \
{ \
__m256i t1, t2; \
b = _mm256_xor_si256( a,b ); \
t1 = _mm256_slli_epi32( a, 2 ); \
t2 = _mm256_srli_epi32( a, 30 ); \
a = _mm256_or_si256( t1, t2 ); \
a = _mm256_xor_si256( a, b ); \
t1 = _mm256_slli_epi32( b, 14 ); \
t2 = _mm256_srli_epi32( b, 18 ); \
b = _mm256_or_si256( t1, t2 ); \
b = _mm256_xor_si256( a, b ); \
t1 = _mm256_slli_epi32( a, 10 ); \
t2 = _mm256_srli_epi32( a, 22 ); \
a = _mm256_or_si256( t1,t2 ); \
a = _mm256_xor_si256( a,b ); \
t1 = _mm256_slli_epi32( b,1 ); \
t2 = _mm256_srli_epi32( b,31 ); \
b = _mm256_or_si256( t1, t2 ); \
}
#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
a1 = _mm256_shuffle_epi32(a1,147);\
t0 = _mm256_load_si256(&a1);\
a1 = _mm256_unpacklo_epi32(a1,a0);\
t0 = _mm256_unpackhi_epi32(t0,a0);\
t1 = _mm256_shuffle_epi32(t0,78);\
a0 = _mm256_shuffle_epi32(a1,78);\
SUBCRUMB(t1,t0,a0,a1,tmp0);\
t0 = _mm256_unpacklo_epi32(t0,t1);\
a1 = _mm256_unpacklo_epi32(a1,a0);\
a0 = _mm256_load_si256(&a1);\
a0 = _mm256_unpackhi_epi64(a0,t0);\
a1 = _mm256_unpacklo_epi64(a1,t0);\
a1 = _mm256_shuffle_epi32(a1,57);\
MIXWORD(a0,a1,tmp0,tmp1);\
ADD_CONSTANT(a0,a1,c0,c1);
#define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
SUBCRUMB( x0, x1, x2, x3 ); \
SUBCRUMB( x5, x6, x7, x4 ); \
MIXWORD( x0, x4 ); \
MIXWORD( x1, x5 ); \
MIXWORD( x2, x6 ); \
MIXWORD( x3, x7 ); \
ADD_CONSTANT( x0, x4, c0, c1 );
#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
s2 = _mm256_load_si256(&r1);\
q2 = _mm256_load_si256(&p1);\
r2 = _mm256_shuffle_epi32(r2,216);\
p2 = _mm256_shuffle_epi32(p2,216);\
r1 = _mm256_unpacklo_epi32(r1,r0);\
p1 = _mm256_unpacklo_epi32(p1,p0);\
s2 = _mm256_unpackhi_epi32(s2,r0);\
q2 = _mm256_unpackhi_epi32(q2,p0);\
s0 = _mm256_load_si256(&r2);\
q0 = _mm256_load_si256(&p2);\
r2 = _mm256_unpacklo_epi64(r2,r1);\
p2 = _mm256_unpacklo_epi64(p2,p1);\
s1 = _mm256_load_si256(&s0);\
q1 = _mm256_load_si256(&q0);\
s0 = _mm256_unpackhi_epi64(s0,r1);\
q0 = _mm256_unpackhi_epi64(q0,p1);\
r2 = _mm256_shuffle_epi32(r2,225);\
p2 = _mm256_shuffle_epi32(p2,225);\
r0 = _mm256_load_si256(&s1);\
p0 = _mm256_load_si256(&q1);\
s0 = _mm256_shuffle_epi32(s0,225);\
q0 = _mm256_shuffle_epi32(q0,225);\
s1 = _mm256_unpacklo_epi64(s1,s2);\
q1 = _mm256_unpacklo_epi64(q1,q2);\
r0 = _mm256_unpackhi_epi64(r0,s2);\
p0 = _mm256_unpackhi_epi64(p0,q2);\
s2 = _mm256_load_si256(&r0);\
q2 = _mm256_load_si256(&p0);\
s3 = _mm256_load_si256(&r2);\
q3 = _mm256_load_si256(&p2);
#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
s0 = _mm256_load_si256(&r0);\
q0 = _mm256_load_si256(&p0);\
s1 = _mm256_load_si256(&r2);\
q1 = _mm256_load_si256(&p2);\
r0 = _mm256_unpackhi_epi32(r0,r1);\
p0 = _mm256_unpackhi_epi32(p0,p1);\
r2 = _mm256_unpackhi_epi32(r2,r3);\
p2 = _mm256_unpackhi_epi32(p2,p3);\
s0 = _mm256_unpacklo_epi32(s0,r1);\
q0 = _mm256_unpacklo_epi32(q0,p1);\
s1 = _mm256_unpacklo_epi32(s1,r3);\
q1 = _mm256_unpacklo_epi32(q1,p3);\
r1 = _mm256_load_si256(&r0);\
p1 = _mm256_load_si256(&p0);\
r0 = _mm256_unpackhi_epi64(r0,r2);\
p0 = _mm256_unpackhi_epi64(p0,p2);\
s0 = _mm256_unpackhi_epi64(s0,s1);\
q0 = _mm256_unpackhi_epi64(q0,q1);\
r1 = _mm256_unpacklo_epi64(r1,r2);\
p1 = _mm256_unpacklo_epi64(p1,p2);\
s2 = _mm256_load_si256(&r0);\
q2 = _mm256_load_si256(&p0);\
s1 = _mm256_load_si256(&r1);\
q1 = _mm256_load_si256(&p1);\
#define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \
a1 = _mm256_shuffle_epi32( a1, 147); \
t0 = _mm256_load_si256( &a1 ); \
a1 = _mm256_unpacklo_epi32( a1, a0 ); \
t0 = _mm256_unpackhi_epi32( t0, a0 ); \
t1 = _mm256_shuffle_epi32( t0, 78 ); \
a0 = _mm256_shuffle_epi32( a1, 78 ); \
SUBCRUMB( t1, t0, a0, a1 );\
t0 = _mm256_unpacklo_epi32( t0, t1 ); \
a1 = _mm256_unpacklo_epi32( a1, a0 ); \
a0 = _mm256_load_si256( &a1 ); \
a0 = _mm256_unpackhi_epi64( a0, t0 ); \
a1 = _mm256_unpacklo_epi64( a1, t0 ); \
a1 = _mm256_shuffle_epi32( a1, 57 ); \
MIXWORD( a0, a1 ); \
ADD_CONSTANT( a0, a1, c0, c1 );
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
s1 = _mm256_load_si256(&r3);\
@@ -857,9 +681,8 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
__m256i t0, t1;
__m256i *chainv = state->chainv;
__m256i msg0, msg1;
__m256i tmp[2];
__m256i x[8];
const __m256i MASK = m256_const1_i128( 0x00000000ffffffff );
__m256i x0, x1, x2, x3, x4, x5, x6, x7;
const __m256i MASK = m256_const1_i128( 0xffffffff );
t0 = chainv[0];
t1 = chainv[1];
@@ -958,42 +781,30 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
chainv[7] = mm256_rol_32( chainv[7], 3 );
chainv[9] = mm256_rol_32( chainv[9], 4 );
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
x[0], x[1], x[2], x[3],
chainv[1],chainv[3],chainv[5],chainv[7],
x[4], x[5], x[6], x[7] );
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );
STEP_PART( &x[0], cns( 0), cns( 1), &tmp[0] );
STEP_PART( &x[0], cns( 2), cns( 3), &tmp[0] );
STEP_PART( &x[0], cns( 4), cns( 5), &tmp[0] );
STEP_PART( &x[0], cns( 6), cns( 7), &tmp[0] );
STEP_PART( &x[0], cns( 8), cns( 9), &tmp[0] );
STEP_PART( &x[0], cns(10), cns(11), &tmp[0] );
STEP_PART( &x[0], cns(12), cns(13), &tmp[0] );
STEP_PART( &x[0], cns(14), cns(15), &tmp[0] );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 0), cns( 1) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 2), cns( 3) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 4), cns( 5) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 6), cns( 7) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 8), cns( 9) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(10), cns(11) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(12), cns(13) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(14), cns(15) );
MIXTON1024( x[0], x[1], x[2], x[3],
chainv[0], chainv[2], chainv[4],chainv[6],
x[4], x[5], x[6], x[7],
chainv[1],chainv[3],chainv[5],chainv[7]);
MIXTON1024( x0, x1, x2, x3, chainv[0], chainv[2], chainv[4], chainv[6],
x4, x5, x6, x7, chainv[1], chainv[3], chainv[5], chainv[7]);
/* Process last 256-bit block */
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17),
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19),
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21),
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23),
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25),
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27),
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29),
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31),
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31) );
}
/***************************************************/

View File

@@ -23,25 +23,25 @@
#include "simd-utils.h"
#include "luffa_for_sse2.h"
#if defined(__SSE4_1__)
#define MULT2( a0, a1 ) do \
{ \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \
a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) ); \
a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) ); \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
} while(0)
/*
static inline __m256i mult2_avx2( a )
{
__m128 a0, a0, b;
a0 = mm128_extractlo_256( a );
a1 = mm128_extracthi_256( a );
b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) );
a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) );
a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) );
return mm256_concat_128( a1, a0 );
}
*/
#else
#define MULT2( a0, a1 ) do \
{ \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 16 ) ); \
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
} while(0)
#endif
#define STEP_PART(x,c,t)\
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
@@ -73,13 +73,13 @@ static inline __m256i mult2_avx2( a )
t = _mm_load_si128(&a0);\
a0 = _mm_or_si128(a0,a1);\
a2 = _mm_xor_si128(a2,a3);\
a1 = _mm_andnot_si128(a1,ALLONE);\
a1 = mm128_not( a1 );\
a0 = _mm_xor_si128(a0,a3);\
a3 = _mm_and_si128(a3,t);\
a1 = _mm_xor_si128(a1,a3);\
a3 = _mm_xor_si128(a3,a2);\
a2 = _mm_and_si128(a2,a0);\
a0 = _mm_andnot_si128(a0,ALLONE);\
a0 = mm128_not( a0 );\
a2 = _mm_xor_si128(a2,a1);\
a1 = _mm_or_si128(a1,a3);\
t = _mm_xor_si128(t,a1);\
@@ -255,17 +255,18 @@ static const uint32 CNS_INIT[128] __attribute((aligned(16))) = {
__m128i CNS128[32];
__m128i ALLONE;
#if !defined(__SSE4_1__)
__m128i MASK;
#endif
HashReturn init_luffa(hashState_luffa *state, int hashbitlen)
{
int i;
state->hashbitlen = hashbitlen;
#if !defined(__SSE4_1__)
/* set the lower 32 bits to '1' */
MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
/* set all bits to '1' */
ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
#endif
/* set the 32-bit round constant values to the 128-bit data field */
for ( i=0; i<32; i++ )
CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
@@ -365,10 +366,10 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
int i;
state->hashbitlen = hashbitlen;
#if !defined(__SSE4_1__)
/* set the lower 32 bits to '1' */
MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
/* set all bits to '1' */
ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
#endif
/* set the 32-bit round constant values to the 128-bit data field */
for ( i=0; i<32; i++ )
CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );

View File

@@ -13,10 +13,9 @@
#if defined (ALLIUM_16WAY)
typedef struct {
blake256_16way_context blake;
typedef union {
keccak256_8way_context keccak;
cube_4way_context cube;
cube_4way_2buf_context cube;
skein256_8way_context skein;
#if defined(__VAES__)
groestl256_4way_context groestl;
@@ -25,47 +24,31 @@ typedef struct {
#endif
} allium_16way_ctx_holder;
static __thread allium_16way_ctx_holder allium_16way_ctx;
bool init_allium_16way_ctx()
{
keccak256_8way_init( &allium_16way_ctx.keccak );
cube_4way_init( &allium_16way_ctx.cube, 256, 16, 32 );
skein256_8way_init( &allium_16way_ctx.skein );
#if defined(__VAES__)
groestl256_4way_init( &allium_16way_ctx.groestl, 32 );
#else
init_groestl256( &allium_16way_ctx.groestl, 32 );
#endif
return true;
}
void allium_16way_hash( void *state, const void *input )
static void allium_16way_hash( void *state, const void *midstate_vars,
const void *midhash, const void *block )
{
uint32_t vhash[16*8] __attribute__ ((aligned (128)));
uint32_t vhashA[16*8] __attribute__ ((aligned (64)));
uint32_t vhashB[16*8] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t hash8[8] __attribute__ ((aligned (64)));
uint32_t hash9[8] __attribute__ ((aligned (64)));
uint32_t hash10[8] __attribute__ ((aligned (64)));
uint32_t hash11[8] __attribute__ ((aligned (64)));
uint32_t hash12[8] __attribute__ ((aligned (64)));
uint32_t hash13[8] __attribute__ ((aligned (64)));
uint32_t hash14[8] __attribute__ ((aligned (64)));
uint32_t hash15[8] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t hash4[8] __attribute__ ((aligned (32)));
uint32_t hash5[8] __attribute__ ((aligned (32)));
uint32_t hash6[8] __attribute__ ((aligned (32)));
uint32_t hash7[8] __attribute__ ((aligned (32)));
uint32_t hash8[8] __attribute__ ((aligned (32)));
uint32_t hash9[8] __attribute__ ((aligned (32)));
uint32_t hash10[8] __attribute__ ((aligned (32)));
uint32_t hash11[8] __attribute__ ((aligned (32)));
uint32_t hash12[8] __attribute__ ((aligned (32)));
uint32_t hash13[8] __attribute__ ((aligned (32)));
uint32_t hash14[8] __attribute__ ((aligned (32)));
uint32_t hash15[8] __attribute__ ((aligned (32)));
allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &allium_16way_ctx, sizeof(allium_16way_ctx) );
blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
blake256_16way_close( &ctx.blake, vhash );
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
@@ -75,7 +58,7 @@ void allium_16way_hash( void *state, const void *input )
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
hash15, 256 );
// rintrlv_8x32_8x64( vhashA, vhash, 256 );
keccak256_8way_init( &ctx.keccak );
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
keccak256_8way_close( &ctx.keccak, vhashA);
keccak256_8way_init( &ctx.keccak );
@@ -111,12 +94,11 @@ void allium_16way_hash( void *state, const void *input )
intrlv_2x256( vhash, hash14, hash15, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash14, hash15, vhash, 256 );
intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
@@ -124,8 +106,7 @@ void allium_16way_hash( void *state, const void *input )
intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 );
intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 );
cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 );
dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 );
dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 );
@@ -160,6 +141,7 @@ void allium_16way_hash( void *state, const void *input )
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
hash15, 256 );
skein256_8way_init( &ctx.skein );
skein256_8way_update( &ctx.skein, vhashA, 32 );
skein256_8way_close( &ctx.skein, vhashA );
skein256_8way_init( &ctx.skein );
@@ -207,6 +189,7 @@ void allium_16way_hash( void *state, const void *input )
groestl256_full( &ctx.groestl, state+416, hash13, 256 );
groestl256_full( &ctx.groestl, state+448, hash14, 256 );
groestl256_full( &ctx.groestl, state+480, hash15, 256 );
#endif
}
@@ -214,35 +197,60 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
__m512i block0_hash[8] __attribute__ ((aligned (64)));
__m512i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 16;
__m512i *noncev = (__m512i*)vdata + 19; // aligned
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i sixteen = m512_const1_32( 16 );
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
mm512_bswap32_intrlv80_16x32( vdata, pdata );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
// Prehash first block.
blake256_transform_le( phash, pdata, 512, 0 );
blake256_16way_init( &allium_16way_ctx.blake );
blake256_16way_update( &allium_16way_ctx.blake, vdata, 64 );
// Interleave hash for second block prehash.
block0_hash[0] = _mm512_set1_epi32( phash[0] );
block0_hash[1] = _mm512_set1_epi32( phash[1] );
block0_hash[2] = _mm512_set1_epi32( phash[2] );
block0_hash[3] = _mm512_set1_epi32( phash[3] );
block0_hash[4] = _mm512_set1_epi32( phash[4] );
block0_hash[5] = _mm512_set1_epi32( phash[5] );
block0_hash[6] = _mm512_set1_epi32( phash[6] );
block0_hash[7] = _mm512_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
allium_16way_hash( hash, vdata );
allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, hash+(lane<<3), mythr );
pdata[19] = n + lane;
submit_solution( work, hash+(lane<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
block_buf[ 3] = _mm512_add_epi32( block_buf[ 3], sixteen );
n += 16;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
pdata[19] = n;
@@ -252,10 +260,9 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
#elif defined (ALLIUM_8WAY)
typedef struct {
blake256_8way_context blake;
typedef union {
keccak256_4way_context keccak;
cubehashParam cube;
cube_2way_context cube;
skein256_4way_context skein;
#if defined(__VAES__)
groestl256_2way_context groestl;
@@ -264,25 +271,11 @@ typedef struct {
#endif
} allium_8way_ctx_holder;
static __thread allium_8way_ctx_holder allium_8way_ctx;
bool init_allium_8way_ctx()
{
keccak256_4way_init( &allium_8way_ctx.keccak );
cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 );
skein256_4way_init( &allium_8way_ctx.skein );
#if defined(__VAES__)
groestl256_2way_init( &allium_8way_ctx.groestl, 32 );
#else
init_groestl256( &allium_8way_ctx.groestl, 32 );
#endif
return true;
}
void allium_8way_hash( void *hash, const void *input )
static void allium_8way_hash( void *hash, const void *midstate_vars,
const void *midhash, const void *block )
{
uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
uint64_t vhashB[4*8] __attribute__ ((aligned (32)));
uint64_t *hash0 = (uint64_t*)hash;
uint64_t *hash1 = (uint64_t*)hash+ 4;
uint64_t *hash2 = (uint64_t*)hash+ 8;
@@ -293,15 +286,14 @@ void allium_8way_hash( void *hash, const void *input )
uint64_t *hash7 = (uint64_t*)hash+28;
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
blake256_8way_close( &ctx.blake, vhashA );
blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block );
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhashA, 256 );
vhashA, 256 );
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
keccak256_4way_init( &ctx.keccak );
keccak256_4way_update( &ctx.keccak, vhashA, 32 );
keccak256_4way_close( &ctx.keccak, vhashA );
keccak256_4way_init( &ctx.keccak );
@@ -320,21 +312,19 @@ void allium_8way_hash( void *hash, const void *input )
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*)hash4, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*)hash5, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*)hash6, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*)hash7, 32 );
intrlv_2x128( vhashA, hash0, hash1, 256 );
intrlv_2x128( vhashB, hash2, hash3, 256 );
cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
dintrlv_2x128( hash0, hash1, vhashA, 256 );
dintrlv_2x128( hash2, hash3, vhashB, 256 );
intrlv_2x128( vhashA, hash4, hash5, 256 );
intrlv_2x128( vhashB, hash6, hash7, 256 );
cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
dintrlv_2x128( hash4, hash5, vhashA, 256 );
dintrlv_2x128( hash6, hash7, vhashB, 256 );
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
@@ -348,6 +338,7 @@ void allium_8way_hash( void *hash, const void *input )
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
skein256_4way_init( &ctx.skein );
skein256_4way_update( &ctx.skein, vhashA, 32 );
skein256_4way_close( &ctx.skein, vhashA );
skein256_4way_init( &ctx.skein );
@@ -356,8 +347,8 @@ void allium_8way_hash( void *hash, const void *input )
#if defined(__VAES__)
uint64_t vhashC[4*2] __attribute__ ((aligned (64)));
uint64_t vhashD[4*2] __attribute__ ((aligned (64)));
uint64_t vhashC[4*2] __attribute__ ((aligned (32)));
uint64_t vhashD[4*2] __attribute__ ((aligned (32)));
rintrlv_4x64_2x128( vhashC, vhashD, vhashA, 256 );
groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 );
@@ -392,36 +383,60 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
__m256i block0_hash[8] __attribute__ ((aligned (64)));
__m256i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint64_t *ptarget = (uint64_t*)work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i eight = m256_const1_32( 8 );
mm256_bswap32_intrlv80_8x32( vdata, pdata );
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
blake256_8way_init( &allium_8way_ctx.blake );
blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );
block0_hash[2] = _mm256_set1_epi32( phash[2] );
block0_hash[3] = _mm256_set1_epi32( phash[3] );
block0_hash[4] = _mm256_set1_epi32( phash[4] );
block0_hash[5] = _mm256_set1_epi32( phash[5] );
block0_hash[6] = _mm256_set1_epi32( phash[6] );
block0_hash[7] = _mm256_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4,
n+ 3, n+ 2, n+ 1, n );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
allium_8way_hash( hash, vdata );
allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
for ( int lane = 0; lane < 8; lane++ )
{
const uint64_t *lane_hash = hash + (lane<<2);
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n + lane );
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 8;
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
} while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;

View File

@@ -132,11 +132,11 @@ bool register_lyra2z_algo( algo_gate_t* gate )
#if defined(LYRA2Z_16WAY)
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_16way;
gate->hash = (void*)&lyra2z_16way_hash;
// gate->hash = (void*)&lyra2z_16way_hash;
#elif defined(LYRA2Z_8WAY)
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_8way;
gate->hash = (void*)&lyra2z_8way_hash;
// gate->hash = (void*)&lyra2z_8way_hash;
#elif defined(LYRA2Z_4WAY)
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_4way;
@@ -175,20 +175,16 @@ bool register_lyra2h_algo( algo_gate_t* gate )
bool register_allium_algo( algo_gate_t* gate )
{
#if defined (ALLIUM_16WAY)
gate->miner_thread_init = (void*)&init_allium_16way_ctx;
gate->scanhash = (void*)&scanhash_allium_16way;
gate->hash = (void*)&allium_16way_hash;
#elif defined (ALLIUM_8WAY)
gate->miner_thread_init = (void*)&init_allium_8way_ctx;
gate->scanhash = (void*)&scanhash_allium_8way;
gate->hash = (void*)&allium_8way_hash;
#else
gate->miner_thread_init = (void*)&init_allium_ctx;
gate->scanhash = (void*)&scanhash_allium;
gate->hash = (void*)&allium_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT
| VAES_OPT | VAES256_OPT;
| VAES_OPT;
opt_target_factor = 256.0;
return true;
};

View File

@@ -99,14 +99,14 @@ bool init_lyra2rev2_ctx();
#if defined(LYRA2Z_16WAY)
void lyra2z_16way_hash( void *state, const void *input );
//void lyra2z_16way_hash( void *state, const void *input );
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_16way_thread_init();
#elif defined(LYRA2Z_8WAY)
void lyra2z_8way_hash( void *state, const void *input );
//void lyra2z_8way_hash( void *state, const void *input );
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_8way_thread_init();
@@ -163,17 +163,13 @@ bool register_allium_algo( algo_gate_t* gate );
#if defined(ALLIUM_16WAY)
void allium_16way_hash( void *state, const void *input );
int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool init_allium_16way_ctx();
#elif defined(ALLIUM_8WAY)
void allium_8way_hash( void *state, const void *input );
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool init_allium_8way_ctx();
#else

View File

@@ -14,42 +14,32 @@ bool lyra2z_16way_thread_init()
return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_16way_context l2z_16way_blake_mid;
void lyra2z_16way_midstate( const void* input )
{
blake256_16way_init( &l2z_16way_blake_mid );
blake256_16way_update( &l2z_16way_blake_mid, input, 64 );
}
void lyra2z_16way_hash( void *state, const void *input )
static void lyra2z_16way_hash( void *state, const void *midstate_vars,
const void *midhash, const void *block )
{
uint32_t vhash[8*16] __attribute__ ((aligned (128)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t hash8[8] __attribute__ ((aligned (64)));
uint32_t hash9[8] __attribute__ ((aligned (64)));
uint32_t hash10[8] __attribute__ ((aligned (64)));
uint32_t hash11[8] __attribute__ ((aligned (64)));
uint32_t hash12[8] __attribute__ ((aligned (64)));
uint32_t hash13[8] __attribute__ ((aligned (64)));
uint32_t hash14[8] __attribute__ ((aligned (64)));
uint32_t hash15[8] __attribute__ ((aligned (64)));
blake256_16way_context ctx_blake __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t hash4[8] __attribute__ ((aligned (32)));
uint32_t hash5[8] __attribute__ ((aligned (32)));
uint32_t hash6[8] __attribute__ ((aligned (32)));
uint32_t hash7[8] __attribute__ ((aligned (32)));
uint32_t hash8[8] __attribute__ ((aligned (32)));
uint32_t hash9[8] __attribute__ ((aligned (32)));
uint32_t hash10[8] __attribute__ ((aligned (32)));
uint32_t hash11[8] __attribute__ ((aligned (32)));
uint32_t hash12[8] __attribute__ ((aligned (32)));
uint32_t hash13[8] __attribute__ ((aligned (32)));
uint32_t hash14[8] __attribute__ ((aligned (32)));
uint32_t hash15[8] __attribute__ ((aligned (32)));
memcpy( &ctx_blake, &l2z_16way_blake_mid, sizeof l2z_16way_blake_mid );
blake256_16way_update( &ctx_blake, input + (64*16), 16 );
blake256_16way_close( &ctx_blake, vhash );
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
vhash, 256 );
vhash, 256 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
@@ -97,40 +87,62 @@ void lyra2z_16way_hash( void *state, const void *input )
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[4*16] __attribute__ ((aligned (128)));
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
__m512i block0_hash[8] __attribute__ ((aligned (64)));
__m512i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (64))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 16;
__m512i *noncev = (__m512i*)vdata + 19; // aligned
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i sixteen = m512_const1_32( 16 );
if ( bench ) ptarget[7] = 0x0000ff;
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
mm512_bswap32_intrlv80_16x32( vdata, pdata );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
block0_hash[0] = _mm512_set1_epi32( phash[0] );
block0_hash[1] = _mm512_set1_epi32( phash[1] );
block0_hash[2] = _mm512_set1_epi32( phash[2] );
block0_hash[3] = _mm512_set1_epi32( phash[3] );
block0_hash[4] = _mm512_set1_epi32( phash[4] );
block0_hash[5] = _mm512_set1_epi32( phash[5] );
block0_hash[6] = _mm512_set1_epi32( phash[6] );
block0_hash[7] = _mm512_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
lyra2z_16way_midstate( vdata );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
lyra2z_16way_hash( hash, vdata );
for ( int lane = 0; lane < 16; lane++ )
{
const uint64_t *lane_hash = hash + (lane<<2);
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
n += 16;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, hash+(lane<<3), mythr );
}
block_buf[ 3] = _mm512_add_epi32( block_buf[ 3], sixteen );
n += 16;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
@@ -145,30 +157,20 @@ bool lyra2z_8way_thread_init()
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_8way_context l2z_8way_blake_mid;
void lyra2z_8way_midstate( const void* input )
{
blake256_8way_init( &l2z_8way_blake_mid );
blake256_8way_update( &l2z_8way_blake_mid, input, 64 );
}
void lyra2z_8way_hash( void *state, const void *input )
static void lyra2z_8way_hash( void *state, const void *midstate_vars,
const void *midhash, const void *block )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t hash4[8] __attribute__ ((aligned (32)));
uint32_t hash5[8] __attribute__ ((aligned (32)));
uint32_t hash6[8] __attribute__ ((aligned (32)));
uint32_t hash7[8] __attribute__ ((aligned (32)));
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
blake256_8way_update( &ctx_blake, input + (64*8), 16 );
blake256_8way_close( &ctx_blake, vhash );
blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block );
dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
@@ -182,7 +184,6 @@ void lyra2z_8way_hash( void *state, const void *input )
LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
memcpy( state, hash0, 32 );
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
@@ -197,43 +198,66 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
__m256i block0_hash[8] __attribute__ ((aligned (64)));
__m256i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint64_t *ptarget = (uint64_t*)work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i eight = m256_const1_32( 8 );
if ( bench ) ptarget[7] = 0x0000ff;
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
mm256_bswap32_intrlv80_8x32( vdata, pdata );
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
lyra2z_8way_midstate( vdata );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );
block0_hash[2] = _mm256_set1_epi32( phash[2] );
block0_hash[3] = _mm256_set1_epi32( phash[3] );
block0_hash[4] = _mm256_set1_epi32( phash[4] );
block0_hash[5] = _mm256_set1_epi32( phash[5] );
block0_hash[6] = _mm256_set1_epi32( phash[6] );
block0_hash[7] = _mm256_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
lyra2z_8way_hash( hash, vdata );
lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );
for ( int lane = 0; lane < 8; lane++ )
{
for ( int lane = 0; lane < 8; lane++ )
{
const uint64_t *lane_hash = hash + (lane<<2);
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n + lane );
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
n += 8;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
}
n += 8;
block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
} while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(LYRA2Z_4WAY)

View File

@@ -3,7 +3,7 @@
#include "lyra2.h"
#include "simd-utils.h"
__thread uint64_t* lyra2z330_wholeMatrix;
static __thread uint64_t* lyra2z330_wholeMatrix;
void lyra2z330_hash(void *state, const void *input, uint32_t height)
{

View File

@@ -261,7 +261,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
// overlap it's unified.
// As a result normal is Nrows-2 / Nrows.
// for 4 rows: 1 unified, 2 overlap, 1 normal.
// for 8 rows: 1 unified, 2 overlap, 56 normal.
// for 8 rows: 1 unified, 2 overlap, 5 normal.
static inline void reducedDuplexRow_2way_normal( uint64_t *State,
uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
@@ -283,6 +283,15 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
for ( i = 0; i < nCols; i++ )
{
//Absorbing "M[prev] [+] M[row*]"
io0 = _mm512_load_si512( inout0 );
io1 = _mm512_load_si512( inout0 +1 );
io2 = _mm512_load_si512( inout0 +2 );
io0 = _mm512_mask_load_epi64( io0, 0xf0, inout1 );
io1 = _mm512_mask_load_epi64( io1, 0xf0, inout1 +1 );
io2 = _mm512_mask_load_epi64( io2, 0xf0, inout1 +2 );
/*
io0 = _mm512_mask_blend_epi64( 0xf0,
_mm512_load_si512( (__m512i*)inout0 ),
_mm512_load_si512( (__m512i*)inout1 ) );
@@ -292,6 +301,7 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
io2 = _mm512_mask_blend_epi64( 0xf0,
_mm512_load_si512( (__m512i*)inout0 +2 ),
_mm512_load_si512( (__m512i*)inout1 +2 ) );
*/
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0 ) );
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1 ) );
@@ -359,6 +369,15 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
for ( i = 0; i < nCols; i++ )
{
//Absorbing "M[prev] [+] M[row*]"
io0.v512 = _mm512_load_si512( inout0 );
io1.v512 = _mm512_load_si512( inout0 +1 );
io2.v512 = _mm512_load_si512( inout0 +2 );
io0.v512 = _mm512_mask_load_epi64( io0.v512, 0xf0, inout1 );
io1.v512 = _mm512_mask_load_epi64( io1.v512, 0xf0, inout1 +1 );
io2.v512 = _mm512_mask_load_epi64( io2.v512, 0xf0, inout1 +2 );
/*
io0.v512 = _mm512_mask_blend_epi64( 0xf0,
_mm512_load_si512( (__m512i*)inout0 ),
_mm512_load_si512( (__m512i*)inout1 ) );
@@ -368,27 +387,12 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
io2.v512 = _mm512_mask_blend_epi64( 0xf0,
_mm512_load_si512( (__m512i*)inout0 +2 ),
_mm512_load_si512( (__m512i*)inout1 +2 ) );
*/
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0.v512 ) );
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1.v512 ) );
state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io2.v512 ) );
/*
io.v512[0] = _mm512_mask_blend_epi64( 0xf0,
_mm512_load_si512( (__m512i*)inout0 ),
_mm512_load_si512( (__m512i*)inout1 ) );
io.v512[1] = _mm512_mask_blend_epi64( 0xf0,
_mm512_load_si512( (__m512i*)inout0 +1 ),
_mm512_load_si512( (__m512i*)inout1 +1 ) );
io.v512[2] = _mm512_mask_blend_epi64( 0xf0,
_mm512_load_si512( (__m512i*)inout0 +2 ),
_mm512_load_si512( (__m512i*)inout1 +2 ) );
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io.v512[0] ) );
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io.v512[1] ) );
state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io.v512[2] ) );
*/
//Applies the reduced-round transformation f to the sponge's state
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
@@ -415,22 +419,6 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
io2.v512 = _mm512_mask_blend_epi64( 0xf0, io2.v512, out[2] );
}
/*
if ( rowOut == rowInOut0 )
{
io.v512[0] = _mm512_mask_blend_epi64( 0x0f, io.v512[0], out[0] );
io.v512[1] = _mm512_mask_blend_epi64( 0x0f, io.v512[1], out[1] );
io.v512[2] = _mm512_mask_blend_epi64( 0x0f, io.v512[2], out[2] );
}
if ( rowOut == rowInOut1 )
{
io.v512[0] = _mm512_mask_blend_epi64( 0xf0, io.v512[0], out[0] );
io.v512[1] = _mm512_mask_blend_epi64( 0xf0, io.v512[1], out[1] );
io.v512[2] = _mm512_mask_blend_epi64( 0xf0, io.v512[2], out[2] );
}
*/
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
t0 = _mm512_permutex_epi64( state0, 0x93 );
t1 = _mm512_permutex_epi64( state1, 0x93 );
@@ -444,12 +432,23 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
_mm512_mask_blend_epi64( 0x11, t2, t1 ) );
}
/*
casti_m256i( inout0, 0 ) = _mm512_castsi512_si256( io0.v512 );
casti_m256i( inout0, 2 ) = _mm512_castsi512_si256( io1.v512 );
casti_m256i( inout0, 4 ) = _mm512_castsi512_si256( io2.v512 );
_mm512_mask_store_epi64( inout1, 0xf0, io0.v512 );
_mm512_mask_store_epi64( inout1 +1, 0xf0, io1.v512 );
_mm512_mask_store_epi64( inout1 +2, 0xf0, io2.v512 );
*/
casti_m256i( inout0, 0 ) = io0.v256lo;
casti_m256i( inout1, 1 ) = io0.v256hi;
casti_m256i( inout0, 2 ) = io1.v256lo;
casti_m256i( inout1, 3 ) = io1.v256hi;
casti_m256i( inout0, 4 ) = io2.v256lo;
casti_m256i( inout1, 5 ) = io2.v256hi;
/*
_mm512_mask_store_epi64( inout0, 0x0f, io.v512[0] );
_mm512_mask_store_epi64( inout1, 0xf0, io.v512[0] );

View File

@@ -66,13 +66,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
G2W_4X64( s0, s1, s2, s3 ); \
s1 = mm512_ror256_64( s1); \
s3 = mm512_shufll256_64( s3 ); \
s1 = mm512_shuflr256_64( s1); \
s2 = mm512_swap256_128( s2 ); \
s3 = mm512_rol256_64( s3 ); \
G2W_4X64( s0, s1, s2, s3 ); \
s1 = mm512_rol256_64( s1 ); \
s2 = mm512_swap256_128( s2 ); \
s3 = mm512_ror256_64( s3 );
s3 = mm512_shuflr256_64( s3 ); \
s1 = mm512_shufll256_64( s1 ); \
s2 = mm512_swap256_128( s2 );
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -97,23 +97,23 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
// returns void, updates all args
#define G_4X64(a,b,c,d) \
a = _mm256_add_epi64( a, b ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
b = mm256_shuflr64_24( _mm256_xor_si256( b, c ) ); \
a = _mm256_add_epi64( a, b ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_ror_1x64( s1); \
s3 = mm256_shufll_64( s3 ); \
s1 = mm256_shuflr_64( s1); \
s2 = mm256_swap_128( s2 ); \
s3 = mm256_rol_1x64( s3 ); \
G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_rol_1x64( s1 ); \
s2 = mm256_swap_128( s2 ); \
s3 = mm256_ror_1x64( s3 );
s3 = mm256_shuflr_64( s3 ); \
s1 = mm256_shufll_64( s1 ); \
s2 = mm256_swap_128( s2 );
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -137,25 +137,23 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
// returns void, all args updated
#define G_2X64(a,b,c,d) \
a = _mm_add_epi64( a, b ); \
d = mm128_ror_64( _mm_xor_si128( d, a), 32 ); \
d = mm128_swap64_32( _mm_xor_si128( d, a) ); \
c = _mm_add_epi64( c, d ); \
b = mm128_ror_64( _mm_xor_si128( b, c ), 24 ); \
b = mm128_shuflr64_24( _mm_xor_si128( b, c ) ); \
a = _mm_add_epi64( a, b ); \
d = mm128_ror_64( _mm_xor_si128( d, a ), 16 ); \
d = mm128_shuflr64_16( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi64( c, d ); \
b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_ror256_64( s2, s3 ); \
mm128_swap256_128( s4, s5 ); \
mm128_rol256_64( s6, s7 ); \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_rol256_64( s2, s3 ); \
mm128_swap256_128( s4, s5 ); \
mm128_ror256_64( s6, s7 );
mm128_vrol256_64( s6, s7 ); \
mm128_vror256_64( s2, s3 ); \
G_2X64( s0, s2, s5, s6 ); \
G_2X64( s1, s3, s4, s7 ); \
mm128_vror256_64( s6, s7 ); \
mm128_vrol256_64( s2, s3 );
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \

View File

@@ -13,6 +13,7 @@
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/ripemd/sph_ripemd.h"
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
#define EPSa DBL_EPSILON
#define EPS1 DBL_EPSILON
@@ -104,8 +105,8 @@ uint32_t sw2_( int nnounce )
}
typedef struct {
sph_sha256_context sha256;
sph_sha512_context sha512;
sha256_context sha256;
sph_sha512_context sha512;
sph_keccak512_context keccak;
sph_whirlpool_context whirlpool;
sph_haval256_5_context haval;
@@ -117,7 +118,7 @@ m7m_ctx_holder m7m_ctx;
void init_m7m_ctx()
{
sph_sha256_init( &m7m_ctx );
sha256_ctx_init( &m7m_ctx.sha256 );
sph_sha512_init( &m7m_ctx.sha512 );
sph_keccak512_init( &m7m_ctx.keccak );
sph_whirlpool_init( &m7m_ctx.whirlpool );
@@ -153,11 +154,10 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64)));
memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) );
sph_sha256_context ctxf_sha256;
memcpy(data, pdata, 80);
sph_sha256( &ctx1.sha256, data, M7_MIDSTATE_LEN );
sha256_update( &ctx1.sha256, data, M7_MIDSTATE_LEN );
sph_sha512( &ctx1.sha512, data, M7_MIDSTATE_LEN );
sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN );
sph_whirlpool( &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
@@ -189,8 +189,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );
sph_sha256( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
sph_sha256_close( &ctx2.sha256, bhash[0] );
sha256_update( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
sha256_final( &ctx2.sha256, bhash[0] );
sph_sha512( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
sph_sha512_close( &ctx2.sha512, bhash[1] );
@@ -225,9 +225,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
bytes = mpz_sizeinbase(product, 256);
mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);
sph_sha256_init( &ctxf_sha256 );
sph_sha256( &ctxf_sha256, bdata, bytes );
sph_sha256_close( &ctxf_sha256, hash );
sha256_full( hash, bdata, bytes );
digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
@@ -260,10 +258,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
mpzscale=bytes;
mpz_export(bdata, NULL, -1, 1, 0, 0, product);
sph_sha256_init( &ctxf_sha256 );
sph_sha256( &ctxf_sha256, bdata, bytes );
sph_sha256_close( &ctxf_sha256, hash );
}
sha256_full( hash, bdata, bytes );
}
if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
&& !opt_benchmark ) )

View File

@@ -15,7 +15,8 @@
#if defined (ANIME_8WAY)
typedef struct {
union _anime_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
#if defined(__VAES__)
@@ -26,23 +27,9 @@ typedef struct {
jh512_8way_context jh;
skein512_8way_context skein;
keccak512_8way_context keccak;
} anime_8way_ctx_holder;
} __attribute__ ((aligned (64)));
anime_8way_ctx_holder anime_8way_ctx __attribute__ ((aligned (64)));
void init_anime_8way_ctx()
{
blake512_8way_init( &anime_8way_ctx.blake );
bmw512_8way_init( &anime_8way_ctx.bmw );
#if defined(__VAES__)
groestl512_4way_init( &anime_8way_ctx.groestl, 64 );
#else
init_groestl( &anime_8way_ctx.groestl, 64 );
#endif
skein512_8way_init( &anime_8way_ctx.skein );
jh512_8way_init( &anime_8way_ctx.jh );
keccak512_8way_init( &anime_8way_ctx.keccak );
}
typedef union _anime_8way_context_overlay anime_8way_context_overlay;
void anime_8way_hash( void *state, const void *input )
{
@@ -65,17 +52,14 @@ void anime_8way_hash( void *state, const void *input )
__m512i* vhB = (__m512i*)vhashB;
__m512i* vhC = (__m512i*)vhashC;
const __m512i bit3_mask = m512_const1_64( 8 );
const __m512i zero = _mm512_setzero_si512();
__mmask8 vh_mask;
anime_8way_ctx_holder ctx;
memcpy( &ctx, &anime_8way_ctx, sizeof(anime_8way_ctx) );
anime_8way_context_overlay ctx __attribute__ ((aligned (64)));
bmw512_8way_full( &ctx.bmw, vhash, input, 80 );
blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
zero );
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
#if defined(__VAES__)
@@ -152,8 +136,7 @@ void anime_8way_hash( void *state, const void *input )
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
zero );
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
if ( ( vh_mask & 0xff ) != 0xff )
blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
@@ -168,8 +151,7 @@ void anime_8way_hash( void *state, const void *input )
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
zero );
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
if ( ( vh_mask & 0xff ) != 0xff )
{
@@ -237,14 +219,20 @@ int scanhash_anime_8way( struct work *work, uint32_t max_nonce,
#elif defined (ANIME_4WAY)
typedef struct {
union _anime_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
jh512_4way_context jh;
skein512_4way_context skein;
keccak512_4way_context keccak;
} anime_4way_ctx_holder;
#if defined(__VAES__)
groestl512_2way_context groestl2;
#endif
} __attribute__ ((aligned (64)));
typedef union _anime_4way_context_overlay anime_4way_context_overlay;
void anime_4way_hash( void *state, const void *input )
{
@@ -262,7 +250,7 @@ void anime_4way_hash( void *state, const void *input )
int h_mask;
const __m256i bit3_mask = m256_const1_64( 8 );
const __m256i zero = _mm256_setzero_si256();
anime_4way_ctx_holder ctx;
anime_4way_context_overlay ctx __attribute__ ((aligned (64)));
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, input, 80 );
@@ -293,7 +281,18 @@ void anime_4way_hash( void *state, const void *input )
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
#if defined(__VAES__)
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
groestl512_2way_full( &ctx.groestl2, vhashA, vhashA, 64 );
groestl512_2way_full( &ctx.groestl2, vhashB, vhashB, 64 );
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
#else
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
@@ -302,6 +301,8 @@ void anime_4way_hash( void *state, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
#endif
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );

View File

@@ -13,6 +13,7 @@
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
@@ -64,14 +65,14 @@ extern void hmq1725_8way_hash(void *state, const void *input)
uint32_t vhashA[16<<3] __attribute__ ((aligned (64)));
uint32_t vhashB[16<<3] __attribute__ ((aligned (64)));
uint32_t vhashC[16<<3] __attribute__ ((aligned (64)));
uint32_t hash0 [16] __attribute__ ((aligned (64)));
uint32_t hash1 [16] __attribute__ ((aligned (64)));
uint32_t hash2 [16] __attribute__ ((aligned (64)));
uint32_t hash3 [16] __attribute__ ((aligned (64)));
uint32_t hash4 [16] __attribute__ ((aligned (64)));
uint32_t hash5 [16] __attribute__ ((aligned (64)));
uint32_t hash6 [16] __attribute__ ((aligned (64)));
uint32_t hash7 [16] __attribute__ ((aligned (64)));
uint32_t hash0 [16] __attribute__ ((aligned (32)));
uint32_t hash1 [16] __attribute__ ((aligned (32)));
uint32_t hash2 [16] __attribute__ ((aligned (32)));
uint32_t hash3 [16] __attribute__ ((aligned (32)));
uint32_t hash4 [16] __attribute__ ((aligned (32)));
uint32_t hash5 [16] __attribute__ ((aligned (32)));
uint32_t hash6 [16] __attribute__ ((aligned (32)));
uint32_t hash7 [16] __attribute__ ((aligned (32)));
hmq1725_8way_context_overlay ctx __attribute__ ((aligned (64)));
__mmask8 vh_mask;
const __m512i vmask = m512_const1_64( 24 );
@@ -98,8 +99,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7 );
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
m512_zero );
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
// A
#if defined(__VAES__)
@@ -154,8 +154,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
m512_zero );
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
// A
if ( ( vh_mask & 0xff ) != 0xff )
@@ -174,8 +173,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
m512_zero );
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
if ( likely( ( vh_mask & 0xff ) != 0xff ) )
{
@@ -223,8 +221,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
simd512_4way_full( &ctx.simd, vhashB, vhashB, 64 );
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
m512_zero );
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
// 4x32 for haval
@@ -302,8 +299,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
m512_zero );
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
// A
#if defined(__VAES__)
@@ -374,8 +370,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7 );
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
m512_zero );
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
// A
#if defined(__VAES__)
@@ -455,8 +450,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7 );
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
m512_zero );
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
if ( hash0[0] & mask )
fugue512_full( &ctx.fugue, hash0, hash0, 64 );
@@ -520,8 +514,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
sha512_8way_update( &ctx.sha512, vhash, 64 );
sha512_8way_close( &ctx.sha512, vhash );
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
m512_zero );
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
@@ -625,6 +618,7 @@ union _hmq1725_4way_context_overlay
cube_2way_context cube2;
sph_shavite512_context shavite;
hashState_sd sd;
shavite512_2way_context shavite2;
simd_2way_context simd;
hashState_echo echo;
hamsi512_4way_context hamsi;
@@ -633,19 +627,23 @@ union _hmq1725_4way_context_overlay
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
#if defined(__VAES__)
groestl512_2way_context groestl2;
echo_2way_context echo2;
#endif
} __attribute__ ((aligned (64)));
typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
extern void hmq1725_4way_hash(void *state, const void *input)
{
uint32_t hash0 [16] __attribute__ ((aligned (64)));
uint32_t hash1 [16] __attribute__ ((aligned (64)));
uint32_t hash2 [16] __attribute__ ((aligned (64)));
uint32_t hash3 [16] __attribute__ ((aligned (64)));
uint32_t vhash [16<<2] __attribute__ ((aligned (64)));
uint32_t vhashA[16<<2] __attribute__ ((aligned (64)));
uint32_t vhashB[16<<2] __attribute__ ((aligned (64)));
uint32_t hash0 [16] __attribute__ ((aligned (32)));
uint32_t hash1 [16] __attribute__ ((aligned (32)));
uint32_t hash2 [16] __attribute__ ((aligned (32)));
uint32_t hash3 [16] __attribute__ ((aligned (32)));
hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
__m256i vh_mask;
int h_mask;
@@ -750,15 +748,10 @@ extern void hmq1725_4way_hash(void *state, const void *input)
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
shavite512_full( &ctx.shavite, hash0, hash0, 64 );
shavite512_full( &ctx.shavite, hash1, hash1, 64 );
shavite512_full( &ctx.shavite, hash2, hash2, 64 );
shavite512_full( &ctx.shavite, hash3, hash3, 64 );
intrlv_2x128_512( vhashA, hash0, hash1 );
intrlv_2x128_512( vhashB, hash2, hash3 );
shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
@@ -795,6 +788,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
#if defined(__VAES__)
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
echo_2way_full( &ctx.echo2, vhashA, 512, vhashA, 64 );
echo_2way_full( &ctx.echo2, vhashB, 512, vhashB, 64 );
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
#else
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
@@ -807,7 +811,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
(const BitSequence *)hash3, 64 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
#endif
blake512_4way_full( &ctx.blake, vhash, vhash, 64 );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -939,6 +945,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
#if defined(__VAES__)
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
groestl512_2way_full( &ctx.groestl2, vhashA, vhashA, 64 );
groestl512_2way_full( &ctx.groestl2, vhashB, vhashB, 64 );
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
#else
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -948,6 +965,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
#endif
sha512_4way_init( &ctx.sha512 );
sha512_4way_update( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );

View File

@@ -68,7 +68,6 @@ void quark_8way_hash( void *state, const void *input )
quark_8way_ctx_holder ctx;
const uint32_t mask = 8;
const __m512i bit3_mask = m512_const1_64( mask );
const __m512i zero = _mm512_setzero_si512();
memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );
@@ -76,9 +75,7 @@ void quark_8way_hash( void *state, const void *input )
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
zero );
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
#if defined(__VAES__)
@@ -154,8 +151,7 @@ void quark_8way_hash( void *state, const void *input )
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
zero );
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
if ( ( vh_mask & 0xff ) != 0xff )
blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
@@ -169,8 +165,7 @@ void quark_8way_hash( void *state, const void *input )
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
zero );
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
if ( ( vh_mask & 0xff ) != 0xff )
{

File diff suppressed because it is too large Load Diff

View File

@@ -1,186 +0,0 @@
/* $Id: sph_radiogatun.h 226 2010-06-16 17:28:08Z tp $ */
/**
* RadioGatun interface.
*
* RadioGatun has been published in: G. Bertoni, J. Daemen, M. Peeters
* and G. Van Assche, "RadioGatun, a belt-and-mill hash function",
* presented at the Second Cryptographic Hash Workshop, Santa Barbara,
* August 24-25, 2006. The main Web site, containing that article, the
* reference code and some test vectors, appears to be currently located
* at the following URL: http://radiogatun.noekeon.org/
*
* The presentation article does not specify endianness or padding. The
* reference code uses the following conventions, which we also apply
* here:
* <ul>
* <li>The input message is an integral number of sequences of three
* words. Each word is either a 32-bit of 64-bit word (depending on
* the version of RadioGatun).</li>
* <li>Input bytes are decoded into words using little-endian
* convention.</li>
* <li>Padding consists of a single bit of value 1, using little-endian
* convention within bytes (i.e. for a byte-oriented input, a single
* byte of value 0x01 is appended), then enough bits of value 0 to finish
* the current block.</li>
* <li>Output consists of 256 bits. Successive output words are encoded
* with little-endian convention.</li>
* </ul>
* These conventions are very close to those we use for PANAMA, which is
* a close ancestor or RadioGatun.
*
* RadioGatun is actually a family of functions, depending on some
* internal parameters. We implement here two functions, with a "belt
* length" of 13, a "belt width" of 3, and a "mill length" of 19. The
* RadioGatun[32] version uses 32-bit words, while the RadioGatun[64]
* variant uses 64-bit words.
*
* Strictly speaking, the name "RadioGatun" should use an acute accent
* on the "u", which we omitted here to keep strict ASCII-compatibility
* of this file.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_radiogatun.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef SPH_RADIOGATUN_H__
#define SPH_RADIOGATUN_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
/**
* Output size (in bits) for RadioGatun[32].
*/
#define SPH_SIZE_radiogatun32 256
/**
* This structure is a context for RadioGatun[32] computations: it
* contains intermediate values and some data from the last entered
* block. Once a RadioGatun[32] computation has been performed, the
* context can be reused for another computation.
*
* The contents of this structure are private. A running RadioGatun[32]
* computation can be cloned by copying the context (e.g. with a
* simple <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char data[156]; /* first field, for alignment */
unsigned data_ptr;
sph_u32 a[19], b[39];
#endif
} sph_radiogatun32_context;
/**
* Initialize a RadioGatun[32] context. This process performs no
* memory allocation.
*
* @param cc the RadioGatun[32] context (pointer to a
* <code>sph_radiogatun32_context</code>)
*/
void sph_radiogatun32_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the RadioGatun[32] context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_radiogatun32(void *cc, const void *data, size_t len);
/**
* Terminate the current RadioGatun[32] computation and output the
* result into the provided buffer. The destination buffer must be wide
* enough to accomodate the result (32 bytes). The context is
* automatically reinitialized.
*
* @param cc the RadioGatun[32] context
* @param dst the destination buffer
*/
void sph_radiogatun32_close(void *cc, void *dst);
#if SPH_64
/**
* Output size (in bits) for RadioGatun[64].
*/
#define SPH_SIZE_radiogatun64 256
/**
* This structure is a context for RadioGatun[64] computations: it
* contains intermediate values and some data from the last entered
* block. Once a RadioGatun[64] computation has been performed, the
* context can be reused for another computation.
*
* The contents of this structure are private. A running RadioGatun[64]
* computation can be cloned by copying the context (e.g. with a
* simple <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char data[312]; /* first field, for alignment */
unsigned data_ptr;
sph_u64 a[19], b[39];
#endif
} sph_radiogatun64_context;
/**
* Initialize a RadioGatun[64] context. This process performs no
* memory allocation.
*
* @param cc the RadioGatun[64] context (pointer to a
* <code>sph_radiogatun64_context</code>)
*/
void sph_radiogatun64_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the RadioGatun[64] context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_radiogatun64(void *cc, const void *data, size_t len);
/**
* Terminate the current RadioGatun[64] computation and output the
* result into the provided buffer. The destination buffer must be wide
* enough to accomodate the result (32 bytes). The context is
* automatically reinitialized.
*
* @param cc the RadioGatun[64] context
* @param dst the destination buffer
*/
void sph_radiogatun64_close(void *cc, void *dst);
#endif
#endif

View File

@@ -4,7 +4,7 @@
#include <string.h>
#include <stdio.h>
double lbry_calc_network_diff( struct work *work )
long double lbry_calc_network_diff( struct work *work )
{
// sample for diff 43.281 : 1c05ea29
// todo: endian reversed on longpoll could be zr5 specific...
@@ -12,7 +12,7 @@ double lbry_calc_network_diff( struct work *work )
uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
uint32_t bits = (nbits & 0xffffff);
int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
double d = (double)0x0000ffff / (double)bits;
long double d = (long double)0x0000ffff / (long double)bits;
for (int m=shift; m < 29; m++) d *= 256.0;
for (int m=29; m < shift; m++) d /= 256.0;

View File

@@ -7,24 +7,19 @@
#include <string.h>
#include <stdio.h>
#include "sph_ripemd.h"
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
void lbry_hash(void* output, const void* input)
{
sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
sha256_context ctx_sha256 __attribute__ ((aligned (64)));
sph_sha512_context ctx_sha512 __attribute__ ((aligned (64)));
sph_ripemd160_context ctx_ripemd __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) hashA[16];
uint32_t _ALIGN(64) hashB[16];
uint32_t _ALIGN(64) hashC[16];
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, input, 112 );
sph_sha256_close( &ctx_sha256, hashA );
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hashA, 32 );
sph_sha256_close( &ctx_sha256, hashA );
sha256_full( hashA, input, 112 );
sha256_full( hashA, hashA, 32 );
sph_sha512_init( &ctx_sha512 );
sph_sha512( &ctx_sha512, hashA, 32 );
@@ -38,15 +33,13 @@ void lbry_hash(void* output, const void* input)
sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 );
sph_ripemd160_close( &ctx_ripemd, hashC );
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hashB, 20 );
sph_sha256( &ctx_sha256, hashC, 20 );
sph_sha256_close( &ctx_sha256, hashA );
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hashA, 32 );
sph_sha256_close( &ctx_sha256, hashA );
sha256_ctx_init( &ctx_sha256 );
sha256_update( &ctx_sha256, hashB, 20 );
sha256_update( &ctx_sha256, hashC, 20 );
sha256_final( &ctx_sha256, hashA );
sha256_full( hashA, hashA, 32 );
memcpy( output, hashA, 32 );
}

View File

@@ -35,6 +35,7 @@
#include "sph_ripemd.h"
#if 0
/*
* Round functions for RIPEMD (original).
*/
@@ -46,6 +47,7 @@ static const sph_u32 oIV[5] = {
SPH_C32(0x67452301), SPH_C32(0xEFCDAB89),
SPH_C32(0x98BADCFE), SPH_C32(0x10325476)
};
#endif
/*
* Round functions for RIPEMD-128 and RIPEMD-160.
@@ -63,6 +65,8 @@ static const sph_u32 IV[5] = {
#define ROTL SPH_ROTL32
#if 0
/* ===================================================================== */
/*
* RIPEMD (original hash, deprecated).
@@ -479,7 +483,7 @@ sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4])
* One round of RIPEMD-128. The data must be aligned for 32-bit access.
*/
static void
ripemd128_round(const unsigned char *data, sph_u32 r[5])
ripemd128_round(const unsigned char *data, sph_u32 r[4])
{
#if SPH_LITTLE_FAST
@@ -539,6 +543,8 @@ sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4])
#undef RIPEMD128_IN
}
#endif
/* ===================================================================== */
/*
* RIPEMD-160.

View File

@@ -84,6 +84,7 @@
* can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
#if 0
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[64]; /* first field, for alignment */
@@ -204,6 +205,8 @@ void sph_ripemd128_close(void *cc, void *dst);
*/
void sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]);
#endif
/* ===================================================================== */
/**

View File

@@ -69,8 +69,12 @@ typedef unsigned int uint;
#define SCRYPT_HASH_BLOCK_SIZE 64U
#define SCRYPT_HASH_DIGEST_SIZE 32U
#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
//#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
//#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
#define ROTL32(a,b) rol32(a,b)
#define ROTR32(a,b) ror32(a,b)
#define U8TO32_BE(p) \
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,70 @@
#ifndef SCRYPT_CORE_4WAY_H__
#define SCRYPT_CORE_4WAY_H__
#include "simd-utils.h"
#include <stdlib.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );
// Serial SIMD over 4 way parallel
void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N );
// 4 way parallel over serial SIMD
void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N );
#endif
#if defined(__AVX2__)
void scrypt_core_8way( __m256i *X, __m256i *V, uint32_t N );
// 2 way parallel over SIMD128
void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N );
// Double buffered 2 way parallel over SIMD128
void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N );
// Triplee buffered 2 way parallel over SIMD128
void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N );
// Serial SIMD128 over 2 way parallel
void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N );
// Double buffered simd over parallel
void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N );
// Triple buffered 2 way
void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, const uint32_t N );
// Quadruple buffered
void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N );
#endif
#if defined(__SSE2__)
// Parallel 4 way, 4x memory
void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N );
// Linear SIMD 1 way, 1x memory, lowest
void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N );
// Double buffered, 2x memory
void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N );
// Triple buffered
void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N );
// Quadruple buffered, 4x memory
void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N );
#endif
// For reference only
void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N );
#endif

View File

@@ -0,0 +1,206 @@
#include "scrypt-core-ref.h"
#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
{
uint32_t x0 = (B[ 0] ^= C[ 0]),
x1 = (B[ 1] ^= C[ 1]),
x2 = (B[ 2] ^= C[ 2]),
x3 = (B[ 3] ^= C[ 3]);
uint32_t x4 = (B[ 4] ^= C[ 4]),
x5 = (B[ 5] ^= C[ 5]),
x6 = (B[ 6] ^= C[ 6]),
x7 = (B[ 7] ^= C[ 7]);
uint32_t x8 = (B[ 8] ^= C[ 8]),
x9 = (B[ 9] ^= C[ 9]),
xa = (B[10] ^= C[10]),
xb = (B[11] ^= C[11]);
uint32_t xc = (B[12] ^= C[12]),
xd = (B[13] ^= C[13]),
xe = (B[14] ^= C[14]),
xf = (B[15] ^= C[15]);
/* Operate on columns. */
x4 ^= ROTL(x0 + xc, 7);
x9 ^= ROTL(x5 + x1, 7);
xe ^= ROTL(xa + x6, 7);
x3 ^= ROTL(xf + xb, 7);
x8 ^= ROTL(x4 + x0, 9);
xd ^= ROTL(x9 + x5, 9);
x2 ^= ROTL(xe + xa, 9);
x7 ^= ROTL(x3 + xf, 9);
xc ^= ROTL(x8 + x4, 13);
x1 ^= ROTL(xd + x9, 13);
x6 ^= ROTL(x2 + xe, 13);
xb ^= ROTL(x7 + x3, 13);
x0 ^= ROTL(xc + x8, 18);
x5 ^= ROTL(x1 + xd, 18);
xa ^= ROTL(x6 + x2, 18);
xf ^= ROTL(xb + x7, 18);
/* Operate on rows. */
x1 ^= ROTL(x0 + x3, 7);
x6 ^= ROTL(x5 + x4, 7);
xb ^= ROTL(xa + x9, 7);
xc ^= ROTL(xf + xe, 7);
x2 ^= ROTL(x1 + x0, 9);
x7 ^= ROTL(x6 + x5, 9);
x8 ^= ROTL(xb + xa, 9);
xd ^= ROTL(xc + xf, 9);
x3 ^= ROTL(x2 + x1, 13);
x4 ^= ROTL(x7 + x6, 13);
x9 ^= ROTL(x8 + xb, 13);
xe ^= ROTL(xd + xc, 13);
x0 ^= ROTL(x3 + x2, 18);
x5 ^= ROTL(x4 + x7, 18);
xa ^= ROTL(x9 + x8, 18);
xf ^= ROTL(xe + xd, 18);
/* Operate on columns. */
x4 ^= ROTL(x0 + xc, 7);
x9 ^= ROTL(x5 + x1, 7);
xe ^= ROTL(xa + x6, 7);
x3 ^= ROTL(xf + xb, 7);
x8 ^= ROTL(x4 + x0, 9);
xd ^= ROTL(x9 + x5, 9);
x2 ^= ROTL(xe + xa, 9);
x7 ^= ROTL(x3 + xf, 9);
xc ^= ROTL(x8 + x4, 13);
x1 ^= ROTL(xd + x9, 13);
x6 ^= ROTL(x2 + xe, 13);
xb ^= ROTL(x7 + x3, 13);
x0 ^= ROTL(xc + x8, 18);
x5 ^= ROTL(x1 + xd, 18);
xa ^= ROTL(x6 + x2, 18);
xf ^= ROTL(xb + x7, 18);
/* Operate on rows. */
x1 ^= ROTL(x0 + x3, 7);
x6 ^= ROTL(x5 + x4, 7);
xb ^= ROTL(xa + x9, 7);
xc ^= ROTL(xf + xe, 7);
x2 ^= ROTL(x1 + x0, 9);
x7 ^= ROTL(x6 + x5, 9);
x8 ^= ROTL(xb + xa, 9);
xd ^= ROTL(xc + xf, 9);
x3 ^= ROTL(x2 + x1, 13);
x4 ^= ROTL(x7 + x6, 13);
x9 ^= ROTL(x8 + xb, 13);
xe ^= ROTL(xd + xc, 13);
x0 ^= ROTL(x3 + x2, 18);
x5 ^= ROTL(x4 + x7, 18);
xa ^= ROTL(x9 + x8, 18);
xf ^= ROTL(xe + xd, 18);
/* Operate on columns. */
x4 ^= ROTL(x0 + xc, 7);
x9 ^= ROTL(x5 + x1, 7);
xe ^= ROTL(xa + x6, 7);
x3 ^= ROTL(xf + xb, 7);
x8 ^= ROTL(x4 + x0, 9);
xd ^= ROTL(x9 + x5, 9);
x2 ^= ROTL(xe + xa, 9);
x7 ^= ROTL(x3 + xf, 9);
xc ^= ROTL(x8 + x4, 13);
x1 ^= ROTL(xd + x9, 13);
x6 ^= ROTL(x2 + xe, 13);
xb ^= ROTL(x7 + x3, 13);
x0 ^= ROTL(xc + x8, 18);
x5 ^= ROTL(x1 + xd, 18);
xa ^= ROTL(x6 + x2, 18);
xf ^= ROTL(xb + x7, 18);
/* Operate on rows. */
x1 ^= ROTL(x0 + x3, 7);
x6 ^= ROTL(x5 + x4, 7);
xb ^= ROTL(xa + x9, 7);
xc ^= ROTL(xf + xe, 7);
x2 ^= ROTL(x1 + x0, 9);
x7 ^= ROTL(x6 + x5, 9);
x8 ^= ROTL(xb + xa, 9);
xd ^= ROTL(xc + xf, 9);
x3 ^= ROTL(x2 + x1, 13);
x4 ^= ROTL(x7 + x6, 13);
x9 ^= ROTL(x8 + xb, 13);
xe ^= ROTL(xd + xc, 13);
x0 ^= ROTL(x3 + x2, 18);
x5 ^= ROTL(x4 + x7, 18);
xa ^= ROTL(x9 + x8, 18);
xf ^= ROTL(xe + xd, 18);
/* Operate on columns. */
x4 ^= ROTL(x0 + xc, 7);
x9 ^= ROTL(x5 + x1, 7);
xe ^= ROTL(xa + x6, 7);
x3 ^= ROTL(xf + xb, 7);
x8 ^= ROTL(x4 + x0, 9);
xd ^= ROTL(x9 + x5, 9);
x2 ^= ROTL(xe + xa, 9);
x7 ^= ROTL(x3 + xf, 9);
xc ^= ROTL(x8 + x4, 13);
x1 ^= ROTL(xd + x9, 13);
x6 ^= ROTL(x2 + xe, 13);
xb ^= ROTL(x7 + x3, 13);
x0 ^= ROTL(xc + x8, 18);
x5 ^= ROTL(x1 + xd, 18);
xa ^= ROTL(x6 + x2, 18);
xf ^= ROTL(xb + x7, 18);
/* Operate on rows. */
x1 ^= ROTL(x0 + x3, 7);
x6 ^= ROTL(x5 + x4, 7);
xb ^= ROTL(xa + x9, 7);
xc ^= ROTL(xf + xe, 7);
x2 ^= ROTL(x1 + x0, 9);
x7 ^= ROTL(x6 + x5, 9);
x8 ^= ROTL(xb + xa, 9);
xd ^= ROTL(xc + xf, 9);
x3 ^= ROTL(x2 + x1, 13);
x4 ^= ROTL(x7 + x6, 13);
x9 ^= ROTL(x8 + xb, 13);
xe ^= ROTL(xd + xc, 13);
x0 ^= ROTL(x3 + x2, 18);
x5 ^= ROTL(x4 + x7, 18);
xa ^= ROTL(x9 + x8, 18);
xf ^= ROTL(xe + xd, 18);
B[ 0] += x0;
B[ 1] += x1;
B[ 2] += x2;
B[ 3] += x3;
B[ 4] += x4;
B[ 5] += x5;
B[ 6] += x6;
B[ 7] += x7;
B[ 8] += x8;
B[ 9] += x9;
B[10] += xa;
B[11] += xb;
B[12] += xc;
B[13] += xd;
B[14] += xe;
B[15] += xf;
}
/**
* @param X input/ouput
* @param V scratch buffer
* @param N factor (def. 1024)
*/
void scrypt_core_ref(uint32_t *X, uint32_t *V, uint32_t N)
{
for (uint32_t i = 0; i < N; i++) {
memcpy(&V[i * 32], X, 128);
xor_salsa8(&X[0], &X[16]);
xor_salsa8(&X[16], &X[0]);
}
for (uint32_t i = 0; i < N; i++) {
uint32_t j = 32 * (X[16] & (N - 1));
for (uint8_t k = 0; k < 32; k++)
X[k] ^= V[j + k];
xor_salsa8(&X[0], &X[16]);
xor_salsa8(&X[16], &X[0]);
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -39,10 +39,10 @@
void
SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
{
sph_sha256_context ctx;
sph_sha256_init( &ctx );
sph_sha256( &ctx, in, len );
sph_sha256_close( &ctx, digest );
sha256_context ctx;
sha256_ctx_init( &ctx );
sha256_update( &ctx, in, len );
sha256_final( &ctx, digest );
}
/**
@@ -64,7 +64,7 @@ HMAC_SHA256_Buf( const void *K, size_t Klen, const void *in, size_t len,
void
HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
{
unsigned char pad[64];
unsigned char pad[64] __attribute__ ((aligned (64)));
unsigned char khash[32];
const unsigned char * K = _K;
size_t i;
@@ -72,29 +72,28 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
/* If Klen > 64, the key is really SHA256(K). */
if ( Klen > 64 )
{
sph_sha256_init( &ctx->ictx );
sph_sha256( &ctx->ictx, K, Klen );
sph_sha256_close( &ctx->ictx, khash );
sha256_ctx_init( &ctx->ictx );
sha256_update( &ctx->ictx, K, Klen );
sha256_final( &ctx->ictx, khash );
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
sph_sha256_init( &ctx->ictx );
sha256_ctx_init( &ctx->ictx );
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x36;
memset( pad + Klen, 0x36, 64 - Klen );
sph_sha256( &ctx->ictx, pad, 64 );
sha256_update( &ctx->ictx, pad, 64 );
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
sph_sha256_init( &ctx->octx );
sha256_ctx_init( &ctx->octx );
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x5c;
memset( pad + Klen, 0x5c, 64 - Klen );
sph_sha256( &ctx->octx, pad, 64 );
sha256_update( &ctx->octx, pad, 64 );
}
/* Add bytes to the HMAC-SHA256 operation. */
@@ -102,18 +101,17 @@ void
HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
{
/* Feed data to the inner SHA256 operation. */
sph_sha256( &ctx->ictx, in, len );
sha256_update( &ctx->ictx, in, len );
}
/* Finish an HMAC-SHA256 operation. */
void
HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
HMAC_SHA256_Final( void *digest, HMAC_SHA256_CTX *ctx )
{
unsigned char ihash[32];
sph_sha256_close( &ctx->ictx, ihash );
sph_sha256( &ctx->octx, ihash, 32 );
sph_sha256_close( &ctx->octx, digest );
uint32_t ihash[8] __attribute__ ((aligned (32)));
sha256_final( &ctx->ictx, ihash );
sha256_update( &ctx->octx, ihash, 32 );
sha256_final( &ctx->octx, digest );
}
/**
@@ -126,8 +124,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
size_t saltlen, uint64_t c, uint8_t *buf, size_t dkLen )
{
HMAC_SHA256_CTX PShctx, hctx;
uint8_t _ALIGN(128) T[32];
uint8_t _ALIGN(128) U[32];
uint64_t _ALIGN(128) T[4];
uint64_t _ALIGN(128) U[4];
// uint8_t _ALIGN(128) T[32];
// uint8_t _ALIGN(128) U[32];
uint32_t ivec;
size_t i, clen;
uint64_t j;
@@ -163,10 +163,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
// _mm_xor_si128( ((__m128i*)T)[0], ((__m128i*)U)[0] );
// _mm_xor_si128( ((__m128i*)T)[1], ((__m128i*)U)[1] );
// for ( k = 0; k < 4; k++ ) T[k] ^= U[k];
for ( k = 0; k < 4; k++ ) T[k] ^= U[k];
for ( k = 0; k < 32; k++ )
T[k] ^= U[k];
// for ( k = 0; k < 32; k++ )
// T[k] ^= U[k];
}
/* Copy as many bytes as necessary into buf. */

View File

@@ -31,18 +31,18 @@
#include <sys/types.h>
#include <stdint.h>
#include "sph_sha2.h"
#include "sha256-hash.h"
typedef struct HMAC_SHA256Context
{
sph_sha256_context ictx;
sph_sha256_context octx;
sha256_context ictx;
sha256_context octx;
} HMAC_SHA256_CTX;
void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t );
void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t );
void HMAC_SHA256_Final( unsigned char [32], HMAC_SHA256_CTX * );
void HMAC_SHA256_Final( void*, HMAC_SHA256_CTX * );
void HMAC_SHA256_Buf( const void *, size_t Klen, const void *,
size_t len, uint8_t digest[32] );

View File

@@ -1,270 +0,0 @@
/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
/*
* This file contains some functions which implement the external data
* handling and padding for Merkle-Damgard hash functions which follow
* the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
*
* API: this file is meant to be included, not compiled as a stand-alone
* file. Some macros must be defined:
* RFUN name for the round function
* HASH "short name" for the hash function
* BE32 defined for big-endian, 32-bit based (e.g. SHA-1)
* LE32 defined for little-endian, 32-bit based (e.g. MD5)
* BE64 defined for big-endian, 64-bit based (e.g. SHA-512)
* LE64 defined for little-endian, 64-bit based (no example yet)
* PW01 if defined, append 0x01 instead of 0x80 (for Tiger)
* BLEN if defined, length of a message block (in bytes)
* PLW1 if defined, length is defined on one 64-bit word only (for Tiger)
* PLW4 if defined, length is defined on four 64-bit words (for WHIRLPOOL)
* SVAL if defined, reference to the context state information
*
* BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
* this is used for instance for Tiger, which works on 64-bit words but
* uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
* ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
* set, then only one word (64 bits) will be used to encode the input
* message length (in bits), otherwise two words will be used (as in
* SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
* not PLW1), four 64-bit words will be used to encode the message length
* (in bits). Note that regardless of those settings, only 64-bit message
* lengths are supported (in bits): messages longer than 2 Exabytes will be
* improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
* 2 millions Terabytes, which is huge).
*
* If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
* function. This is used for Tiger2, which is identical to Tiger except
* when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
* of the 0x01 from original Tiger).
*
* The RFUN function is invoked with two arguments, the first pointing to
* aligned data (as a "const void *"), the second being state information
* from the context structure. By default, this state information is the
* "val" field from the context, and this field is assumed to be an array
* of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
* from the context structure. The "val" field can have any type, except
* for the output encoding which assumes that it is an array of "sph_u32"
* values. By defining NO_OUTPUT, this last step is deactivated; the
* includer code is then responsible for writing out the hash result. When
* NO_OUTPUT is defined, the third parameter to the "close()" function is
* ignored.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
#undef SPH_XCAT
#define SPH_XCAT(a, b) SPH_XCAT_(a, b)
#undef SPH_XCAT_
#define SPH_XCAT_(a, b) a ## b
#undef SPH_BLEN
#undef SPH_WLEN
#if defined BE64 || defined LE64
#define SPH_BLEN 128U
#define SPH_WLEN 8U
#else
#define SPH_BLEN 64U
#define SPH_WLEN 4U
#endif
#ifdef BLEN
#undef SPH_BLEN
#define SPH_BLEN BLEN
#endif
#undef SPH_MAXPAD
#if defined PLW1
#define SPH_MAXPAD (SPH_BLEN - SPH_WLEN)
#elif defined PLW4
#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 2))
#else
#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 1))
#endif
#undef SPH_VAL
#undef SPH_NO_OUTPUT
#ifdef SVAL
#define SPH_VAL SVAL
#define SPH_NO_OUTPUT 1
#else
#define SPH_VAL sc->val
#endif
#ifndef CLOSE_ONLY
#ifdef SPH_UPTR
static void
SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len )
#else
void
HASH ( void *cc, const void *data, size_t len )
#endif
{
SPH_XCAT( HASH, _context ) *sc;
__m256i *vdata = (__m256i*)data;
size_t ptr;
sc = cc;
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
while ( len > 0 )
{
size_t clen;
clen = SPH_BLEN - ptr;
if ( clen > len )
clen = len;
memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
vdata = vdata + (clen>>3);
ptr += clen;
len -= clen;
if ( ptr == SPH_BLEN )
{
RFUN( sc->buf, SPH_VAL );
ptr = 0;
}
sc->count += clen;
}
}
#ifdef SPH_UPTR
void
HASH (void *cc, const void *data, size_t len)
{
SPH_XCAT(HASH, _context) *sc;
__m256i *vdata = (__m256i*)data;
unsigned ptr;
if ( len < (2 * SPH_BLEN) )
{
SPH_XCAT(HASH, _short)(cc, data, len);
return;
}
sc = cc;
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
if ( ptr > 0 )
{
unsigned t;
t = SPH_BLEN - ptr;
SPH_XCAT( HASH, _short )( cc, data, t );
vdata = vdata + (t>>3);
len -= t;
}
SPH_XCAT( HASH, _short )( cc, data, len );
}
#endif
#endif
/*
* Perform padding and produce result. The context is NOT reinitialized
* by this function.
*/
static void
SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
void *dst, unsigned rnum )
{
SPH_XCAT(HASH, _context) *sc;
unsigned ptr, u;
sc = cc;
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
#ifdef PW01
sc->buf[ptr>>3] = m256_const1_64( 0x100 >> 8 );
#else
sc->buf[ptr>>3] = m256_const1_64( 0x80 );
#endif
ptr += 8;
if ( ptr > SPH_MAXPAD )
{
memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
RFUN( sc->buf, SPH_VAL );
memset_zero_256( sc->buf, SPH_MAXPAD >> 3 );
}
else
{
memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
}
#if defined BE64
#if defined PLW1
sc->buf[ SPH_MAXPAD>>3 ] =
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#elif defined PLW4
memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#else
sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#endif // PLW
#else // LE64
#if defined PLW1
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
#elif defined PLW4
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
_mm256_set1_epi64x( c->count >> 61 );
memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
2 * SPH_WLEN );
#else
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
_mm256_set1_epi64x( sc->count >> 61 );
#endif // PLW
#endif // LE64
RFUN( sc->buf, SPH_VAL );
#ifdef SPH_NO_OUTPUT
(void)dst;
(void)rnum;
(void)u;
#else
for ( u = 0; u < rnum; u ++ )
{
#if defined BE64
((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
#else // LE64
((__m256i*)dst)[u] = sc->val[u];
#endif
}
#endif
}
static void
SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum )
{
SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum );
}

View File

@@ -51,7 +51,6 @@ typedef struct {
__m128i buf[64>>2];
__m128i val[8];
uint32_t count_high, count_low;
bool initialized;
} sha256_4way_context __attribute__ ((aligned (64)));
void sha256_4way_init( sha256_4way_context *sc );
@@ -59,8 +58,16 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data,
size_t len );
void sha256_4way_close( sha256_4way_context *sc, void *dst );
void sha256_4way_full( void *dst, const void *data, size_t len );
void sha256_4way_transform( __m128i *state_out, const __m128i *data,
void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
const __m128i *state_in );
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
const __m128i *state_in );
void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
const __m128i *W, const __m128i *state_in );
void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
const __m128i *state_in );
#endif // SSE2
@@ -72,15 +79,23 @@ typedef struct {
__m256i buf[64>>2];
__m256i val[8];
uint32_t count_high, count_low;
bool initialized;
} sha256_8way_context __attribute__ ((aligned (128)));
void sha256_8way_init( sha256_8way_context *sc );
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
void sha256_8way_close( sha256_8way_context *sc, void *dst );
void sha256_8way_full( void *dst, const void *data, size_t len );
void sha256_8way_transform( __m256i *state_out, const __m256i *data,
const __m256i *state_in );
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
const __m256i *state_in );
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
const __m256i *state_in );
void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
const __m256i *W, const __m256i *state_in );
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i *state_in );
#endif // AVX2
@@ -92,19 +107,23 @@ typedef struct {
__m512i buf[64>>2];
__m512i val[8];
uint32_t count_high, count_low;
bool initialized;
} sha256_16way_context __attribute__ ((aligned (128)));
void sha256_16way_init( sha256_16way_context *sc );
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
void sha256_16way_close( sha256_16way_context *sc, void *dst );
void sha256_16way_full( void *dst, const void *data, size_t len );
void sha256_16way_transform( __m512i *state_out, const __m512i *data,
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
const __m512i *state_in );
void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
const __m512i *state_in );
void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
const __m512i *W, const __m512i *state_in );
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const __m512i *state_mid );
const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
const __m512i *state_in );
#endif // AVX512

View File

@@ -8,7 +8,7 @@
* any later version. See COPYING for more details.
*/
#include "algo-gate-api.h"
#include "sha256d-4way.h"
#include <string.h>
#include <inttypes.h>
@@ -180,6 +180,9 @@ static const uint32_t sha256d_hash1[16] = {
0x00000000, 0x00000000, 0x00000000, 0x00000100
};
// this performs the entire hash all over again, why?
// because main function only does 56 rounds.
static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
{
uint32_t S[16];
@@ -195,6 +198,7 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
hash[i] = swab32(hash[i]);
}
/*
#if defined (__SHA__)
#include "algo/sha/sph_sha2.h"
@@ -241,6 +245,7 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len)
}
#endif
*/
static inline void sha256d_preextend(uint32_t *W)
{
@@ -489,7 +494,7 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
void sha256d_ms_4way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_4way( struct work *work,
static inline int scanhash_sha256d_4way_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
@@ -550,7 +555,7 @@ static inline int scanhash_sha256d_4way( struct work *work,
void sha256d_ms_8way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_8way( struct work *work,
static inline int scanhash_sha256d_8way_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
@@ -606,11 +611,11 @@ static inline int scanhash_sha256d_8way( struct work *work,
#endif /* HAVE_SHA256_8WAY */
int scanhash_sha256d( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
int scanhash_sha256d_pooler( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(128) data[64];
uint32_t _ALIGN(32) hash[8];
uint32_t _ALIGN(32) midstate[8];
@@ -621,12 +626,12 @@ int scanhash_sha256d( struct work *work,
int thr_id = mythr->id; // thr_id arg is deprecated
#ifdef HAVE_SHA256_8WAY
if (sha256_use_8way())
return scanhash_sha256d_8way( work, max_nonce, hashes_done, mythr );
if ( sha256_use_8way() )
return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr );
#endif
#ifdef HAVE_SHA256_4WAY
if (sha256_use_4way())
return scanhash_sha256d_4way( work, max_nonce, hashes_done, mythr );
if ( sha256_use_4way() )
return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr );
#endif
memcpy(data, pdata + 16, 64);
@@ -653,6 +658,7 @@ int scanhash_sha256d( struct work *work,
return 0;
}
/*
int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
@@ -682,13 +688,20 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
pdata[19] = n;
return 0;
}
*/
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_sha256d;
gate->hash = (void*)&sha256d;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
//#elif defined(SHA256D_8WAY)
// gate->scanhash = (void*)&scanhash_sha256d_8way;
#else
gate->scanhash = (void*)&scanhash_sha256d_pooler;
// gate->scanhash = (void*)&scanhash_sha256d_4way;
#endif
// gate->hash = (void*)&sha256d;
return true;
};

View File

@@ -7,9 +7,9 @@
#if defined(__SHA__)
#include "sha256-hash-opt.h"
#include "sha256-hash.h"
void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y )
{
@@ -342,4 +342,348 @@ void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
}
void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y )
{
__m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
__m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK;
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
__m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y;
// Load initial values
TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
// Save current hash
ABEF_SAVE_X = STATE0_X;
ABEF_SAVE_Y = STATE0_Y;
CDGH_SAVE_X = STATE1_X;
CDGH_SAVE_Y = STATE1_Y;
// Rounds 0-3
TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK );
TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK );
MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Rounds 4-7
TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK );
TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK );
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
// Rounds 8-11
TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK );
TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK );
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
// Rounds 12-15
TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK );
TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK );
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
// Rounds 16-19
TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
// Rounds 20-23
TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
// Rounds 24-27
TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
// Rounds 28-31
TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
// Rounds 32-35
TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
// Rounds 36-39
TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
// Rounds 40-43
TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
// Rounds 44-47
TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
// Rounds 48-51
TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
// Rounds 52-55
TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Rounds 56-59
TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Rounds 60-63
TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Add values back to state
STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
// Save state
_mm_store_si128((__m128i*) &out_X[0], STATE0_X);
_mm_store_si128((__m128i*) &out_X[4], STATE1_X);
_mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -7,9 +7,9 @@
#if defined(__SHA__)
#include "sha256-hash-opt.h"
#include "sha256-hash.h"
void sha256_opt_transform( uint32_t *state_out, const void *input,
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
__m128i STATE0, STATE1;
@@ -197,4 +197,192 @@ void sha256_opt_transform( uint32_t *state_out, const void *input,
_mm_store_si128((__m128i*) &state_out[4], STATE1);
}
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
__m128i STATE0, STATE1;
__m128i MSG, TMP, MASK;
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
__m128i ABEF_SAVE, CDGH_SAVE;
// Load initial values
TMP = _mm_load_si128((__m128i*) &state_in[0]);
STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
// Save current hash
ABEF_SAVE = STATE0;
CDGH_SAVE = STATE1;
// Rounds 0-3
TMSG0 = _mm_load_si128((const __m128i*) (input+0));
TMSG0 = _mm_shuffle_epi8( TMSG0, MASK );
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 4-7
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 8-11
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 12-15
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 16-19
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 20-23
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 24-27
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 28-31
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 32-35
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 36-39
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 40-43
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 44-47
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 48-51
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 52-55
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 56-59
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 60-63
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Add values back to state
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
// Save state
_mm_store_si128((__m128i*) &state_out[0], STATE0);
_mm_store_si128((__m128i*) &state_out[4], STATE1);
}
#endif

View File

@@ -1,18 +0,0 @@
#ifndef SHA2_HASH_OPT_H__
#define SHA2_HASH_OPT_H__ 1
#include <stddef.h>
#include "simd-utils.h"
#if defined(__SHA__)
void sha256_opt_transform( uint32_t *state_out, const void *input,
const uint32_t *state_in );
// 2 way with interleaved instructions
void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y );
#endif
#endif

142
algo/sha/sha256-hash.c Normal file
View File

@@ -0,0 +1,142 @@
#include "sha256-hash.h"
static const uint32_t SHA256_IV[8] =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
/*
static const uint8_t SHA256_PAD[64] =
{
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
*/
void sha256_ctx_init( sha256_context *ctx )
{
memcpy( ctx->state, SHA256_IV, sizeof SHA256_IV );
ctx->count = 0;
}
void sha256_update( sha256_context *ctx, const void *data, size_t len )
{
int ptr = ctx->count & 0x3f;
const uint8_t *src = data;
ctx->count += (uint64_t)len;
if ( len < 64 - ptr )
{
memcpy( ctx->buf + ptr, src, len );
return;
}
memcpy( ctx->buf + ptr, src, 64 - ptr );
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
src += 64 - ptr;
len -= 64 - ptr;
while ( len >= 64 )
{
sha256_transform_be( ctx->state, (uint32_t*)src, ctx->state );
src += 64;
len -= 64;
}
memcpy( ctx->buf, src, len );
}
#if 0
void sha256_final( sha256_context *ctx, uint32_t *hash )
{
size_t r;
/* Figure out how many bytes we have buffered. */
r = ctx->count & 0x3f;
// r = ( ctx->count >> 3 ) & 0x3f;
//printf("final: count= %d, r= %d\n", ctx->count, r );
/* Pad to 56 mod 64, transforming if we finish a block en route. */
if ( r < 56 )
{
/* Pad to 56 mod 64. */
memcpy( &ctx->buf[r], SHA256_PAD, 56 - r );
}
else
{
/* Finish the current block and mix. */
memcpy( &ctx->buf[r], SHA256_PAD, 64 - r );
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
/* The start of the final block is all zeroes. */
memset( &ctx->buf[0], 0, 56 );
}
/* Add the terminating bit-count. */
ctx->buf[56] = bswap_64( ctx->count << 3 );
// ctx->buf[56] = bswap_64( ctx->count );
// be64enc( &ctx->buf[56], ctx->count );
/* Mix in the final block. */
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
for ( int i = 0; i < 8; i++ ) hash[i] = bswap_32( ctx->state[i] );
// for ( int i = 0; i < 8; i++ ) be32enc( hash + 4*i, ctx->state + i );
/*
// be32enc_vect(digest, ctx->state, 4);
// be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
// Encode vector, two words at a time.
do {
be32enc(&dst[0], src[0]);
be32enc(&dst[4], src[1]);
src += 2;
dst += 8;
} while (--len);
*/
}
#endif
void sha256_final( sha256_context *ctx, void *hash )
{
int ptr = ctx->count & 0x3f;
ctx->buf[ ptr++ ] = 0x80;
if ( ptr > 56 )
{
memset( ctx->buf + ptr, 0, 64 - ptr );
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
memset( ctx->buf, 0, 56 );
}
else
memset( ctx->buf + ptr, 0, 56 - ptr );
*(uint64_t*)(&ctx->buf[56]) = bswap_64( ctx->count << 3 );
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
for ( int i = 0; i < 8; i++ )
( (uint32_t*)hash )[i] = bswap_32( ctx->state[i] );
}
void sha256_full( void *hash, const void *data, size_t len )
{
sha256_context ctx;
sha256_ctx_init( &ctx );
sha256_update( &ctx, data, len );
sha256_final( &ctx, hash );
}

60
algo/sha/sha256-hash.h Normal file
View File

@@ -0,0 +1,60 @@
#ifndef SHA256_HASH_H__
#define SHA256_HASH_H__ 1
#include <stddef.h>
#include "simd-utils.h"
#include "cpuminer-config.h"
#include "sph_sha2.h"
// generic interface
typedef struct {
unsigned char buf[64]; /* first field, for alignment */
uint32_t state[8];
uint64_t count;
} sha256_context __attribute__((aligned(64)));
void sha256_full( void *hash, const void *data, size_t len );
void sha256_update( sha256_context *ctx, const void *data, size_t len );
void sha256_final( sha256_context *ctx, void *hash );
void sha256_ctx_init( sha256_context *ctx );
void sha256_transform_le( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
void sha256_transform_be( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
#if defined(__SHA__)
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in );
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in );
// 2 way with interleaved instructions
void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y );
void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y );
// Select target
// with SHA...
#define sha256_transform_le sha256_opt_transform_le
#define sha256_transform_be sha256_opt_transform_be
#else
// without SHA...
#define sha256_transform_le sph_sha256_transform_le
#define sha256_transform_be sph_sha256_transform_be
#endif
// SHA can't do only 3 rounds
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
#endif

View File

@@ -1,4 +1,4 @@
#include "sha256t-gate.h"
#include "sha256d-4way.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -10,12 +10,14 @@
int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m512i vdata[32] __attribute__ ((aligned (128)));
__m512i block[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (32)));
__m512i initstate[8] __attribute__ ((aligned (32)));
__m512i midstate[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m512i vdata[20] __attribute__ ((aligned (32)));
__m512i hash32[8] __attribute__ ((aligned (64)));
__m512i initstate[8] __attribute__ ((aligned (64)));
__m512i midstate1[8] __attribute__ ((aligned (64)));
__m512i midstate2[8] __attribute__ ((aligned (64)));
__m512i mexp_pre[16] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
@@ -23,7 +25,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16;
uint32_t n = first_nonce;
__m512i *noncev = vdata + 19;
__m512i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i last_byte = m512_const1_32( 0x80000000 );
@@ -35,6 +37,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_512( vdata+16 + 5, 10 );
vdata[16+15] = m512_const1_32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = m512_const1_32( 32*8 ); // bit count
// initialize state
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
@@ -45,47 +55,42 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_16way_transform( midstate, vdata, initstate );
sha256_16way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
do
{
// 1. final 16 bytes of data, with padding
memcpy_512( block, vdata + 16, 4 );
block[ 4] = last_byte;
memset_zero_512( block + 5, 10 );
block[15] = m512_const1_32( 80*8 ); // bit count
sha256_16way_transform( hash32, block, midstate );
sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
mexp_pre );
// 2. 32 byte hash from 1.
memcpy_512( block, hash32, 8 );
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = m512_const1_32( 32*8 ); // bit count
sha256_16way_transform( hash32, block, initstate );
// byte swap final hash for testing
mm512_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
if ( sha256_16way_transform_le_short( hash32, block, initstate ) )
{
extr_lane_16x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
// byte swap final hash for testing
mm512_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
extr_lane_16x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
}
*noncev = _mm512_add_epi32( *noncev, sixteen );
n += 16;
}
*noncev = _mm512_add_epi32( *noncev, sixteen );
n += 16;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif
#if defined(SHA256D_8WAY)
@@ -93,12 +98,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m256i block[16] __attribute__ ((aligned (64)));
__m256i vdata[32] __attribute__ ((aligned (64)));
__m256i block[16] __attribute__ ((aligned (32)));
__m256i hash32[8] __attribute__ ((aligned (32)));
__m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate[8] __attribute__ ((aligned (32)));
__m256i midstate1[8] __attribute__ ((aligned (32)));
__m256i midstate2[8] __attribute__ ((aligned (32)));
__m256i mexp_pre[16] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m256i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
@@ -113,10 +120,18 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
const __m256i eight = m256_const1_32( 8 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m256_const1_32( pdata[i] );
vdata[i] = m256_const1_32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_256( vdata+16 + 5, 10 );
vdata[16+15] = m256_const1_32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = m256_const1_32( 32*8 ); // bit count
// initialize state
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
@@ -127,36 +142,33 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_8way_transform( midstate, vdata, initstate );
sha256_8way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
do
{
// 1. final 16 bytes of data, with padding
memcpy_256( block, vdata + 16, 4 );
block[ 4] = last_byte;
memset_zero_256( block + 5, 10 );
block[15] = m256_const1_32( 80*8 ); // bit count
sha256_8way_transform( hash32, block, midstate );
sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
mexp_pre );
// 2. 32 byte hash from 1.
memcpy_256( block, hash32, 8 );
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = m256_const1_32( 32*8 ); // bit count
sha256_8way_transform( hash32, block, initstate );
// byte swap final hash for testing
mm256_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
if ( unlikely(
sha256_8way_transform_le_short( hash32, block, initstate ) ) )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
// byte swap final hash for testing
mm256_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
}
*noncev = _mm256_add_epi32( *noncev, eight );
@@ -174,12 +186,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m128i block[16] __attribute__ ((aligned (64)));
__m128i hash32[8] __attribute__ ((aligned (32)));
__m128i initstate[8] __attribute__ ((aligned (32)));
__m128i midstate[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m128i vdata[20] __attribute__ ((aligned (32)));
__m128i vdata[32] __attribute__ ((aligned (64)));
__m128i block[16] __attribute__ ((aligned (32)));
__m128i hash32[8] __attribute__ ((aligned (32)));
__m128i initstate[8] __attribute__ ((aligned (32)));
__m128i midstate1[8] __attribute__ ((aligned (32)));
__m128i midstate2[8] __attribute__ ((aligned (32)));
__m128i mexp_pre[16] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
@@ -198,6 +212,14 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = m128_const1_32( 32*8 ); // bit count
// initialize state
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
@@ -209,39 +231,36 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_4way_transform( midstate, vdata, initstate );
sha256_4way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
do
{
// 1. final 16 bytes of data, with padding
memcpy_128( block, vdata + 16, 4 );
block[ 4] = last_byte;
memset_zero_128( block + 5, 10 );
block[15] = m128_const1_32( 80*8 ); // bit count
sha256_4way_transform( hash32, block, midstate );
sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
mexp_pre );
// 2. 32 byte hash from 1.
memcpy_128( block, hash32, 8 );
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = m128_const1_32( 32*8 ); // bit count
sha256_4way_transform( hash32, block, initstate );
// byte swap final hash for testing
mm128_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
if ( unlikely(
sha256_4way_transform_le_short( hash32, block, initstate ) ) )
{
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
// byte swap final hash for testing
mm128_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
}
*noncev = _mm_add_epi32( *noncev, four );
n += 4;
}
*noncev = _mm_add_epi32( *noncev, four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
@@ -250,3 +269,20 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
#endif
/*
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
#elif defined(SHA256D_8WAY)
gate->scanhash = (void*)&scanhash_sha256d_8way;
#elif defined(SHA256D_4WAY)
gate->scanhash = (void*)&scanhash_sha256d_4way;
#endif
// gate->hash = (void*)&sha256d;
return true;
};
*/

46
algo/sha/sha256d-4way.h Normal file
View File

@@ -0,0 +1,46 @@
#ifndef __SHA256D_4WAY_H__
#define __SHA256D_4WAY_H__ 1
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA256D_16WAY 1
#elif defined(__AVX2__)
#define SHA256D_8WAY 1
#else
#define SHA256D_4WAY 1
#endif
bool register_sha256d_algo( algo_gate_t* gate );
#if defined(SHA256D_16WAY)
int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256D_8WAY)
int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256D_4WAY)
int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
/*
#if defined(__SHA__)
int scanhash_sha256d( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
*/
#endif

8
algo/sha/sha256d.c Normal file
View File

@@ -0,0 +1,8 @@
#include "sha256d.h"
void sha256d( void *hash, const void *data, int len )
{
sha256_full( hash, data, len );
sha256_full( hash, hash, 32 );
}

7
algo/sha/sha256d.h Normal file
View File

@@ -0,0 +1,7 @@
#include "algo-gate-api.h"
#include <string.h>
#include <inttypes.h>
#include "sha256-hash.h"
void sha256d( void *hash, const void *data, int len );

View File

@@ -3,14 +3,14 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
static __thread sph_sha256_context sha256q_ctx __attribute__ ((aligned (64)));
static __thread sha256_context sha256q_ctx __attribute__ ((aligned (64)));
void sha256q_midstate( const void* input )
{
sph_sha256_init( &sha256q_ctx );
sph_sha256( &sha256q_ctx, input, 64 );
sha256_ctx_init( &sha256q_ctx );
sha256_update( &sha256q_ctx, input, 64 );
}
int sha256q_hash( void* output, const void* input )
@@ -19,24 +19,16 @@ int sha256q_hash( void* output, const void* input )
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
sph_sha256_context ctx __attribute__ ((aligned (64)));
sha256_context ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx );
sph_sha256( &ctx, input + midlen, tail );
sph_sha256_close( &ctx, hash );
sph_sha256_init( &ctx );
sph_sha256( &ctx, hash, 32 );
sph_sha256_close( &ctx, hash );
sph_sha256_init( &ctx );
sph_sha256( &ctx, hash, 32 );
sph_sha256_close( &ctx, hash );
sph_sha256_init( &ctx );
sph_sha256( &ctx, hash, 32 );
sph_sha256_close( &ctx, output );
sha256_update( &ctx, input + midlen, tail );
sha256_final( &ctx, hash );
sha256_full( hash, hash, 32 );
sha256_full( hash, hash, 32 );
sha256_full( output, hash, 32 );
return 1;
}

View File

@@ -10,13 +10,14 @@
int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m512i vdata[32] __attribute__ ((aligned (128)));
__m512i block[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (32)));
__m512i initstate[8] __attribute__ ((aligned (32)));
__m512i midstate[8] __attribute__ ((aligned (32)));
__m512i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m512i vdata[20] __attribute__ ((aligned (32)));
__m512i hash32[8] __attribute__ ((aligned (64)));
__m512i initstate[8] __attribute__ ((aligned (64)));
__m512i midstate1[8] __attribute__ ((aligned (64)));
__m512i midstate2[8] __attribute__ ((aligned (64)));
__m512i mexp_pre[16] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
@@ -31,12 +32,19 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
const __m512i sixteen = m512_const1_32( 16 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m512_const1_32( pdata[i] );
vdata[i] = m512_const1_32( pdata[i] );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
// initialize state
vdata[16+4] = last_byte;
memset_zero_512( vdata+16 + 5, 10 );
vdata[16+15] = m512_const1_32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = m512_const1_32( 32*8 ); // bit count
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
@@ -46,48 +54,40 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 byte block of data
sha256_16way_transform( midstate, vdata, initstate );
sha256_16way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
do
{
// 1. final 16 bytes of data, with padding
memcpy_512( block, vdata + 16, 4 );
block[ 4] = last_byte;
memset_zero_512( block + 5, 10 );
block[15] = m512_const1_32( 80*8 ); // bit count
sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
// sha256_16way_transform( hash32, block, midstate );
// 1. final 16 bytes of data, pre-padded
sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
mexp_pre );
// 2. 32 byte hash from 1.
memcpy_512( block, hash32, 8 );
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = m512_const1_32( 32*8 ); // bit count
sha256_16way_transform( hash32, block, initstate );
sha256_16way_transform_le( block, block, initstate );
// 3. 32 byte hash from 2.
memcpy_512( block, hash32, 8 );
sha256_16way_transform( hash32, block, initstate );
// byte swap final hash for testing
mm512_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
if ( unlikely(
sha256_16way_transform_le_short( hash32, block, initstate ) ) )
{
extr_lane_16x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
// byte swap final hash for testing
mm512_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 16; lane++ )
if ( hash32_d7[ lane ] <= targ32_d7 )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
extr_lane_16x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
}
*noncev = _mm512_add_epi32( *noncev, sixteen );
n += 16;
}
*noncev = _mm512_add_epi32( *noncev, sixteen );
n += 16;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
@@ -102,12 +102,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m256i block[16] __attribute__ ((aligned (64)));
__m256i vdata[32] __attribute__ ((aligned (64)));
__m256i block[16] __attribute__ ((aligned (32)));
__m256i hash32[8] __attribute__ ((aligned (32)));
__m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate[8] __attribute__ ((aligned (32)));
__m256i midstate1[8] __attribute__ ((aligned (32)));
__m256i midstate2[8] __attribute__ ((aligned (32)));
__m256i mexp_pre[16] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m256i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
@@ -122,10 +124,18 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
const __m256i eight = m256_const1_32( 8 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m256_const1_32( pdata[i] );
vdata[i] = m256_const1_32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_256( vdata+16 + 5, 10 );
vdata[16+15] = m256_const1_32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = m256_const1_32( 32*8 ); // bit count
// initialize state
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
@@ -136,44 +146,40 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_8way_transform( midstate, vdata, initstate );
sha256_8way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
do
{
// 1. final 16 bytes of data, with padding
memcpy_256( block, vdata + 16, 4 );
block[ 4] = last_byte;
memset_zero_256( block + 5, 10 );
block[15] = m256_const1_32( 80*8 ); // bit count
sha256_8way_transform( hash32, block, midstate );
sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
mexp_pre );
// 2. 32 byte hash from 1.
memcpy_256( block, hash32, 8 );
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = m256_const1_32( 32*8 ); // bit count
sha256_8way_transform( hash32, block, initstate );
sha256_8way_transform_le( block, block, initstate );
// 3. 32 byte hash from 2.
memcpy_256( block, hash32, 8 );
sha256_8way_transform( hash32, block, initstate );
// byte swap final hash for testing
mm256_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
if ( unlikely(
sha256_8way_transform_le_short( hash32, block, initstate ) ) )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
// byte swap final hash for testing
mm256_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 8; lane++ )
if ( hash32_d7[ lane ] <= targ32_d7 )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
}
*noncev = _mm256_add_epi32( *noncev, eight );
n += 8;
}
*noncev = _mm256_add_epi32( *noncev, eight );
n += 8;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
@@ -182,17 +188,110 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
#endif
#if defined(SHA256T_4WAY)
// Optimizations are slower with AVX/SSE2
// https://github.com/JayDDee/cpuminer-opt/issues/344
/*
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m128i vdata[32] __attribute__ ((aligned (64)));
__m128i block[16] __attribute__ ((aligned (32)));
__m128i hash32[8] __attribute__ ((aligned (32)));
__m128i initstate[8] __attribute__ ((aligned (32)));
__m128i midstate1[8] __attribute__ ((aligned (32)));
__m128i midstate2[8] __attribute__ ((aligned (32)));
__m128i mexp_pre[16] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m128i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = m128_const1_32( 0x80000000 );
const __m128i four = m128_const1_32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m128_const1_32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = m128_const1_32( 32*8 ); // bit count
// initialize state
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m128_const1_64( 0x510E527F510E527F );
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_4way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
do
{
// 1. final 16 bytes of data, with padding
sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
mexp_pre );
// 2. 32 byte hash from 1.
sha256_4way_transform_le( block, block, initstate );
// 3. 32 byte hash from 2.
if ( unlikely(
sha256_4way_transform_le_short( hash32, block, initstate ) ) )
{
// byte swap final hash for testing
mm128_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
}
*noncev = _mm_add_epi32( *noncev, four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
*/
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m128i block[16] __attribute__ ((aligned (64)));
__m128i vdata[32] __attribute__ ((aligned (64)));
__m128i block[16] __attribute__ ((aligned (32)));
__m128i hash32[8] __attribute__ ((aligned (32)));
__m128i initstate[8] __attribute__ ((aligned (32)));
__m128i midstate[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m128i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
@@ -211,6 +310,14 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = m128_const1_32( 32*8 ); // bit count
// initialize state
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
@@ -222,29 +329,13 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_4way_transform( midstate, vdata, initstate );
sha256_4way_transform_le( midstate, vdata, initstate );
do
{
// 1. final 16 bytes of data, with padding
memcpy_128( block, vdata + 16, 4 );
block[ 4] = last_byte;
memset_zero_128( block + 5, 10 );
block[15] = m128_const1_32( 80*8 ); // bit count
sha256_4way_transform( hash32, block, midstate );
// 2. 32 byte hash from 1.
memcpy_128( block, hash32, 8 );
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = m128_const1_32( 32*8 ); // bit count
sha256_4way_transform( hash32, block, initstate );
// 3. 32 byte hash from 2.
memcpy_128( block, hash32, 8 );
sha256_4way_transform( hash32, block, initstate );
// byte swap final hash for testing
sha256_4way_transform_le( block, vdata+16, midstate );
sha256_4way_transform_le( block, block, initstate );
sha256_4way_transform_le( hash32, block, initstate );
mm128_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
@@ -265,5 +356,6 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
return 0;
}
#endif

View File

@@ -4,120 +4,12 @@
#include <string.h>
#include <stdio.h>
//#include "algo/sha/sph_sha2.h"
#include "sha256-hash-opt.h"
#include "sha256-hash.h"
#if defined(__SHA__)
// Only used on CPUs with SHA
/*
static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64)));
void sha256t_midstate( const void* input )
{
sph_sha256_init( &sha256t_ctx );
sph_sha256( &sha256t_ctx, input, 64 );
}
int sha256t_hash( void* output, const void* input )
{
uint32_t _ALIGN(64) hash[16];
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
sph_sha256_context ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &sha256t_ctx, sizeof sha256t_ctx );
sph_sha256( &ctx, input + midlen, tail );
sph_sha256_close( &ctx, hash );
sph_sha256_init( &ctx );
sph_sha256( &ctx, hash, 32 );
sph_sha256_close( &ctx, hash );
sph_sha256_init( &ctx );
sph_sha256( &ctx, hash, 32 );
sph_sha256_close( &ctx, output );
return 1;
}
*/
/*
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t block[16] __attribute__ ((aligned (64)));
uint32_t hash32[8] __attribute__ ((aligned (32)));
uint32_t initstate[8] __attribute__ ((aligned (32)));
uint32_t midstate[8] __attribute__ ((aligned (32)));
// uint32_t edata[20] __attribute__((aligned(64)));
// uint32_t hash[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 1;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
__m128i shuf_bswap32 =
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// mm128_bswap32_80( edata, pdata );
// sha256t_midstate( edata );
// initialize state
initstate[0] = 0x6A09E667;
initstate[1] = 0xBB67AE85;
initstate[2] = 0x3C6EF372;
initstate[3] = 0xA54FF53A;
initstate[4] = 0x510E527F;
initstate[5] = 0x9B05688C;
initstate[6] = 0x1F83D9AB;
initstate[7] = 0x5BE0CD19;
// hash first 64 bytes of data
sha256_opt_transform( midstate, pdata, initstate );
do
{
// 1. final 16 bytes of data, with padding
memcpy( block, pdata + 16, 16 );
block[ 4] = 0x80000000;
memset( block + 5, 0, 40 );
block[15] = 80*8; // bit count
sha256_opt_transform( hash32, block, midstate );
// 2. 32 byte hash from 1.
memcpy( block, hash32, 32 );
block[ 8] = 0x80000000;
memset( block + 9, 0, 24 );
block[15] = 32*8; // bit count
sha256_opt_transform( hash32, block, initstate );
// 3. 32 byte hash from 2.
memcpy( block, hash32, 32 );
sha256_opt_transform( hash32, block, initstate );
// byte swap final hash for testing
casti_m128i( hash32, 0 ) =
_mm_shuffle_epi8( casti_m128i( hash32, 0 ), shuf_bswap32 );
casti_m128i( hash32, 1 ) =
_mm_shuffle_epi8( casti_m128i( hash32, 1 ), shuf_bswap32 );
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
submit_solution( work, hash32, mythr );
n++;
pdata[19] = n;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
return 0;
}
*/
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
@@ -149,7 +41,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
initstate[7] = 0x5BE0CD19;
// hash first 64 bytes of data
sha256_opt_transform( midstate, pdata, initstate );
sha256_opt_transform_le( midstate, pdata, initstate );
do
{
@@ -162,7 +54,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
memset( block0 + 5, 0, 40 );
memset( block1 + 5, 0, 40 );
block0[15] = block1[15] = 80*8; // bit count
sha256_ni2way_transform( hash0, hash1, block0, block1, midstate, midstate );
sha256_ni2way_transform_le( hash0, hash1, block0, block1, midstate, midstate );
// 2. 32 byte hash from 1.
memcpy( block0, hash0, 32 );
@@ -171,12 +63,12 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
memset( block0 + 9, 0, 24 );
memset( block1 + 9, 0, 24 );
block0[15] = block1[15] = 32*8; // bit count
sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
// 3. 32 byte hash from 2.
memcpy( block0, hash0, 32 );
memcpy( block1, hash1, 32 );
sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
// byte swap final hash for testing
casti_m128i( hash0, 0 ) =

View File

@@ -95,32 +95,36 @@ static const uint64_t K512[80] =
// SHA-512 8 way 64 bit
#define CH8W(X, Y, Z) \
_mm512_ternarylogic_epi64( X, Y, Z, 0xca )
#define CH8W( X, Y, Z ) _mm512_ternarylogic_epi64( X, Y, Z, 0xca )
#define MAJ8W(X, Y, Z) \
_mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
#define MAJ8W( X, Y, Z ) _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
#define BSG8W_5_0(x) \
mm512_xor3( mm512_ror_64(x, 28), mm512_ror_64(x, 34), mm512_ror_64(x, 39) )
#define BSG8W_5_0( x ) mm512_xor3( _mm512_ror_epi64( x, 28 ), \
_mm512_ror_epi64( x, 34 ), \
_mm512_ror_epi64( x, 39 ) )
#define BSG8W_5_1(x) \
mm512_xor3( mm512_ror_64(x, 14), mm512_ror_64(x, 18), mm512_ror_64(x, 41) )
#define BSG8W_5_1( x ) mm512_xor3( _mm512_ror_epi64( x, 14 ), \
_mm512_ror_epi64( x, 18 ), \
_mm512_ror_epi64( x, 41 ) )
#define SSG8W_5_0(x) \
mm512_xor3( mm512_ror_64(x, 1), mm512_ror_64(x, 8), _mm512_srli_epi64(x, 7) )
#define SSG8W_5_0( x ) mm512_xor3( _mm512_ror_epi64( x, 1 ), \
_mm512_ror_epi64( x, 8 ), \
_mm512_srli_epi64( x, 7 ) )
#define SSG8W_5_1(x) \
mm512_xor3( mm512_ror_64(x, 19), mm512_ror_64(x, 61), _mm512_srli_epi64(x, 6) )
#define SSG8W_5_1( x ) mm512_xor3( _mm512_ror_epi64( x, 19 ), \
_mm512_ror_epi64( x, 61 ), \
_mm512_srli_epi64( x, 6 ) )
#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
#define SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
__m512i T1, T2; \
__m512i K = _mm512_set1_epi64( K512[ i ] ); \
T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
K, W[i] ) ); \
T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
D = _mm512_add_epi64( D, T1 ); \
__m512i T0 = _mm512_add_epi64( _mm512_set1_epi64( K512[i] ), W[ i ] ); \
__m512i T1 = BSG8W_5_1( E ); \
__m512i T2 = BSG8W_5_0( A ); \
T0 = _mm512_add_epi64( T0, CH8W( E, F, G ) ); \
T1 = _mm512_add_epi64( T1, H ); \
T2 = _mm512_add_epi64( T2, MAJ8W( A, B, C ) ); \
T1 = _mm512_add_epi64( T1, T0 ); \
D = _mm512_add_epi64( D, T1 ); \
H = _mm512_add_epi64( T1, T2 ); \
} while (0)
@@ -267,16 +271,9 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
// SHA-512 4 way 64 bit
#define CH(X, Y, Z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
/*
#define MAJ(X, Y, Z) \
_mm256_or_si256( _mm256_and_si256( X, Y ), \
_mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
*/
#define MAJ(X, Y, Z) \
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
Y_xor_Z ) )
@@ -289,15 +286,6 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
_mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
/*
#define BSG5_0(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) )
#define BSG5_1(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) )
*/
/*
#define SSG5_0(x) \
_mm256_xor_si256( _mm256_xor_si256( \
@@ -325,94 +313,20 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
return _mm256_add_epi64( w0a, w1a );
}
/*
#define SSG512x2_0( w0, w1, i ) do \
{ \
__m256i X0a, X1a, X0b, X1b; \
X0a = mm256_ror_64( W[i-15], 1 ); \
X1a = mm256_ror_64( W[i-14], 1 ); \
X0b = mm256_ror_64( W[i-15], 8 ); \
X1b = mm256_ror_64( W[i-14], 8 ); \
X0a = _mm256_xor_si256( X0a, X0b ); \
X1a = _mm256_xor_si256( X1a, X1b ); \
X0b = _mm256_srli_epi64( W[i-15], 7 ); \
X1b = _mm256_srli_epi64( W[i-14], 7 ); \
w0 = _mm256_xor_si256( X0a, X0b ); \
w1 = _mm256_xor_si256( X1a, X1b ); \
} while(0)
#define SSG512x2_1( w0, w1, i ) do \
{ \
__m256i X0a, X1a, X0b, X1b; \
X0a = mm256_ror_64( W[i-2],19 ); \
X1a = mm256_ror_64( W[i-1],19 ); \
X0b = mm256_ror_64( W[i-2],61 ); \
X1b = mm256_ror_64( W[i-1],61 ); \
X0a = _mm256_xor_si256( X0a, X0b ); \
X1a = _mm256_xor_si256( X1a, X1b ); \
X0b = _mm256_srli_epi64( W[i-2], 6 ); \
X1b = _mm256_srli_epi64( W[i-1], 6 ); \
w0 = _mm256_xor_si256( X0a, X0b ); \
w1 = _mm256_xor_si256( X1a, X1b ); \
} while(0)
*/
/*
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
__m256i K = _mm256_set1_epi64x( K512[ i ] ); \
__m256i T1 = mm256_ror_64( E, 23 ); \
__m256i T2 = mm256_ror_64( A, 5 ); \
__m256i T3 = _mm256_xor_si256( F, G ); \
__m256i T4 = _mm256_or_si256( A, B ); \
__m256i T5 = _mm256_and_si256( A, B ); \
K = _mm256_add_epi64( K, W[i] ); \
T1 = _mm256_xor_si256( T1, E ); \
T2 = _mm256_xor_si256( T2, A ); \
T3 = _mm256_and_si256( T3, E ); \
T4 = _mm256_and_si256( T4, C ); \
K = _mm256_add_epi64( H, K ); \
T1 = mm256_ror_64( T1, 4 ); \
T2 = mm256_ror_64( T2, 6 ); \
T3 = _mm256_xor_si256( T3, G ); \
T4 = _mm256_or_si256( T4, T5 ); \
T1 = _mm256_xor_si256( T1, E ); \
T2 = _mm256_xor_si256( T2, A ); \
T1 = mm256_ror_64( T1, 14 ); \
T2 = mm256_ror_64( T2, 28 ); \
T1 = _mm256_add_epi64( T1, T3 ); \
T2 = _mm256_add_epi64( T2, T4 ); \
T1 = _mm256_add_epi64( T1, K ); \
H = _mm256_add_epi64( T1, T2 ); \
D = _mm256_add_epi64( D, T1 ); \
} while (0)
*/
/*
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
do { \
__m256i K = _mm256_add_epi64( W[i], _mm256_set1_epi64x( K512[ i ] ) ); \
__m256i T1 = BSG5_1(E); \
__m256i T2 = BSG5_0(A); \
T1 = mm256_add4_64( T1, H, CH(E, F, G), K ); \
T2 = _mm256_add_epi64( T2, MAJ(A, B, C) ); \
D = _mm256_add_epi64( D, T1 ); \
H = _mm256_add_epi64( T1, T2 ); \
} while (0)
*/
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
do { \
__m256i T1, T2; \
__m256i K = _mm256_set1_epi64x( K512[ i ] ); \
T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
K, W[i] ) ); \
T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[ i ] ); \
__m256i T1 = BSG5_1( E ); \
__m256i T2 = BSG5_0( A ); \
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
T1 = _mm256_add_epi64( T1, H ); \
T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \
T1 = _mm256_add_epi64( T1, T0 ); \
Y_xor_Z = X_xor_Y; \
D = _mm256_add_epi64( D, T1 ); \
D = _mm256_add_epi64( D, T1 ); \
H = _mm256_add_epi64( T1, T2 ); \
} while (0)
static void
sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
{

View File

@@ -71,198 +71,6 @@ static const sph_u32 H256[8] = {
* of the compression function.
*/
#if defined(__SHA__)
#include "simd-utils.h"
static void sha2_round( const uint8_t input[], uint32_t state[8] )
{
__m128i STATE0, STATE1;
__m128i MSG, TMP, MASK;
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
__m128i ABEF_SAVE, CDGH_SAVE;
// Load initial values
TMP = _mm_load_si128((__m128i*) &state[0]);
STATE1 = _mm_load_si128((__m128i*) &state[4]);
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
// Save current hash
ABEF_SAVE = STATE0;
CDGH_SAVE = STATE1;
// Rounds 0-3
MSG = _mm_load_si128((const __m128i*) (input+0));
TMSG0 = _mm_shuffle_epi8(MSG, MASK);
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 4-7
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 8-11
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 12-15
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 16-19
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 20-23
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 24-27
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 28-31
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 32-35
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 36-39
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 40-43
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 44-47
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 48-51
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 52-55
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 56-59
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 60-63
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Add values back to state
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
// Save state
_mm_store_si128((__m128i*) &state[0], STATE0);
_mm_store_si128((__m128i*) &state[4], STATE1);
}
#else // no SHA
/*
static const sph_u32 K[64] = {
@@ -875,8 +683,54 @@ sha2_round(const unsigned char *data, sph_u32 r[8])
#undef SHA2_IN
}
#endif // SHA else
void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in )
{
memcpy( state_out, state_in, 32 );
#define SHA2_IN(x) (data[x])
SHA2_ROUND_BODY( SHA2_IN, state_out );
#undef SHA2_IN
}
void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in )
{
memcpy( state_out, state_in, 32 );
#define SHA2_IN(x) sph_dec32be_aligned( data+(x) )
SHA2_ROUND_BODY( SHA2_IN, state_out );
#undef SHA2_IN
}
void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in )
{
uint32_t t1, t2, X_xor_Y, Y_xor_Z = state_in[1] ^ state_in[2];
memcpy( state_out, state_in, 32 );
t1 = state_out[7] + BSG2_1( state_out[4] )
+ CH( state_out[4], state_out[5], state_out[6] ) + 0x428A2F98 + data[0];
t2 = BSG2_0( state_out[0] )
+ MAJ( state_out[0], state_out[1], state_out[2] );
Y_xor_Z = X_xor_Y;
state_out[3] += t1;
state_out[7] = t1 + t2;
t1 = state_out[6] + BSG2_1( state_out[3] )
+ CH( state_out[3], state_out[4], state_out[5] ) + 0x71374491 + data[1];
t2 = BSG2_0( state_out[7] )
+ MAJ( state_out[7], state_out[0], state_out[1] );
Y_xor_Z = X_xor_Y;
state_out[2] += t1;
state_out[6] = t1 + t2;
t1 = state_out[5] + BSG2_1( state_out[2] )
+ CH( state_out[2], state_out[3], state_out[4] ) + 0xB5C0FBCF + data[2];
t2 = BSG2_0( state_out[6] )
+ MAJ( state_out[6], state_out[7], state_out[0] );
state_out[1] += t1;
state_out[5] = t1 + t2;
}
/* see sph_sha2.h */
void

View File

@@ -207,6 +207,16 @@ void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
void sph_sha256_full( void *dst, const void *data, size_t len );
// These shouldn't be called directly, use sha256-hash.h generic functions
// sha256_transform_le & sha256_transform_be instead.
void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
#if SPH_64

View File

@@ -33,6 +33,7 @@
#include <stddef.h>
#include <string.h>
// 4way is only used with AVX2, 8way only with AVX512, 16way is not needed.
#ifdef __SSE4_1__
#include "shabal-hash-4way.h"
@@ -44,50 +45,37 @@ extern "C"{
#pragma warning (disable: 4146)
#endif
/*
* Part of this code was automatically generated (the part between
* the "BEGIN" and "END" markers).
*/
#define sM 16
#define C32 SPH_C32
#define T32 SPH_T32
#define O1 13
#define O2 9
#define O3 6
#if defined(__AVX2__)
#define DECL_STATE8 \
__m256i A00, A01, A02, A03, A04, A05, A06, A07, \
A08, A09, A0A, A0B; \
__m256i A0, A1, A2, A3, A4, A5, A6, A7, \
A8, A9, AA, AB; \
__m256i B0, B1, B2, B3, B4, B5, B6, B7, \
B8, B9, BA, BB, BC, BD, BE, BF; \
__m256i C0, C1, C2, C3, C4, C5, C6, C7, \
C8, C9, CA, CB, CC, CD, CE, CF; \
__m256i M0, M1, M2, M3, M4, M5, M6, M7, \
M8, M9, MA, MB, MC, MD, ME, MF; \
const __m256i FIVE = _mm256_set1_epi32( 5 ); \
const __m256i THREE = _mm256_set1_epi32( 3 ); \
sph_u32 Wlow, Whigh;
#define READ_STATE8(state) do \
{ \
if ( (state)->state_loaded ) \
{ \
A00 = (state)->A[0]; \
A01 = (state)->A[1]; \
A02 = (state)->A[2]; \
A03 = (state)->A[3]; \
A04 = (state)->A[4]; \
A05 = (state)->A[5]; \
A06 = (state)->A[6]; \
A07 = (state)->A[7]; \
A08 = (state)->A[8]; \
A09 = (state)->A[9]; \
A0A = (state)->A[10]; \
A0B = (state)->A[11]; \
A0 = (state)->A[0]; \
A1 = (state)->A[1]; \
A2 = (state)->A[2]; \
A3 = (state)->A[3]; \
A4 = (state)->A[4]; \
A5 = (state)->A[5]; \
A6 = (state)->A[6]; \
A7 = (state)->A[7]; \
A8 = (state)->A[8]; \
A9 = (state)->A[9]; \
AA = (state)->A[10]; \
AB = (state)->A[11]; \
B0 = (state)->B[0]; \
B1 = (state)->B[1]; \
B2 = (state)->B[2]; \
@@ -124,18 +112,18 @@ extern "C"{
else \
{ \
(state)->state_loaded = true; \
A00 = m256_const1_64( 0x20728DFD20728DFD ); \
A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
A02 = m256_const1_64( 0xE782B699E782B699 ); \
A03 = m256_const1_64( 0x5530463255304632 ); \
A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
A09 = m256_const1_64( 0x8BD144108BD14410 ); \
A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
A0 = m256_const1_64( 0x20728DFD20728DFD ); \
A1 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
A2 = m256_const1_64( 0xE782B699E782B699 ); \
A3 = m256_const1_64( 0x5530463255304632 ); \
A4 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
A5 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
A6 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
A7 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
A8 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
A9 = m256_const1_64( 0x8BD144108BD14410 ); \
AA = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
AB = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
B1 = m256_const1_64( 0x07B385F307B385F3 ); \
B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
@@ -174,18 +162,18 @@ extern "C"{
} while (0)
#define WRITE_STATE8(state) do { \
(state)->A[0] = A00; \
(state)->A[1] = A01; \
(state)->A[2] = A02; \
(state)->A[3] = A03; \
(state)->A[4] = A04; \
(state)->A[5] = A05; \
(state)->A[6] = A06; \
(state)->A[7] = A07; \
(state)->A[8] = A08; \
(state)->A[9] = A09; \
(state)->A[10] = A0A; \
(state)->A[11] = A0B; \
(state)->A[0] = A0; \
(state)->A[1] = A1; \
(state)->A[2] = A2; \
(state)->A[3] = A3; \
(state)->A[4] = A4; \
(state)->A[5] = A5; \
(state)->A[6] = A6; \
(state)->A[7] = A7; \
(state)->A[8] = A8; \
(state)->A[9] = A9; \
(state)->A[10] = AA; \
(state)->A[11] = AB; \
(state)->B[0] = B0; \
(state)->B[1] = B1; \
(state)->B[2] = B2; \
@@ -284,8 +272,8 @@ do { \
#define XOR_W8 \
do { \
A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \
A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \
A0 = _mm256_xor_si256( A0, _mm256_set1_epi32( Wlow ) ); \
A1 = _mm256_xor_si256( A1, _mm256_set1_epi32( Whigh ) ); \
} while (0)
#define SWAP_BC8 \
@@ -308,73 +296,71 @@ do { \
mm256_swap512_256( BF, CF ); \
} while (0)
#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
#define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm256_xor3( xm, xb1, _mm256_xor_si256( \
_mm256_andnot_si256( xb3, xb2 ), \
_mm256_mullo_epi32( mm256_xor3( xa0, xc, \
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
_mm256_set1_epi32(5UL) ) ), \
_mm256_set1_epi32(3UL) ) ) ); \
xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
_mm256_mullo_epi32( mm256_xor3( xa0, xc, \
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
xb3, xb2 ) ); \
xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
} while (0)
#define PERM_STEP_0_8 do { \
PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \
} while (0)
PERM_ELT8( A0, AB, B0, BD, B9, B6, C8, M0 ); \
PERM_ELT8( A1, A0, B1, BE, BA, B7, C7, M1 ); \
PERM_ELT8( A2, A1, B2, BF, BB, B8, C6, M2 ); \
PERM_ELT8( A3, A2, B3, B0, BC, B9, C5, M3 ); \
PERM_ELT8( A4, A3, B4, B1, BD, BA, C4, M4 ); \
PERM_ELT8( A5, A4, B5, B2, BE, BB, C3, M5 ); \
PERM_ELT8( A6, A5, B6, B3, BF, BC, C2, M6 ); \
PERM_ELT8( A7, A6, B7, B4, B0, BD, C1, M7 ); \
PERM_ELT8( A8, A7, B8, B5, B1, BE, C0, M8 ); \
PERM_ELT8( A9, A8, B9, B6, B2, BF, CF, M9 ); \
PERM_ELT8( AA, A9, BA, B7, B3, B0, CE, MA ); \
PERM_ELT8( AB, AA, BB, B8, B4, B1, CD, MB ); \
PERM_ELT8( A0, AB, BC, B9, B5, B2, CC, MC ); \
PERM_ELT8( A1, A0, BD, BA, B6, B3, CB, MD ); \
PERM_ELT8( A2, A1, BE, BB, B7, B4, CA, ME ); \
PERM_ELT8( A3, A2, BF, BC, B8, B5, C9, MF ); \
} while (0)
#define PERM_STEP_1_8 do { \
PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \
} while (0)
PERM_ELT8( A4, A3, B0, BD, B9, B6, C8, M0 ); \
PERM_ELT8( A5, A4, B1, BE, BA, B7, C7, M1 ); \
PERM_ELT8( A6, A5, B2, BF, BB, B8, C6, M2 ); \
PERM_ELT8( A7, A6, B3, B0, BC, B9, C5, M3 ); \
PERM_ELT8( A8, A7, B4, B1, BD, BA, C4, M4 ); \
PERM_ELT8( A9, A8, B5, B2, BE, BB, C3, M5 ); \
PERM_ELT8( AA, A9, B6, B3, BF, BC, C2, M6 ); \
PERM_ELT8( AB, AA, B7, B4, B0, BD, C1, M7 ); \
PERM_ELT8( A0, AB, B8, B5, B1, BE, C0, M8 ); \
PERM_ELT8( A1, A0, B9, B6, B2, BF, CF, M9 ); \
PERM_ELT8( A2, A1, BA, B7, B3, B0, CE, MA ); \
PERM_ELT8( A3, A2, BB, B8, B4, B1, CD, MB ); \
PERM_ELT8( A4, A3, BC, B9, B5, B2, CC, MC ); \
PERM_ELT8( A5, A4, BD, BA, B6, B3, CB, MD ); \
PERM_ELT8( A6, A5, BE, BB, B7, B4, CA, ME ); \
PERM_ELT8( A7, A6, BF, BC, B8, B5, C9, MF ); \
} while (0)
#define PERM_STEP_2_8 do { \
PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \
} while (0)
PERM_ELT8( A8, A7, B0, BD, B9, B6, C8, M0 ); \
PERM_ELT8( A9, A8, B1, BE, BA, B7, C7, M1 ); \
PERM_ELT8( AA, A9, B2, BF, BB, B8, C6, M2 ); \
PERM_ELT8( AB, AA, B3, B0, BC, B9, C5, M3 ); \
PERM_ELT8( A0, AB, B4, B1, BD, BA, C4, M4 ); \
PERM_ELT8( A1, A0, B5, B2, BE, BB, C3, M5 ); \
PERM_ELT8( A2, A1, B6, B3, BF, BC, C2, M6 ); \
PERM_ELT8( A3, A2, B7, B4, B0, BD, C1, M7 ); \
PERM_ELT8( A4, A3, B8, B5, B1, BE, C0, M8 ); \
PERM_ELT8( A5, A4, B9, B6, B2, BF, CF, M9 ); \
PERM_ELT8( A6, A5, BA, B7, B3, B0, CE, MA ); \
PERM_ELT8( A7, A6, BB, B8, B4, B1, CD, MB ); \
PERM_ELT8( A8, A7, BC, B9, B5, B2, CC, MC ); \
PERM_ELT8( A9, A8, BD, BA, B6, B3, CB, MD ); \
PERM_ELT8( AA, A9, BE, BB, B7, B4, CA, ME ); \
PERM_ELT8( AB, AA, BF, BC, B8, B5, C9, MF ); \
} while (0)
#define APPLY_P8 \
do { \
@@ -397,47 +383,47 @@ do { \
PERM_STEP_0_8; \
PERM_STEP_1_8; \
PERM_STEP_2_8; \
A0B = _mm256_add_epi32( A0B, C6 ); \
A0A = _mm256_add_epi32( A0A, C5 ); \
A09 = _mm256_add_epi32( A09, C4 ); \
A08 = _mm256_add_epi32( A08, C3 ); \
A07 = _mm256_add_epi32( A07, C2 ); \
A06 = _mm256_add_epi32( A06, C1 ); \
A05 = _mm256_add_epi32( A05, C0 ); \
A04 = _mm256_add_epi32( A04, CF ); \
A03 = _mm256_add_epi32( A03, CE ); \
A02 = _mm256_add_epi32( A02, CD ); \
A01 = _mm256_add_epi32( A01, CC ); \
A00 = _mm256_add_epi32( A00, CB ); \
A0B = _mm256_add_epi32( A0B, CA ); \
A0A = _mm256_add_epi32( A0A, C9 ); \
A09 = _mm256_add_epi32( A09, C8 ); \
A08 = _mm256_add_epi32( A08, C7 ); \
A07 = _mm256_add_epi32( A07, C6 ); \
A06 = _mm256_add_epi32( A06, C5 ); \
A05 = _mm256_add_epi32( A05, C4 ); \
A04 = _mm256_add_epi32( A04, C3 ); \
A03 = _mm256_add_epi32( A03, C2 ); \
A02 = _mm256_add_epi32( A02, C1 ); \
A01 = _mm256_add_epi32( A01, C0 ); \
A00 = _mm256_add_epi32( A00, CF ); \
A0B = _mm256_add_epi32( A0B, CE ); \
A0A = _mm256_add_epi32( A0A, CD ); \
A09 = _mm256_add_epi32( A09, CC ); \
A08 = _mm256_add_epi32( A08, CB ); \
A07 = _mm256_add_epi32( A07, CA ); \
A06 = _mm256_add_epi32( A06, C9 ); \
A05 = _mm256_add_epi32( A05, C8 ); \
A04 = _mm256_add_epi32( A04, C7 ); \
A03 = _mm256_add_epi32( A03, C6 ); \
A02 = _mm256_add_epi32( A02, C5 ); \
A01 = _mm256_add_epi32( A01, C4 ); \
A00 = _mm256_add_epi32( A00, C3 ); \
AB = _mm256_add_epi32( AB, C6 ); \
AA = _mm256_add_epi32( AA, C5 ); \
A9 = _mm256_add_epi32( A9, C4 ); \
A8 = _mm256_add_epi32( A8, C3 ); \
A7 = _mm256_add_epi32( A7, C2 ); \
A6 = _mm256_add_epi32( A6, C1 ); \
A5 = _mm256_add_epi32( A5, C0 ); \
A4 = _mm256_add_epi32( A4, CF ); \
A3 = _mm256_add_epi32( A3, CE ); \
A2 = _mm256_add_epi32( A2, CD ); \
A1 = _mm256_add_epi32( A1, CC ); \
A0 = _mm256_add_epi32( A0, CB ); \
AB = _mm256_add_epi32( AB, CA ); \
AA = _mm256_add_epi32( AA, C9 ); \
A9 = _mm256_add_epi32( A9, C8 ); \
A8 = _mm256_add_epi32( A8, C7 ); \
A7 = _mm256_add_epi32( A7, C6 ); \
A6 = _mm256_add_epi32( A6, C5 ); \
A5 = _mm256_add_epi32( A5, C4 ); \
A4 = _mm256_add_epi32( A4, C3 ); \
A3 = _mm256_add_epi32( A3, C2 ); \
A2 = _mm256_add_epi32( A2, C1 ); \
A1 = _mm256_add_epi32( A1, C0 ); \
A0 = _mm256_add_epi32( A0, CF ); \
AB = _mm256_add_epi32( AB, CE ); \
AA = _mm256_add_epi32( AA, CD ); \
A9 = _mm256_add_epi32( A9, CC ); \
A8 = _mm256_add_epi32( A8, CB ); \
A7 = _mm256_add_epi32( A7, CA ); \
A6 = _mm256_add_epi32( A6, C9 ); \
A5 = _mm256_add_epi32( A5, C8 ); \
A4 = _mm256_add_epi32( A4, C7 ); \
A3 = _mm256_add_epi32( A3, C6 ); \
A2 = _mm256_add_epi32( A2, C5 ); \
A1 = _mm256_add_epi32( A1, C4 ); \
A0 = _mm256_add_epi32( A0, C3 ); \
} while (0)
#define INCR_W8 do { \
if ((Wlow = T32(Wlow + 1)) == 0) \
Whigh = T32(Whigh + 1); \
if ( ( Wlow = Wlow + 1 ) == 0 ) \
Whigh = Whigh + 1; \
} while (0)
static void
@@ -649,42 +635,37 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
shabal_8way_close(cc, ub, n, dst, 16);
}
#endif // AVX2
/*
* We copy the state into local variables, so that the compiler knows
* that it can optimize them at will.
*/
#define DECL_STATE \
__m128i A00, A01, A02, A03, A04, A05, A06, A07, \
A08, A09, A0A, A0B; \
__m128i A0, A1, A2, A3, A4, A5, A6, A7, \
A8, A9, AA, AB; \
__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
B8, B9, BA, BB, BC, BD, BE, BF; \
__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
C8, C9, CA, CB, CC, CD, CE, CF; \
__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
M8, M9, MA, MB, MC, MD, ME, MF; \
sph_u32 Wlow, Whigh;
const __m128i FIVE = _mm_set1_epi32( 5 ); \
const __m128i THREE = _mm_set1_epi32( 3 ); \
sph_u32 Wlow, Whigh;
#define READ_STATE(state) do \
{ \
if ( (state)->state_loaded ) \
{ \
A00 = (state)->A[0]; \
A01 = (state)->A[1]; \
A02 = (state)->A[2]; \
A03 = (state)->A[3]; \
A04 = (state)->A[4]; \
A05 = (state)->A[5]; \
A06 = (state)->A[6]; \
A07 = (state)->A[7]; \
A08 = (state)->A[8]; \
A09 = (state)->A[9]; \
A0A = (state)->A[10]; \
A0B = (state)->A[11]; \
A0 = (state)->A[0]; \
A1 = (state)->A[1]; \
A2 = (state)->A[2]; \
A3 = (state)->A[3]; \
A4 = (state)->A[4]; \
A5 = (state)->A[5]; \
A6 = (state)->A[6]; \
A7 = (state)->A[7]; \
A8 = (state)->A[8]; \
A9 = (state)->A[9]; \
AA = (state)->A[10]; \
AB = (state)->A[11]; \
B0 = (state)->B[0]; \
B1 = (state)->B[1]; \
B2 = (state)->B[2]; \
@@ -721,18 +702,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
else \
{ \
(state)->state_loaded = true; \
A00 = m128_const1_64( 0x20728DFD20728DFD ); \
A01 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
A02 = m128_const1_64( 0xE782B699E782B699 ); \
A03 = m128_const1_64( 0x5530463255304632 ); \
A04 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
A05 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
A06 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
A07 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
A08 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
A09 = m128_const1_64( 0x8BD144108BD14410 ); \
A0A = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
A0B = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
A0 = m128_const1_64( 0x20728DFD20728DFD ); \
A1 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
A2 = m128_const1_64( 0xE782B699E782B699 ); \
A3 = m128_const1_64( 0x5530463255304632 ); \
A4 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
A5 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
A6 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
A7 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
A8 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
A9 = m128_const1_64( 0x8BD144108BD14410 ); \
AA = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
AB = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
B0 = m128_const1_64( 0xC1099CB7C1099CB7 ); \
B1 = m128_const1_64( 0x07B385F307B385F3 ); \
B2 = m128_const1_64( 0xE7442C26E7442C26 ); \
@@ -771,18 +752,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
} while (0)
#define WRITE_STATE(state) do { \
(state)->A[0] = A00; \
(state)->A[1] = A01; \
(state)->A[2] = A02; \
(state)->A[3] = A03; \
(state)->A[4] = A04; \
(state)->A[5] = A05; \
(state)->A[6] = A06; \
(state)->A[7] = A07; \
(state)->A[8] = A08; \
(state)->A[9] = A09; \
(state)->A[10] = A0A; \
(state)->A[11] = A0B; \
(state)->A[0] = A0; \
(state)->A[1] = A1; \
(state)->A[2] = A2; \
(state)->A[3] = A3; \
(state)->A[4] = A4; \
(state)->A[5] = A5; \
(state)->A[6] = A6; \
(state)->A[7] = A7; \
(state)->A[8] = A8; \
(state)->A[9] = A9; \
(state)->A[10] = AA; \
(state)->A[11] = AB; \
(state)->B[0] = B0; \
(state)->B[1] = B1; \
(state)->B[2] = B2; \
@@ -881,19 +862,10 @@ do { \
#define XOR_W \
do { \
A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
A0 = _mm_xor_si128( A0, _mm_set1_epi32( Wlow ) ); \
A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \
} while (0)
/*
#define SWAP(v1, v2) do { \
sph_u32 tmp = (v1); \
(v1) = (v2); \
(v2) = tmp; \
} while (0)
*/
#define SWAP_BC \
do { \
mm128_swap256_128( B0, C0 ); \
@@ -914,83 +886,71 @@ do { \
mm128_swap256_128( BF, CF ); \
} while (0)
/*
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
do { \
__m128i t1 = _mm_mullo_epi32( mm_rol_32( xa1, 15 ),\
_mm_set1_epi32(5UL) ) \
__m128i t2 = _mm_xor_si128( xa0, xc ); \
xb0 = mm_not( _mm_xor_si256( xa0, mm_rol_32( xb0, 1 ) ) ); \
xa0 = mm_xor4( xm, xb1, _mm_andnot_si128( xb3, xb2 ), \
_mm_xor_si128( t2, \
_mm_mullo_epi32( t1, _mm_set1_epi32(5UL) ) ) ) \
*/
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
do { \
xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \
_mm_andnot_si128( xb3, xb2 ), \
_mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
_mm_mullo_epi32( mm128_rol_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
) ), _mm_set1_epi32(3UL) ) ) ) ); \
_mm_mullo_epi32( mm128_rol_32( xa1, 15 ), FIVE ) \
) ), THREE ) ) ) ); \
xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
} while (0)
#define PERM_STEP_0 do { \
PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A1, A0, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A2, A1, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A3, A2, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A4, A3, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A5, A4, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A6, A5, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A7, A6, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A8, A7, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A9, A8, B9, B6, B2, BF, CF, M9); \
PERM_ELT(AA, A9, BA, B7, B3, B0, CE, MA); \
PERM_ELT(AB, AA, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A0, AB, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A1, A0, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A2, A1, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A3, A2, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_1 do { \
PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
PERM_ELT(A4, A3, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A5, A4, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A6, A5, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A7, A6, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A8, A7, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A9, A8, B5, B2, BE, BB, C3, M5); \
PERM_ELT(AA, A9, B6, B3, BF, BC, C2, M6); \
PERM_ELT(AB, AA, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A0, AB, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A1, A0, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A2, A1, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A3, A2, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A4, A3, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A5, A4, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A6, A5, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A7, A6, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_2 do { \
PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
PERM_ELT(A8, A7, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A9, A8, B1, BE, BA, B7, C7, M1); \
PERM_ELT(AA, A9, B2, BF, BB, B8, C6, M2); \
PERM_ELT(AB, AA, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A0, AB, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A1, A0, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A2, A1, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A3, A2, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A4, A3, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A5, A4, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A6, A5, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A7, A6, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A8, A7, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A9, A8, BD, BA, B6, B3, CB, MD); \
PERM_ELT(AA, A9, BE, BB, B7, B4, CA, ME); \
PERM_ELT(AB, AA, BF, BC, B8, B5, C9, MF); \
} while (0)
#define APPLY_P \
@@ -1014,47 +974,47 @@ do { \
PERM_STEP_0; \
PERM_STEP_1; \
PERM_STEP_2; \
A0B = _mm_add_epi32( A0B, C6 ); \
A0A = _mm_add_epi32( A0A, C5 ); \
A09 = _mm_add_epi32( A09, C4 ); \
A08 = _mm_add_epi32( A08, C3 ); \
A07 = _mm_add_epi32( A07, C2 ); \
A06 = _mm_add_epi32( A06, C1 ); \
A05 = _mm_add_epi32( A05, C0 ); \
A04 = _mm_add_epi32( A04, CF ); \
A03 = _mm_add_epi32( A03, CE ); \
A02 = _mm_add_epi32( A02, CD ); \
A01 = _mm_add_epi32( A01, CC ); \
A00 = _mm_add_epi32( A00, CB ); \
A0B = _mm_add_epi32( A0B, CA ); \
A0A = _mm_add_epi32( A0A, C9 ); \
A09 = _mm_add_epi32( A09, C8 ); \
A08 = _mm_add_epi32( A08, C7 ); \
A07 = _mm_add_epi32( A07, C6 ); \
A06 = _mm_add_epi32( A06, C5 ); \
A05 = _mm_add_epi32( A05, C4 ); \
A04 = _mm_add_epi32( A04, C3 ); \
A03 = _mm_add_epi32( A03, C2 ); \
A02 = _mm_add_epi32( A02, C1 ); \
A01 = _mm_add_epi32( A01, C0 ); \
A00 = _mm_add_epi32( A00, CF ); \
A0B = _mm_add_epi32( A0B, CE ); \
A0A = _mm_add_epi32( A0A, CD ); \
A09 = _mm_add_epi32( A09, CC ); \
A08 = _mm_add_epi32( A08, CB ); \
A07 = _mm_add_epi32( A07, CA ); \
A06 = _mm_add_epi32( A06, C9 ); \
A05 = _mm_add_epi32( A05, C8 ); \
A04 = _mm_add_epi32( A04, C7 ); \
A03 = _mm_add_epi32( A03, C6 ); \
A02 = _mm_add_epi32( A02, C5 ); \
A01 = _mm_add_epi32( A01, C4 ); \
A00 = _mm_add_epi32( A00, C3 ); \
AB = _mm_add_epi32( AB, C6 ); \
AA = _mm_add_epi32( AA, C5 ); \
A9 = _mm_add_epi32( A9, C4 ); \
A8 = _mm_add_epi32( A8, C3 ); \
A7 = _mm_add_epi32( A7, C2 ); \
A6 = _mm_add_epi32( A6, C1 ); \
A5 = _mm_add_epi32( A5, C0 ); \
A4 = _mm_add_epi32( A4, CF ); \
A3 = _mm_add_epi32( A3, CE ); \
A2 = _mm_add_epi32( A2, CD ); \
A1 = _mm_add_epi32( A1, CC ); \
A0 = _mm_add_epi32( A0, CB ); \
AB = _mm_add_epi32( AB, CA ); \
AA = _mm_add_epi32( AA, C9 ); \
A9 = _mm_add_epi32( A9, C8 ); \
A8 = _mm_add_epi32( A8, C7 ); \
A7 = _mm_add_epi32( A7, C6 ); \
A6 = _mm_add_epi32( A6, C5 ); \
A5 = _mm_add_epi32( A5, C4 ); \
A4 = _mm_add_epi32( A4, C3 ); \
A3 = _mm_add_epi32( A3, C2 ); \
A2 = _mm_add_epi32( A2, C1 ); \
A1 = _mm_add_epi32( A1, C0 ); \
A0 = _mm_add_epi32( A0, CF ); \
AB = _mm_add_epi32( AB, CE ); \
AA = _mm_add_epi32( AA, CD ); \
A9 = _mm_add_epi32( A9, CC ); \
A8 = _mm_add_epi32( A8, CB ); \
A7 = _mm_add_epi32( A7, CA ); \
A6 = _mm_add_epi32( A6, C9 ); \
A5 = _mm_add_epi32( A5, C8 ); \
A4 = _mm_add_epi32( A4, C7 ); \
A3 = _mm_add_epi32( A3, C6 ); \
A2 = _mm_add_epi32( A2, C5 ); \
A1 = _mm_add_epi32( A1, C4 ); \
A0 = _mm_add_epi32( A0, C3 ); \
} while (0)
#define INCR_W do { \
if ((Wlow = T32(Wlow + 1)) == 0) \
Whigh = T32(Whigh + 1); \
if ( ( Wlow = Wlow + 1 ) == 0 ) \
Whigh = Whigh + 1; \
} while (0)
/*

View File

@@ -75,7 +75,6 @@ void shabal512_8way_close( void *cc, void *dst );
void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );
#endif
typedef struct {
@@ -97,7 +96,6 @@ void shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void shabal512_4way_init( void *cc );
void shabal512_4way_update( void *cc, const void *data, size_t len );
//#define shabal512_4way shabal512_4way_update
void shabal512_4way_close( void *cc, void *dst );
void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );

View File

@@ -18,10 +18,13 @@ static const uint32_t IV512[] =
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
};
/*
#define mm256_ror2x256hi_1x32( a, b ) \
_mm256_blend_epi32( mm256_ror128_32( a ), \
mm256_ror128_32( b ), 0x88 )
_mm256_blend_epi32( mm256_shuflr128_32( a ), \
mm256_shuflr128_32( b ), 0x88 )
*/
//#define mm256_ror2x256hi_1x32( a, b ) _mm256_alignr_epi8( b, a, 4 )
#if defined(__VAES__)
@@ -78,7 +81,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
{
// round 1, 5, 9
k00 = _mm256_xor_si256( k13, mm256_ror128_32(
k00 = _mm256_xor_si256( k13, mm256_shuflr128_32(
mm256_aesenc_2x128( k00, zero ) ) );
if ( r == 0 )
@@ -88,7 +91,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
k01 = _mm256_xor_si256( k00,
mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) );
mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ) );
if ( r == 1 )
k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
@@ -97,25 +100,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( k01,
mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) );
mm256_shuflr128_32( mm256_aesenc_2x128( k02, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( k02,
mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) );
mm256_shuflr128_32( mm256_aesenc_2x128( k03, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p3 = _mm256_xor_si256( p3, x );
k10 = _mm256_xor_si256( k03,
mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) );
mm256_shuflr128_32( mm256_aesenc_2x128( k10, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
k11 = _mm256_xor_si256( k10,
mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) );
mm256_shuflr128_32( mm256_aesenc_2x128( k11, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = _mm256_xor_si256( k11,
mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) );
mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( k12,
mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) );
mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ) );
if ( r == 2 )
k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
@@ -127,55 +130,55 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
// round 2, 6, 10
k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
k00 = _mm256_xor_si256( k00, _mm256_alignr_epi8( k13, k12, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k00 ), zero );
k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
k01 = _mm256_xor_si256( k01, _mm256_alignr_epi8( k00, k13, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
k02 = _mm256_xor_si256( k02, _mm256_alignr_epi8( k01, k00, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
k03 = _mm256_xor_si256( k03, _mm256_alignr_epi8( k02, k01, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p2 = _mm256_xor_si256( p2, x );
k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
k10 = _mm256_xor_si256( k10, _mm256_alignr_epi8( k03, k02, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k10 ), zero );
k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
k11 = _mm256_xor_si256( k11, _mm256_alignr_epi8( k10, k03, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
k12 = _mm256_xor_si256( k12, _mm256_alignr_epi8( k11, k10, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
k13 = _mm256_xor_si256( k13, _mm256_alignr_epi8( k12, k11, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
p0 = _mm256_xor_si256( p0, x );
// round 3, 7, 11
k00 = _mm256_xor_si256( mm256_ror128_32(
k00 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k00, zero ) ), k13 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
k01 = _mm256_xor_si256( mm256_ror128_32(
k01 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k01, zero ) ), k00 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( mm256_ror128_32(
k02 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k02, zero ) ), k01 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( mm256_ror128_32(
k03 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k03, zero ) ), k02 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p1 = _mm256_xor_si256( p1, x );
k10 = _mm256_xor_si256( mm256_ror128_32(
k10 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k10, zero ) ), k03 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
k11 = _mm256_xor_si256( mm256_ror128_32(
k11 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k11, zero ) ), k10 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = _mm256_xor_si256( mm256_ror128_32(
k12 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k12, zero ) ), k11 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( mm256_ror128_32(
k13 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k13, zero ) ), k12 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
@@ -183,24 +186,24 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
// round 4, 8, 12
k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
k00 = _mm256_xor_si256( k00, _mm256_alignr_epi8( k13, k12, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero );
k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
k01 = _mm256_xor_si256( k01, _mm256_alignr_epi8( k00, k13, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
k02 = _mm256_xor_si256( k02, _mm256_alignr_epi8( k01, k00, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
k03 = _mm256_xor_si256( k03, _mm256_alignr_epi8( k02, k01, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p0 = _mm256_xor_si256( p0, x );
k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
k10 = _mm256_xor_si256( k10, _mm256_alignr_epi8( k03, k02, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ), zero );
k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
k11 = _mm256_xor_si256( k11, _mm256_alignr_epi8( k10, k03, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
k12 = _mm256_xor_si256( k12, _mm256_alignr_epi8( k11, k10, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
k13 = _mm256_xor_si256( k13, _mm256_alignr_epi8( k12, k11, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
p2 = _mm256_xor_si256( p2, x );
@@ -209,35 +212,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
// round 13
k00 = _mm256_xor_si256( mm256_ror128_32(
k00 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k00, zero ) ), k13 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
k01 = _mm256_xor_si256( mm256_ror128_32(
k01 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k01, zero ) ), k00 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( mm256_ror128_32(
k02 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k02, zero ) ), k01 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( mm256_ror128_32(
k03 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k03, zero ) ), k02 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p3 = _mm256_xor_si256( p3, x );
k10 = _mm256_xor_si256( mm256_ror128_32(
k10 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k10, zero ) ), k03 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
k11 = _mm256_xor_si256( mm256_ror128_32(
k11 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k11, zero ) ), k10 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) );
k12 = mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) );
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( mm256_ror128_32(
k13 = _mm256_xor_si256( mm256_shuflr128_32(
mm256_aesenc_2x128( k13, zero ) ), k12 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );

View File

@@ -11,10 +11,6 @@ static const uint32_t IV512[] =
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
};
#define mm512_ror2x512hi_1x32( a, b ) \
_mm512_mask_blend_epi32( 0x8888, mm512_ror128_32( a ), \
mm512_ror128_32( b ) )
static void
c512_4way( shavite512_4way_context *ctx, const void *msg )
{
@@ -60,7 +56,7 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
{
// round 1, 5, 9
K0 = _mm512_xor_si512( K7, mm512_ror128_32(
K0 = _mm512_xor_si512( K7, mm512_shuflr128_32(
_mm512_aesenc_epi128( K0, m512_zero ) ) );
if ( r == 0 )
@@ -69,33 +65,33 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
K1 = _mm512_xor_si512( K0,
mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
if ( r == 1 )
K1 = _mm512_xor_si512( K1, mm512_ror128_32(
K1 = _mm512_xor_si512( K1, mm512_shuflr128_32(
_mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
K2 = _mm512_xor_si512( K1,
mm512_ror128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
K3 = _mm512_xor_si512( K2,
mm512_ror128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
P3 = _mm512_xor_si512( P3, X );
K4 = _mm512_xor_si512( K3,
mm512_ror128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
K5 = _mm512_xor_si512( K4,
mm512_ror128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
K6 = _mm512_xor_si512( K5,
mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
K7 = _mm512_xor_si512( K6,
mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
if ( r == 2 )
K7 = _mm512_xor_si512( K7, mm512_swap128_64(
@@ -106,55 +102,55 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
// round 2, 6, 10
K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero );
K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
P2 = _mm512_xor_si512( P2, X );
K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero );
K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
P0 = _mm512_xor_si512( P0, X );
// round 3, 7, 11
K0 = _mm512_xor_si512( mm512_ror128_32(
K0 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
K1 = _mm512_xor_si512( mm512_ror128_32(
K1 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
K2 = _mm512_xor_si512( mm512_ror128_32(
K2 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
K3 = _mm512_xor_si512( mm512_ror128_32(
K3 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
P1 = _mm512_xor_si512( P1, X );
K4 = _mm512_xor_si512( mm512_ror128_32(
K4 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
K5 = _mm512_xor_si512( mm512_ror128_32(
K5 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
K6 = _mm512_xor_si512( mm512_ror128_32(
K6 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
K7 = _mm512_xor_si512( mm512_ror128_32(
K7 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
@@ -162,24 +158,24 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
// round 4, 8, 12
K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
P0 = _mm512_xor_si512( P0, X );
K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
P2 = _mm512_xor_si512( P2, X );
@@ -187,34 +183,34 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
// round 13
K0 = _mm512_xor_si512( mm512_ror128_32(
K0 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
K1 = _mm512_xor_si512( mm512_ror128_32(
K1 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
K2 = _mm512_xor_si512( mm512_ror128_32(
K2 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
K3 = _mm512_xor_si512( mm512_ror128_32(
K3 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
P3 = _mm512_xor_si512( P3, X );
K4 = _mm512_xor_si512( mm512_ror128_32(
K4 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
K5 = _mm512_xor_si512( mm512_ror128_32(
K5 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
K6 = mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32(
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
K7= _mm512_xor_si512( mm512_ror128_32(
K7= _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );

View File

@@ -59,30 +59,6 @@ static const sph_u32 IV512[] = {
C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
};
// Partially rotate elements in two 128 bit vectors a & b as one 256 bit vector
// and return the rotated 128 bit vector a.
// a[3:0] = { b[0], a[3], a[2], a[1] }
#if defined(__SSSE3__)
#define mm128_ror256hi_1x32( a, b ) _mm_alignr_epi8( b, a, 4 )
#else // SSE2
#define mm128_ror256hi_1x32( a, b ) \
_mm_or_si128( _mm_srli_si128( a, 4 ), \
_mm_slli_si128( b, 12 ) )
#endif
#if defined(__AVX2__)
// 2 way version of above
// a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] }
#define mm256_ror2x256hi_1x32( a, b ) \
_mm256_blend_epi32( mm256_ror256_1x32( a ), \
mm256_rol256_3x32( b ), 0x88 )
#endif
static void
c512( sph_shavite_big_context *sc, const void *msg )
@@ -135,7 +111,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
for ( r = 0; r < 3; r ++ )
{
// round 1, 5, 9
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
k00 = _mm_xor_si128( k00, k13 );
if ( r == 0 )
@@ -144,7 +120,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
k01 = _mm_xor_si128( k01, k00 );
if ( r == 1 )
@@ -153,31 +129,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, zero );
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, zero );
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, zero );
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, zero );
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
k13 = _mm_xor_si128( k13, k12 );
if ( r == 2 )
@@ -190,31 +166,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round 2, 6, 10
k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
x = _mm_xor_si128( p3, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );
k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, zero );
k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, zero );
p2 = _mm_xor_si128( p2, x );
k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
x = _mm_xor_si128( p1, k10 );
x = _mm_aesenc_si128( x, zero );
k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, zero );
k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, zero );
k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, zero );
@@ -222,38 +198,38 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round 3, 7, 11
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p2, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, zero );
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, zero );
p1 = _mm_xor_si128( p1, x );
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p0, k10 );
x = _mm_aesenc_si128( x, zero );
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, zero );
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, zero );
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, zero );
@@ -262,31 +238,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round 4, 8, 12
k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );
k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, zero );
k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, zero );
p0 = _mm_xor_si128( p0, x );
k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, zero );
k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, zero );
k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, zero );
k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, zero );
@@ -295,39 +271,39 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round 13
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, zero );
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, zero );
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, zero );
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, zero );
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, zero );

View File

@@ -35,7 +35,7 @@
#include "sph_shavite.h"
#if !defined(__AES__)
#if !(defined(__AES__) && defined(__SSSE3__))
#ifdef __cplusplus
extern "C"{

View File

@@ -263,7 +263,7 @@ void sph_shavite384_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
//Don't call these directly from application code, use the macros below.
#ifdef __AES__
#if defined(__AES__) && defined(__SSSE3__)
void sph_shavite512_aesni_init(void *cc);
void sph_shavite512_aesni(void *cc, const void *data, size_t len);

View File

@@ -383,11 +383,17 @@ static const m512_v16 FFT256_Twiddle4w[] =
#define shufxor4w(x,s) _mm512_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
#define REDUCE4w(x) \
_mm512_sub_epi16( _mm512_maskz_mov_epi8( 0x5555555555555555, x ), \
_mm512_srai_epi16( x, 8 ) )
/*
#define REDUCE4w(x) \
_mm512_sub_epi16( _mm512_and_si512( x, m512_const1_64( \
0x00ff00ff00ff00ff ) ), _mm512_srai_epi16( x, 8 ) )
*/
#define EXTRA_REDUCE_S4w(x)\
#define EXTRA_REDUCE_S4w(x) \
_mm512_sub_epi16( x, _mm512_and_si512( \
m512_const1_64( 0x0101010101010101 ), \
_mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
@@ -400,8 +406,8 @@ static const m512_v16 FFT256_Twiddle4w[] =
#define DO_REDUCE_FULL_S4w(i) \
do { \
X(i) = REDUCE4w( X(i) ); \
X(i) = EXTRA_REDUCE_S4w( X(i) ); \
X(i) = REDUCE4w( X(i) ); \
X(i) = EXTRA_REDUCE_S4w( X(i) ); \
} while(0)
@@ -431,10 +437,6 @@ void fft64_4way( void *a )
// Unrolled decimation in frequency (DIF) radix-2 NTT.
// Output data is in revbin_permuted order.
static const int w[] = {0, 2, 4, 6};
// __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
// targetted
#define BUTTERFLY_0( i,j ) \
do { \
@@ -443,25 +445,25 @@ do { \
X(i) = _mm512_sub_epi16( X(i), v ); \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
#define BUTTERFLY_N( i, j, w ) \
do { \
__m512i v = X(j); \
X(j) = _mm512_add_epi16( X(i), X(j) ); \
X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w[n] ); \
X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w ); \
} while(0)
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
BUTTERFLY_N( 1, 5, 2 );
BUTTERFLY_N( 2, 6, 4 );
BUTTERFLY_N( 3, 7, 6 );
DO_REDUCE( 2 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
BUTTERFLY_N( 1, 3, 4 );
BUTTERFLY_N( 5, 7, 4 );
DO_REDUCE( 1 );
@@ -501,12 +503,11 @@ do { \
// Transpose the FFT state with a revbin order permutation
// on the rows and the column.
// This will make the full FFT_64 in order.
#define INTERLEAVE(i,j) \
#define INTERLEAVE( i, j ) \
do { \
__m512i t1= X(i); \
__m512i t2= X(j); \
X(i) = _mm512_unpacklo_epi16( t1, t2 ); \
X(j) = _mm512_unpackhi_epi16( t1, t2 ); \
__m512i u = X(j); \
X(j) = _mm512_unpackhi_epi16( X(i), X(j) ); \
X(i) = _mm512_unpacklo_epi16( X(i), u ); \
} while(0)
INTERLEAVE( 1, 0 );
@@ -534,10 +535,10 @@ do { \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
#define BUTTERFLY_N( i, j, w ) \
do { \
__m512i u = X(j); \
X(i) = _mm512_slli_epi16( X(i), w[n] ); \
X(i) = _mm512_slli_epi16( X(i), w ); \
X(j) = _mm512_sub_epi16( X(j), X(i) ); \
X(i) = _mm512_add_epi16( u, X(i) ); \
} while(0)
@@ -558,15 +559,15 @@ do { \
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
BUTTERFLY_N( 1, 3, 4 );
BUTTERFLY_N( 5, 7, 4 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
BUTTERFLY_N( 1, 5, 2 );
BUTTERFLY_N( 2, 6, 4 );
BUTTERFLY_N( 3, 7, 6 );
DO_REDUCE_FULL_S4w( 0 );
DO_REDUCE_FULL_S4w( 1 );
@@ -599,7 +600,6 @@ void fft128_4way( void *a )
// Temp space to help for interleaving in the end
__m512i B[8];
__m512i *A = (__m512i*) a;
// __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
/* Size-2 butterflies */
for ( i = 0; i<8; i++ )
@@ -633,7 +633,6 @@ void fft128_4way_msg( uint16_t *a, const uint8_t *x, int final )
__m512i *X = (__m512i*)x;
__m512i *A = (__m512i*)a;
// __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
#define UNPACK( i ) \
do { \
@@ -686,7 +685,6 @@ void fft256_4way_msg( uint16_t *a, const uint8_t *x, int final )
__m512i *X = (__m512i*)x;
__m512i *A = (__m512i*)a;
// __m256i *Twiddle = (__m256i*)FFT256_Twiddle;
#define UNPACK( i ) \
do { \
@@ -776,109 +774,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
// We split the round function in two halfes
// so as to insert some independent computations in between
// generic
#if 0
#define SUM7_00 0
#define SUM7_01 1
#define SUM7_02 2
#define SUM7_03 3
#define SUM7_04 4
#define SUM7_05 5
#define SUM7_06 6
#define SUM7_10 1
#define SUM7_11 2
#define SUM7_12 3
#define SUM7_13 4
#define SUM7_14 5
#define SUM7_15 6
#define SUM7_16 0
#define SUM7_20 2
#define SUM7_21 3
#define SUM7_22 4
#define SUM7_23 5
#define SUM7_24 6
#define SUM7_25 0
#define SUM7_26 1
#define SUM7_30 3
#define SUM7_31 4
#define SUM7_32 5
#define SUM7_33 6
#define SUM7_34 0
#define SUM7_35 1
#define SUM7_36 2
#define SUM7_40 4
#define SUM7_41 5
#define SUM7_42 6
#define SUM7_43 0
#define SUM7_44 1
#define SUM7_45 2
#define SUM7_46 3
#define SUM7_50 5
#define SUM7_51 6
#define SUM7_52 0
#define SUM7_53 1
#define SUM7_54 2
#define SUM7_55 3
#define SUM7_56 4
#define SUM7_60 6
#define SUM7_61 0
#define SUM7_62 1
#define SUM7_63 2
#define SUM7_64 3
#define SUM7_65 4
#define SUM7_66 5
#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
#define PERM_0(d,a) /* XOR 1 */ \
do { \
d##l = shufxor( a##l, 1 ); \
d##h = shufxor( a##h, 1 ); \
} while(0)
#define PERM_1(d,a) /* XOR 6 */ \
do { \
d##l = shufxor( a##h, 2 ); \
d##h = shufxor( a##l, 2 ); \
} while(0)
#define PERM_2(d,a) /* XOR 2 */ \
do { \
d##l = shufxor( a##l, 2 ); \
d##h = shufxor( a##h, 2 ); \
} while(0)
#define PERM_3(d,a) /* XOR 3 */ \
do { \
d##l = shufxor( a##l, 3 ); \
d##h = shufxor( a##h, 3 ); \
} while(0)
#define PERM_4(d,a) /* XOR 5 */ \
do { \
d##l = shufxor( a##h, 1 ); \
d##h = shufxor( a##l, 1 ); \
} while(0)
#define PERM_5(d,a) /* XOR 7 */ \
do { \
d##l = shufxor( a##h, 3 ); \
d##h = shufxor( a##l, 3 ); \
} while(0)
#define PERM_6(d,a) /* XOR 4 */ \
do { \
d##l = a##h; \
d##h = a##l; \
} while(0)
#endif
// targetted
#define STEP_1_(a,b,c,d,w,fun,r,s,z) \

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include "skein-hash-4way.h"
#include "algo/sha/sha-hash-4way.h"
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
#if defined (SKEIN_8WAY)
@@ -87,7 +87,6 @@ void skeinhash_4way( void *state, const void *input )
uint32_t hash1[16] __attribute__ ((aligned (64)));
uint32_t hash2[16] __attribute__ ((aligned (64)));
uint32_t hash3[16] __attribute__ ((aligned (64)));
sph_sha256_context ctx_sha256;
#else
uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
sha256_4way_context ctx_sha256;
@@ -98,18 +97,12 @@ void skeinhash_4way( void *state, const void *input )
#if defined(__SHA__)
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 );
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hash0, 64 );
sph_sha256_close( &ctx_sha256, hash0 );
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hash1, 64 );
sph_sha256_close( &ctx_sha256, hash1 );
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hash2, 64 );
sph_sha256_close( &ctx_sha256, hash2 );
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hash3, 64 );
sph_sha256_close( &ctx_sha256, hash3 );
sha256_full( hash0, hash0, 64 );
sha256_full( hash1, hash1, 64 );
sha256_full( hash2, hash2, 64 );
sha256_full( hash3, hash3, 64 );
intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
#else

View File

@@ -1106,8 +1106,7 @@ skein256_4way_close(void *cc, void *dst)
}
// Do not use with 128 bit data
// Broken for 80 & 128 bytes, use prehash or full
void
skein512_4way_update(void *cc, const void *data, size_t len)
{

View File

@@ -5,21 +5,18 @@
#include <string.h>
#include <stdint.h>
#include "sph_skein.h"
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha256-hash.h"
void skeinhash(void *state, const void *input)
{
uint32_t hash[16] __attribute__ ((aligned (64)));
sph_skein512_context ctx_skein;
sph_sha256_context ctx_sha256;
sph_skein512_init( &ctx_skein );
sph_skein512( &ctx_skein, input, 80 );
sph_skein512_close( &ctx_skein, hash );
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hash, 64 );
sph_sha256_close( &ctx_sha256, hash );
sha256_full( hash, hash, 64 );
memcpy(state, hash, 32);
}
@@ -27,8 +24,8 @@ void skeinhash(void *state, const void *input)
int scanhash_skein( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t hash64[8] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
const uint32_t Htarg = ptarget[7];
@@ -36,7 +33,7 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
int thr_id = mythr->id; // thr_id arg is deprecated
swab32_array( endiandata, pdata, 20 );
swab32_array( endiandata, pdata, 20 );
do {
be32enc(&endiandata[19], n);

Some files were not shown because too many files have changed in this diff Show More