mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
6 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
7d2ef7973d | ||
![]() |
e6fd9b1d69 | ||
![]() |
1a234cbe53 | ||
![]() |
47cc5dcff5 | ||
![]() |
2cd1507c2e | ||
![]() |
9b905fccc8 |
@@ -32,14 +32,26 @@ but different package names.
|
||||
$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
|
||||
|
||||
SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
|
||||
openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
|
||||
support depending on your CPU and compiler version:
|
||||
openssl 1.1.0e or higher.
|
||||
|
||||
"-march=native" is always the best choice
|
||||
znver1 and znver2 should be recognized on most recent version of GCC and
|
||||
znver3 is expected with GCC 11. GCC 11 also includes rocketlake support.
|
||||
In the meantime here are some suggestions to compile with new CPUs:
|
||||
|
||||
"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
|
||||
"-march=native" is usually the best choice, used by build.sh.
|
||||
|
||||
"-msha" Add SHA to other tuning options
|
||||
"-march=znver2 -mvaes" can be used for Ryzen 5000 if znver3 is not recongized.
|
||||
|
||||
"-mcascadelake -msha" or
|
||||
"-mcometlake -mavx512 -msha" can be used for Rocket Lake.
|
||||
|
||||
Features can also be added individually:
|
||||
|
||||
"-msha" adds support for HW accelerated sha256.
|
||||
|
||||
"-mavx512" adds support for 512 bit vectors
|
||||
|
||||
"-mvaes" add support for parallel AES
|
||||
|
||||
Additional instructions for static compilalation can be found here:
|
||||
https://lxadm.com/Static_compilation_of_cpuminer
|
||||
|
@@ -40,7 +40,7 @@ $ mkdir $HOME/usr/lib
|
||||
version available in the repositories.
|
||||
|
||||
Download the following source code packages from their respective and
|
||||
respected download locations, copy them to ~/usr/lib/ and uncompress them.
|
||||
respected download locations, copy them to $HOME/usr/lib/ and uncompress them.
|
||||
|
||||
openssl: https://github.com/openssl/openssl/releases
|
||||
|
||||
@@ -149,85 +149,10 @@ Copy cpuminer.exe to the release directory, compress and copy the release direct
|
||||
|
||||
Run cpuminer
|
||||
|
||||
In a command windows change directories to the unzipped release folder. to get a list of all options:
|
||||
In a command windows change directories to the unzipped release folder. To get a list of all options:
|
||||
|
||||
cpuminer.exe --help
|
||||
|
||||
Command options are specific to where you mine. Refer to the pool's instructions on how to set them.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Create a link to the locally compiled version of gmp.h
|
||||
|
||||
$ ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h
|
||||
|
||||
Edit configure.ac to fix lipthread package name.
|
||||
|
||||
sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
|
||||
|
||||
|
||||
7. Compile
|
||||
|
||||
you can use the default compile if you intend to use cpuminer-opt on the
|
||||
same CPU and the virtual machine supports that architecture.
|
||||
|
||||
./build.sh
|
||||
|
||||
Otherwise you can compile manually while setting options in CFLAGS.
|
||||
|
||||
Some common options:
|
||||
|
||||
To compile for a specific CPU architecture:
|
||||
|
||||
CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl
|
||||
|
||||
This will compile for AMD Ryzen.
|
||||
|
||||
You can compile more generically for a set of specific CPU features
|
||||
if you know what features you want:
|
||||
|
||||
CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
|
||||
|
||||
This will compile for an older CPU that does not have AVX.
|
||||
|
||||
You can find several examples in build-allarch.sh
|
||||
|
||||
If you have a CPU with more than 64 threads and Windows 7 or higher you
|
||||
can enable the CPU Groups feature:
|
||||
|
||||
-D_WIN32_WINNT==0x0601
|
||||
|
||||
Once you have run configure successfully run make with n CPU threads:
|
||||
|
||||
make -j n
|
||||
|
||||
Copy cpuminer.exe to the release directory, compress and copy the release
|
||||
directory to a Windows system and run cpuminer.exe from the command line.
|
||||
|
||||
Run cpuminer
|
||||
|
||||
In a command windows change directories to the unzipped release folder.
|
||||
to get a list of all options:
|
||||
|
||||
cpuminer.exe --help
|
||||
|
||||
Command options are specific to where you mine. Refer to the pool's
|
||||
instructions on how to set them.
|
||||
|
@@ -158,7 +158,9 @@ cpuminer_SOURCES = \
|
||||
algo/ripemd/lbry.c \
|
||||
algo/ripemd/lbry-4way.c \
|
||||
algo/scrypt/scrypt.c \
|
||||
algo/scrypt/scrypt-core-4way.c \
|
||||
algo/scrypt/neoscrypt.c \
|
||||
algo/sha/sha256-hash.c \
|
||||
algo/sha/sph_sha2.c \
|
||||
algo/sha/sph_sha2big.c \
|
||||
algo/sha/sha256-hash-4way.c \
|
||||
@@ -167,7 +169,9 @@ cpuminer_SOURCES = \
|
||||
algo/sha/sha256-hash-2way-ni.c \
|
||||
algo/sha/hmac-sha256-hash.c \
|
||||
algo/sha/hmac-sha256-hash-4way.c \
|
||||
algo/sha/sha256d.c \
|
||||
algo/sha/sha2.c \
|
||||
algo/sha/sha256d-4way.c \
|
||||
algo/sha/sha256t-gate.c \
|
||||
algo/sha/sha256t-4way.c \
|
||||
algo/sha/sha256t.c \
|
||||
|
48
README.txt
48
README.txt
@@ -18,14 +18,14 @@ error to find the fastest one that works. Pay attention to
|
||||
the features listed at cpuminer startup to ensure you are mining at
|
||||
optimum speed using the best available features.
|
||||
|
||||
Architecture names and compile options used are only provided for Intel
|
||||
Core series. Budget CPUs like Pentium and Celeron are often missing some
|
||||
features.
|
||||
Architecture names and compile options used are only provided for
|
||||
mainstream desktop CPUs. Budget CPUs like Pentium and Celeron are often
|
||||
missing some features. Check your CPU.
|
||||
|
||||
AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
|
||||
supported by cpuminer-opt due to an incompatible implementation of SSE2 on
|
||||
these CPUs. Some algos may crash the miner with an invalid instruction.
|
||||
Users are recommended to use an unoptimized miner such as cpuminer-multi.
|
||||
Support for AMD CPUs older than Ryzen is incomplete and without specific
|
||||
recommendations. Find the best fit. CPUs older than Piledriver, including
|
||||
Athlon x2 and Phenom II x4, are not supported by cpuminer-opt due to an
|
||||
incompatible implementation of SSE2 on these CPUs.
|
||||
|
||||
More information for Intel and AMD CPU architectures and their features
|
||||
can be found on Wikipedia.
|
||||
@@ -34,26 +34,21 @@ https://en.wikipedia.org/wiki/List_of_Intel_CPU_microarchitectures
|
||||
|
||||
https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures
|
||||
|
||||
File name Architecture name
|
||||
|
||||
Exe file name Compile flags Arch name
|
||||
cpuminer-sse2.exe Core2, Nehalem, generic x86_64 with SSE2
|
||||
cpuminer-aes-sse42.exe Westmere
|
||||
cpuminer-avx.exe Sandybridge, Ivybridge
|
||||
cpuminer-avx2.exe Haswell, Skylake, Kabylake, Coffeelake, Cometlake
|
||||
cpuminer-avx2-sha.exe AMD Zen1, Zen2
|
||||
cpuminer-avx2-sha-vaes.exe Intel Alderlake*, AMD Zen3
|
||||
cpuminer-avx512.exe Intel HEDT Skylake-X, Cascadelake
|
||||
cpuminer-avx512-sha-vaes.exe Icelake, Tigerlake, Rocketlake
|
||||
|
||||
cpuminer-sse2.exe "-msse2" Core2, Nehalem
|
||||
cpuminer-aes-sse42.exe "-march=westmere" Westmere
|
||||
cpuminer-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell(1)
|
||||
cpuminer-avx512.exe "-march=skylake-avx512" Skylake-X, Cascadelake
|
||||
cpuminer-avx512-sha.exe "-march=cascadelake -msha" Rocketlake(2)
|
||||
cpuminer-avx512-sha-vaes.exe "-march=icelake-client" Icelake, Tigerlake(3)
|
||||
cpuminer-zen.exe "-march=znver1" AMD Zen1, Zen2
|
||||
cpuminer-zen3.exe "-march=znver2 -mvaes" Zen3(4)
|
||||
|
||||
(1) Haswell includes Broadwell, Skylake, Kabylake, Coffeelake & Cometlake.
|
||||
(2) Rocketlake build uses cascadelake+sha as a workaround until Rocketlake
|
||||
compiler support is avalable.
|
||||
(3) Icelake & Tigerlake are only available on some laptops. Mining with a
|
||||
laptop is not recommended.
|
||||
(4) Zen3 build uses zen2+vaes as a workaround until Zen3 compiler support is
|
||||
available. Zen2 CPUs should use Zen1 build.
|
||||
* Alderlake is a hybrid architecture. With the E-cores disabled it may be
|
||||
possible to enable AVX512 on the the P-cores and use the avx512-sha-vaes
|
||||
build. This is not officially supported by Intel at time of writing.
|
||||
Check for current information.
|
||||
|
||||
Notes about included DLL files:
|
||||
|
||||
@@ -66,8 +61,7 @@ https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
|
||||
|
||||
Some DLL filess may already be installed on the system by Windows or third
|
||||
party packages. They often will work and may be used instead of the included
|
||||
file. Without a compelling reason to do so it's recommended to use the included
|
||||
files as they are packaged.
|
||||
file.
|
||||
|
||||
If you like this software feel free to donate:
|
||||
|
||||
|
@@ -65,6 +65,98 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.19.1
|
||||
|
||||
Changes to Windows binaries package:
|
||||
- builds for CPUs with AVX or lower have CPU groups disabled,
|
||||
- zen3 build renamed to avx2-sha-vaes to support Alderlake as well as Zen3,
|
||||
- zen build renamed to avx2-sha, supports Zen1 & Zen2,
|
||||
- avx512-sha build removed, Rocketlake CPUs can use avx512-sha-vaes,
|
||||
- see README.txt for compatibility details.
|
||||
|
||||
Fixed a few compiler warnings that are new in GCC 11.
|
||||
Other minor fixes.
|
||||
|
||||
v3.19.0
|
||||
|
||||
Windows binaries now built with support for CPU groups, requires Windows 7.
|
||||
|
||||
Changes to cpu-affinity:
|
||||
- PR#346: Fixed incorrect CPU affinity on Windows built for CPU groups,
|
||||
- added support for CPU affinity for up to 256 threads or CPUs,
|
||||
- streamlined code for more efficient initialization of miner threads,
|
||||
- precise affining of each miner thread to a specific CPU,
|
||||
- added an option to disable CPU affinity with "--cpu-affinity 0"
|
||||
|
||||
Faster sha256t with AVX512 & AVX2.
|
||||
|
||||
Added stratum error count to stats log, reported only when non-zero.
|
||||
|
||||
v3.18.2
|
||||
|
||||
Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.
|
||||
|
||||
AVX512 for sha256d.
|
||||
|
||||
SSE42 and AVX may now be displayed as mining features at startup.
|
||||
This is hard coded for each algo, and is only implemented for scrypt
|
||||
at this time as it is the only algo with significant performance differences
|
||||
with those features.
|
||||
|
||||
Fixed an issue where a high hashrate algo could cause excessive invalid hash
|
||||
rate log reports when starting up in benchmark mode.
|
||||
|
||||
v3.18.1
|
||||
|
||||
More speed for scrypt:
|
||||
- additional scryptn2 optimizations for all CPU architectures,
|
||||
- AVX2 is now used by default on CPUS with SHA but not AVX512,
|
||||
- scrypt:1024 performance lost in v3.18.0 is restored,
|
||||
- AVX512 & AVX2 improvements to scrypt:1024.
|
||||
|
||||
Big speedup for SwiFFTx AVX2 & SSE4.1: x22i +55%, x25x +22%.
|
||||
|
||||
Issue #337: fixed a problem that could display negative stats values in the
|
||||
first summary report if the report was forced prematurely due to a stratum
|
||||
diff change. The stats will still be invalid but should display zeros.
|
||||
|
||||
v3.18.0
|
||||
|
||||
Complete rewrite of Scrypt code, optimized for large N factor (scryptn2):
|
||||
- AVX512 & SHA support for sha256, AVX512 has priority,
|
||||
- up to 50% increase in hashrate,
|
||||
- memory requirements reduced 30-60% depending on CPU architecture,
|
||||
- memory usage displayed at startup,
|
||||
- scrypt, default N=1024 (LTC), will likely perform slower.
|
||||
|
||||
Improved stale share detection and handling for Scrypt with large N factor:
|
||||
- abort and discard partially computed hash when new work is detected,
|
||||
- quicker response to new job, less time wasted mining stale job.
|
||||
|
||||
Improved stale share handling for all algorithms:
|
||||
- report possible stale share when new work received with a previously
|
||||
submitted share still pending,
|
||||
- when new work is detected report the submission of an already completed,
|
||||
otherwise valid, but likely stale, share,
|
||||
- fixed incorrect block height in stale share log.
|
||||
|
||||
Small performance improvements to sha, bmw, cube & hamsi for AVX512 & AVX2.
|
||||
|
||||
When stratum disconnects miner threads go to idle until reconnected.
|
||||
|
||||
Colour changes to some logs.
|
||||
|
||||
Some low level function name changes for clarity and consistency.
|
||||
|
||||
The reference hashrate in the summary log and the benchmark total hashrate
|
||||
are now the mean hashrate for the session.
|
||||
|
||||
v3.17.1
|
||||
|
||||
Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES.
|
||||
More ternary logic optimizations for AVX512, AVX512+VAES, and AVX512+AES.
|
||||
Fixed my-gr algo for VAES.
|
||||
|
||||
v3.17.0
|
||||
|
||||
AVX512 optimized using ternary logic instructions.
|
||||
|
@@ -1,3 +1,6 @@
|
||||
#ifndef __ALGO_GATE_API_H__
|
||||
#define __ALGO_GATE_API_H__ 1
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
@@ -94,7 +97,6 @@ typedef uint32_t set_t;
|
||||
#define SHA_OPT 0x20 // Zen1, Icelake (sha256)
|
||||
#define AVX512_OPT 0x40 // Skylake-X (AVX512[F,VL,DQ,BW])
|
||||
#define VAES_OPT 0x80 // Icelake (VAES & AVX512)
|
||||
#define VAES256_OPT 0x100 // Zen3 (VAES without AVX512)
|
||||
|
||||
|
||||
// return set containing all elements from sets a & b
|
||||
@@ -319,3 +321,4 @@ void exec_hash_function( int algo, void *output, const void *pdata );
|
||||
// algo name if valid alias, NULL if invalid alias or algo.
|
||||
void get_algo_alias( char **algo_or_alias );
|
||||
|
||||
#endif
|
||||
|
@@ -328,7 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#define ror64(x, n) _mm512_ror_epi64((x), (n))
|
||||
#define ROR64(x, n) _mm512_ror_epi64((x), (n))
|
||||
|
||||
static __m512i muladd(__m512i x, __m512i y)
|
||||
{
|
||||
@@ -344,8 +344,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
D0 = _mm512_xor_si512(D0, A0); \
|
||||
D1 = _mm512_xor_si512(D1, A1); \
|
||||
\
|
||||
D0 = ror64(D0, 32); \
|
||||
D1 = ror64(D1, 32); \
|
||||
D0 = ROR64(D0, 32); \
|
||||
D1 = ROR64(D1, 32); \
|
||||
\
|
||||
C0 = muladd(C0, D0); \
|
||||
C1 = muladd(C1, D1); \
|
||||
@@ -353,8 +353,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
B0 = _mm512_xor_si512(B0, C0); \
|
||||
B1 = _mm512_xor_si512(B1, C1); \
|
||||
\
|
||||
B0 = ror64(B0, 24); \
|
||||
B1 = ror64(B1, 24); \
|
||||
B0 = ROR64(B0, 24); \
|
||||
B1 = ROR64(B1, 24); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
@@ -365,8 +365,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
D0 = _mm512_xor_si512(D0, A0); \
|
||||
D1 = _mm512_xor_si512(D1, A1); \
|
||||
\
|
||||
D0 = ror64(D0, 16); \
|
||||
D1 = ror64(D1, 16); \
|
||||
D0 = ROR64(D0, 16); \
|
||||
D1 = ROR64(D1, 16); \
|
||||
\
|
||||
C0 = muladd(C0, D0); \
|
||||
C1 = muladd(C1, D1); \
|
||||
@@ -374,8 +374,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
B0 = _mm512_xor_si512(B0, C0); \
|
||||
B1 = _mm512_xor_si512(B1, C1); \
|
||||
\
|
||||
B0 = ror64(B0, 63); \
|
||||
B1 = ror64(B1, 63); \
|
||||
B0 = ROR64(B0, 63); \
|
||||
B1 = ROR64(B1, 63); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
|
@@ -180,6 +180,7 @@ void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8way_close( void *cc, void *dst );
|
||||
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_hash_le80( void *hash, const void *data );
|
||||
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
|
@@ -669,14 +669,14 @@ do { \
|
||||
ROUND_S_8WAY(2); \
|
||||
ROUND_S_8WAY(3); \
|
||||
} \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \
|
||||
H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \
|
||||
H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \
|
||||
H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \
|
||||
H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \
|
||||
H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \
|
||||
H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \
|
||||
H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \
|
||||
H0 = mm256_xor3( V8, V0, H0 ); \
|
||||
H1 = mm256_xor3( V9, V1, H1 ); \
|
||||
H2 = mm256_xor3( VA, V2, H2 ); \
|
||||
H3 = mm256_xor3( VB, V3, H3 ); \
|
||||
H4 = mm256_xor3( VC, V4, H4 ); \
|
||||
H5 = mm256_xor3( VD, V5, H5 ); \
|
||||
H6 = mm256_xor3( VE, V6, H6 ); \
|
||||
H7 = mm256_xor3( VF, V7, H7 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
@@ -808,14 +808,14 @@ do { \
|
||||
ROUND_S_16WAY(2); \
|
||||
ROUND_S_16WAY(3); \
|
||||
} \
|
||||
H0 = _mm512_xor_si512( _mm512_xor_si512( V8, V0 ), H0 ); \
|
||||
H1 = _mm512_xor_si512( _mm512_xor_si512( V9, V1 ), H1 ); \
|
||||
H2 = _mm512_xor_si512( _mm512_xor_si512( VA, V2 ), H2 ); \
|
||||
H3 = _mm512_xor_si512( _mm512_xor_si512( VB, V3 ), H3 ); \
|
||||
H4 = _mm512_xor_si512( _mm512_xor_si512( VC, V4 ), H4 ); \
|
||||
H5 = _mm512_xor_si512( _mm512_xor_si512( VD, V5 ), H5 ); \
|
||||
H6 = _mm512_xor_si512( _mm512_xor_si512( VE, V6 ), H6 ); \
|
||||
H7 = _mm512_xor_si512( _mm512_xor_si512( VF, V7 ), H7 ); \
|
||||
H0 = mm512_xor3( V8, V0, H0 ); \
|
||||
H1 = mm512_xor3( V9, V1, H1 ); \
|
||||
H2 = mm512_xor3( VA, V2, H2 ); \
|
||||
H3 = mm512_xor3( VB, V3, H3 ); \
|
||||
H4 = mm512_xor3( VC, V4, H4 ); \
|
||||
H5 = mm512_xor3( VD, V5, H5 ); \
|
||||
H6 = mm512_xor3( VE, V6, H6 ); \
|
||||
H7 = mm512_xor3( VF, V7, H7 ); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
@@ -122,14 +122,14 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
B2B8W_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
|
||||
}
|
||||
|
||||
ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] );
|
||||
ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] );
|
||||
ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] );
|
||||
ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] );
|
||||
ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] );
|
||||
ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] );
|
||||
ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] );
|
||||
ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] );
|
||||
ctx->h[0] = mm512_xor3( ctx->h[0], v[0], v[ 8] );
|
||||
ctx->h[1] = mm512_xor3( ctx->h[1], v[1], v[ 9] );
|
||||
ctx->h[2] = mm512_xor3( ctx->h[2], v[2], v[10] );
|
||||
ctx->h[3] = mm512_xor3( ctx->h[3], v[3], v[11] );
|
||||
ctx->h[4] = mm512_xor3( ctx->h[4], v[4], v[12] );
|
||||
ctx->h[5] = mm512_xor3( ctx->h[5], v[5], v[13] );
|
||||
ctx->h[6] = mm512_xor3( ctx->h[6], v[6], v[14] );
|
||||
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
|
||||
}
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
|
@@ -4,7 +4,6 @@
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
//#if defined(__SSE4_2__)
|
||||
#if defined(__SSE2__)
|
||||
#define BLAKE2S_4WAY
|
||||
#endif
|
||||
@@ -27,8 +26,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined (BLAKE2S_8WAY)
|
||||
|
||||
//#if defined(BLAKE2S_8WAY)
|
||||
|
||||
void blake2s_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
@@ -368,7 +368,7 @@ do { \
|
||||
ROUND8W( 9 );
|
||||
|
||||
for( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] );
|
||||
S->h[i] = mm256_xor3( S->h[i], v[i], v[i + 8] );
|
||||
|
||||
#undef G8W
|
||||
#undef ROUND8W
|
||||
@@ -566,7 +566,7 @@ do { \
|
||||
ROUND16W( 9 );
|
||||
|
||||
for( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] );
|
||||
S->h[i] = mm512_xor3( S->h[i], v[i], v[i + 8] );
|
||||
|
||||
#undef G16W
|
||||
#undef ROUND16W
|
||||
|
@@ -293,10 +293,6 @@ static const sph_u64 CB[16] = {
|
||||
H5 = (state)->H[5]; \
|
||||
H6 = (state)->H[6]; \
|
||||
H7 = (state)->H[7]; \
|
||||
S0 = (state)->S[0]; \
|
||||
S1 = (state)->S[1]; \
|
||||
S2 = (state)->S[2]; \
|
||||
S3 = (state)->S[3]; \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
@@ -310,10 +306,6 @@ static const sph_u64 CB[16] = {
|
||||
(state)->H[5] = H5; \
|
||||
(state)->H[6] = H6; \
|
||||
(state)->H[7] = H7; \
|
||||
(state)->S[0] = S0; \
|
||||
(state)->S[1] = S1; \
|
||||
(state)->S[2] = S2; \
|
||||
(state)->S[3] = S3; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
@@ -348,7 +340,6 @@ static const sph_u64 CB[16] = {
|
||||
|
||||
#define DECL_STATE64_8WAY \
|
||||
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m512i S0, S1, S2, S3; \
|
||||
uint64_t T0, T1;
|
||||
|
||||
#define COMPRESS64_8WAY( buf ) do \
|
||||
@@ -366,10 +357,10 @@ static const sph_u64 CB[16] = {
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm512_xor_si512( S0, m512_const1_64( CB0 ) ); \
|
||||
V9 = _mm512_xor_si512( S1, m512_const1_64( CB1 ) ); \
|
||||
VA = _mm512_xor_si512( S2, m512_const1_64( CB2 ) ); \
|
||||
VB = _mm512_xor_si512( S3, m512_const1_64( CB3 ) ); \
|
||||
V8 = m512_const1_64( CB0 ); \
|
||||
V9 = m512_const1_64( CB1 ); \
|
||||
VA = m512_const1_64( CB2 ); \
|
||||
VB = m512_const1_64( CB3 ); \
|
||||
VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
|
||||
m512_const1_64( CB4 ) ); \
|
||||
VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
|
||||
@@ -414,14 +405,14 @@ static const sph_u64 CB[16] = {
|
||||
ROUND_B_8WAY(3); \
|
||||
ROUND_B_8WAY(4); \
|
||||
ROUND_B_8WAY(5); \
|
||||
H0 = mm512_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm512_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm512_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm512_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm512_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm512_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm512_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm512_xor4( VF, V7, S3, H7 ); \
|
||||
H0 = mm512_xor3( V8, V0, H0 ); \
|
||||
H1 = mm512_xor3( V9, V1, H1 ); \
|
||||
H2 = mm512_xor3( VA, V2, H2 ); \
|
||||
H3 = mm512_xor3( VB, V3, H3 ); \
|
||||
H4 = mm512_xor3( VC, V4, H4 ); \
|
||||
H5 = mm512_xor3( VD, V5, H5 ); \
|
||||
H6 = mm512_xor3( VE, V6, H6 ); \
|
||||
H7 = mm512_xor3( VF, V7, H7 ); \
|
||||
} while (0)
|
||||
|
||||
void blake512_8way_compress( blake_8way_big_context *sc )
|
||||
@@ -440,10 +431,10 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = _mm512_xor_si512( sc->S[0], m512_const1_64( CB0 ) );
|
||||
V9 = _mm512_xor_si512( sc->S[1], m512_const1_64( CB1 ) );
|
||||
VA = _mm512_xor_si512( sc->S[2], m512_const1_64( CB2 ) );
|
||||
VB = _mm512_xor_si512( sc->S[3], m512_const1_64( CB3 ) );
|
||||
V8 = m512_const1_64( CB0 );
|
||||
V9 = m512_const1_64( CB1 );
|
||||
VA = m512_const1_64( CB2 );
|
||||
VB = m512_const1_64( CB3 );
|
||||
VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
|
||||
m512_const1_64( CB4 ) );
|
||||
VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
|
||||
@@ -492,19 +483,18 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
||||
ROUND_B_8WAY(4);
|
||||
ROUND_B_8WAY(5);
|
||||
|
||||
sc->H[0] = mm512_xor4( V8, V0, sc->S[0], sc->H[0] );
|
||||
sc->H[1] = mm512_xor4( V9, V1, sc->S[1], sc->H[1] );
|
||||
sc->H[2] = mm512_xor4( VA, V2, sc->S[2], sc->H[2] );
|
||||
sc->H[3] = mm512_xor4( VB, V3, sc->S[3], sc->H[3] );
|
||||
sc->H[4] = mm512_xor4( VC, V4, sc->S[0], sc->H[4] );
|
||||
sc->H[5] = mm512_xor4( VD, V5, sc->S[1], sc->H[5] );
|
||||
sc->H[6] = mm512_xor4( VE, V6, sc->S[2], sc->H[6] );
|
||||
sc->H[7] = mm512_xor4( VF, V7, sc->S[3], sc->H[7] );
|
||||
sc->H[0] = mm512_xor3( V8, V0, sc->H[0] );
|
||||
sc->H[1] = mm512_xor3( V9, V1, sc->H[1] );
|
||||
sc->H[2] = mm512_xor3( VA, V2, sc->H[2] );
|
||||
sc->H[3] = mm512_xor3( VB, V3, sc->H[3] );
|
||||
sc->H[4] = mm512_xor3( VC, V4, sc->H[4] );
|
||||
sc->H[5] = mm512_xor3( VD, V5, sc->H[5] );
|
||||
sc->H[6] = mm512_xor3( VE, V6, sc->H[6] );
|
||||
sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
|
||||
}
|
||||
|
||||
void blake512_8way_init( blake_8way_big_context *sc )
|
||||
{
|
||||
__m512i zero = m512_zero;
|
||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
@@ -514,11 +504,6 @@ void blake512_8way_init( blake_8way_big_context *sc )
|
||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
casti_m512i( sc->S, 0 ) = zero;
|
||||
casti_m512i( sc->S, 1 ) = zero;
|
||||
casti_m512i( sc->S, 2 ) = zero;
|
||||
casti_m512i( sc->S, 3 ) = zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
@@ -641,11 +626,6 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
casti_m512i( sc->S, 0 ) = m512_zero;
|
||||
casti_m512i( sc->S, 1 ) = m512_zero;
|
||||
casti_m512i( sc->S, 2 ) = m512_zero;
|
||||
casti_m512i( sc->S, 3 ) = m512_zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
|
||||
@@ -740,7 +720,6 @@ blake512_8way_close(void *cc, void *dst)
|
||||
|
||||
#define DECL_STATE64_4WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m256i S0, S1, S2, S3; \
|
||||
uint64_t T0, T1;
|
||||
|
||||
#define COMPRESS64_4WAY do \
|
||||
@@ -758,10 +737,10 @@ blake512_8way_close(void *cc, void *dst)
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, m256_const1_64( CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, m256_const1_64( CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, m256_const1_64( CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, m256_const1_64( CB3 ) ); \
|
||||
V8 = m256_const1_64( CB0 ); \
|
||||
V9 = m256_const1_64( CB1 ); \
|
||||
VA = m256_const1_64( CB2 ); \
|
||||
VB = m256_const1_64( CB3 ); \
|
||||
VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
|
||||
m256_const1_64( CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
|
||||
@@ -804,14 +783,14 @@ blake512_8way_close(void *cc, void *dst)
|
||||
ROUND_B_4WAY(3); \
|
||||
ROUND_B_4WAY(4); \
|
||||
ROUND_B_4WAY(5); \
|
||||
H0 = mm256_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm256_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm256_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm256_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm256_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm256_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm256_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm256_xor4( VF, V7, S3, H7 ); \
|
||||
H0 = mm256_xor3( V8, V0, H0 ); \
|
||||
H1 = mm256_xor3( V9, V1, H1 ); \
|
||||
H2 = mm256_xor3( VA, V2, H2 ); \
|
||||
H3 = mm256_xor3( VB, V3, H3 ); \
|
||||
H4 = mm256_xor3( VC, V4, H4 ); \
|
||||
H5 = mm256_xor3( VD, V5, H5 ); \
|
||||
H6 = mm256_xor3( VE, V6, H6 ); \
|
||||
H7 = mm256_xor3( VF, V7, H7 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
@@ -831,10 +810,10 @@ void blake512_4way_compress( blake_4way_big_context *sc )
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = _mm256_xor_si256( sc->S[0], m256_const1_64( CB0 ) );
|
||||
V9 = _mm256_xor_si256( sc->S[1], m256_const1_64( CB1 ) );
|
||||
VA = _mm256_xor_si256( sc->S[2], m256_const1_64( CB2 ) );
|
||||
VB = _mm256_xor_si256( sc->S[3], m256_const1_64( CB3 ) );
|
||||
V8 = m256_const1_64( CB0 );
|
||||
V9 = m256_const1_64( CB1 );
|
||||
VA = m256_const1_64( CB2 );
|
||||
VB = m256_const1_64( CB3 );
|
||||
VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
|
||||
m256_const1_64( CB4 ) );
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
|
||||
@@ -880,19 +859,18 @@ void blake512_4way_compress( blake_4way_big_context *sc )
|
||||
ROUND_B_4WAY(4);
|
||||
ROUND_B_4WAY(5);
|
||||
|
||||
sc->H[0] = mm256_xor4( V8, V0, sc->S[0], sc->H[0] );
|
||||
sc->H[1] = mm256_xor4( V9, V1, sc->S[1], sc->H[1] );
|
||||
sc->H[2] = mm256_xor4( VA, V2, sc->S[2], sc->H[2] );
|
||||
sc->H[3] = mm256_xor4( VB, V3, sc->S[3], sc->H[3] );
|
||||
sc->H[4] = mm256_xor4( VC, V4, sc->S[0], sc->H[4] );
|
||||
sc->H[5] = mm256_xor4( VD, V5, sc->S[1], sc->H[5] );
|
||||
sc->H[6] = mm256_xor4( VE, V6, sc->S[2], sc->H[6] );
|
||||
sc->H[7] = mm256_xor4( VF, V7, sc->S[3], sc->H[7] );
|
||||
sc->H[0] = mm256_xor3( V8, V0, sc->H[0] );
|
||||
sc->H[1] = mm256_xor3( V9, V1, sc->H[1] );
|
||||
sc->H[2] = mm256_xor3( VA, V2, sc->H[2] );
|
||||
sc->H[3] = mm256_xor3( VB, V3, sc->H[3] );
|
||||
sc->H[4] = mm256_xor3( VC, V4, sc->H[4] );
|
||||
sc->H[5] = mm256_xor3( VD, V5, sc->H[5] );
|
||||
sc->H[6] = mm256_xor3( VE, V6, sc->H[6] );
|
||||
sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
|
||||
}
|
||||
|
||||
void blake512_4way_init( blake_4way_big_context *sc )
|
||||
{
|
||||
__m256i zero = m256_zero;
|
||||
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||
@@ -902,11 +880,6 @@ void blake512_4way_init( blake_4way_big_context *sc )
|
||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
casti_m256i( sc->S, 0 ) = zero;
|
||||
casti_m256i( sc->S, 1 ) = zero;
|
||||
casti_m256i( sc->S, 2 ) = zero;
|
||||
casti_m256i( sc->S, 3 ) = zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
@@ -1026,11 +999,6 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
casti_m256i( sc->S, 0 ) = m256_zero;
|
||||
casti_m256i( sc->S, 1 ) = m256_zero;
|
||||
casti_m256i( sc->S, 2 ) = m256_zero;
|
||||
casti_m256i( sc->S, 3 ) = m256_zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
|
||||
|
@@ -8,7 +8,7 @@ uint32_t *decred_get_nonceptr( uint32_t *work_data )
|
||||
return &work_data[ DECRED_NONCE_INDEX ];
|
||||
}
|
||||
|
||||
double decred_calc_network_diff( struct work* work )
|
||||
long double decred_calc_network_diff( struct work* work )
|
||||
{
|
||||
// sample for diff 43.281 : 1c05ea29
|
||||
// todo: endian reversed on longpoll could be zr5 specific...
|
||||
@@ -16,7 +16,7 @@ double decred_calc_network_diff( struct work* work )
|
||||
uint32_t bits = ( nbits & 0xffffff );
|
||||
int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
|
||||
int m;
|
||||
double d = (double)0x0000ffff / (double)bits;
|
||||
long double d = (long double)0x0000ffff / (long double)bits;
|
||||
|
||||
for ( m = shift; m < 29; m++ )
|
||||
d *= 256.0;
|
||||
@@ -25,7 +25,7 @@ double decred_calc_network_diff( struct work* work )
|
||||
if ( shift == 28 )
|
||||
d *= 256.0; // testnet
|
||||
if ( opt_debug_diff )
|
||||
applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
|
||||
applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", (double)d,
|
||||
shift, bits );
|
||||
return net_diff;
|
||||
}
|
||||
|
@@ -867,40 +867,35 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
|
||||
qt[30] = expand2s8( qt, M, H, 30 );
|
||||
qt[31] = expand2s8( qt, M, H, 31 );
|
||||
|
||||
xl = _mm256_xor_si256(
|
||||
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm256_xor_si256( xl, _mm256_xor_si256(
|
||||
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
xl = mm256_xor3( mm256_xor3( qt[16], qt[17], qt[18] ),
|
||||
mm256_xor3( qt[19], qt[20], qt[21] ),
|
||||
_mm256_xor_si256( qt[22], qt[23] ) );
|
||||
|
||||
xh = mm256_xor3( mm256_xor3( xl, qt[24], qt[25] ),
|
||||
mm256_xor3( qt[26], qt[27], qt[28] ),
|
||||
mm256_xor3( qt[29], qt[30], qt[31] ) );
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_xor_si256( M[m], \
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
|
||||
_mm256_srli_epi32( qt[a], sr ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
|
||||
_mm256_add_epi32( mm256_xor3( M[m], _mm256_slli_epi32( xh, sl ), \
|
||||
_mm256_srli_epi32( qt[a], sr ) ), \
|
||||
mm256_xor3( xl, qt[b], qt[c] ) )
|
||||
|
||||
#define DH1R( m, sl, sr, a, b, c ) \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_xor_si256( M[m], \
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
|
||||
_mm256_slli_epi32( qt[a], sr ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
|
||||
_mm256_add_epi32( mm256_xor3( M[m], _mm256_srli_epi32( xh, sl ), \
|
||||
_mm256_slli_epi32( qt[a], sr ) ), \
|
||||
mm256_xor3( xl, qt[b], qt[c] ) )
|
||||
|
||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||
_mm256_add_epi32( _mm256_add_epi32( \
|
||||
mm256_rol_32( dH[h], rl ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
|
||||
_mm256_xor_si256( qt[b], qt[c] ) ) );
|
||||
mm256_rol_32( dH[h], rl ), \
|
||||
mm256_xor3( xh, qt[a], M[m] ) ), \
|
||||
mm256_xor3( _mm256_slli_epi32( xl, sl ), qt[b], qt[c] ) )
|
||||
|
||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||
_mm256_add_epi32( _mm256_add_epi32( \
|
||||
mm256_rol_32( dH[h], rl ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
|
||||
_mm256_xor_si256( qt[b], qt[c] ) ) );
|
||||
mm256_rol_32( dH[h], rl ), \
|
||||
mm256_xor3( xh, qt[a], M[m] ) ), \
|
||||
mm256_xor3( _mm256_srli_epi32( xl, sr ), qt[b], qt[c] ) )
|
||||
|
||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
||||
@@ -924,88 +919,6 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
|
||||
#undef DH2L
|
||||
#undef DH2R
|
||||
|
||||
/*
|
||||
dH[ 0] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[0],
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
|
||||
_mm256_srli_epi32( qt[16], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
|
||||
dH[ 1] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[1],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 7 ),
|
||||
_mm256_slli_epi32( qt[17], 8 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
|
||||
dH[ 2] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[2],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 5 ),
|
||||
_mm256_slli_epi32( qt[18], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
|
||||
dH[ 3] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[3],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 1 ),
|
||||
_mm256_slli_epi32( qt[19], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
|
||||
dH[ 4] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[4],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 3 ),
|
||||
_mm256_slli_epi32( qt[20], 0 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
|
||||
dH[ 5] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[5],
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xh, 6 ),
|
||||
_mm256_srli_epi32( qt[21], 6 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
|
||||
dH[ 6] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[6],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 4 ),
|
||||
_mm256_slli_epi32( qt[22], 6 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
|
||||
dH[ 7] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[7],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 11 ),
|
||||
_mm256_slli_epi32( qt[23], 2 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
|
||||
dH[ 8] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[4], 9 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xl, 8 ),
|
||||
_mm256_xor_si256( qt[23], qt[ 8] ) ) );
|
||||
dH[ 9] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[5], 10 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 6 ),
|
||||
_mm256_xor_si256( qt[16], qt[ 9] ) ) );
|
||||
dH[10] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[6], 11 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xl, 6 ),
|
||||
_mm256_xor_si256( qt[17], qt[10] ) ) );
|
||||
dH[11] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[7], 12 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xl, 4 ),
|
||||
_mm256_xor_si256( qt[18], qt[11] ) ) );
|
||||
dH[12] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[0], 13 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 3 ),
|
||||
_mm256_xor_si256( qt[19], qt[12] ) ) );
|
||||
dH[13] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[1], 14 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 4 ),
|
||||
_mm256_xor_si256( qt[20], qt[13] ) ) );
|
||||
dH[14] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[2], 15 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 7 ),
|
||||
_mm256_xor_si256( qt[21], qt[14] ) ) );
|
||||
dH[15] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[3], 16 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
|
||||
_mm256_xor_si256( qt[22], qt[15] ) ) );
|
||||
*/
|
||||
}
|
||||
|
||||
static const __m256i final_s8[16] =
|
||||
@@ -1422,40 +1335,35 @@ void compress_small_16way( const __m512i *M, const __m512i H[16],
|
||||
qt[30] = expand2s16( qt, M, H, 30 );
|
||||
qt[31] = expand2s16( qt, M, H, 31 );
|
||||
|
||||
xl = _mm512_xor_si512(
|
||||
mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm512_xor_si512( xl, _mm512_xor_si512(
|
||||
mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
|
||||
mm512_xor3( qt[19], qt[20], qt[21] ),
|
||||
_mm512_xor_si512( qt[22], qt[23] ) );
|
||||
|
||||
xh = mm512_xor3( mm512_xor3( xl, qt[24], qt[25] ),
|
||||
mm512_xor3( qt[26], qt[27], qt[28] ),
|
||||
mm512_xor3( qt[29], qt[30], qt[31] ) );
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_xor_si512( M[m], \
|
||||
_mm512_xor_si512( _mm512_slli_epi32( xh, sl ), \
|
||||
_mm512_srli_epi32( qt[a], sr ) ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||
_mm512_add_epi32( mm512_xor3( M[m], _mm512_slli_epi32( xh, sl ), \
|
||||
_mm512_srli_epi32( qt[a], sr ) ), \
|
||||
mm512_xor3( xl, qt[b], qt[c] ) )
|
||||
|
||||
#define DH1R( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_xor_si512( M[m], \
|
||||
_mm512_xor_si512( _mm512_srli_epi32( xh, sl ), \
|
||||
_mm512_slli_epi32( qt[a], sr ) ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||
_mm512_add_epi32( mm512_xor3( M[m], _mm512_srli_epi32( xh, sl ), \
|
||||
_mm512_slli_epi32( qt[a], sr ) ), \
|
||||
mm512_xor3( xl, qt[b], qt[c] ) )
|
||||
|
||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||
_mm512_add_epi32( _mm512_add_epi32( \
|
||||
mm512_rol_32( dH[h], rl ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||
_mm512_xor_si512( _mm512_slli_epi32( xl, sl ), \
|
||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||
mm512_rol_32( dH[h], rl ), \
|
||||
mm512_xor3( xh, qt[a], M[m] ) ), \
|
||||
mm512_xor3( _mm512_slli_epi32( xl, sl ), qt[b], qt[c] ) )
|
||||
|
||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||
_mm512_add_epi32( _mm512_add_epi32( \
|
||||
mm512_rol_32( dH[h], rl ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||
_mm512_xor_si512( _mm512_srli_epi32( xl, sr ), \
|
||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||
mm512_rol_32( dH[h], rl ), \
|
||||
mm512_xor3( xh, qt[a], M[m] ) ), \
|
||||
mm512_xor3( _mm512_srli_epi32( xl, sr ), qt[b], qt[c] ) )
|
||||
|
||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
||||
|
@@ -594,22 +594,15 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
||||
#define rb6(x) mm256_rol_64( x, 43 )
|
||||
#define rb7(x) mm256_rol_64( x, 53 )
|
||||
|
||||
#define rol_off_64( M, j, off ) \
|
||||
mm256_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
|
||||
( ( (j) + (off) ) & 0xF ) + 1 )
|
||||
#define rol_off_64( M, j ) \
|
||||
mm256_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
|
||||
|
||||
#define add_elt_b( M, H, j ) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \
|
||||
rol_off_64( M, j, 3 ) ), \
|
||||
rol_off_64( M, j, 10 ) ), \
|
||||
_mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
|
||||
H[ ( (j)+7 ) & 0xF ] )
|
||||
#define add_elt_b( mj0, mj3, mj10, h, K ) \
|
||||
_mm256_xor_si256( h, _mm256_add_epi64( K, \
|
||||
_mm256_sub_epi64( _mm256_add_epi64( mj0, mj3 ), mj10 ) ) )
|
||||
|
||||
|
||||
#define expand1b( qt, M, H, i ) \
|
||||
_mm256_add_epi64( mm256_add4_64( \
|
||||
#define expand1_b( qt, i ) \
|
||||
mm256_add4_64( \
|
||||
mm256_add4_64( sb1( qt[ (i)-16 ] ), sb2( qt[ (i)-15 ] ), \
|
||||
sb3( qt[ (i)-14 ] ), sb0( qt[ (i)-13 ] )), \
|
||||
mm256_add4_64( sb1( qt[ (i)-12 ] ), sb2( qt[ (i)-11 ] ), \
|
||||
@@ -617,11 +610,10 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
||||
mm256_add4_64( sb1( qt[ (i)- 8 ] ), sb2( qt[ (i)- 7 ] ), \
|
||||
sb3( qt[ (i)- 6 ] ), sb0( qt[ (i)- 5 ] )), \
|
||||
mm256_add4_64( sb1( qt[ (i)- 4 ] ), sb2( qt[ (i)- 3 ] ), \
|
||||
sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b( M, H, (i)-16 ) )
|
||||
sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) )
|
||||
|
||||
#define expand2b( qt, M, H, i) \
|
||||
_mm256_add_epi64( mm256_add4_64( \
|
||||
#define expand2_b( qt, i) \
|
||||
mm256_add4_64( \
|
||||
mm256_add4_64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ), \
|
||||
qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ), \
|
||||
mm256_add4_64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ), \
|
||||
@@ -629,159 +621,98 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
||||
mm256_add4_64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ), \
|
||||
qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ), \
|
||||
mm256_add4_64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ), \
|
||||
sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b( M, H, (i)-16 ) )
|
||||
|
||||
|
||||
sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) )
|
||||
|
||||
#define Wb0 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[13], H[13] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \
|
||||
_mm256_add_epi64( mh[13], mh[14] ) )
|
||||
|
||||
#define Wb1 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[14], H[14] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \
|
||||
_mm256_sub_epi64( mh[14], mh[15] ) )
|
||||
|
||||
#define Wb2 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
_mm256_add_epi64( _mm256_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \
|
||||
_mm256_sub_epi64( mh[12], mh[15] ) )
|
||||
|
||||
#define Wb3 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[10], H[10] ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \
|
||||
_mm256_sub_epi64( mh[10], \
|
||||
mh[13] ) )
|
||||
|
||||
#define Wb4 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
_mm256_add_epi64( _mm256_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \
|
||||
_mm256_add_epi64( mh[11], mh[14] ) )
|
||||
|
||||
#define Wb5 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \
|
||||
_mm256_sub_epi64( mh[12], mh[15] ) )
|
||||
|
||||
#define Wb6 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
|
||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[11], H[11] ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \
|
||||
_mm256_sub_epi64( mh[11], mh[13] ) )
|
||||
|
||||
#define Wb7 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \
|
||||
_mm256_add_epi64( mh[12], mh[14] ) )
|
||||
|
||||
#define Wb8 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[13], H[13] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \
|
||||
_mm256_sub_epi64( mh[13], mh[15] ) )
|
||||
|
||||
#define Wb9 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \
|
||||
_mm256_sub_epi64( mh[ 7], mh[14] ) )
|
||||
|
||||
#define Wb10 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \
|
||||
_mm256_sub_epi64( mh[ 7], mh[15] ) )
|
||||
|
||||
#define Wb11 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \
|
||||
_mm256_sub_epi64( mh[ 5], mh[ 9] ) )
|
||||
|
||||
#define Wb12 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ) )
|
||||
_mm256_sub_epi64( _mm256_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \
|
||||
_mm256_sub_epi64( mh[ 9], mh[10] ) )
|
||||
|
||||
#define Wb13 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[10], H[10] ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ) )
|
||||
_mm256_add_epi64( _mm256_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \
|
||||
_mm256_add_epi64( mh[10], mh[11] ) )
|
||||
|
||||
#define Wb14 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) ) )
|
||||
_mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \
|
||||
_mm256_add_epi64( mh[11], mh[12] ) )
|
||||
|
||||
#define Wb15 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[ 4], H[4] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
||||
_mm256_sub_epi64( _mm256_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \
|
||||
_mm256_sub_epi64( mh[ 9], mh[13] ) )
|
||||
|
||||
|
||||
void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
{
|
||||
__m256i qt[32], xl, xh;
|
||||
__m256i mh[16];
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < 16; i++ )
|
||||
mh[i] = _mm256_xor_si256( M[i], H[i] );
|
||||
|
||||
qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] );
|
||||
qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] );
|
||||
@@ -799,22 +730,60 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] );
|
||||
qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] );
|
||||
qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] );
|
||||
qt[16] = expand1b( qt, M, H, 16 );
|
||||
qt[17] = expand1b( qt, M, H, 17 );
|
||||
qt[18] = expand2b( qt, M, H, 18 );
|
||||
qt[19] = expand2b( qt, M, H, 19 );
|
||||
qt[20] = expand2b( qt, M, H, 20 );
|
||||
qt[21] = expand2b( qt, M, H, 21 );
|
||||
qt[22] = expand2b( qt, M, H, 22 );
|
||||
qt[23] = expand2b( qt, M, H, 23 );
|
||||
qt[24] = expand2b( qt, M, H, 24 );
|
||||
qt[25] = expand2b( qt, M, H, 25 );
|
||||
qt[26] = expand2b( qt, M, H, 26 );
|
||||
qt[27] = expand2b( qt, M, H, 27 );
|
||||
qt[28] = expand2b( qt, M, H, 28 );
|
||||
qt[29] = expand2b( qt, M, H, 29 );
|
||||
qt[30] = expand2b( qt, M, H, 30 );
|
||||
qt[31] = expand2b( qt, M, H, 31 );
|
||||
|
||||
__m256i mj[16];
|
||||
for ( i = 0; i < 16; i++ )
|
||||
mj[i] = rol_off_64( M, i );
|
||||
|
||||
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7],
|
||||
(const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) );
|
||||
qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8],
|
||||
(const __m256i)_mm256_set1_epi64x( 17 * 0x0555555555555555ULL ) );
|
||||
qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9],
|
||||
(const __m256i)_mm256_set1_epi64x( 18 * 0x0555555555555555ULL ) );
|
||||
qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10],
|
||||
(const __m256i)_mm256_set1_epi64x( 19 * 0x0555555555555555ULL ) );
|
||||
qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11],
|
||||
(const __m256i)_mm256_set1_epi64x( 20 * 0x0555555555555555ULL ) );
|
||||
qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12],
|
||||
(const __m256i)_mm256_set1_epi64x( 21 * 0x0555555555555555ULL ) );
|
||||
qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13],
|
||||
(const __m256i)_mm256_set1_epi64x( 22 * 0x0555555555555555ULL ) );
|
||||
qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14],
|
||||
(const __m256i)_mm256_set1_epi64x( 23 * 0x0555555555555555ULL ) );
|
||||
qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15],
|
||||
(const __m256i)_mm256_set1_epi64x( 24 * 0x0555555555555555ULL ) );
|
||||
qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0],
|
||||
(const __m256i)_mm256_set1_epi64x( 25 * 0x0555555555555555ULL ) );
|
||||
qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1],
|
||||
(const __m256i)_mm256_set1_epi64x( 26 * 0x0555555555555555ULL ) );
|
||||
qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2],
|
||||
(const __m256i)_mm256_set1_epi64x( 27 * 0x0555555555555555ULL ) );
|
||||
qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3],
|
||||
(const __m256i)_mm256_set1_epi64x( 28 * 0x0555555555555555ULL ) );
|
||||
qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4],
|
||||
(const __m256i)_mm256_set1_epi64x( 29 * 0x0555555555555555ULL ) );
|
||||
qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5],
|
||||
(const __m256i)_mm256_set1_epi64x( 30 * 0x0555555555555555ULL ) );
|
||||
qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6],
|
||||
(const __m256i)_mm256_set1_epi64x( 31 * 0x0555555555555555ULL ) );
|
||||
|
||||
qt[16] = _mm256_add_epi64( qt[16], expand1_b( qt, 16 ) );
|
||||
qt[17] = _mm256_add_epi64( qt[17], expand1_b( qt, 17 ) );
|
||||
qt[18] = _mm256_add_epi64( qt[18], expand2_b( qt, 18 ) );
|
||||
qt[19] = _mm256_add_epi64( qt[19], expand2_b( qt, 19 ) );
|
||||
qt[20] = _mm256_add_epi64( qt[20], expand2_b( qt, 20 ) );
|
||||
qt[21] = _mm256_add_epi64( qt[21], expand2_b( qt, 21 ) );
|
||||
qt[22] = _mm256_add_epi64( qt[22], expand2_b( qt, 22 ) );
|
||||
qt[23] = _mm256_add_epi64( qt[23], expand2_b( qt, 23 ) );
|
||||
qt[24] = _mm256_add_epi64( qt[24], expand2_b( qt, 24 ) );
|
||||
qt[25] = _mm256_add_epi64( qt[25], expand2_b( qt, 25 ) );
|
||||
qt[26] = _mm256_add_epi64( qt[26], expand2_b( qt, 26 ) );
|
||||
qt[27] = _mm256_add_epi64( qt[27], expand2_b( qt, 27 ) );
|
||||
qt[28] = _mm256_add_epi64( qt[28], expand2_b( qt, 28 ) );
|
||||
qt[29] = _mm256_add_epi64( qt[29], expand2_b( qt, 29 ) );
|
||||
qt[30] = _mm256_add_epi64( qt[30], expand2_b( qt, 30 ) );
|
||||
qt[31] = _mm256_add_epi64( qt[31], expand2_b( qt, 31 ) );
|
||||
|
||||
xl = _mm256_xor_si256(
|
||||
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
@@ -823,7 +792,6 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_xor_si256( M[m], \
|
||||
@@ -1066,21 +1034,15 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
#define r8b6(x) mm512_rol_64( x, 43 )
|
||||
#define r8b7(x) mm512_rol_64( x, 53 )
|
||||
|
||||
#define rol8w_off_64( M, j, off ) \
|
||||
mm512_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
|
||||
( ( (j) + (off) ) & 0xF ) + 1 )
|
||||
#define rol8w_off_64( M, j ) \
|
||||
mm512_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
|
||||
|
||||
#define add_elt_b8( M, H, j ) \
|
||||
_mm512_xor_si512( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_add_epi64( rol8w_off_64( M, j, 0 ), \
|
||||
rol8w_off_64( M, j, 3 ) ), \
|
||||
rol8w_off_64( M, j, 10 ) ), \
|
||||
_mm512_set1_epi64( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
|
||||
H[ ( (j)+7 ) & 0xF ] )
|
||||
#define add_elt_b8( mj0, mj3, mj10, h, K ) \
|
||||
_mm512_xor_si512( h, _mm512_add_epi64( K, \
|
||||
_mm512_sub_epi64( _mm512_add_epi64( mj0, mj3 ), mj10 ) ) )
|
||||
|
||||
#define expand1b8( qt, M, H, i ) \
|
||||
_mm512_add_epi64( mm512_add4_64( \
|
||||
#define expand1_b8( qt, i ) \
|
||||
mm512_add4_64( \
|
||||
mm512_add4_64( s8b1( qt[ (i)-16 ] ), s8b2( qt[ (i)-15 ] ), \
|
||||
s8b3( qt[ (i)-14 ] ), s8b0( qt[ (i)-13 ] )), \
|
||||
mm512_add4_64( s8b1( qt[ (i)-12 ] ), s8b2( qt[ (i)-11 ] ), \
|
||||
@@ -1088,11 +1050,10 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
mm512_add4_64( s8b1( qt[ (i)- 8 ] ), s8b2( qt[ (i)- 7 ] ), \
|
||||
s8b3( qt[ (i)- 6 ] ), s8b0( qt[ (i)- 5 ] )), \
|
||||
mm512_add4_64( s8b1( qt[ (i)- 4 ] ), s8b2( qt[ (i)- 3 ] ), \
|
||||
s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b8( M, H, (i)-16 ) )
|
||||
s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) )
|
||||
|
||||
#define expand2b8( qt, M, H, i) \
|
||||
_mm512_add_epi64( mm512_add4_64( \
|
||||
#define expand2_b8( qt, i) \
|
||||
mm512_add4_64( \
|
||||
mm512_add4_64( qt[ (i)-16 ], r8b1( qt[ (i)-15 ] ), \
|
||||
qt[ (i)-14 ], r8b2( qt[ (i)-13 ] ) ), \
|
||||
mm512_add4_64( qt[ (i)-12 ], r8b3( qt[ (i)-11 ] ), \
|
||||
@@ -1100,157 +1061,97 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
mm512_add4_64( qt[ (i)- 8 ], r8b5( qt[ (i)- 7 ] ), \
|
||||
qt[ (i)- 6 ], r8b6( qt[ (i)- 5 ] ) ), \
|
||||
mm512_add4_64( qt[ (i)- 4 ], r8b7( qt[ (i)- 3 ] ), \
|
||||
s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b8( M, H, (i)-16 ) )
|
||||
s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) )
|
||||
|
||||
#define W8b0 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
|
||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||
_mm512_xor_si512( M[10], H[10] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[13], H[13] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \
|
||||
_mm512_add_epi64( mh[13], mh[14] ) )
|
||||
|
||||
#define W8b1 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 6], H[ 6] ), \
|
||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||
_mm512_xor_si512( M[11], H[11] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[14], H[14] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \
|
||||
_mm512_sub_epi64( mh[14], mh[15] ) )
|
||||
|
||||
#define W8b2 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
_mm512_add_epi64( _mm512_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \
|
||||
_mm512_sub_epi64( mh[12], mh[15] ) )
|
||||
|
||||
#define W8b3 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
|
||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[10], H[10] ), \
|
||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \
|
||||
_mm512_sub_epi64( mh[10], mh[13] ) )
|
||||
|
||||
#define W8b4 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
_mm512_add_epi64( _mm512_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \
|
||||
_mm512_add_epi64( mh[11], mh[14] ) )
|
||||
|
||||
#define W8b5 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
|
||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||
_mm512_xor_si512( M[10], H[10] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \
|
||||
_mm512_sub_epi64( mh[12], mh[15] ) )
|
||||
|
||||
#define W8b6 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 4], H[ 4] ), \
|
||||
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
|
||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[11], H[11] ), \
|
||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \
|
||||
_mm512_sub_epi64( mh[11], mh[13] ) )
|
||||
|
||||
#define W8b7 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \
|
||||
_mm512_add_epi64( mh[12], mh[14] ) )
|
||||
|
||||
#define W8b8 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
|
||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[13], H[13] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \
|
||||
_mm512_sub_epi64( mh[13], mh[15] ) )
|
||||
|
||||
#define W8b9 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \
|
||||
_mm512_sub_epi64( mh[ 7], mh[14] ) )
|
||||
|
||||
#define W8b10 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
|
||||
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
|
||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \
|
||||
_mm512_sub_epi64( mh[ 7], mh[15] ) )
|
||||
|
||||
#define W8b11 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
|
||||
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
|
||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
|
||||
_mm512_xor_si512( M[ 9], H[ 9] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \
|
||||
_mm512_sub_epi64( mh[ 5], mh[ 9] ) )
|
||||
|
||||
#define W8b12 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
|
||||
_mm512_xor_si512( M[10], H[10] ) ) )
|
||||
_mm512_sub_epi64( _mm512_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \
|
||||
_mm512_sub_epi64( mh[ 9], mh[10] ) )
|
||||
|
||||
#define W8b13 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
|
||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[10], H[10] ), \
|
||||
_mm512_xor_si512( M[11], H[11] ) ) )
|
||||
_mm512_add_epi64( _mm512_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \
|
||||
_mm512_add_epi64( mh[10], mh[11] ) )
|
||||
|
||||
#define W8b14 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
|
||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
|
||||
_mm512_xor_si512( M[12], H[12] ) ) )
|
||||
_mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \
|
||||
_mm512_add_epi64( mh[11], mh[12] ) )
|
||||
|
||||
#define W8b15 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[ 4], H[4] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
|
||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||
_mm512_sub_epi64( _mm512_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \
|
||||
_mm512_sub_epi64( mh[ 9], mh[13] ) )
|
||||
|
||||
void compress_big_8way( const __m512i *M, const __m512i H[16],
|
||||
__m512i dH[16] )
|
||||
{
|
||||
__m512i qt[32], xl, xh;
|
||||
__m512i mh[16];
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < 16; i++ )
|
||||
mh[i] = _mm512_xor_si512( M[i], H[i] );
|
||||
|
||||
qt[ 0] = _mm512_add_epi64( s8b0( W8b0 ), H[ 1] );
|
||||
qt[ 1] = _mm512_add_epi64( s8b1( W8b1 ), H[ 2] );
|
||||
@@ -1268,29 +1169,68 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
|
||||
qt[13] = _mm512_add_epi64( s8b3( W8b13), H[14] );
|
||||
qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
|
||||
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
|
||||
qt[16] = expand1b8( qt, M, H, 16 );
|
||||
qt[17] = expand1b8( qt, M, H, 17 );
|
||||
qt[18] = expand2b8( qt, M, H, 18 );
|
||||
qt[19] = expand2b8( qt, M, H, 19 );
|
||||
qt[20] = expand2b8( qt, M, H, 20 );
|
||||
qt[21] = expand2b8( qt, M, H, 21 );
|
||||
qt[22] = expand2b8( qt, M, H, 22 );
|
||||
qt[23] = expand2b8( qt, M, H, 23 );
|
||||
qt[24] = expand2b8( qt, M, H, 24 );
|
||||
qt[25] = expand2b8( qt, M, H, 25 );
|
||||
qt[26] = expand2b8( qt, M, H, 26 );
|
||||
qt[27] = expand2b8( qt, M, H, 27 );
|
||||
qt[28] = expand2b8( qt, M, H, 28 );
|
||||
qt[29] = expand2b8( qt, M, H, 29 );
|
||||
qt[30] = expand2b8( qt, M, H, 30 );
|
||||
qt[31] = expand2b8( qt, M, H, 31 );
|
||||
|
||||
xl = _mm512_xor_si512(
|
||||
mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm512_xor_si512( xl, _mm512_xor_si512(
|
||||
mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
__m512i mj[16];
|
||||
for ( i = 0; i < 16; i++ )
|
||||
mj[i] = rol8w_off_64( M, i );
|
||||
|
||||
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7],
|
||||
(const __m512i)_mm512_set1_epi64( 16 * 0x0555555555555555ULL ) );
|
||||
qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8],
|
||||
(const __m512i)_mm512_set1_epi64( 17 * 0x0555555555555555ULL ) );
|
||||
qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9],
|
||||
(const __m512i)_mm512_set1_epi64( 18 * 0x0555555555555555ULL ) );
|
||||
qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10],
|
||||
(const __m512i)_mm512_set1_epi64( 19 * 0x0555555555555555ULL ) );
|
||||
qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11],
|
||||
(const __m512i)_mm512_set1_epi64( 20 * 0x0555555555555555ULL ) );
|
||||
qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12],
|
||||
(const __m512i)_mm512_set1_epi64( 21 * 0x0555555555555555ULL ) );
|
||||
qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13],
|
||||
(const __m512i)_mm512_set1_epi64( 22 * 0x0555555555555555ULL ) );
|
||||
qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14],
|
||||
(const __m512i)_mm512_set1_epi64( 23 * 0x0555555555555555ULL ) );
|
||||
qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15],
|
||||
(const __m512i)_mm512_set1_epi64( 24 * 0x0555555555555555ULL ) );
|
||||
qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0],
|
||||
(const __m512i)_mm512_set1_epi64( 25 * 0x0555555555555555ULL ) );
|
||||
qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1],
|
||||
(const __m512i)_mm512_set1_epi64( 26 * 0x0555555555555555ULL ) );
|
||||
qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2],
|
||||
(const __m512i)_mm512_set1_epi64( 27 * 0x0555555555555555ULL ) );
|
||||
qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3],
|
||||
(const __m512i)_mm512_set1_epi64( 28 * 0x0555555555555555ULL ) );
|
||||
qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4],
|
||||
(const __m512i)_mm512_set1_epi64( 29 * 0x0555555555555555ULL ) );
|
||||
qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5],
|
||||
(const __m512i)_mm512_set1_epi64( 30 * 0x0555555555555555ULL ) );
|
||||
qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6],
|
||||
(const __m512i)_mm512_set1_epi64( 31 * 0x0555555555555555ULL ) );
|
||||
|
||||
qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
|
||||
qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
|
||||
qt[18] = _mm512_add_epi64( qt[18], expand2_b8( qt, 18 ) );
|
||||
qt[19] = _mm512_add_epi64( qt[19], expand2_b8( qt, 19 ) );
|
||||
qt[20] = _mm512_add_epi64( qt[20], expand2_b8( qt, 20 ) );
|
||||
qt[21] = _mm512_add_epi64( qt[21], expand2_b8( qt, 21 ) );
|
||||
qt[22] = _mm512_add_epi64( qt[22], expand2_b8( qt, 22 ) );
|
||||
qt[23] = _mm512_add_epi64( qt[23], expand2_b8( qt, 23 ) );
|
||||
qt[24] = _mm512_add_epi64( qt[24], expand2_b8( qt, 24 ) );
|
||||
qt[25] = _mm512_add_epi64( qt[25], expand2_b8( qt, 25 ) );
|
||||
qt[26] = _mm512_add_epi64( qt[26], expand2_b8( qt, 26 ) );
|
||||
qt[27] = _mm512_add_epi64( qt[27], expand2_b8( qt, 27 ) );
|
||||
qt[28] = _mm512_add_epi64( qt[28], expand2_b8( qt, 28 ) );
|
||||
qt[29] = _mm512_add_epi64( qt[29], expand2_b8( qt, 29 ) );
|
||||
qt[30] = _mm512_add_epi64( qt[30], expand2_b8( qt, 30 ) );
|
||||
qt[31] = _mm512_add_epi64( qt[31], expand2_b8( qt, 31 ) );
|
||||
|
||||
xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
|
||||
mm512_xor3( qt[19], qt[20], qt[21] ),
|
||||
_mm512_xor_si512( qt[22], qt[23] ) );
|
||||
|
||||
xh = mm512_xor3( mm512_xor3( xl, qt[24], qt[25] ),
|
||||
mm512_xor3( qt[26], qt[27], qt[28] ),
|
||||
mm512_xor3( qt[29], qt[30], qt[31] ) );
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi64( mm512_xor3( M[m], _mm512_slli_epi64( xh, sl ), \
|
||||
|
@@ -98,6 +98,138 @@ static void transform_4way( cube_4way_context *sp )
|
||||
_mm512_store_si512( (__m512i*)sp->h + 7, x7 );
|
||||
}
|
||||
|
||||
// 8 ways, 4 way parallel double buffered
|
||||
static void transform_4way_2buf( cube_4way_2buf_context *sp )
|
||||
{
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
__m512i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
__m512i y0, y1, y2, y3, y4, y5, y6, y7;
|
||||
__m512i tx0, tx1, ty0, ty1;
|
||||
|
||||
x0 = _mm512_load_si512( (__m512i*)sp->h0 );
|
||||
x1 = _mm512_load_si512( (__m512i*)sp->h0 + 1 );
|
||||
x2 = _mm512_load_si512( (__m512i*)sp->h0 + 2 );
|
||||
x3 = _mm512_load_si512( (__m512i*)sp->h0 + 3 );
|
||||
x4 = _mm512_load_si512( (__m512i*)sp->h0 + 4 );
|
||||
x5 = _mm512_load_si512( (__m512i*)sp->h0 + 5 );
|
||||
x6 = _mm512_load_si512( (__m512i*)sp->h0 + 6 );
|
||||
x7 = _mm512_load_si512( (__m512i*)sp->h0 + 7 );
|
||||
|
||||
y0 = _mm512_load_si512( (__m512i*)sp->h1 );
|
||||
y1 = _mm512_load_si512( (__m512i*)sp->h1 + 1 );
|
||||
y2 = _mm512_load_si512( (__m512i*)sp->h1 + 2 );
|
||||
y3 = _mm512_load_si512( (__m512i*)sp->h1 + 3 );
|
||||
y4 = _mm512_load_si512( (__m512i*)sp->h1 + 4 );
|
||||
y5 = _mm512_load_si512( (__m512i*)sp->h1 + 5 );
|
||||
y6 = _mm512_load_si512( (__m512i*)sp->h1 + 6 );
|
||||
y7 = _mm512_load_si512( (__m512i*)sp->h1 + 7 );
|
||||
|
||||
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x4 = _mm512_add_epi32( x0, x4 );
|
||||
y4 = _mm512_add_epi32( y0, y4 );
|
||||
tx0 = x0;
|
||||
ty0 = y0;
|
||||
x5 = _mm512_add_epi32( x1, x5 );
|
||||
y5 = _mm512_add_epi32( y1, y5 );
|
||||
tx1 = x1;
|
||||
ty1 = y1;
|
||||
x0 = mm512_rol_32( x2, 7 );
|
||||
y0 = mm512_rol_32( y2, 7 );
|
||||
x6 = _mm512_add_epi32( x2, x6 );
|
||||
y6 = _mm512_add_epi32( y2, y6 );
|
||||
x1 = mm512_rol_32( x3, 7 );
|
||||
y1 = mm512_rol_32( y3, 7 );
|
||||
x7 = _mm512_add_epi32( x3, x7 );
|
||||
y7 = _mm512_add_epi32( y3, y7 );
|
||||
|
||||
|
||||
x2 = mm512_rol_32( tx0, 7 );
|
||||
y2 = mm512_rol_32( ty0, 7 );
|
||||
x0 = _mm512_xor_si512( x0, x4 );
|
||||
y0 = _mm512_xor_si512( y0, y4 );
|
||||
x4 = mm512_swap128_64( x4 );
|
||||
x3 = mm512_rol_32( tx1, 7 );
|
||||
y3 = mm512_rol_32( ty1, 7 );
|
||||
y4 = mm512_swap128_64( y4 );
|
||||
|
||||
x1 = _mm512_xor_si512( x1, x5 );
|
||||
y1 = _mm512_xor_si512( y1, y5 );
|
||||
x5 = mm512_swap128_64( x5 );
|
||||
x2 = _mm512_xor_si512( x2, x6 );
|
||||
y2 = _mm512_xor_si512( y2, y6 );
|
||||
y5 = mm512_swap128_64( y5 );
|
||||
x3 = _mm512_xor_si512( x3, x7 );
|
||||
y3 = _mm512_xor_si512( y3, y7 );
|
||||
|
||||
x6 = mm512_swap128_64( x6 );
|
||||
x4 = _mm512_add_epi32( x0, x4 );
|
||||
y4 = _mm512_add_epi32( y0, y4 );
|
||||
y6 = mm512_swap128_64( y6 );
|
||||
x5 = _mm512_add_epi32( x1, x5 );
|
||||
y5 = _mm512_add_epi32( y1, y5 );
|
||||
x7 = mm512_swap128_64( x7 );
|
||||
x6 = _mm512_add_epi32( x2, x6 );
|
||||
y6 = _mm512_add_epi32( y2, y6 );
|
||||
tx0 = x0;
|
||||
ty0 = y0;
|
||||
y7 = mm512_swap128_64( y7 );
|
||||
tx1 = x2;
|
||||
ty1 = y2;
|
||||
x0 = mm512_rol_32( x1, 11 );
|
||||
y0 = mm512_rol_32( y1, 11 );
|
||||
|
||||
x7 = _mm512_add_epi32( x3, x7 );
|
||||
y7 = _mm512_add_epi32( y3, y7 );
|
||||
|
||||
x1 = mm512_rol_32( tx0, 11 );
|
||||
y1 = mm512_rol_32( ty0, 11 );
|
||||
x0 = _mm512_xor_si512( x0, x4 );
|
||||
x4 = mm512_swap64_32( x4 );
|
||||
y0 = _mm512_xor_si512( y0, y4 );
|
||||
x2 = mm512_rol_32( x3, 11 );
|
||||
y4 = mm512_swap64_32( y4 );
|
||||
y2 = mm512_rol_32( y3, 11 );
|
||||
x1 = _mm512_xor_si512( x1, x5 );
|
||||
x5 = mm512_swap64_32( x5 );
|
||||
y1 = _mm512_xor_si512( y1, y5 );
|
||||
x3 = mm512_rol_32( tx1, 11 );
|
||||
y5 = mm512_swap64_32( y5 );
|
||||
y3 = mm512_rol_32( ty1, 11 );
|
||||
|
||||
x2 = _mm512_xor_si512( x2, x6 );
|
||||
x6 = mm512_swap64_32( x6 );
|
||||
y2 = _mm512_xor_si512( y2, y6 );
|
||||
y6 = mm512_swap64_32( y6 );
|
||||
x3 = _mm512_xor_si512( x3, x7 );
|
||||
x7 = mm512_swap64_32( x7 );
|
||||
y3 = _mm512_xor_si512( y3, y7 );
|
||||
|
||||
y7 = mm512_swap64_32( y7 );
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)sp->h0, x0 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 1, x1 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 2, x2 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 3, x3 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 4, x4 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 5, x5 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 6, x6 );
|
||||
_mm512_store_si512( (__m512i*)sp->h0 + 7, x7 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)sp->h1, y0 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 1, y1 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 2, y2 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 3, y3 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 4, y4 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 5, y5 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 6, y6 );
|
||||
_mm512_store_si512( (__m512i*)sp->h1 + 7, y7 );
|
||||
}
|
||||
|
||||
int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
|
||||
int blockbytes )
|
||||
{
|
||||
@@ -219,6 +351,67 @@ int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_2buf_full( cube_4way_2buf_context *sp,
|
||||
void *output0, void *output1, int hashbitlen,
|
||||
const void *data0, const void *data1, size_t size )
|
||||
{
|
||||
__m512i *h0 = (__m512i*)sp->h0;
|
||||
__m512i *h1 = (__m512i*)sp->h1;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = 32/16;
|
||||
sp->rounds = 16;
|
||||
sp->pos = 0;
|
||||
|
||||
h1[0] = h0[0] = m512_const1_128( iv[0] );
|
||||
h1[1] = h0[1] = m512_const1_128( iv[1] );
|
||||
h1[2] = h0[2] = m512_const1_128( iv[2] );
|
||||
h1[3] = h0[3] = m512_const1_128( iv[3] );
|
||||
h1[4] = h0[4] = m512_const1_128( iv[4] );
|
||||
h1[5] = h0[5] = m512_const1_128( iv[5] );
|
||||
h1[6] = h0[6] = m512_const1_128( iv[6] );
|
||||
h1[7] = h0[7] = m512_const1_128( iv[7] );
|
||||
|
||||
const int len = size >> 4;
|
||||
const __m512i *in0 = (__m512i*)data0;
|
||||
const __m512i *in1 = (__m512i*)data1;
|
||||
__m512i *hash0 = (__m512i*)output0;
|
||||
__m512i *hash1 = (__m512i*)output1;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], in0[i] );
|
||||
sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], in1[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_4way_2buf( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
__m512i tmp = m512_const2_64( 0, 0x0000000000000080 );
|
||||
sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], tmp );
|
||||
sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], tmp );
|
||||
|
||||
transform_4way_2buf( sp );
|
||||
|
||||
tmp = m512_const2_64( 0x0000000100000000, 0 );
|
||||
sp->h0[7] = _mm512_xor_si512( sp->h0[7], tmp );
|
||||
sp->h1[7] = _mm512_xor_si512( sp->h1[7], tmp );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way_2buf( sp );
|
||||
|
||||
memcpy( hash0, sp->h0, sp->hashlen<<6);
|
||||
memcpy( hash1, sp->h1, sp->hashlen<<6);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size )
|
||||
@@ -259,6 +452,21 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
|
||||
// 2 way 128
|
||||
|
||||
// This isn't expected to be used with AVX512 so HW rotate intruction
|
||||
// is assumed not avaiable.
|
||||
// Use double buffering to optimize serial bit rotations. Full double
|
||||
// buffering isn't practical because it needs twice as many registers
|
||||
// with AVX2 having only half as many as AVX512.
|
||||
#define ROL2( out0, out1, in0, in1, c ) \
|
||||
{ \
|
||||
__m256i t0 = _mm256_slli_epi32( in0, c ); \
|
||||
__m256i t1 = _mm256_slli_epi32( in1, c ); \
|
||||
out0 = _mm256_srli_epi32( in0, 32-(c) ); \
|
||||
out1 = _mm256_srli_epi32( in1, 32-(c) ); \
|
||||
out0 = _mm256_or_si256( out0, t0 ); \
|
||||
out1 = _mm256_or_si256( out1, t1 ); \
|
||||
}
|
||||
|
||||
static void transform_2way( cube_2way_context *sp )
|
||||
{
|
||||
int r;
|
||||
@@ -283,35 +491,31 @@ static void transform_2way( cube_2way_context *sp )
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x1;
|
||||
x0 = mm256_rol_32( x2, 7 );
|
||||
x1 = mm256_rol_32( x3, 7 );
|
||||
x2 = mm256_rol_32( y0, 7 );
|
||||
x3 = mm256_rol_32( y1, 7 );
|
||||
ROL2( x0, x1, x2, x3, 7 );
|
||||
ROL2( x2, x3, y0, y1, 7 );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = mm256_swap128_64( x4 );
|
||||
x5 = mm256_swap128_64( x5 );
|
||||
x6 = mm256_swap128_64( x6 );
|
||||
x7 = mm256_swap128_64( x7 );
|
||||
x4 = _mm256_add_epi32( x0, x4 );
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x2;
|
||||
x0 = mm256_rol_32( x1, 11 );
|
||||
x1 = mm256_rol_32( y0, 11 );
|
||||
x2 = mm256_rol_32( x3, 11 );
|
||||
x3 = mm256_rol_32( y1, 11 );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x5 = mm256_swap128_64( x5 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = _mm256_add_epi32( x0, x4 );
|
||||
x6 = mm256_swap128_64( x6 );
|
||||
y0 = x0;
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x7 = mm256_swap128_64( x7 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
y1 = x2;
|
||||
ROL2( x0, x1, x1, y0, 11 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
ROL2( x2, x3, x3, y1, 11 );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x4 = mm256_swap64_32( x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x5 = mm256_swap64_32( x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x6 = mm256_swap64_32( x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x7 = mm256_swap64_32( x7 );
|
||||
}
|
||||
|
||||
|
@@ -17,41 +17,41 @@ struct _cube_4way_context
|
||||
int pos;
|
||||
} __attribute__ ((aligned (128)));
|
||||
|
||||
struct _cube_4way_2buf_context
|
||||
{
|
||||
__m512i h0[8];
|
||||
__m512i h1[8];
|
||||
int hashlen;
|
||||
int rounds;
|
||||
int blocksize;
|
||||
int pos;
|
||||
} __attribute__ ((aligned (128)));
|
||||
|
||||
|
||||
typedef struct _cube_4way_context cube_4way_context;
|
||||
|
||||
typedef struct _cube_4way_2buf_context cube_4way_2buf_context;
|
||||
|
||||
int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
|
||||
int blockbytes );
|
||||
int blockbytes );
|
||||
|
||||
int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
|
||||
|
||||
int cube_4way_close( cube_4way_context *sp, void *output );
|
||||
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size );
|
||||
|
||||
int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
const void *data, size_t size );
|
||||
|
||||
int cube_4x256_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
const void *data, size_t size );
|
||||
|
||||
#define cube512_4way_init( sp ) cube_4way_update( sp, 512 )
|
||||
#define cube512_4way_update cube_4way_update
|
||||
#define cube512_4way_update_close cube_4way_update
|
||||
#define cube512_4way_close cube_4way_update
|
||||
#define cube512_4way_full( sp, output, data, size ) \
|
||||
cube_4way_full( sp, output, 512, data, size )
|
||||
#define cube512_4x256_full( sp, output, data, size ) \
|
||||
cube_4x256_full( sp, output, 512, data, size )
|
||||
|
||||
#define cube256_4way_init( sp ) cube_4way_update( sp, 256 )
|
||||
#define cube256_4way_update cube_4way_update
|
||||
#define cube256_4way_update_close cube_4way_update
|
||||
#define cube256_4way_close cube_4way_update
|
||||
#define cube256_4way_full( sp, output, data, size ) \
|
||||
cube_4way_full( sp, output, 256, data, size )
|
||||
#define cube256_4x256_full( sp, output, data, size ) \
|
||||
cube_4x256_full( sp, output, 256, data, size )
|
||||
int cube_4way_2buf_full( cube_4way_2buf_context *sp,
|
||||
void *output0, void *output1, int hashbitlen,
|
||||
const void *data0, const void *data1, size_t size );
|
||||
|
||||
#endif
|
||||
|
||||
// 2x128, 2 way parallel SSE2
|
||||
// 2x128, 2 way parallel AVX2
|
||||
|
||||
struct _cube_2way_context
|
||||
{
|
||||
|
@@ -31,10 +31,14 @@ static void transform( cubehashParam *sp )
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x1 = _mm512_add_epi32( x0, x1 );
|
||||
x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 );
|
||||
x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) );
|
||||
x0 = _mm512_xor_si512( mm512_rol_32(
|
||||
mm512_swap256_128( x0 ), 11 ), x1 );
|
||||
x0 = mm512_swap_256( x0 );
|
||||
x0 = mm512_rol_32( x0, 7 );
|
||||
x0 = _mm512_xor_si512( x0, x1 );
|
||||
x1 = mm512_swap128_64( x1 );
|
||||
x1 = _mm512_add_epi32( x0, x1 );
|
||||
x0 = mm512_swap256_128( x0 );
|
||||
x0 = mm512_rol_32( x0, 11 );
|
||||
x0 = _mm512_xor_si512( x0, x1 );
|
||||
x1 = mm512_swap64_32( x1 );
|
||||
}
|
||||
|
||||
|
@@ -53,6 +53,20 @@ MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x000
|
||||
MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
|
||||
|
||||
|
||||
#define ECHO_SUBBYTES4(state, j) \
|
||||
state[0][j] = _mm_aesenc_si128(state[0][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[1][j] = _mm_aesenc_si128(state[1][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[2][j] = _mm_aesenc_si128(state[2][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[3][j] = _mm_aesenc_si128(state[3][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[0][j] = _mm_aesenc_si128(state[0][j], m128_zero ); \
|
||||
state[1][j] = _mm_aesenc_si128(state[1][j], m128_zero ); \
|
||||
state[2][j] = _mm_aesenc_si128(state[2][j], m128_zero ); \
|
||||
state[3][j] = _mm_aesenc_si128(state[3][j], m128_zero )
|
||||
|
||||
#define ECHO_SUBBYTES(state, i, j) \
|
||||
state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
@@ -73,7 +87,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
|
||||
state2[0][j] = mm128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
|
||||
@@ -83,7 +97,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
|
||||
state2[1][j] = mm128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
|
||||
s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
|
||||
@@ -93,10 +107,29 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
|
||||
state2[2][j] = mm128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], s2)
|
||||
|
||||
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES4(_state, 0);\
|
||||
ECHO_SUBBYTES4(_state, 1);\
|
||||
ECHO_SUBBYTES4(_state, 2);\
|
||||
ECHO_SUBBYTES4(_state, 3);\
|
||||
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES4(_state2, 0);\
|
||||
ECHO_SUBBYTES4(_state2, 1);\
|
||||
ECHO_SUBBYTES4(_state2, 2);\
|
||||
ECHO_SUBBYTES4(_state2, 3);\
|
||||
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
/*
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES(_state, 0, 0);\
|
||||
ECHO_SUBBYTES(_state, 1, 0);\
|
||||
@@ -138,7 +171,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#define SAVESTATE(dst, src)\
|
||||
|
@@ -13,12 +13,19 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
|
||||
//#define mul2mask m512_const2_64( 0, 0x00001b00 )
|
||||
//_mm512_set4_epi32( 0, 0, 0, 0x00001b00 )
|
||||
//_mm512_set4_epi32( 0x00001b00, 0, 0, 0 )
|
||||
|
||||
//#define lsbmask m512_const1_32( 0x01010101 )
|
||||
#define ECHO_SUBBYTES4(state, j) \
|
||||
state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
|
||||
k1 = _mm512_add_epi32( k1, one ); \
|
||||
state[1][j] = _mm512_aesenc_epi128( state[1][j], k1 ); \
|
||||
k1 = _mm512_add_epi32( k1, one ); \
|
||||
state[2][j] = _mm512_aesenc_epi128( state[2][j], k1 ); \
|
||||
k1 = _mm512_add_epi32( k1, one ); \
|
||||
state[3][j] = _mm512_aesenc_epi128( state[3][j], k1 ); \
|
||||
k1 = _mm512_add_epi32( k1, one ); \
|
||||
state[0][j] = _mm512_aesenc_epi128( state[0][j], m512_zero ); \
|
||||
state[1][j] = _mm512_aesenc_epi128( state[1][j], m512_zero ); \
|
||||
state[2][j] = _mm512_aesenc_epi128( state[2][j], m512_zero ); \
|
||||
state[3][j] = _mm512_aesenc_epi128( state[3][j], m512_zero )
|
||||
|
||||
#define ECHO_SUBBYTES( state, i, j ) \
|
||||
state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
|
||||
@@ -44,8 +51,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 );\
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
|
||||
state2[ 0 ][ j ] = mm512_xor3( state2[ 0 ][ j ], s2, state1[ 1 ][ j1 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
@@ -55,8 +61,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
|
||||
state2[ 1 ][ j ] = mm512_xor3( state2[ 1 ][ j ], s2, state1[ 2 ][ j2 ] ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
|
||||
s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
|
||||
@@ -66,11 +71,29 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
|
||||
state2[ 2 ][ j ] = mm512_xor3( state2[ 2 ][ j ], s2, state1[ 3 ][ j3] ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
|
||||
} while(0)
|
||||
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES4(_state, 0);\
|
||||
ECHO_SUBBYTES4(_state, 1);\
|
||||
ECHO_SUBBYTES4(_state, 2);\
|
||||
ECHO_SUBBYTES4(_state, 3);\
|
||||
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES4(_state2, 0);\
|
||||
ECHO_SUBBYTES4(_state2, 1);\
|
||||
ECHO_SUBBYTES4(_state2, 2);\
|
||||
ECHO_SUBBYTES4(_state2, 3);\
|
||||
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
/*
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES(_state, 0, 0);\
|
||||
ECHO_SUBBYTES(_state, 1, 0);\
|
||||
@@ -112,6 +135,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
*/
|
||||
|
||||
#define SAVESTATE(dst, src)\
|
||||
dst[0][0] = src[0][0];\
|
||||
@@ -405,6 +429,20 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
|
||||
#define lsbmask_2way m256_const1_32( 0x01010101 )
|
||||
|
||||
#define ECHO_SUBBYTES4_2WAY( state, j ) \
|
||||
state[0][j] = _mm256_aesenc_epi128( state[0][j], k1 ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||
state[1][j] = _mm256_aesenc_epi128( state[1][j], k1 ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||
state[2][j] = _mm256_aesenc_epi128( state[2][j], k1 ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||
state[3][j] = _mm256_aesenc_epi128( state[3][j], k1 ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||
state[0][j] = _mm256_aesenc_epi128( state[0][j], m256_zero ); \
|
||||
state[1][j] = _mm256_aesenc_epi128( state[1][j], m256_zero ); \
|
||||
state[2][j] = _mm256_aesenc_epi128( state[2][j], m256_zero ); \
|
||||
state[3][j] = _mm256_aesenc_epi128( state[3][j], m256_zero )
|
||||
|
||||
#define ECHO_SUBBYTES_2WAY( state, i, j ) \
|
||||
state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||
@@ -456,6 +494,25 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
|
||||
} while(0)
|
||||
|
||||
#define ECHO_ROUND_UNROLL2_2WAY \
|
||||
ECHO_SUBBYTES4_2WAY(_state, 0);\
|
||||
ECHO_SUBBYTES4_2WAY(_state, 1);\
|
||||
ECHO_SUBBYTES4_2WAY(_state, 2);\
|
||||
ECHO_SUBBYTES4_2WAY(_state, 3);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES4_2WAY(_state2, 0);\
|
||||
ECHO_SUBBYTES4_2WAY(_state2, 1);\
|
||||
ECHO_SUBBYTES4_2WAY(_state2, 2);\
|
||||
ECHO_SUBBYTES4_2WAY(_state2, 3);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
/*
|
||||
#define ECHO_ROUND_UNROLL2_2WAY \
|
||||
ECHO_SUBBYTES_2WAY(_state, 0, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 1, 0);\
|
||||
@@ -497,6 +554,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
|
||||
*/
|
||||
|
||||
#define SAVESTATE_2WAY(dst, src)\
|
||||
dst[0][0] = src[0][0];\
|
||||
|
@@ -124,7 +124,16 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s7 = _mm_xor_si128(s7, t1)
|
||||
|
||||
#define PRESUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t2 = t0;\
|
||||
t3 = _mm_add_epi8(t0, t0);\
|
||||
t4 = _mm_add_epi8(t3, t3);\
|
||||
t1 = _mm_srli_epi16(t0, 6);\
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
||||
|
||||
/*
|
||||
#define PRESUPERMIX(x, t1, s1, s2, t2)\
|
||||
s1 = x;\
|
||||
s2 = _mm_add_epi8(x, x);\
|
||||
@@ -133,37 +142,59 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
||||
*/
|
||||
|
||||
#define SUBSTITUTE(r0, _t1, _t2, _t3, _t0)\
|
||||
#define SUBSTITUTE(r0, _t2 )\
|
||||
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
|
||||
_t2 = _mm_aesenclast_si128( _t2, m128_zero )
|
||||
|
||||
|
||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t2 = t0;\
|
||||
t3 = _mm_add_epi8(t0, t0);\
|
||||
t4 = _mm_add_epi8(t3, t3);\
|
||||
t1 = _mm_srli_epi16(t0, 6);\
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
|
||||
t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
|
||||
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
||||
t2 = mm128_xor3(t2, t3, t0 );\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||
t4 = mm128_xor3( t4, t1, t2 ); \
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
||||
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
||||
t4 = mm128_xor3( t4, t2, t1 ); \
|
||||
t0 = _mm_xor_si128(t0, t3);\
|
||||
t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
|
||||
|
||||
/*
|
||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||
PRESUPERMIX(t0, t1, t2, t3, t4);\
|
||||
POSTSUPERMIX(t0, t1, t2, t3, t4)
|
||||
|
||||
*/
|
||||
|
||||
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t1 = t2;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1b));\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
|
||||
t4 = t1;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = t4;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1d));\
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = t2;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1a));\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t2 = _mm_xor_si128(t2, t3);\
|
||||
t2 = _mm_xor_si128(t2, t0);\
|
||||
t2 = mm128_xor3(t2, t3, t0 );\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||
t4 = _mm_xor_si128(t4, t2);\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||
t4 = _mm_xor_si128(t4, t2);\
|
||||
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
||||
t1 = t0;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix4a));\
|
||||
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
||||
t0 = _mm_xor_si128(t0, t3);\
|
||||
@@ -171,59 +202,55 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
|
||||
t4 = _mm_xor_si128(t4, t0)
|
||||
|
||||
|
||||
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
|
||||
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
||||
PACK_S0(r1c, r1a, _t0);\
|
||||
SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE(r1c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
||||
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
||||
r2c = _mm_xor_si128(r2c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r2d = _mm_xor_si128(r2d, _t0);\
|
||||
UNPACK_S0(r1c, r1a, _t3);\
|
||||
SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE(r2c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
||||
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
||||
r3c = _mm_xor_si128(r3c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r3d = _mm_xor_si128(r3d, _t0);\
|
||||
UNPACK_S0(r2c, r2a, _t3);\
|
||||
SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE(r3c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
||||
UNPACK_S0(r3c, r3a, _t3)
|
||||
|
||||
|
||||
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
|
||||
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
||||
PACK_S0(r1c, r1a, _t0);\
|
||||
SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE( r1c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
||||
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
||||
r2c = _mm_xor_si128(r2c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r2d = _mm_xor_si128(r2d, _t0);\
|
||||
UNPACK_S0(r1c, r1a, _t3);\
|
||||
SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE(r2c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
||||
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
||||
r3c = _mm_xor_si128(r3c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r3d = _mm_xor_si128(r3d, _t0);\
|
||||
UNPACK_S0(r2c, r2a, _t3);\
|
||||
SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE( r3c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
||||
_t0 = _mm_shuffle_epi32(r3c, 0x39);\
|
||||
r4c = _mm_xor_si128(r4c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r4d = _mm_xor_si128(r4d, _t0);\
|
||||
UNPACK_S0(r3c, r3a, _t3);\
|
||||
SUBSTITUTE(r4c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE( r4c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
|
||||
UNPACK_S0(r4c, r4a, _t3)
|
||||
|
||||
|
||||
|
||||
#define LOADCOLUMN(x, s, a)\
|
||||
block[0] = col[(base + a + 0) % s];\
|
||||
block[1] = col[(base + a + 1) % s];\
|
||||
@@ -247,14 +274,14 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
|
||||
case 1:
|
||||
TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4],
|
||||
ctx->state[5], ctx->state[ 6], ctx->state[8],
|
||||
ctx->state[9], ctx->state[10], _t0, _t1, _t2 );
|
||||
ctx->state[9], ctx->state[10], _t0, _t1, _t2 );
|
||||
|
||||
SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7],
|
||||
SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7],
|
||||
ctx->state[1], ctx->state[7], ctx->state[8],
|
||||
ctx->state[6], ctx->state[0], ctx->state[6],
|
||||
ctx->state[7], ctx->state[5], ctx->state[11],
|
||||
ctx->state[5], ctx->state[6], ctx->state[4],
|
||||
ctx->state[10] );
|
||||
ctx->state[6], ctx->state[0], ctx->state[6],
|
||||
ctx->state[7], ctx->state[5], ctx->state[11],
|
||||
ctx->state[5], ctx->state[6], ctx->state[4],
|
||||
ctx->state[10] );
|
||||
ctx->base++;
|
||||
pmsg += 4;
|
||||
uBlockCount--;
|
||||
@@ -263,14 +290,14 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
|
||||
case 2:
|
||||
TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0],
|
||||
ctx->state[ 1], ctx->state[2], ctx->state[4],
|
||||
ctx->state[ 5], ctx->state[6], _t0, _t1, _t2);
|
||||
ctx->state[ 5], ctx->state[6], _t0, _t1, _t2);
|
||||
|
||||
SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3],
|
||||
ctx->state[9], ctx->state[3], ctx->state[4],
|
||||
ctx->state[2], ctx->state[8], ctx->state[2],
|
||||
ctx->state[3], ctx->state[1], ctx->state[7],
|
||||
ctx->state[1], ctx->state[2], ctx->state[0],
|
||||
ctx->state[6]);
|
||||
ctx->state[2], ctx->state[8], ctx->state[2],
|
||||
ctx->state[3], ctx->state[1], ctx->state[7],
|
||||
ctx->state[1], ctx->state[2], ctx->state[0],
|
||||
ctx->state[6]);
|
||||
|
||||
ctx->base = 0;
|
||||
pmsg += 4;
|
||||
@@ -278,44 +305,42 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
while( uBlockCount > 0 )
|
||||
{
|
||||
TIX512( pmsg, ctx->state[ 7], ctx->state[2], ctx->state[8], ctx->state[9],
|
||||
ctx->state[10], ctx->state[0], ctx->state[1], ctx->state[2],
|
||||
_t0, _t1, _t2 );
|
||||
SUBROUND512_4( ctx->state[0], ctx->state[1], ctx->state[11],
|
||||
ctx->state[5], ctx->state[11], ctx->state[0],
|
||||
ctx->state[10], ctx->state[4], ctx->state[10],
|
||||
ctx->state[11], ctx->state[9], ctx->state[3],
|
||||
ctx->state[9], ctx->state[10], ctx->state[8],
|
||||
ctx->state[2] );
|
||||
TIX512( pmsg, ctx->state[ 7],ctx->state[2],ctx->state[8],ctx->state[9],
|
||||
ctx->state[10],ctx->state[0],ctx->state[1],ctx->state[2],
|
||||
_t0, _t1, _t2 );
|
||||
SUBROUND512_4( ctx->state[0], ctx->state[1],ctx->state[11],ctx->state[5],
|
||||
ctx->state[11],ctx->state[0],ctx->state[10],ctx->state[4],
|
||||
ctx->state[10],ctx->state[11],ctx->state[9],ctx->state[3],
|
||||
ctx->state[9],ctx->state[10],ctx->state[8],ctx->state[2] );
|
||||
|
||||
ctx->base++;
|
||||
pmsg += 4;
|
||||
uBlockCount--;
|
||||
if( uBlockCount == 0 ) break;
|
||||
|
||||
TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4], ctx->state[5],
|
||||
ctx->state[6], ctx->state[8], ctx->state[9], ctx->state[10],
|
||||
_t0, _t1, _t2 );
|
||||
TIX512( pmsg, ctx->state[3],ctx->state[10],ctx->state[4],ctx->state[5],
|
||||
ctx->state[6],ctx->state[8], ctx->state[9],ctx->state[10],
|
||||
_t0, _t1, _t2 );
|
||||
|
||||
SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], ctx->state[1], ctx->state[7], ctx->state[8], ctx->state[6], ctx->state[0],
|
||||
ctx->state[6], ctx->state[7], ctx->state[5], ctx->state[11],
|
||||
ctx->state[5], ctx->state[6, ctx->state[4], ctx->state[10]);
|
||||
SUBROUND512_4( ctx->state[8],ctx->state[9],ctx->state[7],ctx->state[1],
|
||||
ctx->state[7],ctx->state[8],ctx->state[6],ctx->state[0],
|
||||
ctx->state[6],ctx->state[7],ctx->state[5],ctx->state[11],
|
||||
ctx->state[5],ctx->state[6],ctx->state[4],ctx->state[10] );
|
||||
|
||||
ctx->base++;
|
||||
pmsg += 4;
|
||||
uBlockCount--;
|
||||
if( uBlockCount == 0 ) break;
|
||||
|
||||
TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0], ctx->state[1],
|
||||
ctx->state[2], ctx->state[4], ctx->state[5], ctx->state[6],
|
||||
_t0, _t1, _t2);
|
||||
SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3], ctx->state[9],
|
||||
ctx->state[3], ctx->state[4], ctx->state[2], ctx->state[8],
|
||||
ctx->state[2], ctx->state[3], ctx->state[1], ctx->state[7],
|
||||
ctx->state[1], ctx->state[2], ctx->state[0], ctx->state[6]);
|
||||
TIX512( pmsg, ctx->state[11],ctx->state[6],ctx->state[0],ctx->state[1],
|
||||
ctx->state[2], ctx->state[4],ctx->state[5],ctx->state[6],
|
||||
_t0, _t1, _t2);
|
||||
SUBROUND512_4( ctx->state[4],ctx->state[5],ctx->state[3],ctx->state[9],
|
||||
ctx->state[3],ctx->state[4],ctx->state[2],ctx->state[8],
|
||||
ctx->state[2],ctx->state[3],ctx->state[1],ctx->state[7],
|
||||
ctx->state[1],ctx->state[2],ctx->state[0],ctx->state[6]);
|
||||
|
||||
ctx->base = 0;
|
||||
pmsg += 4;
|
||||
@@ -326,8 +351,8 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
|
||||
|
||||
void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
{
|
||||
unsigned int block[4] __attribute__ ((aligned (32)));
|
||||
unsigned int col[36] __attribute__ ((aligned (16)));
|
||||
unsigned int block[4] __attribute__ ((aligned (32)));
|
||||
unsigned int col[36] __attribute__ ((aligned (16)));
|
||||
unsigned int i, base;
|
||||
__m128i r0, _t0, _t1, _t2, _t3;
|
||||
|
||||
@@ -357,7 +382,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
}
|
||||
@@ -375,7 +400,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
|
||||
@@ -390,7 +415,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
|
||||
@@ -405,7 +430,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
|
||||
@@ -420,7 +445,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
}
|
||||
|
@@ -67,11 +67,9 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm_xor_si128(j, j);\
|
||||
j = _mm_cmpgt_epi8(j, i);\
|
||||
j = _mm_cmpgt_epi8( m128_zero, i);\
|
||||
i = _mm_add_epi8(i, i);\
|
||||
j = _mm_and_si128(j, k);\
|
||||
i = _mm_xor_si128(i, j);\
|
||||
i = mm128_xorand(i, j, k );\
|
||||
}
|
||||
|
||||
/**/
|
||||
@@ -93,6 +91,96 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
b1 = a3;\
|
||||
TEMP2 = _mm_xor_si128(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
TEMP0 = mm128_xor3( b0, a4, a6 ); \
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP1 = mm128_xor3( b1, a5, a7 );\
|
||||
b2 = mm128_xor3( b2, a6, a0 ); \
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b3 = mm128_xor3( b3, a7, a1 ); \
|
||||
b1 = a1;\
|
||||
b6 = mm128_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = mm128_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = mm128_xor3( b7, a5, a3 ); \
|
||||
b5 = mm128_xor3( b5, a1, a3 ); \
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(TEMP2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm_xor_si128(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm_xor_si128(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#else
|
||||
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
@@ -189,6 +277,8 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* one round
|
||||
* a0-a7 = input rows
|
||||
|
@@ -58,11 +58,9 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm_xor_si128(j, j);\
|
||||
j = _mm_cmpgt_epi8(j, i);\
|
||||
j = _mm_cmpgt_epi8( m128_zero, i);\
|
||||
i = _mm_add_epi8(i, i);\
|
||||
j = _mm_and_si128(j, k);\
|
||||
i = _mm_xor_si128(i, j);\
|
||||
i = mm128_xorand(i, j, k );\
|
||||
}
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
@@ -82,6 +80,96 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
b1 = a3;\
|
||||
TEMP2 = _mm_xor_si128(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
TEMP0 = mm128_xor3( b0, a4, a6 ); \
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP1 = mm128_xor3( b1, a5, a7 );\
|
||||
b2 = mm128_xor3( b2, a6, a0 ); \
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b3 = mm128_xor3( b3, a7, a1 ); \
|
||||
b1 = a1;\
|
||||
b6 = mm128_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = mm128_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = mm128_xor3( b7, a5, a3 ); \
|
||||
b5 = mm128_xor3( b5, a1, a3 ); \
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(TEMP2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm_xor_si128(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm_xor_si128(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#else
|
||||
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
@@ -178,6 +266,8 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#endif
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
|
@@ -43,7 +43,8 @@
|
||||
#define ROUNDS (ROUNDS1024)
|
||||
//#endif
|
||||
|
||||
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
#define ROTL64(a,n) rol64( a, n )
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
||||
|
@@ -63,7 +63,8 @@ typedef crypto_uint64 u64;
|
||||
//#define ROUNDS (ROUNDS1024)
|
||||
//#endif
|
||||
|
||||
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
#define ROTL64(a,n) rol64( a, n )
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
||||
|
@@ -96,11 +96,9 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm512_xor_si512(j, j);\
|
||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
|
||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
|
||||
i = _mm512_add_epi8(i, i);\
|
||||
j = _mm512_and_si512(j, k);\
|
||||
i = _mm512_xor_si512(i, j);\
|
||||
i = mm512_xorand( i, j, k );\
|
||||
}
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
@@ -120,6 +118,95 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
|
||||
#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
|
||||
b0, b1, b2, b3, b4, b5, b6, b7) { \
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0; \
|
||||
b7 = a1; \
|
||||
a0 = _mm512_xor_si512( a0, a1 ); \
|
||||
b0 = a2; \
|
||||
a1 = _mm512_xor_si512( a1, a2 ); \
|
||||
b1 = a3; \
|
||||
TEMP2 = _mm512_xor_si512( a2, a3 ); \
|
||||
b2 = a4; \
|
||||
a3 = _mm512_xor_si512( a3, a4 ); \
|
||||
b3 = a5; \
|
||||
a4 = _mm512_xor_si512( a4, a5 );\
|
||||
b4 = a6; \
|
||||
a5 = _mm512_xor_si512( a5, a6 ); \
|
||||
b5 = a7; \
|
||||
a6 = _mm512_xor_si512( a6, a7 ); \
|
||||
a7 = _mm512_xor_si512( a7, b6 ); \
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
TEMP0 = mm512_xor3( b0, a4, a6 ); \
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP1 = mm512_xor3( b1, a5, a7 ); \
|
||||
b2 = mm512_xor3( b2, a6, a0 ); \
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0; \
|
||||
b3 = mm512_xor3( b3, a7, a1 ); \
|
||||
b1 = a1; \
|
||||
b6 = mm512_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = mm512_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = mm512_xor3( b7, a5, a3 ); \
|
||||
b5 = mm512_xor3( b5, a1, a3 ); \
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm512_xor_si512( a0, a3 ); \
|
||||
a1 = _mm512_xor_si512( a1, a4 ); \
|
||||
a2 = _mm512_xor_si512( TEMP2, a5 ); \
|
||||
a3 = _mm512_xor_si512( a3, a6 ); \
|
||||
a4 = _mm512_xor_si512( a4, a7 ); \
|
||||
a5 = _mm512_xor_si512( a5, b0 ); \
|
||||
a6 = _mm512_xor_si512( a6, b1 ); \
|
||||
a7 = _mm512_xor_si512( a7, TEMP2 ); \
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
|
||||
MUL2( a0, b0, b1 ); \
|
||||
a0 = _mm512_xor_si512( a0, TEMP0 ); \
|
||||
MUL2( a1, b0, b1 ); \
|
||||
a1 = _mm512_xor_si512( a1, TEMP1 ); \
|
||||
MUL2( a2, b0, b1 ); \
|
||||
a2 = _mm512_xor_si512( a2, b2 ); \
|
||||
MUL2( a3, b0, b1 ); \
|
||||
a3 = _mm512_xor_si512( a3, b3 ); \
|
||||
MUL2( a4, b0, b1 ); \
|
||||
a4 = _mm512_xor_si512( a4, b4 ); \
|
||||
MUL2( a5, b0, b1 ); \
|
||||
a5 = _mm512_xor_si512( a5, b5 ); \
|
||||
MUL2( a6, b0, b1 ); \
|
||||
a6 = _mm512_xor_si512( a6, b6 ); \
|
||||
MUL2( a7, b0, b1 ); \
|
||||
a7 = _mm512_xor_si512( a7, b7 ); \
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2( a0, b0, b1 ); \
|
||||
b5 = _mm512_xor_si512( b5, a0 ); \
|
||||
MUL2( a1, b0, b1 ); \
|
||||
b6 = _mm512_xor_si512( b6, a1 ); \
|
||||
MUL2( a2, b0, b1 ); \
|
||||
b7 = _mm512_xor_si512( b7, a2 ); \
|
||||
MUL2( a5, b0, b1 ); \
|
||||
b2 = _mm512_xor_si512( b2, a5 ); \
|
||||
MUL2( a6, b0, b1 ); \
|
||||
b3 = _mm512_xor_si512( b3, a6 ); \
|
||||
MUL2( a7, b0, b1 ); \
|
||||
b4 = _mm512_xor_si512( b4, a7 ); \
|
||||
MUL2( a3, b0, b1 ); \
|
||||
MUL2( a4, b0, b1 ); \
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm512_xor_si512( b0, a3 ); \
|
||||
b1 = _mm512_xor_si512( b1, a4 ); \
|
||||
}/*MixBytes*/
|
||||
|
||||
|
||||
#if 0
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
@@ -215,7 +302,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
b0 = _mm512_xor_si512(b0, a3);\
|
||||
b1 = _mm512_xor_si512(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#endif
|
||||
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
|
@@ -104,11 +104,9 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm512_xor_si512(j, j);\
|
||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
|
||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
|
||||
i = _mm512_add_epi8(i, i);\
|
||||
j = _mm512_and_si512(j, k);\
|
||||
i = _mm512_xor_si512(i, j);\
|
||||
i = mm512_xorand( i, j, k );\
|
||||
}
|
||||
|
||||
/**/
|
||||
@@ -130,100 +128,90 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
|
||||
b0, b1, b2, b3, b4, b5, b6, b7) { \
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm512_xor_si512(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm512_xor_si512(a1, a2);\
|
||||
b1 = a3;\
|
||||
a2 = _mm512_xor_si512(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm512_xor_si512(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm512_xor_si512(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm512_xor_si512(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm512_xor_si512(a6, a7);\
|
||||
a7 = _mm512_xor_si512(a7, b6);\
|
||||
b6 = a0; \
|
||||
b7 = a1; \
|
||||
a0 = _mm512_xor_si512( a0, a1 ); \
|
||||
b0 = a2; \
|
||||
a1 = _mm512_xor_si512( a1, a2 ); \
|
||||
b1 = a3; \
|
||||
TEMP2 = _mm512_xor_si512( a2, a3 ); \
|
||||
b2 = a4; \
|
||||
a3 = _mm512_xor_si512( a3, a4 ); \
|
||||
b3 = a5; \
|
||||
a4 = _mm512_xor_si512( a4, a5 );\
|
||||
b4 = a6; \
|
||||
a5 = _mm512_xor_si512( a5, a6 ); \
|
||||
b5 = a7; \
|
||||
a6 = _mm512_xor_si512( a6, a7 ); \
|
||||
a7 = _mm512_xor_si512( a7, b6 ); \
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm512_xor_si512(b0, a4);\
|
||||
b6 = _mm512_xor_si512(b6, a4);\
|
||||
b1 = _mm512_xor_si512(b1, a5);\
|
||||
b7 = _mm512_xor_si512(b7, a5);\
|
||||
b2 = _mm512_xor_si512(b2, a6);\
|
||||
b0 = _mm512_xor_si512(b0, a6);\
|
||||
TEMP0 = mm512_xor3( b0, a4, a6 ); \
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm512_xor_si512(b3, a7);\
|
||||
b1 = _mm512_xor_si512(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm512_xor_si512(b4, a0);\
|
||||
b2 = _mm512_xor_si512(b2, a0);\
|
||||
TEMP1 = mm512_xor3( b1, a5, a7 ); \
|
||||
b2 = mm512_xor3( b2, a6, a0 ); \
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b5 = _mm512_xor_si512(b5, a1);\
|
||||
b3 = _mm512_xor_si512(b3, a1);\
|
||||
b1 = a1;\
|
||||
b6 = _mm512_xor_si512(b6, a2);\
|
||||
b4 = _mm512_xor_si512(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm512_xor_si512(b7, a3);\
|
||||
b5 = _mm512_xor_si512(b5, a3);\
|
||||
b0 = a0; \
|
||||
b3 = mm512_xor3( b3, a7, a1 ); \
|
||||
b1 = a1; \
|
||||
b6 = mm512_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = mm512_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = mm512_xor3( b7, a5, a3 ); \
|
||||
b5 = mm512_xor3( b5, a1, a3 ); \
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm512_xor_si512(a0, a3);\
|
||||
a1 = _mm512_xor_si512(a1, a4);\
|
||||
a2 = _mm512_xor_si512(a2, a5);\
|
||||
a3 = _mm512_xor_si512(a3, a6);\
|
||||
a4 = _mm512_xor_si512(a4, a7);\
|
||||
a5 = _mm512_xor_si512(a5, b0);\
|
||||
a6 = _mm512_xor_si512(a6, b1);\
|
||||
a7 = _mm512_xor_si512(a7, TEMP2);\
|
||||
a0 = _mm512_xor_si512( a0, a3 ); \
|
||||
a1 = _mm512_xor_si512( a1, a4 ); \
|
||||
a2 = _mm512_xor_si512( TEMP2, a5 ); \
|
||||
a3 = _mm512_xor_si512( a3, a6 ); \
|
||||
a4 = _mm512_xor_si512( a4, a7 ); \
|
||||
a5 = _mm512_xor_si512( a5, b0 ); \
|
||||
a6 = _mm512_xor_si512( a6, b1 ); \
|
||||
a7 = _mm512_xor_si512( a7, TEMP2 ); \
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm512_xor_si512(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm512_xor_si512(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm512_xor_si512(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm512_xor_si512(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm512_xor_si512(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm512_xor_si512(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm512_xor_si512(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm512_xor_si512(a7, b7);\
|
||||
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
|
||||
MUL2( a0, b0, b1 ); \
|
||||
a0 = _mm512_xor_si512( a0, TEMP0 ); \
|
||||
MUL2( a1, b0, b1 ); \
|
||||
a1 = _mm512_xor_si512( a1, TEMP1 ); \
|
||||
MUL2( a2, b0, b1 ); \
|
||||
a2 = _mm512_xor_si512( a2, b2 ); \
|
||||
MUL2( a3, b0, b1 ); \
|
||||
a3 = _mm512_xor_si512( a3, b3 ); \
|
||||
MUL2( a4, b0, b1 ); \
|
||||
a4 = _mm512_xor_si512( a4, b4 ); \
|
||||
MUL2( a5, b0, b1 ); \
|
||||
a5 = _mm512_xor_si512( a5, b5 ); \
|
||||
MUL2( a6, b0, b1 ); \
|
||||
a6 = _mm512_xor_si512( a6, b6 ); \
|
||||
MUL2( a7, b0, b1 ); \
|
||||
a7 = _mm512_xor_si512( a7, b7 ); \
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm512_xor_si512(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm512_xor_si512(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm512_xor_si512(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm512_xor_si512(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm512_xor_si512(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm512_xor_si512(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
MUL2( a0, b0, b1 ); \
|
||||
b5 = _mm512_xor_si512( b5, a0 ); \
|
||||
MUL2( a1, b0, b1 ); \
|
||||
b6 = _mm512_xor_si512( b6, a1 ); \
|
||||
MUL2( a2, b0, b1 ); \
|
||||
b7 = _mm512_xor_si512( b7, a2 ); \
|
||||
MUL2( a5, b0, b1 ); \
|
||||
b2 = _mm512_xor_si512( b2, a5 ); \
|
||||
MUL2( a6, b0, b1 ); \
|
||||
b3 = _mm512_xor_si512( b3, a6 ); \
|
||||
MUL2( a7, b0, b1 ); \
|
||||
b4 = _mm512_xor_si512( b4, a7 ); \
|
||||
MUL2( a3, b0, b1 ); \
|
||||
MUL2( a4, b0, b1 ); \
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm512_xor_si512(b0, a3);\
|
||||
b1 = _mm512_xor_si512(b1, a4);\
|
||||
b0 = _mm512_xor_si512( b0, a3 ); \
|
||||
b1 = _mm512_xor_si512( b1, a4 ); \
|
||||
}/*MixBytes*/
|
||||
|
||||
/* one round
|
||||
@@ -709,11 +697,9 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2_2WAY(i, j, k){\
|
||||
j = _mm256_xor_si256(j, j);\
|
||||
j = _mm256_cmpgt_epi8(j, i );\
|
||||
j = _mm256_cmpgt_epi8( m256_zero, i );\
|
||||
i = _mm256_add_epi8(i, i);\
|
||||
j = _mm256_and_si256(j, k);\
|
||||
i = _mm256_xor_si256(i, j);\
|
||||
i = mm256_xorand( i, j, k );\
|
||||
}
|
||||
|
||||
#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
|
@@ -11,7 +11,7 @@
|
||||
#else
|
||||
#include "sph_groestl.h"
|
||||
#endif
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
typedef struct {
|
||||
#ifdef __AES__
|
||||
@@ -19,7 +19,6 @@ typedef struct {
|
||||
#else
|
||||
sph_groestl512_context groestl;
|
||||
#endif
|
||||
sph_sha256_context sha;
|
||||
} myrgr_ctx_holder;
|
||||
|
||||
myrgr_ctx_holder myrgr_ctx;
|
||||
@@ -31,7 +30,6 @@ void init_myrgr_ctx()
|
||||
#else
|
||||
sph_groestl512_init( &myrgr_ctx.groestl );
|
||||
#endif
|
||||
sph_sha256_init( &myrgr_ctx.sha );
|
||||
}
|
||||
|
||||
void myriad_hash(void *output, const void *input)
|
||||
@@ -49,8 +47,7 @@ void myriad_hash(void *output, const void *input)
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#endif
|
||||
|
||||
sph_sha256( &ctx.sha, hash, 64 );
|
||||
sph_sha256_close( &ctx.sha, hash );
|
||||
sha256_full( hash, hash, 64 );
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
@@ -44,6 +44,7 @@ void myriad_8way_hash( void *output, const void *input )
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, input, 640 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(groestl512_4way_context) );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 );
|
||||
|
||||
uint32_t hash0[20] __attribute__ ((aligned (64)));
|
||||
@@ -58,8 +59,6 @@ void myriad_8way_hash( void *output, const void *input )
|
||||
// rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
|
||||
hash6, hash7 );
|
||||
|
||||
#else
|
||||
|
||||
@@ -76,27 +75,27 @@ void myriad_8way_hash( void *output, const void *input )
|
||||
hash4, hash5, hash6, hash7, input, 640 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
|
||||
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, 512 );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
|
||||
#endif
|
||||
|
||||
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
|
||||
hash6, hash7 );
|
||||
|
||||
sha256_8way_update( &ctx.sha, vhash, 64 );
|
||||
sha256_8way_close( &ctx.sha, output );
|
||||
}
|
||||
|
@@ -632,26 +632,25 @@ do { \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define ROUND_BIG8(rc, alpha) \
|
||||
#define ROUND_BIG8( alpha ) \
|
||||
do { \
|
||||
__m512i t0, t1, t2, t3; \
|
||||
s0 = _mm512_xor_si512( s0, m512_const1_64( \
|
||||
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
|
||||
s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
|
||||
s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
|
||||
s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
|
||||
s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
|
||||
s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
|
||||
s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
|
||||
s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
|
||||
s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
|
||||
s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
|
||||
sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
|
||||
sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
|
||||
sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
|
||||
sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
|
||||
sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
|
||||
sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
|
||||
s0 = _mm512_xor_si512( s0, alpha[ 0] ); \
|
||||
s1 = _mm512_xor_si512( s1, alpha[ 1] ); \
|
||||
s2 = _mm512_xor_si512( s2, alpha[ 2] ); \
|
||||
s3 = _mm512_xor_si512( s3, alpha[ 3] ); \
|
||||
s4 = _mm512_xor_si512( s4, alpha[ 4] ); \
|
||||
s5 = _mm512_xor_si512( s5, alpha[ 5] ); \
|
||||
s6 = _mm512_xor_si512( s6, alpha[ 6] ); \
|
||||
s7 = _mm512_xor_si512( s7, alpha[ 7] ); \
|
||||
s8 = _mm512_xor_si512( s8, alpha[ 8] ); \
|
||||
s9 = _mm512_xor_si512( s9, alpha[ 9] ); \
|
||||
sA = _mm512_xor_si512( sA, alpha[10] ); \
|
||||
sB = _mm512_xor_si512( sB, alpha[11] ); \
|
||||
sC = _mm512_xor_si512( sC, alpha[12] ); \
|
||||
sD = _mm512_xor_si512( sD, alpha[13] ); \
|
||||
sE = _mm512_xor_si512( sE, alpha[14] ); \
|
||||
sF = _mm512_xor_si512( sF, alpha[15] ); \
|
||||
\
|
||||
SBOX8( s0, s4, s8, sC ); \
|
||||
SBOX8( s1, s5, s9, sD ); \
|
||||
@@ -731,28 +730,66 @@ do { \
|
||||
|
||||
#define P_BIG8 \
|
||||
do { \
|
||||
ROUND_BIG8(0, alpha_n); \
|
||||
ROUND_BIG8(1, alpha_n); \
|
||||
ROUND_BIG8(2, alpha_n); \
|
||||
ROUND_BIG8(3, alpha_n); \
|
||||
ROUND_BIG8(4, alpha_n); \
|
||||
ROUND_BIG8(5, alpha_n); \
|
||||
__m512i alpha[16]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
} while (0)
|
||||
|
||||
#define PF_BIG8 \
|
||||
do { \
|
||||
ROUND_BIG8( 0, alpha_f); \
|
||||
ROUND_BIG8( 1, alpha_f); \
|
||||
ROUND_BIG8( 2, alpha_f); \
|
||||
ROUND_BIG8( 3, alpha_f); \
|
||||
ROUND_BIG8( 4, alpha_f); \
|
||||
ROUND_BIG8( 5, alpha_f); \
|
||||
ROUND_BIG8( 6, alpha_f); \
|
||||
ROUND_BIG8( 7, alpha_f); \
|
||||
ROUND_BIG8( 8, alpha_f); \
|
||||
ROUND_BIG8( 9, alpha_f); \
|
||||
ROUND_BIG8(10, alpha_f); \
|
||||
ROUND_BIG8(11, alpha_f); \
|
||||
__m512i alpha[16]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)6 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)7 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)8 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)9 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)10 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)11 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
} while (0)
|
||||
|
||||
#define T_BIG8 \
|
||||
@@ -965,26 +1002,25 @@ do { \
|
||||
#define sF m7
|
||||
*/
|
||||
|
||||
#define ROUND_BIG(rc, alpha) \
|
||||
#define ROUND_BIG( alpha ) \
|
||||
do { \
|
||||
__m256i t0, t1, t2, t3; \
|
||||
s0 = _mm256_xor_si256( s0, m256_const1_64( \
|
||||
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
|
||||
s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
|
||||
s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
|
||||
s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
|
||||
s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
|
||||
s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
|
||||
s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
|
||||
s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
|
||||
s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
|
||||
s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
|
||||
sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
|
||||
sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
|
||||
sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
|
||||
sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
|
||||
sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
|
||||
sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
|
||||
s0 = _mm256_xor_si256( s0, alpha[ 0] ); \
|
||||
s1 = _mm256_xor_si256( s1, alpha[ 1] ); \
|
||||
s2 = _mm256_xor_si256( s2, alpha[ 2] ); \
|
||||
s3 = _mm256_xor_si256( s3, alpha[ 3] ); \
|
||||
s4 = _mm256_xor_si256( s4, alpha[ 4] ); \
|
||||
s5 = _mm256_xor_si256( s5, alpha[ 5] ); \
|
||||
s6 = _mm256_xor_si256( s6, alpha[ 6] ); \
|
||||
s7 = _mm256_xor_si256( s7, alpha[ 7] ); \
|
||||
s8 = _mm256_xor_si256( s8, alpha[ 8] ); \
|
||||
s9 = _mm256_xor_si256( s9, alpha[ 9] ); \
|
||||
sA = _mm256_xor_si256( sA, alpha[10] ); \
|
||||
sB = _mm256_xor_si256( sB, alpha[11] ); \
|
||||
sC = _mm256_xor_si256( sC, alpha[12] ); \
|
||||
sD = _mm256_xor_si256( sD, alpha[13] ); \
|
||||
sE = _mm256_xor_si256( sE, alpha[14] ); \
|
||||
sF = _mm256_xor_si256( sF, alpha[15] ); \
|
||||
\
|
||||
SBOX( s0, s4, s8, sC ); \
|
||||
SBOX( s1, s5, s9, sD ); \
|
||||
@@ -1064,28 +1100,66 @@ do { \
|
||||
|
||||
#define P_BIG \
|
||||
do { \
|
||||
ROUND_BIG(0, alpha_n); \
|
||||
ROUND_BIG(1, alpha_n); \
|
||||
ROUND_BIG(2, alpha_n); \
|
||||
ROUND_BIG(3, alpha_n); \
|
||||
ROUND_BIG(4, alpha_n); \
|
||||
ROUND_BIG(5, alpha_n); \
|
||||
__m256i alpha[16]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
} while (0)
|
||||
|
||||
#define PF_BIG \
|
||||
do { \
|
||||
ROUND_BIG( 0, alpha_f); \
|
||||
ROUND_BIG( 1, alpha_f); \
|
||||
ROUND_BIG( 2, alpha_f); \
|
||||
ROUND_BIG( 3, alpha_f); \
|
||||
ROUND_BIG( 4, alpha_f); \
|
||||
ROUND_BIG( 5, alpha_f); \
|
||||
ROUND_BIG( 6, alpha_f); \
|
||||
ROUND_BIG( 7, alpha_f); \
|
||||
ROUND_BIG( 8, alpha_f); \
|
||||
ROUND_BIG( 9, alpha_f); \
|
||||
ROUND_BIG(10, alpha_f); \
|
||||
ROUND_BIG(11, alpha_f); \
|
||||
__m256i alpha[16]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)6 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)7 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)8 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)9 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)10 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)11 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
} while (0)
|
||||
|
||||
#define T_BIG \
|
||||
|
@@ -7,6 +7,7 @@
|
||||
#include "hodl-gate.h"
|
||||
#include "hodl-wolf.h"
|
||||
#include "miner.h"
|
||||
#include "algo/sha/sha256d.h"
|
||||
|
||||
#if defined(__AES__)
|
||||
|
||||
|
@@ -1,5 +1,6 @@
|
||||
#include "keccak-gate.h"
|
||||
#include "sph_keccak.h"
|
||||
#include "algo/sha/sha256d.h"
|
||||
|
||||
int hard_coded_eb = 1;
|
||||
|
||||
|
@@ -70,13 +70,13 @@ static const uint64_t RC[] = {
|
||||
|
||||
// Targetted macros, keccak-macros.h is included for each target.
|
||||
|
||||
#define DECL64(x) __m512i x
|
||||
#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b))
|
||||
#define AND64(d, a, b) (d = _mm512_and_si512(a,b))
|
||||
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
|
||||
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
|
||||
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
|
||||
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
|
||||
#define DECL64(x) __m512i x
|
||||
#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b))
|
||||
#define AND64(d, a, b) (d = _mm512_and_si512(a,b))
|
||||
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
|
||||
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
|
||||
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
|
||||
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
|
||||
#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
|
||||
|
||||
|
||||
|
@@ -16,7 +16,7 @@
|
||||
typedef struct {
|
||||
blake256_16way_context blake;
|
||||
keccak256_8way_context keccak;
|
||||
cube_4way_context cube;
|
||||
cube_4way_2buf_context cube;
|
||||
skein256_8way_context skein;
|
||||
#if defined(__VAES__)
|
||||
groestl256_4way_context groestl;
|
||||
@@ -30,13 +30,7 @@ static __thread allium_16way_ctx_holder allium_16way_ctx;
|
||||
bool init_allium_16way_ctx()
|
||||
{
|
||||
keccak256_8way_init( &allium_16way_ctx.keccak );
|
||||
cube_4way_init( &allium_16way_ctx.cube, 256, 16, 32 );
|
||||
skein256_8way_init( &allium_16way_ctx.skein );
|
||||
#if defined(__VAES__)
|
||||
groestl256_4way_init( &allium_16way_ctx.groestl, 32 );
|
||||
#else
|
||||
init_groestl256( &allium_16way_ctx.groestl, 32 );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -111,12 +105,11 @@ void allium_16way_hash( void *state, const void *input )
|
||||
intrlv_2x256( vhash, hash14, hash15, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash14, hash15, vhash, 256 );
|
||||
|
||||
|
||||
intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||
intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||
|
||||
cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||
cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
||||
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 );
|
||||
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
|
||||
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
|
||||
@@ -124,8 +117,7 @@ void allium_16way_hash( void *state, const void *input )
|
||||
intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 );
|
||||
intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 );
|
||||
|
||||
cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||
cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
||||
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 );
|
||||
|
||||
dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 );
|
||||
dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 );
|
||||
@@ -255,7 +247,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
typedef struct {
|
||||
blake256_8way_context blake;
|
||||
keccak256_4way_context keccak;
|
||||
cubehashParam cube;
|
||||
cube_2way_context cube;
|
||||
skein256_4way_context skein;
|
||||
#if defined(__VAES__)
|
||||
groestl256_2way_context groestl;
|
||||
@@ -269,13 +261,7 @@ static __thread allium_8way_ctx_holder allium_8way_ctx;
|
||||
bool init_allium_8way_ctx()
|
||||
{
|
||||
keccak256_4way_init( &allium_8way_ctx.keccak );
|
||||
cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 );
|
||||
skein256_4way_init( &allium_8way_ctx.skein );
|
||||
#if defined(__VAES__)
|
||||
groestl256_2way_init( &allium_8way_ctx.groestl, 32 );
|
||||
#else
|
||||
init_groestl256( &allium_8way_ctx.groestl, 32 );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -320,21 +306,20 @@ void allium_8way_hash( void *hash, const void *input )
|
||||
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*)hash4, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*)hash5, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*)hash6, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*)hash7, 32 );
|
||||
|
||||
intrlv_2x128( vhashA, hash0, hash1, 256 );
|
||||
intrlv_2x128( vhashB, hash2, hash3, 256 );
|
||||
cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||
cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
||||
dintrlv_2x128( hash0, hash1, vhashA, 256 );
|
||||
dintrlv_2x128( hash2, hash3, vhashB, 256 );
|
||||
|
||||
intrlv_2x128( vhashA, hash4, hash5, 256 );
|
||||
intrlv_2x128( vhashB, hash6, hash7, 256 );
|
||||
cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||
cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
||||
dintrlv_2x128( hash4, hash5, vhashA, 256 );
|
||||
dintrlv_2x128( hash6, hash7, vhashB, 256 );
|
||||
|
||||
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
|
||||
|
@@ -188,7 +188,7 @@ bool register_allium_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&allium_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT
|
||||
| VAES_OPT | VAES256_OPT;
|
||||
| VAES_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
|
@@ -66,13 +66,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
|
||||
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
G2W_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm512_ror256_64( s1); \
|
||||
s3 = mm512_shufll256_64( s3 ); \
|
||||
s1 = mm512_shuflr256_64( s1); \
|
||||
s2 = mm512_swap256_128( s2 ); \
|
||||
s3 = mm512_rol256_64( s3 ); \
|
||||
G2W_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm512_rol256_64( s1 ); \
|
||||
s2 = mm512_swap256_128( s2 ); \
|
||||
s3 = mm512_ror256_64( s3 );
|
||||
s3 = mm512_shuflr256_64( s3 ); \
|
||||
s1 = mm512_shufll256_64( s1 ); \
|
||||
s2 = mm512_swap256_128( s2 );
|
||||
|
||||
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
@@ -107,13 +107,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
|
||||
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm256_ror_1x64( s1); \
|
||||
s3 = mm256_shufll_64( s3 ); \
|
||||
s1 = mm256_shuflr_64( s1); \
|
||||
s2 = mm256_swap_128( s2 ); \
|
||||
s3 = mm256_rol_1x64( s3 ); \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm256_rol_1x64( s1 ); \
|
||||
s2 = mm256_swap_128( s2 ); \
|
||||
s3 = mm256_ror_1x64( s3 );
|
||||
s3 = mm256_shuflr_64( s3 ); \
|
||||
s1 = mm256_shufll_64( s1 ); \
|
||||
s2 = mm256_swap_128( s2 );
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
@@ -148,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm128_ror256_64( s2, s3 ); \
|
||||
mm128_vrol256_64( s6, s7 ); \
|
||||
mm128_vror256_64( s2, s3 ); \
|
||||
mm128_swap256_128( s4, s5 ); \
|
||||
mm128_rol256_64( s6, s7 ); \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm128_rol256_64( s2, s3 ); \
|
||||
mm128_swap256_128( s4, s5 ); \
|
||||
mm128_ror256_64( s6, s7 );
|
||||
mm128_vror256_64( s6, s7 ); \
|
||||
mm128_vrol256_64( s2, s3 ); \
|
||||
mm128_swap256_128( s4, s5 );
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
|
@@ -13,6 +13,7 @@
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/ripemd/sph_ripemd.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
#define EPSa DBL_EPSILON
|
||||
#define EPS1 DBL_EPSILON
|
||||
@@ -104,8 +105,8 @@ uint32_t sw2_( int nnounce )
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
sph_sha256_context sha256;
|
||||
sph_sha512_context sha512;
|
||||
sha256_context sha256;
|
||||
sph_sha512_context sha512;
|
||||
sph_keccak512_context keccak;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sph_haval256_5_context haval;
|
||||
@@ -117,7 +118,7 @@ m7m_ctx_holder m7m_ctx;
|
||||
|
||||
void init_m7m_ctx()
|
||||
{
|
||||
sph_sha256_init( &m7m_ctx );
|
||||
sha256_ctx_init( &m7m_ctx.sha256 );
|
||||
sph_sha512_init( &m7m_ctx.sha512 );
|
||||
sph_keccak512_init( &m7m_ctx.keccak );
|
||||
sph_whirlpool_init( &m7m_ctx.whirlpool );
|
||||
@@ -153,11 +154,10 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
|
||||
m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) );
|
||||
sph_sha256_context ctxf_sha256;
|
||||
|
||||
memcpy(data, pdata, 80);
|
||||
|
||||
sph_sha256( &ctx1.sha256, data, M7_MIDSTATE_LEN );
|
||||
sha256_update( &ctx1.sha256, data, M7_MIDSTATE_LEN );
|
||||
sph_sha512( &ctx1.sha512, data, M7_MIDSTATE_LEN );
|
||||
sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN );
|
||||
sph_whirlpool( &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
|
||||
@@ -189,8 +189,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
|
||||
memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );
|
||||
|
||||
sph_sha256( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
sph_sha256_close( &ctx2.sha256, bhash[0] );
|
||||
sha256_update( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
sha256_final( &ctx2.sha256, bhash[0] );
|
||||
|
||||
sph_sha512( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
sph_sha512_close( &ctx2.sha512, bhash[1] );
|
||||
@@ -225,9 +225,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
bytes = mpz_sizeinbase(product, 256);
|
||||
mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);
|
||||
|
||||
sph_sha256_init( &ctxf_sha256 );
|
||||
sph_sha256( &ctxf_sha256, bdata, bytes );
|
||||
sph_sha256_close( &ctxf_sha256, hash );
|
||||
sha256_full( hash, bdata, bytes );
|
||||
|
||||
digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
|
||||
mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
|
||||
@@ -260,10 +258,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
mpzscale=bytes;
|
||||
mpz_export(bdata, NULL, -1, 1, 0, 0, product);
|
||||
|
||||
sph_sha256_init( &ctxf_sha256 );
|
||||
sph_sha256( &ctxf_sha256, bdata, bytes );
|
||||
sph_sha256_close( &ctxf_sha256, hash );
|
||||
}
|
||||
sha256_full( hash, bdata, bytes );
|
||||
}
|
||||
|
||||
if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
|
||||
&& !opt_benchmark ) )
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
double lbry_calc_network_diff( struct work *work )
|
||||
long double lbry_calc_network_diff( struct work *work )
|
||||
{
|
||||
// sample for diff 43.281 : 1c05ea29
|
||||
// todo: endian reversed on longpoll could be zr5 specific...
|
||||
@@ -12,7 +12,7 @@ double lbry_calc_network_diff( struct work *work )
|
||||
uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
|
||||
uint32_t bits = (nbits & 0xffffff);
|
||||
int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
|
||||
double d = (double)0x0000ffff / (double)bits;
|
||||
long double d = (long double)0x0000ffff / (long double)bits;
|
||||
|
||||
for (int m=shift; m < 29; m++) d *= 256.0;
|
||||
for (int m=29; m < shift; m++) d /= 256.0;
|
||||
|
@@ -7,24 +7,19 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sph_ripemd.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
void lbry_hash(void* output, const void* input)
|
||||
{
|
||||
sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
|
||||
sha256_context ctx_sha256 __attribute__ ((aligned (64)));
|
||||
sph_sha512_context ctx_sha512 __attribute__ ((aligned (64)));
|
||||
sph_ripemd160_context ctx_ripemd __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) hashA[16];
|
||||
uint32_t _ALIGN(64) hashB[16];
|
||||
uint32_t _ALIGN(64) hashC[16];
|
||||
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, input, 112 );
|
||||
sph_sha256_close( &ctx_sha256, hashA );
|
||||
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hashA, 32 );
|
||||
sph_sha256_close( &ctx_sha256, hashA );
|
||||
sha256_full( hashA, input, 112 );
|
||||
sha256_full( hashA, hashA, 32 );
|
||||
|
||||
sph_sha512_init( &ctx_sha512 );
|
||||
sph_sha512( &ctx_sha512, hashA, 32 );
|
||||
@@ -38,15 +33,13 @@ void lbry_hash(void* output, const void* input)
|
||||
sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 );
|
||||
sph_ripemd160_close( &ctx_ripemd, hashC );
|
||||
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hashB, 20 );
|
||||
sph_sha256( &ctx_sha256, hashC, 20 );
|
||||
sph_sha256_close( &ctx_sha256, hashA );
|
||||
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hashA, 32 );
|
||||
sph_sha256_close( &ctx_sha256, hashA );
|
||||
sha256_ctx_init( &ctx_sha256 );
|
||||
sha256_update( &ctx_sha256, hashB, 20 );
|
||||
sha256_update( &ctx_sha256, hashC, 20 );
|
||||
sha256_final( &ctx_sha256, hashA );
|
||||
|
||||
sha256_full( hashA, hashA, 32 );
|
||||
|
||||
memcpy( output, hashA, 32 );
|
||||
}
|
||||
|
||||
|
@@ -479,7 +479,7 @@ sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4])
|
||||
* One round of RIPEMD-128. The data must be aligned for 32-bit access.
|
||||
*/
|
||||
static void
|
||||
ripemd128_round(const unsigned char *data, sph_u32 r[5])
|
||||
ripemd128_round(const unsigned char *data, sph_u32 r[4])
|
||||
{
|
||||
#if SPH_LITTLE_FAST
|
||||
|
||||
|
@@ -69,8 +69,12 @@ typedef unsigned int uint;
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64U
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 32U
|
||||
|
||||
#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
|
||||
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
|
||||
//#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
|
||||
//#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
|
||||
|
||||
#define ROTL32(a,b) rol32(a,b)
|
||||
#define ROTR32(a,b) ror32(a,b)
|
||||
|
||||
|
||||
#define U8TO32_BE(p) \
|
||||
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
|
||||
|
3265
algo/scrypt/scrypt-core-4way.c
Normal file
3265
algo/scrypt/scrypt-core-4way.c
Normal file
File diff suppressed because it is too large
Load Diff
70
algo/scrypt/scrypt-core-4way.h
Normal file
70
algo/scrypt/scrypt-core-4way.h
Normal file
@@ -0,0 +1,70 @@
|
||||
#ifndef SCRYPT_CORE_4WAY_H__
|
||||
#define SCRYPT_CORE_4WAY_H__
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );
|
||||
|
||||
// Serial SIMD over 4 way parallel
|
||||
void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N );
|
||||
|
||||
// 4 way parallel over serial SIMD
|
||||
void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
void scrypt_core_8way( __m256i *X, __m256i *V, uint32_t N );
|
||||
|
||||
// 2 way parallel over SIMD128
|
||||
void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N );
|
||||
|
||||
// Double buffered 2 way parallel over SIMD128
|
||||
void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N );
|
||||
|
||||
// Triplee buffered 2 way parallel over SIMD128
|
||||
void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N );
|
||||
|
||||
// Serial SIMD128 over 2 way parallel
|
||||
void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N );
|
||||
|
||||
// Double buffered simd over parallel
|
||||
void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N );
|
||||
|
||||
// Triple buffered 2 way
|
||||
void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, const uint32_t N );
|
||||
|
||||
// Quadruple buffered
|
||||
void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
// Parallel 4 way, 4x memory
|
||||
void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N );
|
||||
|
||||
// Linear SIMD 1 way, 1x memory, lowest
|
||||
void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||
|
||||
// Double buffered, 2x memory
|
||||
void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||
|
||||
// Triple buffered
|
||||
void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||
|
||||
// Quadruple buffered, 4x memory
|
||||
void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||
|
||||
#endif
|
||||
|
||||
// For reference only
|
||||
void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N );
|
||||
|
||||
#endif
|
||||
|
206
algo/scrypt/scrypt-core-ref.c
Normal file
206
algo/scrypt/scrypt-core-ref.c
Normal file
@@ -0,0 +1,206 @@
|
||||
#include "scrypt-core-ref.h"
|
||||
|
||||
#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
|
||||
|
||||
static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
|
||||
{
|
||||
uint32_t x0 = (B[ 0] ^= C[ 0]),
|
||||
x1 = (B[ 1] ^= C[ 1]),
|
||||
x2 = (B[ 2] ^= C[ 2]),
|
||||
x3 = (B[ 3] ^= C[ 3]);
|
||||
uint32_t x4 = (B[ 4] ^= C[ 4]),
|
||||
x5 = (B[ 5] ^= C[ 5]),
|
||||
x6 = (B[ 6] ^= C[ 6]),
|
||||
x7 = (B[ 7] ^= C[ 7]);
|
||||
uint32_t x8 = (B[ 8] ^= C[ 8]),
|
||||
x9 = (B[ 9] ^= C[ 9]),
|
||||
xa = (B[10] ^= C[10]),
|
||||
xb = (B[11] ^= C[11]);
|
||||
uint32_t xc = (B[12] ^= C[12]),
|
||||
xd = (B[13] ^= C[13]),
|
||||
xe = (B[14] ^= C[14]),
|
||||
xf = (B[15] ^= C[15]);
|
||||
|
||||
/* Operate on columns. */
|
||||
x4 ^= ROTL(x0 + xc, 7);
|
||||
x9 ^= ROTL(x5 + x1, 7);
|
||||
xe ^= ROTL(xa + x6, 7);
|
||||
x3 ^= ROTL(xf + xb, 7);
|
||||
x8 ^= ROTL(x4 + x0, 9);
|
||||
xd ^= ROTL(x9 + x5, 9);
|
||||
x2 ^= ROTL(xe + xa, 9);
|
||||
x7 ^= ROTL(x3 + xf, 9);
|
||||
xc ^= ROTL(x8 + x4, 13);
|
||||
x1 ^= ROTL(xd + x9, 13);
|
||||
x6 ^= ROTL(x2 + xe, 13);
|
||||
xb ^= ROTL(x7 + x3, 13);
|
||||
x0 ^= ROTL(xc + x8, 18);
|
||||
x5 ^= ROTL(x1 + xd, 18);
|
||||
xa ^= ROTL(x6 + x2, 18);
|
||||
xf ^= ROTL(xb + x7, 18);
|
||||
|
||||
/* Operate on rows. */
|
||||
x1 ^= ROTL(x0 + x3, 7);
|
||||
x6 ^= ROTL(x5 + x4, 7);
|
||||
xb ^= ROTL(xa + x9, 7);
|
||||
xc ^= ROTL(xf + xe, 7);
|
||||
x2 ^= ROTL(x1 + x0, 9);
|
||||
x7 ^= ROTL(x6 + x5, 9);
|
||||
x8 ^= ROTL(xb + xa, 9);
|
||||
xd ^= ROTL(xc + xf, 9);
|
||||
x3 ^= ROTL(x2 + x1, 13);
|
||||
x4 ^= ROTL(x7 + x6, 13);
|
||||
x9 ^= ROTL(x8 + xb, 13);
|
||||
xe ^= ROTL(xd + xc, 13);
|
||||
x0 ^= ROTL(x3 + x2, 18);
|
||||
x5 ^= ROTL(x4 + x7, 18);
|
||||
xa ^= ROTL(x9 + x8, 18);
|
||||
xf ^= ROTL(xe + xd, 18);
|
||||
|
||||
/* Operate on columns. */
|
||||
x4 ^= ROTL(x0 + xc, 7);
|
||||
x9 ^= ROTL(x5 + x1, 7);
|
||||
xe ^= ROTL(xa + x6, 7);
|
||||
x3 ^= ROTL(xf + xb, 7);
|
||||
x8 ^= ROTL(x4 + x0, 9);
|
||||
xd ^= ROTL(x9 + x5, 9);
|
||||
x2 ^= ROTL(xe + xa, 9);
|
||||
x7 ^= ROTL(x3 + xf, 9);
|
||||
xc ^= ROTL(x8 + x4, 13);
|
||||
x1 ^= ROTL(xd + x9, 13);
|
||||
x6 ^= ROTL(x2 + xe, 13);
|
||||
xb ^= ROTL(x7 + x3, 13);
|
||||
x0 ^= ROTL(xc + x8, 18);
|
||||
x5 ^= ROTL(x1 + xd, 18);
|
||||
xa ^= ROTL(x6 + x2, 18);
|
||||
xf ^= ROTL(xb + x7, 18);
|
||||
|
||||
/* Operate on rows. */
|
||||
x1 ^= ROTL(x0 + x3, 7);
|
||||
x6 ^= ROTL(x5 + x4, 7);
|
||||
xb ^= ROTL(xa + x9, 7);
|
||||
xc ^= ROTL(xf + xe, 7);
|
||||
x2 ^= ROTL(x1 + x0, 9);
|
||||
x7 ^= ROTL(x6 + x5, 9);
|
||||
x8 ^= ROTL(xb + xa, 9);
|
||||
xd ^= ROTL(xc + xf, 9);
|
||||
x3 ^= ROTL(x2 + x1, 13);
|
||||
x4 ^= ROTL(x7 + x6, 13);
|
||||
x9 ^= ROTL(x8 + xb, 13);
|
||||
xe ^= ROTL(xd + xc, 13);
|
||||
x0 ^= ROTL(x3 + x2, 18);
|
||||
x5 ^= ROTL(x4 + x7, 18);
|
||||
xa ^= ROTL(x9 + x8, 18);
|
||||
xf ^= ROTL(xe + xd, 18);
|
||||
|
||||
/* Operate on columns. */
|
||||
x4 ^= ROTL(x0 + xc, 7);
|
||||
x9 ^= ROTL(x5 + x1, 7);
|
||||
xe ^= ROTL(xa + x6, 7);
|
||||
x3 ^= ROTL(xf + xb, 7);
|
||||
x8 ^= ROTL(x4 + x0, 9);
|
||||
xd ^= ROTL(x9 + x5, 9);
|
||||
x2 ^= ROTL(xe + xa, 9);
|
||||
x7 ^= ROTL(x3 + xf, 9);
|
||||
xc ^= ROTL(x8 + x4, 13);
|
||||
x1 ^= ROTL(xd + x9, 13);
|
||||
x6 ^= ROTL(x2 + xe, 13);
|
||||
xb ^= ROTL(x7 + x3, 13);
|
||||
x0 ^= ROTL(xc + x8, 18);
|
||||
x5 ^= ROTL(x1 + xd, 18);
|
||||
xa ^= ROTL(x6 + x2, 18);
|
||||
xf ^= ROTL(xb + x7, 18);
|
||||
|
||||
/* Operate on rows. */
|
||||
x1 ^= ROTL(x0 + x3, 7);
|
||||
x6 ^= ROTL(x5 + x4, 7);
|
||||
xb ^= ROTL(xa + x9, 7);
|
||||
xc ^= ROTL(xf + xe, 7);
|
||||
x2 ^= ROTL(x1 + x0, 9);
|
||||
x7 ^= ROTL(x6 + x5, 9);
|
||||
x8 ^= ROTL(xb + xa, 9);
|
||||
xd ^= ROTL(xc + xf, 9);
|
||||
x3 ^= ROTL(x2 + x1, 13);
|
||||
x4 ^= ROTL(x7 + x6, 13);
|
||||
x9 ^= ROTL(x8 + xb, 13);
|
||||
xe ^= ROTL(xd + xc, 13);
|
||||
x0 ^= ROTL(x3 + x2, 18);
|
||||
x5 ^= ROTL(x4 + x7, 18);
|
||||
xa ^= ROTL(x9 + x8, 18);
|
||||
xf ^= ROTL(xe + xd, 18);
|
||||
|
||||
/* Operate on columns. */
|
||||
x4 ^= ROTL(x0 + xc, 7);
|
||||
x9 ^= ROTL(x5 + x1, 7);
|
||||
xe ^= ROTL(xa + x6, 7);
|
||||
x3 ^= ROTL(xf + xb, 7);
|
||||
x8 ^= ROTL(x4 + x0, 9);
|
||||
xd ^= ROTL(x9 + x5, 9);
|
||||
x2 ^= ROTL(xe + xa, 9);
|
||||
x7 ^= ROTL(x3 + xf, 9);
|
||||
xc ^= ROTL(x8 + x4, 13);
|
||||
x1 ^= ROTL(xd + x9, 13);
|
||||
x6 ^= ROTL(x2 + xe, 13);
|
||||
xb ^= ROTL(x7 + x3, 13);
|
||||
x0 ^= ROTL(xc + x8, 18);
|
||||
x5 ^= ROTL(x1 + xd, 18);
|
||||
xa ^= ROTL(x6 + x2, 18);
|
||||
xf ^= ROTL(xb + x7, 18);
|
||||
|
||||
/* Operate on rows. */
|
||||
x1 ^= ROTL(x0 + x3, 7);
|
||||
x6 ^= ROTL(x5 + x4, 7);
|
||||
xb ^= ROTL(xa + x9, 7);
|
||||
xc ^= ROTL(xf + xe, 7);
|
||||
x2 ^= ROTL(x1 + x0, 9);
|
||||
x7 ^= ROTL(x6 + x5, 9);
|
||||
x8 ^= ROTL(xb + xa, 9);
|
||||
xd ^= ROTL(xc + xf, 9);
|
||||
x3 ^= ROTL(x2 + x1, 13);
|
||||
x4 ^= ROTL(x7 + x6, 13);
|
||||
x9 ^= ROTL(x8 + xb, 13);
|
||||
xe ^= ROTL(xd + xc, 13);
|
||||
x0 ^= ROTL(x3 + x2, 18);
|
||||
x5 ^= ROTL(x4 + x7, 18);
|
||||
xa ^= ROTL(x9 + x8, 18);
|
||||
xf ^= ROTL(xe + xd, 18);
|
||||
|
||||
B[ 0] += x0;
|
||||
B[ 1] += x1;
|
||||
B[ 2] += x2;
|
||||
B[ 3] += x3;
|
||||
B[ 4] += x4;
|
||||
B[ 5] += x5;
|
||||
B[ 6] += x6;
|
||||
B[ 7] += x7;
|
||||
B[ 8] += x8;
|
||||
B[ 9] += x9;
|
||||
B[10] += xa;
|
||||
B[11] += xb;
|
||||
B[12] += xc;
|
||||
B[13] += xd;
|
||||
B[14] += xe;
|
||||
B[15] += xf;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param X input/ouput
|
||||
* @param V scratch buffer
|
||||
* @param N factor (def. 1024)
|
||||
*/
|
||||
void scrypt_core_ref(uint32_t *X, uint32_t *V, uint32_t N)
|
||||
{
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
memcpy(&V[i * 32], X, 128);
|
||||
xor_salsa8(&X[0], &X[16]);
|
||||
xor_salsa8(&X[16], &X[0]);
|
||||
}
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
uint32_t j = 32 * (X[16] & (N - 1));
|
||||
for (uint8_t k = 0; k < 32; k++)
|
||||
X[k] ^= V[j + k];
|
||||
xor_salsa8(&X[0], &X[16]);
|
||||
xor_salsa8(&X[16], &X[0]);
|
||||
}
|
||||
}
|
||||
|
1697
algo/scrypt/scrypt.c
1697
algo/scrypt/scrypt.c
File diff suppressed because it is too large
Load Diff
@@ -39,17 +39,10 @@
|
||||
void
|
||||
SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
|
||||
{
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256_context ctx;
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, in, len );
|
||||
sph_sha256_close( &ctx, digest );
|
||||
#else
|
||||
SHA256_CTX ctx;
|
||||
SHA256_Init( &ctx );
|
||||
SHA256_Update( &ctx, in, len );
|
||||
SHA256_Final( digest, &ctx );
|
||||
#endif
|
||||
sha256_context ctx;
|
||||
sha256_ctx_init( &ctx );
|
||||
sha256_update( &ctx, in, len );
|
||||
sha256_final( &ctx, digest );
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -71,7 +64,7 @@ HMAC_SHA256_Buf( const void *K, size_t Klen, const void *in, size_t len,
|
||||
void
|
||||
HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
|
||||
{
|
||||
unsigned char pad[64];
|
||||
unsigned char pad[64] __attribute__ ((aligned (64)));
|
||||
unsigned char khash[32];
|
||||
const unsigned char * K = _K;
|
||||
size_t i;
|
||||
@@ -79,51 +72,28 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
|
||||
/* If Klen > 64, the key is really SHA256(K). */
|
||||
if ( Klen > 64 )
|
||||
{
|
||||
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256_init( &ctx->ictx );
|
||||
sph_sha256( &ctx->ictx, K, Klen );
|
||||
sph_sha256_close( &ctx->ictx, khash );
|
||||
#else
|
||||
SHA256_Init( &ctx->ictx );
|
||||
SHA256_Update( &ctx->ictx, K, Klen );
|
||||
SHA256_Final( khash, &ctx->ictx );
|
||||
#endif
|
||||
K = khash;
|
||||
Klen = 32;
|
||||
sha256_ctx_init( &ctx->ictx );
|
||||
sha256_update( &ctx->ictx, K, Klen );
|
||||
sha256_final( &ctx->ictx, khash );
|
||||
K = khash;
|
||||
Klen = 32;
|
||||
}
|
||||
|
||||
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256_init( &ctx->ictx );
|
||||
#else
|
||||
SHA256_Init( &ctx->ictx );
|
||||
#endif
|
||||
sha256_ctx_init( &ctx->ictx );
|
||||
|
||||
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x36;
|
||||
|
||||
memset( pad + Klen, 0x36, 64 - Klen );
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256( &ctx->ictx, pad, 64 );
|
||||
#else
|
||||
SHA256_Update( &ctx->ictx, pad, 64 );
|
||||
#endif
|
||||
sha256_update( &ctx->ictx, pad, 64 );
|
||||
|
||||
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256_init( &ctx->octx );
|
||||
#else
|
||||
SHA256_Init( &ctx->octx );
|
||||
#endif
|
||||
sha256_ctx_init( &ctx->octx );
|
||||
|
||||
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x5c;
|
||||
|
||||
memset( pad + Klen, 0x5c, 64 - Klen );
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256( &ctx->octx, pad, 64 );
|
||||
#else
|
||||
SHA256_Update( &ctx->octx, pad, 64 );
|
||||
#endif
|
||||
sha256_update( &ctx->octx, pad, 64 );
|
||||
}
|
||||
|
||||
/* Add bytes to the HMAC-SHA256 operation. */
|
||||
@@ -131,33 +101,17 @@ void
|
||||
HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
|
||||
{
|
||||
/* Feed data to the inner SHA256 operation. */
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256( &ctx->ictx, in, len );
|
||||
#else
|
||||
SHA256_Update( &ctx->ictx, in, len );
|
||||
#endif
|
||||
sha256_update( &ctx->ictx, in, len );
|
||||
}
|
||||
|
||||
/* Finish an HMAC-SHA256 operation. */
|
||||
void
|
||||
HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
|
||||
HMAC_SHA256_Final( void *digest, HMAC_SHA256_CTX *ctx )
|
||||
{
|
||||
unsigned char ihash[32];
|
||||
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256_close( &ctx->ictx, ihash );
|
||||
sph_sha256( &ctx->octx, ihash, 32 );
|
||||
sph_sha256_close( &ctx->octx, digest );
|
||||
#else
|
||||
/* Finish the inner SHA256 operation. */
|
||||
SHA256_Final( ihash, &ctx->ictx );
|
||||
|
||||
/* Feed the inner hash to the outer SHA256 operation. */
|
||||
SHA256_Update( &ctx->octx, ihash, 32 );
|
||||
|
||||
/* Finish the outer SHA256 operation. */
|
||||
SHA256_Final( digest, &ctx->octx );
|
||||
#endif
|
||||
uint32_t ihash[8] __attribute__ ((aligned (32)));
|
||||
sha256_final( &ctx->ictx, ihash );
|
||||
sha256_update( &ctx->octx, ihash, 32 );
|
||||
sha256_final( &ctx->octx, digest );
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -170,8 +124,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
|
||||
size_t saltlen, uint64_t c, uint8_t *buf, size_t dkLen )
|
||||
{
|
||||
HMAC_SHA256_CTX PShctx, hctx;
|
||||
uint8_t _ALIGN(128) T[32];
|
||||
uint8_t _ALIGN(128) U[32];
|
||||
uint64_t _ALIGN(128) T[4];
|
||||
uint64_t _ALIGN(128) U[4];
|
||||
// uint8_t _ALIGN(128) T[32];
|
||||
// uint8_t _ALIGN(128) U[32];
|
||||
uint32_t ivec;
|
||||
size_t i, clen;
|
||||
uint64_t j;
|
||||
@@ -207,10 +163,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
|
||||
// _mm_xor_si128( ((__m128i*)T)[0], ((__m128i*)U)[0] );
|
||||
// _mm_xor_si128( ((__m128i*)T)[1], ((__m128i*)U)[1] );
|
||||
|
||||
// for ( k = 0; k < 4; k++ ) T[k] ^= U[k];
|
||||
for ( k = 0; k < 4; k++ ) T[k] ^= U[k];
|
||||
|
||||
for ( k = 0; k < 32; k++ )
|
||||
T[k] ^= U[k];
|
||||
// for ( k = 0; k < 32; k++ )
|
||||
// T[k] ^= U[k];
|
||||
}
|
||||
|
||||
/* Copy as many bytes as necessary into buf. */
|
||||
|
@@ -29,30 +29,20 @@
|
||||
#ifndef HMAC_SHA256_H__
|
||||
#define HMAC_SHA256_H__
|
||||
|
||||
//#define HMAC_SSL_SHA 1
|
||||
#define HMAC_SPH_SHA 1
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include "sph_sha2.h"
|
||||
#include <openssl/sha.h>
|
||||
|
||||
#include "sha256-hash.h"
|
||||
|
||||
typedef struct HMAC_SHA256Context
|
||||
{
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256_context ictx;
|
||||
sph_sha256_context octx;
|
||||
#else
|
||||
SHA256_CTX ictx;
|
||||
SHA256_CTX octx;
|
||||
#endif
|
||||
sha256_context ictx;
|
||||
sha256_context octx;
|
||||
} HMAC_SHA256_CTX;
|
||||
|
||||
void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
|
||||
void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t );
|
||||
void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t );
|
||||
void HMAC_SHA256_Final( unsigned char [32], HMAC_SHA256_CTX * );
|
||||
void HMAC_SHA256_Final( void*, HMAC_SHA256_CTX * );
|
||||
void HMAC_SHA256_Buf( const void *, size_t Klen, const void *,
|
||||
size_t len, uint8_t digest[32] );
|
||||
|
||||
|
@@ -51,7 +51,6 @@ typedef struct {
|
||||
__m128i buf[64>>2];
|
||||
__m128i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
bool initialized;
|
||||
} sha256_4way_context __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256_4way_init( sha256_4way_context *sc );
|
||||
@@ -59,8 +58,16 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha256_4way_close( sha256_4way_context *sc, void *dst );
|
||||
void sha256_4way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_4way_transform( __m128i *state_out, const __m128i *data,
|
||||
void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
|
||||
const __m128i *W, const __m128i *state_in );
|
||||
void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
|
||||
int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
@@ -72,15 +79,23 @@ typedef struct {
|
||||
__m256i buf[64>>2];
|
||||
__m256i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
bool initialized;
|
||||
} sha256_8way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha256_8way_init( sha256_8way_context *sc );
|
||||
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
|
||||
void sha256_8way_close( sha256_8way_context *sc, void *dst );
|
||||
void sha256_8way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_8way_transform( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
|
||||
void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
const __m256i *W, const __m256i *state_in );
|
||||
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
|
||||
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
@@ -92,19 +107,23 @@ typedef struct {
|
||||
__m512i buf[64>>2];
|
||||
__m512i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
bool initialized;
|
||||
} sha256_16way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha256_16way_init( sha256_16way_context *sc );
|
||||
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
|
||||
void sha256_16way_close( sha256_16way_context *sc, void *dst );
|
||||
void sha256_16way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_16way_transform( __m512i *state_out, const __m512i *data,
|
||||
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
|
||||
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
const __m512i *W, const __m512i *state_in );
|
||||
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const __m512i *state_mid );
|
||||
const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
|
||||
|
||||
int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
@@ -8,7 +8,7 @@
|
||||
* any later version. See COPYING for more details.
|
||||
*/
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include "sha256d-4way.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <inttypes.h>
|
||||
@@ -180,6 +180,9 @@ static const uint32_t sha256d_hash1[16] = {
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000100
|
||||
};
|
||||
|
||||
// this performs the entire hash all over again, why?
|
||||
// because main function only does 56 rounds.
|
||||
|
||||
static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
|
||||
{
|
||||
uint32_t S[16];
|
||||
@@ -195,6 +198,7 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
|
||||
hash[i] = swab32(hash[i]);
|
||||
}
|
||||
|
||||
/*
|
||||
#if defined (__SHA__)
|
||||
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
@@ -241,6 +245,7 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len)
|
||||
}
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
static inline void sha256d_preextend(uint32_t *W)
|
||||
{
|
||||
@@ -489,7 +494,7 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
|
||||
void sha256d_ms_4way(uint32_t *hash, uint32_t *data,
|
||||
const uint32_t *midstate, const uint32_t *prehash);
|
||||
|
||||
static inline int scanhash_sha256d_4way( struct work *work,
|
||||
static inline int scanhash_sha256d_4way_pooler( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
@@ -550,7 +555,7 @@ static inline int scanhash_sha256d_4way( struct work *work,
|
||||
void sha256d_ms_8way(uint32_t *hash, uint32_t *data,
|
||||
const uint32_t *midstate, const uint32_t *prehash);
|
||||
|
||||
static inline int scanhash_sha256d_8way( struct work *work,
|
||||
static inline int scanhash_sha256d_8way_pooler( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
@@ -606,11 +611,11 @@ static inline int scanhash_sha256d_8way( struct work *work,
|
||||
|
||||
#endif /* HAVE_SHA256_8WAY */
|
||||
|
||||
int scanhash_sha256d( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
int scanhash_sha256d_pooler( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t _ALIGN(128) data[64];
|
||||
uint32_t _ALIGN(32) hash[8];
|
||||
uint32_t _ALIGN(32) midstate[8];
|
||||
@@ -621,12 +626,12 @@ int scanhash_sha256d( struct work *work,
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
#ifdef HAVE_SHA256_8WAY
|
||||
if (sha256_use_8way())
|
||||
return scanhash_sha256d_8way( work, max_nonce, hashes_done, mythr );
|
||||
if ( sha256_use_8way() )
|
||||
return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr );
|
||||
#endif
|
||||
#ifdef HAVE_SHA256_4WAY
|
||||
if (sha256_use_4way())
|
||||
return scanhash_sha256d_4way( work, max_nonce, hashes_done, mythr );
|
||||
if ( sha256_use_4way() )
|
||||
return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr );
|
||||
#endif
|
||||
|
||||
memcpy(data, pdata + 16, 64);
|
||||
@@ -653,6 +658,7 @@ int scanhash_sha256d( struct work *work,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
@@ -682,13 +688,20 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
bool register_sha256d_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256d;
|
||||
gate->hash = (void*)&sha256d;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined(SHA256D_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_16way;
|
||||
//#elif defined(SHA256D_8WAY)
|
||||
// gate->scanhash = (void*)&scanhash_sha256d_8way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_sha256d_pooler;
|
||||
// gate->scanhash = (void*)&scanhash_sha256d_4way;
|
||||
#endif
|
||||
// gate->hash = (void*)&sha256d;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -7,9 +7,9 @@
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
#include "sha256-hash-opt.h"
|
||||
#include "sha256-hash.h"
|
||||
|
||||
void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
|
||||
void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y )
|
||||
{
|
||||
@@ -342,4 +342,348 @@ void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
|
||||
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
|
||||
}
|
||||
|
||||
void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y )
|
||||
{
|
||||
__m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
|
||||
__m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK;
|
||||
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
|
||||
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
|
||||
__m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y;
|
||||
|
||||
// Load initial values
|
||||
TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
|
||||
STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
|
||||
TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
|
||||
STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
|
||||
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||
|
||||
TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
|
||||
TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
|
||||
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
|
||||
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
|
||||
STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
|
||||
STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
|
||||
STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
|
||||
STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
|
||||
|
||||
// Save current hash
|
||||
ABEF_SAVE_X = STATE0_X;
|
||||
ABEF_SAVE_Y = STATE0_Y;
|
||||
CDGH_SAVE_X = STATE1_X;
|
||||
CDGH_SAVE_Y = STATE1_Y;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
|
||||
TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
|
||||
TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
|
||||
TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK );
|
||||
TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK );
|
||||
MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
|
||||
TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
|
||||
TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
|
||||
TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK );
|
||||
TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK );
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
|
||||
TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
|
||||
TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
|
||||
TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK );
|
||||
TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK );
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
|
||||
TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
|
||||
TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
|
||||
TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK );
|
||||
TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK );
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 16-19
|
||||
TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 20-23
|
||||
TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 24-27
|
||||
TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 28-31
|
||||
TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 32-35
|
||||
TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 36-39
|
||||
TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 40-43
|
||||
TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 44-47
|
||||
TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 48-51
|
||||
TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 52-55
|
||||
TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 56-59
|
||||
TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 60-63
|
||||
TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Add values back to state
|
||||
STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
|
||||
STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
|
||||
STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
|
||||
STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
|
||||
|
||||
TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
|
||||
TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
|
||||
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
|
||||
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
|
||||
STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
|
||||
STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
|
||||
STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
|
||||
STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &out_X[0], STATE0_X);
|
||||
_mm_store_si128((__m128i*) &out_X[4], STATE1_X);
|
||||
_mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
|
||||
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -7,9 +7,9 @@
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
#include "sha256-hash-opt.h"
|
||||
#include "sha256-hash.h"
|
||||
|
||||
void sha256_opt_transform( uint32_t *state_out, const void *input,
|
||||
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
|
||||
const uint32_t *state_in )
|
||||
{
|
||||
__m128i STATE0, STATE1;
|
||||
@@ -197,4 +197,192 @@ void sha256_opt_transform( uint32_t *state_out, const void *input,
|
||||
_mm_store_si128((__m128i*) &state_out[4], STATE1);
|
||||
}
|
||||
|
||||
|
||||
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
|
||||
const uint32_t *state_in )
|
||||
{
|
||||
__m128i STATE0, STATE1;
|
||||
__m128i MSG, TMP, MASK;
|
||||
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||
__m128i ABEF_SAVE, CDGH_SAVE;
|
||||
|
||||
// Load initial values
|
||||
TMP = _mm_load_si128((__m128i*) &state_in[0]);
|
||||
STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
|
||||
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||
|
||||
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
|
||||
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
|
||||
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
|
||||
|
||||
// Save current hash
|
||||
ABEF_SAVE = STATE0;
|
||||
CDGH_SAVE = STATE1;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0 = _mm_load_si128((const __m128i*) (input+0));
|
||||
TMSG0 = _mm_shuffle_epi8( TMSG0, MASK );
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
|
||||
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
|
||||
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
|
||||
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 16-19
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 20-23
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 24-27
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 28-31
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 32-35
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 36-39
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 40-43
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 44-47
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 48-51
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 52-55
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 56-59
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 60-63
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Add values back to state
|
||||
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
|
||||
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
|
||||
|
||||
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
|
||||
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
|
||||
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &state_out[0], STATE0);
|
||||
_mm_store_si128((__m128i*) &state_out[4], STATE1);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -1,18 +0,0 @@
|
||||
#ifndef SHA2_HASH_OPT_H__
|
||||
#define SHA2_HASH_OPT_H__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
void sha256_opt_transform( uint32_t *state_out, const void *input,
|
||||
const uint32_t *state_in );
|
||||
|
||||
// 2 way with interleaved instructions
|
||||
void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y );
|
||||
|
||||
#endif
|
||||
#endif
|
142
algo/sha/sha256-hash.c
Normal file
142
algo/sha/sha256-hash.c
Normal file
@@ -0,0 +1,142 @@
|
||||
#include "sha256-hash.h"
|
||||
|
||||
static const uint32_t SHA256_IV[8] =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
|
||||
/*
|
||||
static const uint8_t SHA256_PAD[64] =
|
||||
{
|
||||
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
*/
|
||||
|
||||
void sha256_ctx_init( sha256_context *ctx )
|
||||
{
|
||||
memcpy( ctx->state, SHA256_IV, sizeof SHA256_IV );
|
||||
ctx->count = 0;
|
||||
}
|
||||
|
||||
void sha256_update( sha256_context *ctx, const void *data, size_t len )
|
||||
{
|
||||
int ptr = ctx->count & 0x3f;
|
||||
const uint8_t *src = data;
|
||||
|
||||
ctx->count += (uint64_t)len;
|
||||
|
||||
if ( len < 64 - ptr )
|
||||
{
|
||||
memcpy( ctx->buf + ptr, src, len );
|
||||
return;
|
||||
}
|
||||
|
||||
memcpy( ctx->buf + ptr, src, 64 - ptr );
|
||||
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||
src += 64 - ptr;
|
||||
len -= 64 - ptr;
|
||||
|
||||
while ( len >= 64 )
|
||||
{
|
||||
sha256_transform_be( ctx->state, (uint32_t*)src, ctx->state );
|
||||
src += 64;
|
||||
len -= 64;
|
||||
}
|
||||
|
||||
memcpy( ctx->buf, src, len );
|
||||
}
|
||||
|
||||
#if 0
|
||||
void sha256_final( sha256_context *ctx, uint32_t *hash )
|
||||
{
|
||||
size_t r;
|
||||
|
||||
|
||||
/* Figure out how many bytes we have buffered. */
|
||||
r = ctx->count & 0x3f;
|
||||
// r = ( ctx->count >> 3 ) & 0x3f;
|
||||
|
||||
//printf("final: count= %d, r= %d\n", ctx->count, r );
|
||||
|
||||
/* Pad to 56 mod 64, transforming if we finish a block en route. */
|
||||
if ( r < 56 )
|
||||
{
|
||||
/* Pad to 56 mod 64. */
|
||||
memcpy( &ctx->buf[r], SHA256_PAD, 56 - r );
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Finish the current block and mix. */
|
||||
memcpy( &ctx->buf[r], SHA256_PAD, 64 - r );
|
||||
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||
|
||||
// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
|
||||
|
||||
/* The start of the final block is all zeroes. */
|
||||
memset( &ctx->buf[0], 0, 56 );
|
||||
}
|
||||
|
||||
/* Add the terminating bit-count. */
|
||||
ctx->buf[56] = bswap_64( ctx->count << 3 );
|
||||
// ctx->buf[56] = bswap_64( ctx->count );
|
||||
// be64enc( &ctx->buf[56], ctx->count );
|
||||
|
||||
/* Mix in the final block. */
|
||||
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||
|
||||
// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
|
||||
|
||||
for ( int i = 0; i < 8; i++ ) hash[i] = bswap_32( ctx->state[i] );
|
||||
|
||||
// for ( int i = 0; i < 8; i++ ) be32enc( hash + 4*i, ctx->state + i );
|
||||
|
||||
/*
|
||||
// be32enc_vect(digest, ctx->state, 4);
|
||||
// be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
|
||||
// Encode vector, two words at a time.
|
||||
do {
|
||||
be32enc(&dst[0], src[0]);
|
||||
be32enc(&dst[4], src[1]);
|
||||
src += 2;
|
||||
dst += 8;
|
||||
} while (--len);
|
||||
*/
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
void sha256_final( sha256_context *ctx, void *hash )
|
||||
{
|
||||
int ptr = ctx->count & 0x3f;
|
||||
|
||||
ctx->buf[ ptr++ ] = 0x80;
|
||||
|
||||
if ( ptr > 56 )
|
||||
{
|
||||
memset( ctx->buf + ptr, 0, 64 - ptr );
|
||||
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||
memset( ctx->buf, 0, 56 );
|
||||
}
|
||||
else
|
||||
memset( ctx->buf + ptr, 0, 56 - ptr );
|
||||
|
||||
*(uint64_t*)(&ctx->buf[56]) = bswap_64( ctx->count << 3 );
|
||||
|
||||
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
( (uint32_t*)hash )[i] = bswap_32( ctx->state[i] );
|
||||
}
|
||||
|
||||
void sha256_full( void *hash, const void *data, size_t len )
|
||||
{
|
||||
sha256_context ctx;
|
||||
sha256_ctx_init( &ctx );
|
||||
sha256_update( &ctx, data, len );
|
||||
sha256_final( &ctx, hash );
|
||||
}
|
||||
|
60
algo/sha/sha256-hash.h
Normal file
60
algo/sha/sha256-hash.h
Normal file
@@ -0,0 +1,60 @@
|
||||
#ifndef SHA256_HASH_H__
|
||||
#define SHA256_HASH_H__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
#include "cpuminer-config.h"
|
||||
#include "sph_sha2.h"
|
||||
|
||||
|
||||
// generic interface
|
||||
|
||||
typedef struct {
|
||||
unsigned char buf[64]; /* first field, for alignment */
|
||||
uint32_t state[8];
|
||||
uint64_t count;
|
||||
} sha256_context __attribute__((aligned(64)));
|
||||
|
||||
void sha256_full( void *hash, const void *data, size_t len );
|
||||
void sha256_update( sha256_context *ctx, const void *data, size_t len );
|
||||
void sha256_final( sha256_context *ctx, void *hash );
|
||||
void sha256_ctx_init( sha256_context *ctx );
|
||||
void sha256_transform_le( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in );
|
||||
void sha256_transform_be( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in );
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
|
||||
const uint32_t *state_in );
|
||||
|
||||
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
|
||||
const uint32_t *state_in );
|
||||
|
||||
// 2 way with interleaved instructions
|
||||
void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y );
|
||||
|
||||
void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y );
|
||||
|
||||
// Select target
|
||||
// with SHA...
|
||||
#define sha256_transform_le sha256_opt_transform_le
|
||||
#define sha256_transform_be sha256_opt_transform_be
|
||||
|
||||
#else
|
||||
|
||||
// without SHA...
|
||||
#define sha256_transform_le sph_sha256_transform_le
|
||||
#define sha256_transform_be sph_sha256_transform_be
|
||||
|
||||
#endif
|
||||
|
||||
// SHA can't do only 3 rounds
|
||||
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
|
||||
|
||||
#endif
|
@@ -1,4 +1,4 @@
|
||||
#include "sha256t-gate.h"
|
||||
#include "sha256d-4way.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -10,12 +10,14 @@
|
||||
int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m512i vdata[32] __attribute__ ((aligned (128)));
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m512i vdata[20] __attribute__ ((aligned (32)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (64)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate1[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate2[8] __attribute__ ((aligned (64)));
|
||||
__m512i mexp_pre[16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
@@ -23,7 +25,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = vdata + 19;
|
||||
__m512i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i last_byte = m512_const1_32( 0x80000000 );
|
||||
@@ -35,6 +37,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_512( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m512_const1_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = m512_const1_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
|
||||
@@ -45,47 +55,42 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_16way_transform( midstate, vdata, initstate );
|
||||
sha256_16way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_512( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_512( block + 5, 10 );
|
||||
block[15] = m512_const1_32( 80*8 ); // bit count
|
||||
sha256_16way_transform( hash32, block, midstate );
|
||||
sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_512( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = m512_const1_32( 32*8 ); // bit count
|
||||
sha256_16way_transform( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
if ( sha256_16way_transform_le_short( hash32, block, initstate ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||
n += 16;
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||
n += 16;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256D_8WAY)
|
||||
@@ -93,12 +98,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m256i block[16] __attribute__ ((aligned (64)));
|
||||
__m256i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m256i block[16] __attribute__ ((aligned (32)));
|
||||
__m256i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m256i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate2[8] __attribute__ ((aligned (32)));
|
||||
__m256i mexp_pre[16] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m256i vdata[20] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
@@ -113,10 +120,18 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
const __m256i eight = m256_const1_32( 8 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m256_const1_32( pdata[i] );
|
||||
vdata[i] = m256_const1_32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m256_const1_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = m256_const1_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
|
||||
@@ -127,36 +142,33 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_8way_transform( midstate, vdata, initstate );
|
||||
sha256_8way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_256( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_256( block + 5, 10 );
|
||||
block[15] = m256_const1_32( 80*8 ); // bit count
|
||||
sha256_8way_transform( hash32, block, midstate );
|
||||
sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_256( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = m256_const1_32( 32*8 ); // bit count
|
||||
sha256_8way_transform( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
if ( unlikely(
|
||||
sha256_8way_transform_le_short( hash32, block, initstate ) ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||
@@ -174,12 +186,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m128i block[16] __attribute__ ((aligned (64)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m128i vdata[20] __attribute__ ((aligned (32)));
|
||||
__m128i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m128i block[16] __attribute__ ((aligned (32)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate2[8] __attribute__ ((aligned (32)));
|
||||
__m128i mexp_pre[16] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
@@ -198,6 +212,14 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||
@@ -209,39 +231,36 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform( midstate, vdata, initstate );
|
||||
sha256_4way_transform_le( midstate1, vdata, initstate );
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_128( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_128( block + 5, 10 );
|
||||
block[15] = m128_const1_32( 80*8 ); // bit count
|
||||
sha256_4way_transform( hash32, block, midstate );
|
||||
sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_128( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
sha256_4way_transform( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
if ( unlikely(
|
||||
sha256_4way_transform_le_short( hash32, block, initstate ) ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
// byte swap final hash for testing
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
@@ -250,3 +269,20 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
bool register_sha256d_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined(SHA256D_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_16way;
|
||||
#elif defined(SHA256D_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_8way;
|
||||
#elif defined(SHA256D_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_4way;
|
||||
#endif
|
||||
|
||||
// gate->hash = (void*)&sha256d;
|
||||
return true;
|
||||
};
|
||||
*/
|
||||
|
||||
|
46
algo/sha/sha256d-4way.h
Normal file
46
algo/sha/sha256d-4way.h
Normal file
@@ -0,0 +1,46 @@
|
||||
#ifndef __SHA256D_4WAY_H__
|
||||
#define __SHA256D_4WAY_H__ 1
|
||||
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SHA256D_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SHA256D_8WAY 1
|
||||
#else
|
||||
#define SHA256D_4WAY 1
|
||||
#endif
|
||||
|
||||
bool register_sha256d_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(SHA256D_16WAY)
|
||||
|
||||
int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
#if defined(SHA256D_8WAY)
|
||||
|
||||
int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
#if defined(SHA256D_4WAY)
|
||||
|
||||
int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
#if defined(__SHA__)
|
||||
|
||||
int scanhash_sha256d( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
8
algo/sha/sha256d.c
Normal file
8
algo/sha/sha256d.c
Normal file
@@ -0,0 +1,8 @@
|
||||
#include "sha256d.h"
|
||||
|
||||
void sha256d( void *hash, const void *data, int len )
|
||||
{
|
||||
sha256_full( hash, data, len );
|
||||
sha256_full( hash, hash, 32 );
|
||||
}
|
||||
|
7
algo/sha/sha256d.h
Normal file
7
algo/sha/sha256d.h
Normal file
@@ -0,0 +1,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <string.h>
|
||||
#include <inttypes.h>
|
||||
#include "sha256-hash.h"
|
||||
|
||||
void sha256d( void *hash, const void *data, int len );
|
||||
|
@@ -3,14 +3,14 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
static __thread sph_sha256_context sha256q_ctx __attribute__ ((aligned (64)));
|
||||
static __thread sha256_context sha256q_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256q_midstate( const void* input )
|
||||
{
|
||||
sph_sha256_init( &sha256q_ctx );
|
||||
sph_sha256( &sha256q_ctx, input, 64 );
|
||||
sha256_ctx_init( &sha256q_ctx );
|
||||
sha256_update( &sha256q_ctx, input, 64 );
|
||||
}
|
||||
|
||||
int sha256q_hash( void* output, const void* input )
|
||||
@@ -19,24 +19,16 @@ int sha256q_hash( void* output, const void* input )
|
||||
const int midlen = 64; // bytes
|
||||
const int tail = 80 - midlen; // 16
|
||||
|
||||
sph_sha256_context ctx __attribute__ ((aligned (64)));
|
||||
sha256_context ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx );
|
||||
|
||||
sph_sha256( &ctx, input + midlen, tail );
|
||||
sph_sha256_close( &ctx, hash );
|
||||
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, hash, 32 );
|
||||
sph_sha256_close( &ctx, hash );
|
||||
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, hash, 32 );
|
||||
sph_sha256_close( &ctx, hash );
|
||||
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, hash, 32 );
|
||||
sph_sha256_close( &ctx, output );
|
||||
sha256_update( &ctx, input + midlen, tail );
|
||||
sha256_final( &ctx, hash );
|
||||
|
||||
sha256_full( hash, hash, 32 );
|
||||
sha256_full( hash, hash, 32 );
|
||||
sha256_full( output, hash, 32 );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@@ -10,13 +10,14 @@
|
||||
int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m512i vdata[32] __attribute__ ((aligned (128)));
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m512i vdata[20] __attribute__ ((aligned (32)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (64)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate1[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate2[8] __attribute__ ((aligned (64)));
|
||||
__m512i mexp_pre[16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
@@ -31,12 +32,19 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
const __m512i sixteen = m512_const1_32( 16 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m512_const1_32( pdata[i] );
|
||||
vdata[i] = m512_const1_32( pdata[i] );
|
||||
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
// initialize state
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_512( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m512_const1_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = m512_const1_32( 32*8 ); // bit count
|
||||
|
||||
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
|
||||
@@ -46,48 +54,40 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 byte block of data
|
||||
sha256_16way_transform( midstate, vdata, initstate );
|
||||
|
||||
sha256_16way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
|
||||
sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_512( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_512( block + 5, 10 );
|
||||
block[15] = m512_const1_32( 80*8 ); // bit count
|
||||
sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
|
||||
// sha256_16way_transform( hash32, block, midstate );
|
||||
// 1. final 16 bytes of data, pre-padded
|
||||
sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_512( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = m512_const1_32( 32*8 ); // bit count
|
||||
sha256_16way_transform( hash32, block, initstate );
|
||||
sha256_16way_transform_le( block, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy_512( block, hash32, 8 );
|
||||
sha256_16way_transform( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
if ( unlikely(
|
||||
sha256_16way_transform_le_short( hash32, block, initstate ) ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( hash32_d7[ lane ] <= targ32_d7 )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||
n += 16;
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||
n += 16;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
@@ -102,12 +102,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m256i block[16] __attribute__ ((aligned (64)));
|
||||
__m256i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m256i block[16] __attribute__ ((aligned (32)));
|
||||
__m256i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m256i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate2[8] __attribute__ ((aligned (32)));
|
||||
__m256i mexp_pre[16] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m256i vdata[20] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
@@ -122,10 +124,18 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
const __m256i eight = m256_const1_32( 8 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m256_const1_32( pdata[i] );
|
||||
vdata[i] = m256_const1_32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m256_const1_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = m256_const1_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
|
||||
@@ -136,44 +146,40 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_8way_transform( midstate, vdata, initstate );
|
||||
sha256_8way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_256( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_256( block + 5, 10 );
|
||||
block[15] = m256_const1_32( 80*8 ); // bit count
|
||||
sha256_8way_transform( hash32, block, midstate );
|
||||
sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_256( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = m256_const1_32( 32*8 ); // bit count
|
||||
sha256_8way_transform( hash32, block, initstate );
|
||||
sha256_8way_transform_le( block, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy_256( block, hash32, 8 );
|
||||
sha256_8way_transform( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
if ( unlikely(
|
||||
sha256_8way_transform_le_short( hash32, block, initstate ) ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash32_d7[ lane ] <= targ32_d7 )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||
n += 8;
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
@@ -182,17 +188,110 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(SHA256T_4WAY)
|
||||
|
||||
// Optimizations are slower with AVX/SSE2
|
||||
// https://github.com/JayDDee/cpuminer-opt/issues/344
|
||||
/*
|
||||
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m128i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m128i block[16] __attribute__ ((aligned (32)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate2[8] __attribute__ ((aligned (32)));
|
||||
__m128i mexp_pre[16] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = m128_const1_32( 0x80000000 );
|
||||
const __m128i four = m128_const1_32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m128_const1_32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = m128_const1_64( 0x510E527F510E527F );
|
||||
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
|
||||
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
sha256_4way_transform_le( block, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
if ( unlikely(
|
||||
sha256_4way_transform_le_short( hash32, block, initstate ) ) )
|
||||
{
|
||||
// byte swap final hash for testing
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m128i block[16] __attribute__ ((aligned (64)));
|
||||
__m128i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m128i block[16] __attribute__ ((aligned (32)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m128i vdata[20] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
@@ -211,6 +310,14 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||
@@ -222,29 +329,13 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform( midstate, vdata, initstate );
|
||||
sha256_4way_transform_le( midstate, vdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_128( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_128( block + 5, 10 );
|
||||
block[15] = m128_const1_32( 80*8 ); // bit count
|
||||
sha256_4way_transform( hash32, block, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_128( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
sha256_4way_transform( hash32, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy_128( block, hash32, 8 );
|
||||
sha256_4way_transform( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
sha256_4way_transform_le( block, vdata+16, midstate );
|
||||
sha256_4way_transform_le( block, block, initstate );
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
@@ -265,5 +356,6 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -4,120 +4,12 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
//#include "algo/sha/sph_sha2.h"
|
||||
#include "sha256-hash-opt.h"
|
||||
#include "sha256-hash.h"
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
// Only used on CPUs with SHA
|
||||
|
||||
/*
|
||||
static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256t_midstate( const void* input )
|
||||
{
|
||||
sph_sha256_init( &sha256t_ctx );
|
||||
sph_sha256( &sha256t_ctx, input, 64 );
|
||||
}
|
||||
|
||||
int sha256t_hash( void* output, const void* input )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[16];
|
||||
const int midlen = 64; // bytes
|
||||
const int tail = 80 - midlen; // 16
|
||||
|
||||
sph_sha256_context ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &sha256t_ctx, sizeof sha256t_ctx );
|
||||
|
||||
sph_sha256( &ctx, input + midlen, tail );
|
||||
sph_sha256_close( &ctx, hash );
|
||||
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, hash, 32 );
|
||||
sph_sha256_close( &ctx, hash );
|
||||
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, hash, 32 );
|
||||
sph_sha256_close( &ctx, output );
|
||||
|
||||
return 1;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t block[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash32[8] __attribute__ ((aligned (32)));
|
||||
uint32_t initstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t midstate[8] __attribute__ ((aligned (32)));
|
||||
|
||||
|
||||
|
||||
// uint32_t edata[20] __attribute__((aligned(64)));
|
||||
// uint32_t hash[8] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 1;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
__m128i shuf_bswap32 =
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
// mm128_bswap32_80( edata, pdata );
|
||||
// sha256t_midstate( edata );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = 0x6A09E667;
|
||||
initstate[1] = 0xBB67AE85;
|
||||
initstate[2] = 0x3C6EF372;
|
||||
initstate[3] = 0xA54FF53A;
|
||||
initstate[4] = 0x510E527F;
|
||||
initstate[5] = 0x9B05688C;
|
||||
initstate[6] = 0x1F83D9AB;
|
||||
initstate[7] = 0x5BE0CD19;
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_opt_transform( midstate, pdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy( block, pdata + 16, 16 );
|
||||
block[ 4] = 0x80000000;
|
||||
memset( block + 5, 0, 40 );
|
||||
block[15] = 80*8; // bit count
|
||||
sha256_opt_transform( hash32, block, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy( block, hash32, 32 );
|
||||
block[ 8] = 0x80000000;
|
||||
memset( block + 9, 0, 24 );
|
||||
block[15] = 32*8; // bit count
|
||||
sha256_opt_transform( hash32, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy( block, hash32, 32 );
|
||||
sha256_opt_transform( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
casti_m128i( hash32, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash32, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash32, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash32, 1 ), shuf_bswap32 );
|
||||
|
||||
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
|
||||
submit_solution( work, hash32, mythr );
|
||||
n++;
|
||||
pdata[19] = n;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
@@ -149,7 +41,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
initstate[7] = 0x5BE0CD19;
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_opt_transform( midstate, pdata, initstate );
|
||||
sha256_opt_transform_le( midstate, pdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -162,7 +54,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
memset( block0 + 5, 0, 40 );
|
||||
memset( block1 + 5, 0, 40 );
|
||||
block0[15] = block1[15] = 80*8; // bit count
|
||||
sha256_ni2way_transform( hash0, hash1, block0, block1, midstate, midstate );
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1, midstate, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy( block0, hash0, 32 );
|
||||
@@ -171,12 +63,12 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
memset( block0 + 9, 0, 24 );
|
||||
memset( block1 + 9, 0, 24 );
|
||||
block0[15] = block1[15] = 32*8; // bit count
|
||||
sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy( block0, hash0, 32 );
|
||||
memcpy( block1, hash1, 32 );
|
||||
sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
casti_m128i( hash0, 0 ) =
|
||||
|
@@ -95,32 +95,36 @@ static const uint64_t K512[80] =
|
||||
|
||||
// SHA-512 8 way 64 bit
|
||||
|
||||
#define CH8W(X, Y, Z) \
|
||||
_mm512_ternarylogic_epi64( X, Y, Z, 0xca )
|
||||
#define CH8W( X, Y, Z ) _mm512_ternarylogic_epi64( X, Y, Z, 0xca )
|
||||
|
||||
#define MAJ8W(X, Y, Z) \
|
||||
_mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
|
||||
#define MAJ8W( X, Y, Z ) _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
|
||||
|
||||
#define BSG8W_5_0(x) \
|
||||
mm512_xor3( mm512_ror_64(x, 28), mm512_ror_64(x, 34), mm512_ror_64(x, 39) )
|
||||
#define BSG8W_5_0( x ) mm512_xor3( _mm512_ror_epi64( x, 28 ), \
|
||||
_mm512_ror_epi64( x, 34 ), \
|
||||
_mm512_ror_epi64( x, 39 ) )
|
||||
|
||||
#define BSG8W_5_1(x) \
|
||||
mm512_xor3( mm512_ror_64(x, 14), mm512_ror_64(x, 18), mm512_ror_64(x, 41) )
|
||||
#define BSG8W_5_1( x ) mm512_xor3( _mm512_ror_epi64( x, 14 ), \
|
||||
_mm512_ror_epi64( x, 18 ), \
|
||||
_mm512_ror_epi64( x, 41 ) )
|
||||
|
||||
#define SSG8W_5_0(x) \
|
||||
mm512_xor3( mm512_ror_64(x, 1), mm512_ror_64(x, 8), _mm512_srli_epi64(x, 7) )
|
||||
#define SSG8W_5_0( x ) mm512_xor3( _mm512_ror_epi64( x, 1 ), \
|
||||
_mm512_ror_epi64( x, 8 ), \
|
||||
_mm512_srli_epi64( x, 7 ) )
|
||||
|
||||
#define SSG8W_5_1(x) \
|
||||
mm512_xor3( mm512_ror_64(x, 19), mm512_ror_64(x, 61), _mm512_srli_epi64(x, 6) )
|
||||
#define SSG8W_5_1( x ) mm512_xor3( _mm512_ror_epi64( x, 19 ), \
|
||||
_mm512_ror_epi64( x, 61 ), \
|
||||
_mm512_srli_epi64( x, 6 ) )
|
||||
|
||||
#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
||||
#define SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i ) \
|
||||
do { \
|
||||
__m512i T1, T2; \
|
||||
__m512i K = _mm512_set1_epi64( K512[ i ] ); \
|
||||
T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
|
||||
K, W[i] ) ); \
|
||||
T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
|
||||
D = _mm512_add_epi64( D, T1 ); \
|
||||
__m512i T0 = _mm512_add_epi64( _mm512_set1_epi64( K512[i] ), W[ i ] ); \
|
||||
__m512i T1 = BSG8W_5_1( E ); \
|
||||
__m512i T2 = BSG8W_5_0( A ); \
|
||||
T0 = _mm512_add_epi64( T0, CH8W( E, F, G ) ); \
|
||||
T1 = _mm512_add_epi64( T1, H ); \
|
||||
T2 = _mm512_add_epi64( T2, MAJ8W( A, B, C ) ); \
|
||||
T1 = _mm512_add_epi64( T1, T0 ); \
|
||||
D = _mm512_add_epi64( D, T1 ); \
|
||||
H = _mm512_add_epi64( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
@@ -267,16 +271,9 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||
|
||||
// SHA-512 4 way 64 bit
|
||||
|
||||
|
||||
#define CH(X, Y, Z) \
|
||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
||||
|
||||
/*
|
||||
#define MAJ(X, Y, Z) \
|
||||
_mm256_or_si256( _mm256_and_si256( X, Y ), \
|
||||
_mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
|
||||
*/
|
||||
|
||||
#define MAJ(X, Y, Z) \
|
||||
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
|
||||
Y_xor_Z ) )
|
||||
@@ -289,15 +286,6 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
|
||||
_mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
|
||||
|
||||
/*
|
||||
#define BSG5_0(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) )
|
||||
|
||||
#define BSG5_1(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) )
|
||||
*/
|
||||
/*
|
||||
#define SSG5_0(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
@@ -325,94 +313,20 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
|
||||
return _mm256_add_epi64( w0a, w1a );
|
||||
}
|
||||
|
||||
/*
|
||||
#define SSG512x2_0( w0, w1, i ) do \
|
||||
{ \
|
||||
__m256i X0a, X1a, X0b, X1b; \
|
||||
X0a = mm256_ror_64( W[i-15], 1 ); \
|
||||
X1a = mm256_ror_64( W[i-14], 1 ); \
|
||||
X0b = mm256_ror_64( W[i-15], 8 ); \
|
||||
X1b = mm256_ror_64( W[i-14], 8 ); \
|
||||
X0a = _mm256_xor_si256( X0a, X0b ); \
|
||||
X1a = _mm256_xor_si256( X1a, X1b ); \
|
||||
X0b = _mm256_srli_epi64( W[i-15], 7 ); \
|
||||
X1b = _mm256_srli_epi64( W[i-14], 7 ); \
|
||||
w0 = _mm256_xor_si256( X0a, X0b ); \
|
||||
w1 = _mm256_xor_si256( X1a, X1b ); \
|
||||
} while(0)
|
||||
|
||||
#define SSG512x2_1( w0, w1, i ) do \
|
||||
{ \
|
||||
__m256i X0a, X1a, X0b, X1b; \
|
||||
X0a = mm256_ror_64( W[i-2],19 ); \
|
||||
X1a = mm256_ror_64( W[i-1],19 ); \
|
||||
X0b = mm256_ror_64( W[i-2],61 ); \
|
||||
X1b = mm256_ror_64( W[i-1],61 ); \
|
||||
X0a = _mm256_xor_si256( X0a, X0b ); \
|
||||
X1a = _mm256_xor_si256( X1a, X1b ); \
|
||||
X0b = _mm256_srli_epi64( W[i-2], 6 ); \
|
||||
X1b = _mm256_srli_epi64( W[i-1], 6 ); \
|
||||
w0 = _mm256_xor_si256( X0a, X0b ); \
|
||||
w1 = _mm256_xor_si256( X1a, X1b ); \
|
||||
} while(0)
|
||||
*/
|
||||
/*
|
||||
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
||||
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
|
||||
do { \
|
||||
__m256i K = _mm256_set1_epi64x( K512[ i ] ); \
|
||||
__m256i T1 = mm256_ror_64( E, 23 ); \
|
||||
__m256i T2 = mm256_ror_64( A, 5 ); \
|
||||
__m256i T3 = _mm256_xor_si256( F, G ); \
|
||||
__m256i T4 = _mm256_or_si256( A, B ); \
|
||||
__m256i T5 = _mm256_and_si256( A, B ); \
|
||||
K = _mm256_add_epi64( K, W[i] ); \
|
||||
T1 = _mm256_xor_si256( T1, E ); \
|
||||
T2 = _mm256_xor_si256( T2, A ); \
|
||||
T3 = _mm256_and_si256( T3, E ); \
|
||||
T4 = _mm256_and_si256( T4, C ); \
|
||||
K = _mm256_add_epi64( H, K ); \
|
||||
T1 = mm256_ror_64( T1, 4 ); \
|
||||
T2 = mm256_ror_64( T2, 6 ); \
|
||||
T3 = _mm256_xor_si256( T3, G ); \
|
||||
T4 = _mm256_or_si256( T4, T5 ); \
|
||||
T1 = _mm256_xor_si256( T1, E ); \
|
||||
T2 = _mm256_xor_si256( T2, A ); \
|
||||
T1 = mm256_ror_64( T1, 14 ); \
|
||||
T2 = mm256_ror_64( T2, 28 ); \
|
||||
T1 = _mm256_add_epi64( T1, T3 ); \
|
||||
T2 = _mm256_add_epi64( T2, T4 ); \
|
||||
T1 = _mm256_add_epi64( T1, K ); \
|
||||
H = _mm256_add_epi64( T1, T2 ); \
|
||||
D = _mm256_add_epi64( D, T1 ); \
|
||||
} while (0)
|
||||
*/
|
||||
/*
|
||||
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
||||
do { \
|
||||
__m256i K = _mm256_add_epi64( W[i], _mm256_set1_epi64x( K512[ i ] ) ); \
|
||||
__m256i T1 = BSG5_1(E); \
|
||||
__m256i T2 = BSG5_0(A); \
|
||||
T1 = mm256_add4_64( T1, H, CH(E, F, G), K ); \
|
||||
T2 = _mm256_add_epi64( T2, MAJ(A, B, C) ); \
|
||||
D = _mm256_add_epi64( D, T1 ); \
|
||||
H = _mm256_add_epi64( T1, T2 ); \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
|
||||
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
||||
do { \
|
||||
__m256i T1, T2; \
|
||||
__m256i K = _mm256_set1_epi64x( K512[ i ] ); \
|
||||
T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
|
||||
K, W[i] ) ); \
|
||||
T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
|
||||
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[ i ] ); \
|
||||
__m256i T1 = BSG5_1( E ); \
|
||||
__m256i T2 = BSG5_0( A ); \
|
||||
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
|
||||
T1 = _mm256_add_epi64( T1, H ); \
|
||||
T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \
|
||||
T1 = _mm256_add_epi64( T1, T0 ); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = _mm256_add_epi64( D, T1 ); \
|
||||
D = _mm256_add_epi64( D, T1 ); \
|
||||
H = _mm256_add_epi64( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
static void
|
||||
sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
|
||||
{
|
||||
|
@@ -41,7 +41,7 @@
|
||||
|
||||
#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
|
||||
//#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X)))
|
||||
#define MAJ( X, Y, Z ) ( Y ^ ( ( X ^ Y ) & ( Y ^ Z ) ) )
|
||||
#define MAJ( X, Y, Z ) ( Y ^ ( ( X_xor_Y = X ^ Y ) & ( Y_xor_Z ) ) )
|
||||
#define ROTR SPH_ROTR32
|
||||
|
||||
#define BSG2_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
|
||||
@@ -71,198 +71,6 @@ static const sph_u32 H256[8] = {
|
||||
* of the compression function.
|
||||
*/
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
static void sha2_round( const uint8_t input[], uint32_t state[8] )
|
||||
{
|
||||
__m128i STATE0, STATE1;
|
||||
__m128i MSG, TMP, MASK;
|
||||
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||
__m128i ABEF_SAVE, CDGH_SAVE;
|
||||
|
||||
// Load initial values
|
||||
TMP = _mm_load_si128((__m128i*) &state[0]);
|
||||
STATE1 = _mm_load_si128((__m128i*) &state[4]);
|
||||
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||
|
||||
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
|
||||
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
|
||||
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
|
||||
|
||||
// Save current hash
|
||||
ABEF_SAVE = STATE0;
|
||||
CDGH_SAVE = STATE1;
|
||||
|
||||
// Rounds 0-3
|
||||
MSG = _mm_load_si128((const __m128i*) (input+0));
|
||||
TMSG0 = _mm_shuffle_epi8(MSG, MASK);
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
|
||||
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
|
||||
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
|
||||
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 16-19
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 20-23
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 24-27
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 28-31
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 32-35
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 36-39
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 40-43
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 44-47
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 48-51
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 52-55
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 56-59
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 60-63
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Add values back to state
|
||||
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
|
||||
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
|
||||
|
||||
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
|
||||
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
|
||||
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &state[0], STATE0);
|
||||
_mm_store_si128((__m128i*) &state[4], STATE1);
|
||||
}
|
||||
|
||||
#else // no SHA
|
||||
|
||||
/*
|
||||
static const sph_u32 K[64] = {
|
||||
@@ -319,6 +127,7 @@ static const sph_u32 K[64] = {
|
||||
t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \
|
||||
+ K[pcount + (pc)] + W[(pc) & 0x0F]); \
|
||||
t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
d = SPH_T32(d + t1); \
|
||||
h = SPH_T32(t1 + t2); \
|
||||
} while (0)
|
||||
@@ -329,7 +138,7 @@ static const sph_u32 K[64] = {
|
||||
SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc)
|
||||
|
||||
#define SHA2_ROUND_BODY(in, r) do { \
|
||||
sph_u32 A, B, C, D, E, F, G, H; \
|
||||
sph_u32 A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; \
|
||||
sph_u32 W[16]; \
|
||||
unsigned pcount; \
|
||||
\
|
||||
@@ -342,6 +151,7 @@ static const sph_u32 K[64] = {
|
||||
G = (r)[6]; \
|
||||
H = (r)[7]; \
|
||||
pcount = 0; \
|
||||
Y_xor_Z = B ^ C; \
|
||||
SHA2_STEP1(A, B, C, D, E, F, G, H, in, 0); \
|
||||
SHA2_STEP1(H, A, B, C, D, E, F, G, in, 1); \
|
||||
SHA2_STEP1(G, H, A, B, C, D, E, F, in, 2); \
|
||||
@@ -389,7 +199,7 @@ static const sph_u32 K[64] = {
|
||||
#else // large footprint (default)
|
||||
|
||||
#define SHA2_ROUND_BODY(in, r) do { \
|
||||
sph_u32 A, B, C, D, E, F, G, H, T1, T2; \
|
||||
sph_u32 A, B, C, D, E, F, G, H, T1, T2, X_xor_Y, Y_xor_Z;; \
|
||||
sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
|
||||
sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
|
||||
\
|
||||
@@ -401,388 +211,453 @@ static const sph_u32 K[64] = {
|
||||
F = (r)[5]; \
|
||||
G = (r)[6]; \
|
||||
H = (r)[7]; \
|
||||
Y_xor_Z = B ^ C; \
|
||||
W00 = in(0); \
|
||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||
+ SPH_C32(0x428A2F98) + W00); \
|
||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = SPH_T32(D + T1); \
|
||||
H = SPH_T32(T1 + T2); \
|
||||
W01 = in(1); \
|
||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||
+ SPH_C32(0x71374491) + W01); \
|
||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
C = SPH_T32(C + T1); \
|
||||
G = SPH_T32(T1 + T2); \
|
||||
W02 = in(2); \
|
||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||
+ SPH_C32(0xB5C0FBCF) + W02); \
|
||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
B = SPH_T32(B + T1); \
|
||||
F = SPH_T32(T1 + T2); \
|
||||
W03 = in(3); \
|
||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||
+ SPH_C32(0xE9B5DBA5) + W03); \
|
||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
A = SPH_T32(A + T1); \
|
||||
E = SPH_T32(T1 + T2); \
|
||||
W04 = in(4); \
|
||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||
+ SPH_C32(0x3956C25B) + W04); \
|
||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
H = SPH_T32(H + T1); \
|
||||
D = SPH_T32(T1 + T2); \
|
||||
W05 = in(5); \
|
||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||
+ SPH_C32(0x59F111F1) + W05); \
|
||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
G = SPH_T32(G + T1); \
|
||||
C = SPH_T32(T1 + T2); \
|
||||
W06 = in(6); \
|
||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||
+ SPH_C32(0x923F82A4) + W06); \
|
||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
F = SPH_T32(F + T1); \
|
||||
B = SPH_T32(T1 + T2); \
|
||||
W07 = in(7); \
|
||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||
+ SPH_C32(0xAB1C5ED5) + W07); \
|
||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
E = SPH_T32(E + T1); \
|
||||
A = SPH_T32(T1 + T2); \
|
||||
W08 = in(8); \
|
||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||
+ SPH_C32(0xD807AA98) + W08); \
|
||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = SPH_T32(D + T1); \
|
||||
H = SPH_T32(T1 + T2); \
|
||||
W09 = in(9); \
|
||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||
+ SPH_C32(0x12835B01) + W09); \
|
||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
C = SPH_T32(C + T1); \
|
||||
G = SPH_T32(T1 + T2); \
|
||||
W10 = in(10); \
|
||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||
+ SPH_C32(0x243185BE) + W10); \
|
||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
B = SPH_T32(B + T1); \
|
||||
F = SPH_T32(T1 + T2); \
|
||||
W11 = in(11); \
|
||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||
+ SPH_C32(0x550C7DC3) + W11); \
|
||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
A = SPH_T32(A + T1); \
|
||||
E = SPH_T32(T1 + T2); \
|
||||
W12 = in(12); \
|
||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||
+ SPH_C32(0x72BE5D74) + W12); \
|
||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
H = SPH_T32(H + T1); \
|
||||
D = SPH_T32(T1 + T2); \
|
||||
W13 = in(13); \
|
||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||
+ SPH_C32(0x80DEB1FE) + W13); \
|
||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
G = SPH_T32(G + T1); \
|
||||
C = SPH_T32(T1 + T2); \
|
||||
W14 = in(14); \
|
||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||
+ SPH_C32(0x9BDC06A7) + W14); \
|
||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
F = SPH_T32(F + T1); \
|
||||
B = SPH_T32(T1 + T2); \
|
||||
W15 = in(15); \
|
||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||
+ SPH_C32(0xC19BF174) + W15); \
|
||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
E = SPH_T32(E + T1); \
|
||||
A = SPH_T32(T1 + T2); \
|
||||
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
|
||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||
+ SPH_C32(0xE49B69C1) + W00); \
|
||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = SPH_T32(D + T1); \
|
||||
H = SPH_T32(T1 + T2); \
|
||||
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
|
||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||
+ SPH_C32(0xEFBE4786) + W01); \
|
||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
C = SPH_T32(C + T1); \
|
||||
G = SPH_T32(T1 + T2); \
|
||||
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
|
||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||
+ SPH_C32(0x0FC19DC6) + W02); \
|
||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
B = SPH_T32(B + T1); \
|
||||
F = SPH_T32(T1 + T2); \
|
||||
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
|
||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||
+ SPH_C32(0x240CA1CC) + W03); \
|
||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
A = SPH_T32(A + T1); \
|
||||
E = SPH_T32(T1 + T2); \
|
||||
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
|
||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||
+ SPH_C32(0x2DE92C6F) + W04); \
|
||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
H = SPH_T32(H + T1); \
|
||||
D = SPH_T32(T1 + T2); \
|
||||
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
|
||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||
+ SPH_C32(0x4A7484AA) + W05); \
|
||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
G = SPH_T32(G + T1); \
|
||||
C = SPH_T32(T1 + T2); \
|
||||
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
|
||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||
+ SPH_C32(0x5CB0A9DC) + W06); \
|
||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
F = SPH_T32(F + T1); \
|
||||
B = SPH_T32(T1 + T2); \
|
||||
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
|
||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||
+ SPH_C32(0x76F988DA) + W07); \
|
||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
E = SPH_T32(E + T1); \
|
||||
A = SPH_T32(T1 + T2); \
|
||||
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
|
||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||
+ SPH_C32(0x983E5152) + W08); \
|
||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = SPH_T32(D + T1); \
|
||||
H = SPH_T32(T1 + T2); \
|
||||
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
|
||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||
+ SPH_C32(0xA831C66D) + W09); \
|
||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
C = SPH_T32(C + T1); \
|
||||
G = SPH_T32(T1 + T2); \
|
||||
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
|
||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||
+ SPH_C32(0xB00327C8) + W10); \
|
||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
B = SPH_T32(B + T1); \
|
||||
F = SPH_T32(T1 + T2); \
|
||||
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
|
||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||
+ SPH_C32(0xBF597FC7) + W11); \
|
||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
A = SPH_T32(A + T1); \
|
||||
E = SPH_T32(T1 + T2); \
|
||||
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
|
||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||
+ SPH_C32(0xC6E00BF3) + W12); \
|
||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
H = SPH_T32(H + T1); \
|
||||
D = SPH_T32(T1 + T2); \
|
||||
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
|
||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||
+ SPH_C32(0xD5A79147) + W13); \
|
||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
G = SPH_T32(G + T1); \
|
||||
C = SPH_T32(T1 + T2); \
|
||||
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
|
||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||
+ SPH_C32(0x06CA6351) + W14); \
|
||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
F = SPH_T32(F + T1); \
|
||||
B = SPH_T32(T1 + T2); \
|
||||
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
|
||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||
+ SPH_C32(0x14292967) + W15); \
|
||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
E = SPH_T32(E + T1); \
|
||||
A = SPH_T32(T1 + T2); \
|
||||
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
|
||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||
+ SPH_C32(0x27B70A85) + W00); \
|
||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = SPH_T32(D + T1); \
|
||||
H = SPH_T32(T1 + T2); \
|
||||
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
|
||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||
+ SPH_C32(0x2E1B2138) + W01); \
|
||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
C = SPH_T32(C + T1); \
|
||||
G = SPH_T32(T1 + T2); \
|
||||
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
|
||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||
+ SPH_C32(0x4D2C6DFC) + W02); \
|
||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
B = SPH_T32(B + T1); \
|
||||
F = SPH_T32(T1 + T2); \
|
||||
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
|
||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||
+ SPH_C32(0x53380D13) + W03); \
|
||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
A = SPH_T32(A + T1); \
|
||||
E = SPH_T32(T1 + T2); \
|
||||
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
|
||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||
+ SPH_C32(0x650A7354) + W04); \
|
||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
H = SPH_T32(H + T1); \
|
||||
D = SPH_T32(T1 + T2); \
|
||||
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
|
||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||
+ SPH_C32(0x766A0ABB) + W05); \
|
||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
G = SPH_T32(G + T1); \
|
||||
C = SPH_T32(T1 + T2); \
|
||||
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
|
||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||
+ SPH_C32(0x81C2C92E) + W06); \
|
||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
F = SPH_T32(F + T1); \
|
||||
B = SPH_T32(T1 + T2); \
|
||||
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
|
||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||
+ SPH_C32(0x92722C85) + W07); \
|
||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
E = SPH_T32(E + T1); \
|
||||
A = SPH_T32(T1 + T2); \
|
||||
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
|
||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||
+ SPH_C32(0xA2BFE8A1) + W08); \
|
||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = SPH_T32(D + T1); \
|
||||
H = SPH_T32(T1 + T2); \
|
||||
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
|
||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||
+ SPH_C32(0xA81A664B) + W09); \
|
||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
C = SPH_T32(C + T1); \
|
||||
G = SPH_T32(T1 + T2); \
|
||||
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
|
||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||
+ SPH_C32(0xC24B8B70) + W10); \
|
||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
B = SPH_T32(B + T1); \
|
||||
F = SPH_T32(T1 + T2); \
|
||||
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
|
||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||
+ SPH_C32(0xC76C51A3) + W11); \
|
||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
A = SPH_T32(A + T1); \
|
||||
E = SPH_T32(T1 + T2); \
|
||||
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
|
||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||
+ SPH_C32(0xD192E819) + W12); \
|
||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
H = SPH_T32(H + T1); \
|
||||
D = SPH_T32(T1 + T2); \
|
||||
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
|
||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||
+ SPH_C32(0xD6990624) + W13); \
|
||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
G = SPH_T32(G + T1); \
|
||||
C = SPH_T32(T1 + T2); \
|
||||
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
|
||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||
+ SPH_C32(0xF40E3585) + W14); \
|
||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
F = SPH_T32(F + T1); \
|
||||
B = SPH_T32(T1 + T2); \
|
||||
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
|
||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||
+ SPH_C32(0x106AA070) + W15); \
|
||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
E = SPH_T32(E + T1); \
|
||||
A = SPH_T32(T1 + T2); \
|
||||
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
|
||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||
+ SPH_C32(0x19A4C116) + W00); \
|
||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = SPH_T32(D + T1); \
|
||||
H = SPH_T32(T1 + T2); \
|
||||
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
|
||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||
+ SPH_C32(0x1E376C08) + W01); \
|
||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
C = SPH_T32(C + T1); \
|
||||
G = SPH_T32(T1 + T2); \
|
||||
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
|
||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||
+ SPH_C32(0x2748774C) + W02); \
|
||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
B = SPH_T32(B + T1); \
|
||||
F = SPH_T32(T1 + T2); \
|
||||
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
|
||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||
+ SPH_C32(0x34B0BCB5) + W03); \
|
||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
A = SPH_T32(A + T1); \
|
||||
E = SPH_T32(T1 + T2); \
|
||||
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
|
||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||
+ SPH_C32(0x391C0CB3) + W04); \
|
||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
H = SPH_T32(H + T1); \
|
||||
D = SPH_T32(T1 + T2); \
|
||||
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
|
||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||
+ SPH_C32(0x4ED8AA4A) + W05); \
|
||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
G = SPH_T32(G + T1); \
|
||||
C = SPH_T32(T1 + T2); \
|
||||
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
|
||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||
+ SPH_C32(0x5B9CCA4F) + W06); \
|
||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
F = SPH_T32(F + T1); \
|
||||
B = SPH_T32(T1 + T2); \
|
||||
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
|
||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||
+ SPH_C32(0x682E6FF3) + W07); \
|
||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
E = SPH_T32(E + T1); \
|
||||
A = SPH_T32(T1 + T2); \
|
||||
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
|
||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||
+ SPH_C32(0x748F82EE) + W08); \
|
||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = SPH_T32(D + T1); \
|
||||
H = SPH_T32(T1 + T2); \
|
||||
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
|
||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||
+ SPH_C32(0x78A5636F) + W09); \
|
||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
C = SPH_T32(C + T1); \
|
||||
G = SPH_T32(T1 + T2); \
|
||||
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
|
||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||
+ SPH_C32(0x84C87814) + W10); \
|
||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
B = SPH_T32(B + T1); \
|
||||
F = SPH_T32(T1 + T2); \
|
||||
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
|
||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||
+ SPH_C32(0x8CC70208) + W11); \
|
||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
A = SPH_T32(A + T1); \
|
||||
E = SPH_T32(T1 + T2); \
|
||||
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
|
||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||
+ SPH_C32(0x90BEFFFA) + W12); \
|
||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
H = SPH_T32(H + T1); \
|
||||
D = SPH_T32(T1 + T2); \
|
||||
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
|
||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||
+ SPH_C32(0xA4506CEB) + W13); \
|
||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
G = SPH_T32(G + T1); \
|
||||
C = SPH_T32(T1 + T2); \
|
||||
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
|
||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||
+ SPH_C32(0xBEF9A3F7) + W14); \
|
||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
F = SPH_T32(F + T1); \
|
||||
B = SPH_T32(T1 + T2); \
|
||||
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
|
||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||
+ SPH_C32(0xC67178F2) + W15); \
|
||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
E = SPH_T32(E + T1); \
|
||||
A = SPH_T32(T1 + T2); \
|
||||
(r)[0] = SPH_T32((r)[0] + A); \
|
||||
@@ -808,8 +683,54 @@ sha2_round(const unsigned char *data, sph_u32 r[8])
|
||||
#undef SHA2_IN
|
||||
}
|
||||
|
||||
#endif // SHA else
|
||||
void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in )
|
||||
{
|
||||
memcpy( state_out, state_in, 32 );
|
||||
#define SHA2_IN(x) (data[x])
|
||||
SHA2_ROUND_BODY( SHA2_IN, state_out );
|
||||
#undef SHA2_IN
|
||||
}
|
||||
|
||||
void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in )
|
||||
{
|
||||
memcpy( state_out, state_in, 32 );
|
||||
#define SHA2_IN(x) sph_dec32be_aligned( data+(x) )
|
||||
SHA2_ROUND_BODY( SHA2_IN, state_out );
|
||||
#undef SHA2_IN
|
||||
|
||||
}
|
||||
|
||||
void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in )
|
||||
{
|
||||
uint32_t t1, t2, X_xor_Y, Y_xor_Z = state_in[1] ^ state_in[2];
|
||||
memcpy( state_out, state_in, 32 );
|
||||
|
||||
t1 = state_out[7] + BSG2_1( state_out[4] )
|
||||
+ CH( state_out[4], state_out[5], state_out[6] ) + 0x428A2F98 + data[0];
|
||||
t2 = BSG2_0( state_out[0] )
|
||||
+ MAJ( state_out[0], state_out[1], state_out[2] );
|
||||
Y_xor_Z = X_xor_Y;
|
||||
state_out[3] += t1;
|
||||
state_out[7] = t1 + t2;
|
||||
|
||||
t1 = state_out[6] + BSG2_1( state_out[3] )
|
||||
+ CH( state_out[3], state_out[4], state_out[5] ) + 0x71374491 + data[1];
|
||||
t2 = BSG2_0( state_out[7] )
|
||||
+ MAJ( state_out[7], state_out[0], state_out[1] );
|
||||
Y_xor_Z = X_xor_Y;
|
||||
state_out[2] += t1;
|
||||
state_out[6] = t1 + t2;
|
||||
|
||||
t1 = state_out[5] + BSG2_1( state_out[2] )
|
||||
+ CH( state_out[2], state_out[3], state_out[4] ) + 0xB5C0FBCF + data[2];
|
||||
t2 = BSG2_0( state_out[6] )
|
||||
+ MAJ( state_out[6], state_out[7], state_out[0] );
|
||||
state_out[1] += t1;
|
||||
state_out[5] = t1 + t2;
|
||||
}
|
||||
|
||||
/* see sph_sha2.h */
|
||||
void
|
||||
|
@@ -207,6 +207,16 @@ void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
|
||||
|
||||
void sph_sha256_full( void *dst, const void *data, size_t len );
|
||||
|
||||
// These shouldn't be called directly, use sha256-hash.h generic functions
|
||||
// sha256_transform_le & sha256_transform_be instead.
|
||||
void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in );
|
||||
|
||||
void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in );
|
||||
|
||||
void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in );
|
||||
|
||||
|
||||
#if SPH_64
|
||||
|
@@ -70,6 +70,8 @@ extern "C"{
|
||||
C8, C9, CA, CB, CC, CD, CE, CF; \
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7, \
|
||||
M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
const __m256i FIVE = _mm256_set1_epi32( 5 ); \
|
||||
const __m256i THREE = _mm256_set1_epi32( 3 ); \
|
||||
sph_u32 Wlow, Whigh;
|
||||
|
||||
#define READ_STATE8(state) do \
|
||||
@@ -314,8 +316,7 @@ do { \
|
||||
_mm256_andnot_si256( xb3, xb2 ), \
|
||||
_mm256_mullo_epi32( mm256_xor3( xa0, xc, \
|
||||
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
|
||||
_mm256_set1_epi32(5UL) ) ), \
|
||||
_mm256_set1_epi32(3UL) ) ) ); \
|
||||
FIVE ) ), THREE ) ) ); \
|
||||
xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
|
||||
} while (0)
|
||||
|
||||
@@ -667,7 +668,9 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
C8, C9, CA, CB, CC, CD, CE, CF; \
|
||||
__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
|
||||
M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
sph_u32 Wlow, Whigh;
|
||||
const __m128i FIVE = _mm_set1_epi32( 5 ); \
|
||||
const __m128i THREE = _mm_set1_epi32( 3 ); \
|
||||
sph_u32 Wlow, Whigh;
|
||||
|
||||
#define READ_STATE(state) do \
|
||||
{ \
|
||||
@@ -931,8 +934,8 @@ do { \
|
||||
xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \
|
||||
_mm_andnot_si128( xb3, xb2 ), \
|
||||
_mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
|
||||
_mm_mullo_epi32( mm128_rol_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
|
||||
) ), _mm_set1_epi32(3UL) ) ) ) ); \
|
||||
_mm_mullo_epi32( mm128_rol_32( xa1, 15 ), FIVE ) \
|
||||
) ), THREE ) ) ) ); \
|
||||
xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
|
||||
} while (0)
|
||||
|
||||
|
@@ -20,8 +20,8 @@ static const uint32_t IV512[] =
|
||||
|
||||
|
||||
#define mm256_ror2x256hi_1x32( a, b ) \
|
||||
_mm256_blend_epi32( mm256_ror128_32( a ), \
|
||||
mm256_ror128_32( b ), 0x88 )
|
||||
_mm256_blend_epi32( mm256_shuflr128_32( a ), \
|
||||
mm256_shuflr128_32( b ), 0x88 )
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
@@ -78,7 +78,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
{
|
||||
// round 1, 5, 9
|
||||
|
||||
k00 = _mm256_xor_si256( k13, mm256_ror128_32(
|
||||
k00 = _mm256_xor_si256( k13, mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k00, zero ) ) );
|
||||
|
||||
if ( r == 0 )
|
||||
@@ -88,7 +88,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
|
||||
k01 = _mm256_xor_si256( k00,
|
||||
mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) );
|
||||
mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ) );
|
||||
|
||||
if ( r == 1 )
|
||||
k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
|
||||
@@ -97,25 +97,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||
k02 = _mm256_xor_si256( k01,
|
||||
mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) );
|
||||
mm256_shuflr128_32( mm256_aesenc_2x128( k02, zero ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||
k03 = _mm256_xor_si256( k02,
|
||||
mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) );
|
||||
mm256_shuflr128_32( mm256_aesenc_2x128( k03, zero ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||
|
||||
p3 = _mm256_xor_si256( p3, x );
|
||||
|
||||
k10 = _mm256_xor_si256( k03,
|
||||
mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) );
|
||||
mm256_shuflr128_32( mm256_aesenc_2x128( k10, zero ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
|
||||
k11 = _mm256_xor_si256( k10,
|
||||
mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) );
|
||||
mm256_shuflr128_32( mm256_aesenc_2x128( k11, zero ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||
k12 = _mm256_xor_si256( k11,
|
||||
mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) );
|
||||
mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||
k13 = _mm256_xor_si256( k12,
|
||||
mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) );
|
||||
mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ) );
|
||||
|
||||
if ( r == 2 )
|
||||
k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
|
||||
@@ -151,31 +151,31 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
|
||||
// round 3, 7, 11
|
||||
|
||||
k00 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k00 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k00, zero ) ), k13 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
|
||||
k01 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k01 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k01, zero ) ), k00 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||
k02 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k02 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k02, zero ) ), k01 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||
k03 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k03 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k03, zero ) ), k02 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||
|
||||
p1 = _mm256_xor_si256( p1, x );
|
||||
|
||||
k10 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k10 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k10, zero ) ), k03 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
|
||||
k11 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k11 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k11, zero ) ), k10 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||
k12 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k12 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k12, zero ) ), k11 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||
k13 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k13 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k13, zero ) ), k12 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
||||
|
||||
@@ -209,35 +209,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
|
||||
// round 13
|
||||
|
||||
k00 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k00 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k00, zero ) ), k13 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
|
||||
k01 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k01 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k01, zero ) ), k00 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||
k02 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k02 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k02, zero ) ), k01 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||
k03 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k03 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k03, zero ) ), k02 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||
|
||||
p3 = _mm256_xor_si256( p3, x );
|
||||
|
||||
k10 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k10 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k10, zero ) ), k03 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
|
||||
k11 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k11 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k11, zero ) ), k10 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||
|
||||
k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) );
|
||||
k12 = mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) );
|
||||
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
|
||||
~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
|
||||
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
|
||||
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||
k13 = _mm256_xor_si256( mm256_ror128_32(
|
||||
k13 = _mm256_xor_si256( mm256_shuflr128_32(
|
||||
mm256_aesenc_2x128( k13, zero ) ), k12 );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
||||
|
||||
|
@@ -12,8 +12,8 @@ static const uint32_t IV512[] =
|
||||
};
|
||||
|
||||
#define mm512_ror2x512hi_1x32( a, b ) \
|
||||
_mm512_mask_blend_epi32( 0x8888, mm512_ror128_32( a ), \
|
||||
mm512_ror128_32( b ) )
|
||||
_mm512_mask_blend_epi32( 0x8888, mm512_shuflr128_32( a ), \
|
||||
mm512_shuflr128_32( b ) )
|
||||
|
||||
static void
|
||||
c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
@@ -23,6 +23,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
register __m512i K0, K1, K2, K3, K4, K5, K6, K7;
|
||||
__m512i *M = (__m512i*)msg;
|
||||
__m512i *H = (__m512i*)ctx->h;
|
||||
const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2,
|
||||
ctx->count1, ctx->count0 );
|
||||
int r;
|
||||
|
||||
P0 = H[0];
|
||||
@@ -58,46 +60,46 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
{
|
||||
// round 1, 5, 9
|
||||
|
||||
K0 = _mm512_xor_si512( K7, mm512_ror128_32(
|
||||
K0 = _mm512_xor_si512( K7, mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K0, m512_zero ) ) );
|
||||
|
||||
if ( r == 0 )
|
||||
K0 = _mm512_xor_si512( K0, _mm512_set4_epi32(
|
||||
~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
|
||||
K0 = _mm512_xor_si512( K0,
|
||||
_mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) );
|
||||
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
|
||||
K1 = _mm512_xor_si512( K0,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
|
||||
mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
|
||||
|
||||
if ( r == 1 )
|
||||
K1 = _mm512_xor_si512( K1, _mm512_set4_epi32(
|
||||
~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) );
|
||||
K1 = _mm512_xor_si512( K1, mm512_shuflr128_32(
|
||||
_mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
|
||||
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||
K2 = _mm512_xor_si512( K1,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
|
||||
mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||
K3 = _mm512_xor_si512( K2,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
|
||||
mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||
|
||||
P3 = _mm512_xor_si512( P3, X );
|
||||
|
||||
K4 = _mm512_xor_si512( K3,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
|
||||
mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
|
||||
K5 = _mm512_xor_si512( K4,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
|
||||
mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||
K6 = _mm512_xor_si512( K5,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
|
||||
mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||
K7 = _mm512_xor_si512( K6,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
|
||||
mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
|
||||
|
||||
if ( r == 2 )
|
||||
K7 = _mm512_xor_si512( K7, _mm512_set4_epi32(
|
||||
~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
|
||||
K7 = _mm512_xor_si512( K7, mm512_swap128_64(
|
||||
_mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) );
|
||||
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||
P1 = _mm512_xor_si512( P1, X );
|
||||
@@ -128,31 +130,31 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
|
||||
// round 3, 7, 11
|
||||
|
||||
K0 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K0 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
|
||||
K1 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K1 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||
K2 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K2 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||
K3 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K3 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||
|
||||
P1 = _mm512_xor_si512( P1, X );
|
||||
|
||||
K4 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K4 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
|
||||
K5 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K5 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||
K6 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K6 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||
K7 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K7 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||
|
||||
@@ -185,34 +187,34 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
|
||||
// round 13
|
||||
|
||||
K0 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K0 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
|
||||
K1 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K1 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||
K2 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K2 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||
K3 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K3 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||
|
||||
P3 = _mm512_xor_si512( P3, X );
|
||||
|
||||
K4 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K4 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
|
||||
K5 = _mm512_xor_si512( mm512_ror128_32(
|
||||
K5 = _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||
|
||||
K6 = mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
|
||||
K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
|
||||
K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32(
|
||||
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
|
||||
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||
K7= _mm512_xor_si512( mm512_ror128_32(
|
||||
K7= _mm512_xor_si512( mm512_shuflr128_32(
|
||||
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||
|
||||
|
@@ -74,15 +74,15 @@ static const sph_u32 IV512[] = {
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
#if defined(__AVX2__)
|
||||
// 2 way version of above
|
||||
// a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] }
|
||||
|
||||
#define mm256_ror2x256hi_1x32( a, b ) \
|
||||
_mm256_blend_epi32( mm256_ror256_1x32( a ), \
|
||||
mm256_rol256_3x32( b ), 0x88 )
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
static void
|
||||
c512( sph_shavite_big_context *sc, const void *msg )
|
||||
@@ -101,15 +101,6 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
// round
|
||||
|
||||
// working proof of concept
|
||||
/*
|
||||
__m512i K = m512_const1_128( m[0] );
|
||||
__m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K );
|
||||
X = _mm512_aesenc_epi128( X, m512_zero );
|
||||
k00 = _mm512_castsi512_si128( K );
|
||||
x = _mm512_castsi512_si128( X );
|
||||
*/
|
||||
|
||||
k00 = m[0];
|
||||
x = _mm_xor_si128( p1, k00 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
@@ -144,7 +135,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
for ( r = 0; r < 3; r ++ )
|
||||
{
|
||||
// round 1, 5, 9
|
||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
|
||||
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
|
||||
if ( r == 0 )
|
||||
@@ -153,7 +144,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
x = _mm_xor_si128( p0, k00 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
|
||||
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
|
||||
if ( r == 1 )
|
||||
@@ -162,31 +153,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
|
||||
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
|
||||
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
|
||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
|
||||
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
|
||||
x = _mm_xor_si128( p2, k10 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
|
||||
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
|
||||
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
|
||||
k12 = _mm_xor_si128( k12, k11 );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
|
||||
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
|
||||
if ( r == 2 )
|
||||
@@ -231,38 +222,38 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
// round 3, 7, 11
|
||||
|
||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
|
||||
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
x = _mm_xor_si128( p2, k00 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
|
||||
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
|
||||
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
|
||||
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p1 = _mm_xor_si128( p1, x );
|
||||
|
||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
|
||||
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
x = _mm_xor_si128( p0, k10 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
|
||||
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
|
||||
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
|
||||
k12 = _mm_xor_si128( k12, k11 );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
|
||||
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
@@ -304,39 +295,39 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
// round 13
|
||||
|
||||
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
|
||||
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
x = _mm_xor_si128( p0, k00 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
|
||||
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
|
||||
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
|
||||
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
|
||||
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
|
||||
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
x = _mm_xor_si128( p2, k10 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
|
||||
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
|
||||
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
|
||||
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
|
||||
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
|
||||
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
@@ -747,11 +747,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
|
||||
|
||||
static const m512_v16 code[] = { c1_16_512(185), c1_16_512(233) };
|
||||
|
||||
|
||||
// static const m512_v16 code[] = { c1_16(185), c1_16(233),
|
||||
// c1_16(185), c1_16(233) };
|
||||
|
||||
|
||||
S0l = _mm512_xor_si512( S[0], M[0] );
|
||||
S0h = _mm512_xor_si512( S[1], M[1] );
|
||||
S1l = _mm512_xor_si512( S[2], M[2] );
|
||||
@@ -764,11 +759,16 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
|
||||
// targetted, local macros don't need a unique name
|
||||
#define S(i) S##i
|
||||
|
||||
#define F_0( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xca )
|
||||
#define F_1( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xe8 )
|
||||
|
||||
/*
|
||||
#define F_0(B, C, D) \
|
||||
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( C,D ), B ), D )
|
||||
#define F_1(B, C, D) \
|
||||
_mm512_or_si512( _mm512_and_si512( D, C ),\
|
||||
_mm512_and_si512( _mm512_or_si512( D,C ), B ) )
|
||||
*/
|
||||
|
||||
#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
|
||||
#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
|
||||
|
@@ -6,10 +6,6 @@
|
||||
|
||||
#define PRINT_SOME 0
|
||||
|
||||
/* JDD all ocurrances of macro X in this file renamed to XX
|
||||
* due to name conflict
|
||||
*/
|
||||
|
||||
int SupportedLength(int hashbitlen) {
|
||||
if (hashbitlen <= 0 || hashbitlen > 512)
|
||||
return 0;
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include "skein-hash-4way.h"
|
||||
#include "algo/sha/sha-hash-4way.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
#if defined (SKEIN_8WAY)
|
||||
|
||||
@@ -87,7 +87,6 @@ void skeinhash_4way( void *state, const void *input )
|
||||
uint32_t hash1[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[16] __attribute__ ((aligned (64)));
|
||||
sph_sha256_context ctx_sha256;
|
||||
#else
|
||||
uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
|
||||
sha256_4way_context ctx_sha256;
|
||||
@@ -98,18 +97,12 @@ void skeinhash_4way( void *state, const void *input )
|
||||
#if defined(__SHA__)
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 );
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hash0, 64 );
|
||||
sph_sha256_close( &ctx_sha256, hash0 );
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hash1, 64 );
|
||||
sph_sha256_close( &ctx_sha256, hash1 );
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hash2, 64 );
|
||||
sph_sha256_close( &ctx_sha256, hash2 );
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hash3, 64 );
|
||||
sph_sha256_close( &ctx_sha256, hash3 );
|
||||
|
||||
sha256_full( hash0, hash0, 64 );
|
||||
sha256_full( hash1, hash1, 64 );
|
||||
sha256_full( hash2, hash2, 64 );
|
||||
sha256_full( hash3, hash3, 64 );
|
||||
|
||||
intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
|
||||
|
||||
#else
|
||||
|
@@ -5,21 +5,18 @@
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "sph_skein.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
void skeinhash(void *state, const void *input)
|
||||
{
|
||||
uint32_t hash[16] __attribute__ ((aligned (64)));
|
||||
sph_skein512_context ctx_skein;
|
||||
sph_sha256_context ctx_sha256;
|
||||
|
||||
sph_skein512_init( &ctx_skein );
|
||||
sph_skein512( &ctx_skein, input, 80 );
|
||||
sph_skein512_close( &ctx_skein, hash );
|
||||
|
||||
sph_sha256_init( &ctx_sha256 );
|
||||
sph_sha256( &ctx_sha256, hash, 64 );
|
||||
sph_sha256_close( &ctx_sha256, hash );
|
||||
sha256_full( hash, hash, 64 );
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
@@ -27,8 +24,8 @@ void skeinhash(void *state, const void *input)
|
||||
int scanhash_skein( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t hash64[8] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__ ((aligned (64)));
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
@@ -36,7 +33,7 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
|
@@ -18,16 +18,20 @@
|
||||
#ifndef __INTTYPES_H_
|
||||
#define __INTTYPES_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/* Use [u]intN_t if you need exactly N bits.
|
||||
XXX - doesn't handle the -mint8 option. */
|
||||
|
||||
typedef signed char swift_int8_t;
|
||||
typedef unsigned char swift_uint8_t;
|
||||
|
||||
typedef int swift_int16_t;
|
||||
typedef int32_t swift_int16_t;
|
||||
// typedef int swift_int16_t;
|
||||
typedef unsigned int swift_uint16_t;
|
||||
|
||||
typedef long swift_int32_t;
|
||||
typedef int32_t swift_int32_t;
|
||||
// typedef long swift_int32_t;
|
||||
typedef unsigned long swift_uint32_t;
|
||||
|
||||
typedef long long swift_int64_t;
|
||||
|
@@ -1,912 +0,0 @@
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// SWIFFTX ANSI C OPTIMIZED 32BIT IMPLEMENTATION FOR NIST SHA-3 COMPETITION
|
||||
//
|
||||
// SWIFFTX.c
|
||||
//
|
||||
// October 2008
|
||||
//
|
||||
// This is the source file of the OPTIMIZED 32BIT implementation of SWIFFTX hash function.
|
||||
// SWIFFTX is a candidate function for SHA-3 NIST competition.
|
||||
// More details about SWIFFTX can be found in the accompanying submission documents.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#include "swifftx.h"
|
||||
// See the remarks concerning compatibility issues inside stdint.h.
|
||||
#include "stdint.h"
|
||||
// Remove this while using gcc:
|
||||
//#include "stdbool.h"
|
||||
#include <memory.h>
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Constants and static tables portion.
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// In SWIFFTX we work over Z_257, so this is the modulus and the arithmetic is performed modulo
|
||||
// this number.
|
||||
#define FIELD_SIZE 257
|
||||
|
||||
// The size of FFT we use:
|
||||
#define N 64
|
||||
|
||||
#define LOGN 6
|
||||
|
||||
#define EIGHTH_N (N / 8)
|
||||
|
||||
// The number of FFTS done on the input.
|
||||
#define M (SWIFFTX_INPUT_BLOCK_SIZE / 8) // 32
|
||||
|
||||
// Omega is the 128th root of unity in Z_257.
|
||||
// We choose w = 42.
|
||||
#define OMEGA 42
|
||||
|
||||
// The size of the inner FFT lookup table:
|
||||
#define W 8
|
||||
|
||||
// Calculates the sum and the difference of two numbers.
|
||||
//
|
||||
// Parameters:
|
||||
// - A: the first operand. After the operation stores the sum of the two operands.
|
||||
// - B: the second operand. After the operation stores the difference between the first and the
|
||||
// second operands.
|
||||
#define ADD_SUB_4WAY( A, B ) \
|
||||
{ \
|
||||
__m128i temp = B; \
|
||||
B = _mm_sub_epi32( A, B ); \
|
||||
A = _mm_add_epi32( A, temp ); \
|
||||
}
|
||||
|
||||
|
||||
//#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
|
||||
|
||||
// Quickly reduces an integer modulo 257.
|
||||
//
|
||||
// Parameters:
|
||||
// - A: the input.
|
||||
|
||||
#define Q_REDUCE( A ) ( _mm_sub_epi32( \
|
||||
_mm_and_epi32( A, m128_const1_32( 0xff ) ), \
|
||||
_mm_srli_epi32( A, 8 ) ) )
|
||||
|
||||
//#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
|
||||
|
||||
// Since we need to do the setup only once, this is the indicator variable:
|
||||
static bool wasSetupDone = false;
|
||||
|
||||
// This array stores the powers of omegas that correspond to the indices, which are the input
|
||||
// values. Known also as the "outer FFT twiddle factors".
|
||||
swift_int16_t multipliers[N];
|
||||
|
||||
// This array stores the powers of omegas, multiplied by the corresponding values.
|
||||
// We store this table to save computation time.
|
||||
//
|
||||
// To calculate the intermediate value of the compression function (the first out of two
|
||||
// stages), we multiply the k-th bit of x_i by w^[(2i + 1) * k]. {x_i} is the input to the
|
||||
// compression function, i is between 0 and 31, x_i is a 64-bit value.
|
||||
// One can see the formula for this (intermediate) stage in the SWIFFT FSE 2008 paper --
|
||||
// formula (2), section 3, page 6.
|
||||
swift_int16_t fftTable[256 * EIGHTH_N];
|
||||
|
||||
// The A's we use in SWIFFTX shall be random elements of Z_257.
|
||||
// We generated these A's from the decimal expansion of PI as follows: we converted each
|
||||
// triple of digits into a decimal number d. If d < (257 * 3) we used (d % 257) for the next A
|
||||
// element, otherwise move to the next triple of digits in the expansion. This guarntees that
|
||||
// the A's are random, provided that PI digits are.
|
||||
const swift_int16_t As[3 * M * N] =
|
||||
{141, 78, 139, 75, 238, 205, 129, 126, 22, 245, 197, 169, 142, 118, 105, 78,
|
||||
50, 149, 29, 208, 114, 34, 85, 117, 67, 148, 86, 256, 25, 49, 133, 93,
|
||||
95, 36, 68, 231, 211, 102, 151, 128, 224, 117, 193, 27, 102, 187, 7, 105,
|
||||
45, 130, 108, 124, 171, 151, 189, 128, 218, 134, 233, 165, 14, 201, 145, 134,
|
||||
52, 203, 91, 96, 197, 69, 134, 213, 136, 93, 3, 249, 141, 16, 210, 73,
|
||||
6, 92, 58, 74, 174, 6, 254, 91, 201, 107, 110, 76, 103, 11, 73, 16,
|
||||
34, 209, 7, 127, 146, 254, 95, 176, 57, 13, 108, 245, 77, 92, 186, 117,
|
||||
124, 97, 105, 118, 34, 74, 205, 122, 235, 53, 94, 238, 210, 227, 183, 11,
|
||||
129, 159, 105, 183, 142, 129, 86, 21, 137, 138, 224, 223, 190, 188, 179, 188,
|
||||
256, 25, 217, 176, 36, 176, 238, 127, 160, 210, 155, 148, 132, 0, 54, 127,
|
||||
145, 6, 46, 85, 243, 95, 173, 123, 178, 207, 211, 183, 224, 173, 146, 35,
|
||||
71, 114, 50, 22, 175, 1, 28, 19, 112, 129, 21, 34, 161, 159, 115, 52,
|
||||
4, 193, 211, 92, 115, 49, 59, 217, 218, 96, 61, 81, 24, 202, 198, 89,
|
||||
45, 128, 8, 51, 253, 87, 171, 35, 4, 188, 171, 10, 3, 137, 238, 73,
|
||||
19, 208, 124, 163, 103, 177, 155, 147, 46, 84, 253, 233, 171, 241, 211, 217,
|
||||
159, 48, 96, 79, 237, 18, 171, 226, 99, 1, 97, 195, 216, 163, 198, 95,
|
||||
0, 201, 65, 228, 21, 153, 124, 230, 44, 35, 44, 108, 85, 156, 249, 207,
|
||||
26, 222, 131, 1, 60, 242, 197, 150, 181, 19, 116, 213, 75, 98, 124, 240,
|
||||
123, 207, 62, 255, 60, 143, 187, 157, 139, 9, 12, 104, 89, 49, 193, 146,
|
||||
104, 196, 181, 82, 198, 253, 192, 191, 255, 122, 212, 104, 47, 20, 132, 208,
|
||||
46, 170, 2, 69, 234, 36, 56, 163, 28, 152, 104, 238, 162, 56, 24, 58,
|
||||
38, 150, 193, 254, 253, 125, 173, 35, 73, 126, 247, 239, 216, 6, 199, 15,
|
||||
90, 12, 97, 122, 9, 84, 207, 127, 219, 72, 58, 30, 29, 182, 41, 192,
|
||||
235, 248, 237, 74, 72, 176, 210, 252, 45, 64, 165, 87, 202, 241, 236, 223,
|
||||
151, 242, 119, 239, 52, 112, 169, 28, 13, 37, 160, 60, 158, 81, 133, 60,
|
||||
16, 145, 249, 192, 173, 217, 214, 93, 141, 184, 54, 34, 161, 104, 157, 95,
|
||||
38, 133, 218, 227, 211, 181, 9, 66, 137, 143, 77, 33, 248, 159, 4, 55,
|
||||
228, 48, 99, 219, 222, 184, 15, 36, 254, 256, 157, 237, 87, 139, 209, 113,
|
||||
232, 85, 126, 167, 197, 100, 103, 166, 64, 225, 125, 205, 117, 135, 84, 128,
|
||||
231, 112, 90, 241, 28, 22, 210, 147, 186, 49, 230, 21, 108, 39, 194, 47,
|
||||
123, 199, 107, 114, 30, 210, 250, 143, 59, 156, 131, 133, 221, 27, 76, 99,
|
||||
208, 250, 78, 12, 211, 141, 95, 81, 195, 106, 8, 232, 150, 212, 205, 221,
|
||||
11, 225, 87, 219, 126, 136, 137, 180, 198, 48, 68, 203, 239, 252, 194, 235,
|
||||
142, 137, 174, 172, 190, 145, 250, 221, 182, 204, 1, 195, 130, 153, 83, 241,
|
||||
161, 239, 211, 138, 11, 169, 155, 245, 174, 49, 10, 166, 16, 130, 181, 139,
|
||||
222, 222, 112, 99, 124, 94, 51, 243, 133, 194, 244, 136, 35, 248, 201, 177,
|
||||
178, 186, 129, 102, 89, 184, 180, 41, 149, 96, 165, 72, 225, 231, 134, 158,
|
||||
199, 28, 249, 16, 225, 195, 10, 210, 164, 252, 138, 8, 35, 152, 213, 199,
|
||||
82, 116, 97, 230, 63, 199, 241, 35, 79, 120, 54, 174, 67, 112, 1, 76,
|
||||
69, 222, 194, 96, 82, 94, 25, 228, 196, 145, 155, 136, 228, 234, 46, 101,
|
||||
246, 51, 103, 166, 246, 75, 9, 200, 161, 4, 108, 35, 129, 168, 208, 144,
|
||||
50, 14, 13, 220, 41, 132, 122, 127, 194, 9, 232, 234, 107, 28, 187, 8,
|
||||
51, 141, 97, 221, 225, 9, 113, 170, 166, 102, 135, 22, 231, 185, 227, 187,
|
||||
110, 145, 251, 146, 76, 22, 146, 228, 7, 53, 64, 25, 62, 198, 130, 190,
|
||||
221, 232, 169, 64, 188, 199, 237, 249, 173, 218, 196, 191, 48, 224, 5, 113,
|
||||
100, 166, 160, 21, 191, 197, 61, 162, 149, 171, 240, 183, 129, 231, 123, 204,
|
||||
192, 179, 134, 15, 47, 161, 142, 177, 239, 234, 186, 237, 231, 53, 208, 95,
|
||||
146, 36, 225, 231, 89, 142, 93, 248, 137, 124, 83, 39, 69, 77, 89, 208,
|
||||
182, 48, 85, 147, 244, 164, 246, 68, 38, 190, 220, 35, 202, 91, 157, 151,
|
||||
201, 240, 185, 218, 4, 152, 2, 132, 177, 88, 190, 196, 229, 74, 220, 135,
|
||||
137, 196, 11, 47, 5, 251, 106, 144, 163, 60, 222, 127, 52, 57, 202, 102,
|
||||
64, 140, 110, 206, 23, 182, 39, 245, 1, 163, 157, 186, 163, 80, 7, 230,
|
||||
44, 249, 176, 102, 164, 125, 147, 120, 18, 191, 186, 125, 64, 65, 198, 157,
|
||||
164, 213, 95, 61, 13, 181, 208, 91, 242, 197, 158, 34, 98, 169, 91, 14,
|
||||
17, 93, 157, 17, 65, 30, 183, 6, 139, 58, 255, 108, 100, 136, 209, 144,
|
||||
164, 6, 237, 33, 210, 110, 57, 126, 197, 136, 125, 244, 165, 151, 168, 3,
|
||||
143, 251, 247, 155, 136, 130, 88, 14, 74, 121, 250, 133, 21, 226, 185, 232,
|
||||
118, 132, 89, 64, 204, 161, 2, 70, 224, 159, 35, 204, 123, 180, 13, 52,
|
||||
231, 57, 25, 78, 66, 69, 97, 42, 198, 84, 176, 59, 8, 232, 125, 134,
|
||||
193, 2, 232, 109, 216, 69, 90, 142, 32, 38, 249, 37, 75, 180, 184, 188,
|
||||
19, 47, 120, 87, 146, 70, 232, 120, 191, 45, 33, 38, 19, 248, 110, 110,
|
||||
44, 64, 2, 84, 244, 228, 252, 228, 170, 123, 38, 144, 213, 144, 171, 212,
|
||||
243, 87, 189, 46, 128, 110, 84, 77, 65, 183, 61, 184, 101, 44, 168, 68,
|
||||
14, 106, 105, 8, 227, 211, 166, 39, 152, 43, 52, 254, 197, 55, 119, 89,
|
||||
168, 65, 53, 138, 177, 56, 219, 0, 58, 121, 148, 18, 44, 100, 215, 103,
|
||||
145, 229, 117, 196, 91, 89, 113, 143, 172, 239, 249, 184, 154, 39, 112, 65,
|
||||
204, 42, 84, 38, 155, 151, 151, 16, 100, 87, 174, 162, 145, 147, 149, 186,
|
||||
237, 145, 134, 144, 198, 235, 213, 163, 48, 230, 24, 47, 57, 71, 127, 0,
|
||||
150, 219, 12, 81, 197, 150, 131, 13, 169, 63, 175, 184, 48, 235, 65, 243,
|
||||
149, 200, 163, 254, 202, 114, 247, 67, 143, 250, 126, 228, 80, 130, 216, 214,
|
||||
36, 2, 230, 33, 119, 125, 3, 142, 237, 100, 3, 152, 197, 174, 244, 129,
|
||||
232, 30, 206, 199, 39, 210, 220, 43, 237, 221, 201, 54, 179, 42, 28, 133,
|
||||
246, 203, 198, 177, 0, 28, 194, 85, 223, 109, 155, 147, 221, 60, 133, 108,
|
||||
157, 254, 26, 75, 157, 185, 49, 142, 31, 137, 71, 43, 63, 64, 237, 148,
|
||||
237, 172, 159, 160, 155, 254, 234, 224, 140, 193, 114, 140, 62, 109, 136, 39,
|
||||
255, 8, 158, 146, 128, 49, 222, 96, 57, 209, 180, 249, 202, 127, 113, 231,
|
||||
78, 178, 46, 33, 228, 215, 104, 31, 207, 186, 82, 41, 42, 39, 103, 119,
|
||||
123, 133, 243, 254, 238, 156, 90, 186, 37, 212, 33, 107, 252, 51, 177, 36,
|
||||
237, 76, 159, 245, 93, 214, 97, 56, 190, 38, 160, 94, 105, 222, 220, 158,
|
||||
49, 16, 191, 52, 120, 87, 179, 2, 27, 144, 223, 230, 184, 6, 129, 227,
|
||||
69, 47, 215, 181, 162, 139, 72, 200, 45, 163, 159, 62, 2, 221, 124, 40,
|
||||
159, 242, 35, 208, 179, 166, 98, 67, 178, 68, 143, 225, 178, 146, 187, 159,
|
||||
57, 66, 176, 192, 236, 250, 168, 224, 122, 43, 159, 120, 133, 165, 122, 64,
|
||||
87, 74, 161, 241, 9, 87, 90, 24, 255, 113, 203, 220, 57, 139, 197, 159,
|
||||
31, 151, 27, 140, 77, 162, 7, 27, 84, 228, 187, 220, 53, 126, 162, 242,
|
||||
84, 181, 223, 103, 86, 177, 207, 31, 140, 18, 207, 256, 201, 166, 96, 23,
|
||||
233, 103, 197, 84, 161, 75, 59, 149, 138, 154, 119, 92, 16, 53, 116, 97,
|
||||
220, 114, 35, 45, 77, 209, 40, 196, 71, 22, 81, 178, 110, 14, 3, 180,
|
||||
110, 129, 112, 47, 18, 61, 134, 78, 73, 79, 254, 232, 125, 180, 205, 54,
|
||||
220, 119, 63, 89, 181, 52, 77, 109, 151, 77, 80, 207, 144, 25, 20, 6,
|
||||
208, 47, 201, 206, 192, 14, 73, 176, 256, 201, 207, 87, 216, 60, 56, 73,
|
||||
92, 243, 179, 113, 49, 59, 55, 168, 121, 137, 69, 154, 95, 57, 187, 47,
|
||||
129, 4, 15, 92, 6, 116, 69, 196, 48, 134, 84, 81, 111, 56, 38, 176,
|
||||
239, 6, 128, 72, 242, 134, 36, 221, 59, 48, 242, 68, 130, 110, 171, 89,
|
||||
13, 220, 48, 29, 5, 75, 104, 233, 91, 129, 105, 162, 44, 113, 163, 163,
|
||||
85, 147, 190, 111, 197, 80, 213, 153, 81, 68, 203, 33, 161, 165, 10, 61,
|
||||
120, 252, 0, 205, 28, 42, 193, 64, 39, 37, 83, 175, 5, 218, 215, 174,
|
||||
128, 121, 231, 11, 150, 145, 135, 197, 136, 91, 193, 5, 107, 88, 82, 6,
|
||||
4, 188, 256, 70, 40, 2, 167, 57, 169, 203, 115, 254, 215, 172, 84, 80,
|
||||
188, 167, 34, 137, 43, 243, 2, 79, 178, 38, 188, 135, 233, 194, 208, 13,
|
||||
11, 151, 231, 196, 12, 122, 162, 56, 17, 114, 191, 207, 90, 132, 64, 238,
|
||||
187, 6, 198, 176, 240, 88, 118, 236, 15, 226, 166, 22, 193, 229, 82, 246,
|
||||
213, 64, 37, 63, 31, 243, 252, 37, 156, 38, 175, 204, 138, 141, 211, 82,
|
||||
106, 217, 97, 139, 153, 56, 129, 218, 158, 9, 83, 26, 87, 112, 71, 21,
|
||||
250, 5, 65, 141, 68, 116, 231, 113, 10, 218, 99, 205, 201, 92, 157, 4,
|
||||
97, 46, 49, 220, 72, 139, 103, 171, 149, 129, 193, 19, 69, 245, 43, 31,
|
||||
58, 68, 36, 195, 159, 22, 54, 34, 233, 141, 205, 100, 226, 96, 22, 192,
|
||||
41, 231, 24, 79, 234, 138, 30, 120, 117, 216, 172, 197, 172, 107, 86, 29,
|
||||
181, 151, 0, 6, 146, 186, 68, 55, 54, 58, 213, 182, 60, 231, 33, 232,
|
||||
77, 210, 216, 154, 80, 51, 141, 122, 68, 148, 219, 122, 254, 48, 64, 175,
|
||||
41, 115, 62, 243, 141, 81, 119, 121, 5, 68, 121, 88, 239, 29, 230, 90,
|
||||
135, 159, 35, 223, 168, 112, 49, 37, 146, 60, 126, 134, 42, 145, 115, 90,
|
||||
73, 133, 211, 86, 120, 141, 122, 241, 127, 56, 130, 36, 174, 75, 83, 246,
|
||||
112, 45, 136, 194, 201, 115, 1, 156, 114, 167, 208, 12, 176, 147, 32, 170,
|
||||
251, 100, 102, 220, 122, 210, 6, 49, 75, 201, 38, 105, 132, 135, 126, 102,
|
||||
13, 121, 76, 228, 202, 20, 61, 213, 246, 13, 207, 42, 148, 168, 37, 253,
|
||||
34, 94, 141, 185, 18, 234, 157, 109, 104, 64, 250, 125, 49, 236, 86, 48,
|
||||
196, 77, 75, 237, 156, 103, 225, 19, 110, 229, 22, 68, 177, 93, 221, 181,
|
||||
152, 153, 61, 108, 101, 74, 247, 195, 127, 216, 30, 166, 168, 61, 83, 229,
|
||||
120, 156, 96, 120, 201, 124, 43, 27, 253, 250, 120, 143, 89, 235, 189, 243,
|
||||
150, 7, 127, 119, 149, 244, 84, 185, 134, 34, 128, 193, 236, 234, 132, 117,
|
||||
137, 32, 145, 184, 44, 121, 51, 76, 11, 228, 142, 251, 39, 77, 228, 251,
|
||||
41, 58, 246, 107, 125, 187, 9, 240, 35, 8, 11, 162, 242, 220, 158, 163,
|
||||
2, 184, 163, 227, 242, 2, 100, 101, 2, 78, 129, 34, 89, 28, 26, 157,
|
||||
79, 31, 107, 250, 194, 156, 186, 69, 212, 66, 41, 180, 139, 42, 211, 253,
|
||||
256, 239, 29, 129, 104, 248, 182, 68, 1, 189, 48, 226, 36, 229, 3, 158,
|
||||
41, 53, 241, 22, 115, 174, 16, 163, 224, 19, 112, 219, 177, 233, 42, 27,
|
||||
250, 134, 18, 28, 145, 122, 68, 34, 134, 31, 147, 17, 39, 188, 150, 76,
|
||||
45, 42, 167, 249, 12, 16, 23, 182, 13, 79, 121, 3, 70, 197, 239, 44,
|
||||
86, 177, 255, 81, 64, 171, 138, 131, 73, 110, 44, 201, 254, 198, 146, 91,
|
||||
48, 9, 104, 31, 29, 161, 101, 31, 138, 180, 231, 233, 79, 137, 61, 236,
|
||||
140, 15, 249, 218, 234, 119, 99, 195, 110, 137, 237, 207, 8, 31, 45, 24,
|
||||
90, 155, 203, 253, 192, 203, 65, 176, 210, 171, 142, 214, 220, 122, 136, 237,
|
||||
189, 186, 147, 40, 80, 254, 173, 33, 191, 46, 192, 26, 108, 255, 228, 205,
|
||||
61, 76, 39, 107, 225, 126, 228, 182, 140, 251, 143, 134, 252, 168, 221, 8,
|
||||
185, 85, 60, 233, 147, 244, 87, 137, 8, 140, 96, 80, 53, 45, 175, 160,
|
||||
124, 189, 112, 37, 144, 19, 70, 17, 170, 242, 2, 3, 28, 95, 120, 199,
|
||||
212, 43, 9, 117, 86, 151, 101, 241, 200, 145, 241, 19, 178, 69, 204, 197,
|
||||
227, 166, 94, 7, 193, 45, 247, 234, 19, 187, 212, 212, 236, 125, 33, 95,
|
||||
198, 121, 122, 103, 77, 155, 235, 49, 25, 237, 249, 11, 162, 7, 238, 24,
|
||||
16, 150, 129, 25, 152, 17, 42, 67, 247, 162, 77, 154, 31, 133, 55, 137,
|
||||
79, 119, 153, 10, 86, 28, 244, 186, 41, 169, 106, 44, 10, 49, 110, 179,
|
||||
32, 133, 155, 244, 61, 70, 131, 168, 170, 39, 231, 252, 32, 69, 92, 238,
|
||||
239, 35, 132, 136, 236, 167, 90, 32, 123, 88, 69, 22, 20, 89, 145, 166,
|
||||
30, 118, 75, 4, 49, 31, 225, 54, 11, 50, 56, 191, 246, 1, 187, 33,
|
||||
119, 107, 139, 68, 19, 240, 131, 55, 94, 113, 31, 252, 12, 179, 121, 2,
|
||||
120, 252, 0, 76, 41, 80, 185, 42, 62, 121, 105, 159, 121, 109, 111, 98,
|
||||
7, 118, 86, 29, 210, 70, 231, 179, 223, 229, 164, 70, 62, 47, 0, 206,
|
||||
204, 178, 168, 120, 224, 166, 99, 25, 103, 63, 246, 224, 117, 204, 75, 124,
|
||||
140, 133, 110, 110, 222, 88, 151, 118, 46, 37, 22, 143, 158, 40, 2, 50,
|
||||
153, 94, 190, 199, 13, 198, 127, 211, 180, 90, 183, 98, 0, 142, 210, 154,
|
||||
100, 187, 67, 231, 202, 100, 198, 235, 252, 160, 247, 124, 247, 14, 121, 221,
|
||||
57, 88, 253, 243, 185, 89, 45, 249, 221, 194, 108, 175, 193, 119, 50, 141,
|
||||
223, 133, 136, 64, 176, 250, 129, 100, 124, 94, 181, 159, 99, 185, 177, 240,
|
||||
135, 42, 103, 52, 202, 208, 143, 186, 193, 103, 154, 237, 102, 88, 225, 161,
|
||||
50, 188, 191, 109, 12, 87, 19, 227, 247, 183, 13, 52, 205, 170, 205, 146,
|
||||
89, 160, 18, 105, 192, 73, 231, 225, 184, 157, 252, 220, 61, 59, 169, 183,
|
||||
221, 20, 141, 20, 158, 101, 245, 7, 245, 225, 118, 137, 84, 55, 19, 27,
|
||||
164, 110, 35, 25, 202, 94, 150, 46, 91, 152, 130, 1, 7, 46, 16, 237,
|
||||
171, 109, 19, 200, 65, 38, 10, 213, 70, 96, 126, 226, 185, 225, 181, 46,
|
||||
10, 165, 11, 123, 53, 158, 22, 147, 64, 22, 227, 69, 182, 237, 197, 37,
|
||||
39, 49, 186, 223, 139, 128, 55, 36, 166, 178, 220, 20, 98, 172, 166, 253,
|
||||
45, 0, 120, 180, 189, 185, 158, 159, 196, 6, 214, 79, 141, 52, 156, 107,
|
||||
5, 109, 142, 159, 33, 64, 190, 133, 95, 132, 95, 202, 160, 63, 186, 23,
|
||||
231, 107, 163, 33, 234, 15, 244, 77, 108, 49, 51, 7, 164, 87, 142, 99,
|
||||
240, 202, 47, 256, 118, 190, 196, 178, 217, 42, 39, 153, 21, 192, 232, 202,
|
||||
14, 82, 179, 64, 233, 4, 219, 10, 133, 78, 43, 144, 146, 216, 202, 81,
|
||||
71, 252, 8, 201, 68, 256, 85, 233, 164, 88, 176, 30, 5, 152, 126, 179,
|
||||
249, 84, 140, 190, 159, 54, 118, 98, 2, 159, 27, 133, 74, 121, 239, 196,
|
||||
71, 149, 119, 135, 102, 20, 87, 112, 44, 75, 221, 3, 151, 158, 5, 98,
|
||||
152, 25, 97, 106, 63, 171, 240, 79, 234, 240, 230, 92, 76, 70, 173, 196,
|
||||
36, 225, 218, 133, 64, 240, 150, 41, 146, 66, 133, 51, 134, 73, 170, 238,
|
||||
140, 90, 45, 89, 46, 147, 96, 169, 174, 174, 244, 151, 90, 40, 32, 74,
|
||||
38, 154, 246, 57, 31, 14, 189, 151, 83, 243, 197, 183, 220, 185, 53, 225,
|
||||
51, 106, 188, 208, 222, 248, 93, 13, 93, 215, 131, 25, 142, 185, 113, 222,
|
||||
131, 215, 149, 50, 159, 85, 32, 5, 205, 192, 2, 227, 42, 214, 197, 42,
|
||||
126, 182, 68, 123, 109, 36, 237, 179, 170, 199, 77, 256, 5, 128, 214, 243,
|
||||
137, 177, 170, 253, 179, 180, 153, 236, 100, 196, 216, 231, 198, 37, 192, 80,
|
||||
121, 221, 246, 1, 16, 246, 29, 78, 64, 148, 124, 38, 96, 125, 28, 20,
|
||||
48, 51, 73, 187, 139, 208, 98, 253, 221, 188, 84, 129, 1, 205, 95, 205,
|
||||
117, 79, 71, 126, 134, 237, 19, 184, 137, 125, 129, 178, 223, 54, 188, 112,
|
||||
30, 7, 225, 228, 205, 184, 233, 87, 117, 22, 58, 10, 8, 42, 2, 114,
|
||||
254, 19, 17, 13, 150, 92, 233, 179, 63, 12, 60, 171, 127, 35, 50, 5,
|
||||
195, 113, 241, 25, 249, 184, 166, 44, 221, 35, 151, 116, 8, 54, 195, 89,
|
||||
218, 186, 132, 5, 41, 89, 226, 177, 11, 41, 87, 172, 5, 23, 20, 59,
|
||||
228, 94, 76, 33, 137, 43, 151, 221, 61, 232, 4, 120, 93, 217, 80, 228,
|
||||
228, 6, 58, 25, 62, 84, 91, 48, 209, 20, 247, 243, 55, 106, 80, 79,
|
||||
235, 34, 20, 180, 146, 2, 236, 13, 236, 206, 243, 222, 204, 83, 148, 213,
|
||||
214, 117, 237, 98, 0, 90, 204, 168, 32, 41, 126, 67, 191, 74, 27, 255,
|
||||
26, 75, 240, 113, 185, 105, 167, 154, 112, 67, 151, 63, 161, 134, 239, 176,
|
||||
42, 87, 249, 130, 45, 242, 17, 100, 107, 120, 212, 218, 237, 76, 231, 162,
|
||||
175, 172, 118, 155, 92, 36, 124, 17, 121, 71, 13, 9, 82, 126, 147, 142,
|
||||
218, 148, 138, 80, 163, 106, 164, 123, 140, 129, 35, 42, 186, 154, 228, 214,
|
||||
75, 73, 8, 253, 42, 153, 232, 164, 95, 24, 110, 90, 231, 197, 90, 196,
|
||||
57, 164, 252, 181, 31, 7, 97, 256, 35, 77, 200, 212, 99, 179, 92, 227,
|
||||
17, 180, 49, 176, 9, 188, 13, 182, 93, 44, 128, 219, 134, 92, 151, 6,
|
||||
23, 126, 200, 109, 66, 30, 140, 180, 146, 134, 67, 200, 7, 9, 223, 168,
|
||||
186, 221, 3, 154, 150, 165, 43, 53, 138, 27, 86, 213, 235, 160, 70, 2,
|
||||
240, 20, 89, 212, 84, 141, 168, 246, 183, 227, 30, 167, 138, 185, 253, 83,
|
||||
52, 143, 236, 94, 59, 65, 89, 218, 194, 157, 164, 156, 111, 95, 202, 168,
|
||||
245, 256, 151, 28, 222, 194, 72, 130, 217, 134, 253, 77, 246, 100, 76, 32,
|
||||
254, 174, 182, 193, 14, 237, 74, 1, 74, 26, 135, 216, 152, 208, 112, 38,
|
||||
181, 62, 25, 71, 61, 234, 254, 97, 191, 23, 92, 256, 190, 205, 6, 16,
|
||||
134, 147, 210, 219, 148, 59, 73, 185, 24, 247, 174, 143, 116, 220, 128, 144,
|
||||
111, 126, 101, 98, 130, 136, 101, 102, 69, 127, 24, 168, 146, 226, 226, 207,
|
||||
176, 122, 149, 254, 134, 196, 22, 151, 197, 21, 50, 205, 116, 154, 65, 116,
|
||||
177, 224, 127, 77, 177, 159, 225, 69, 176, 54, 100, 104, 140, 8, 11, 126,
|
||||
11, 188, 185, 159, 107, 16, 254, 142, 80, 28, 5, 157, 104, 57, 109, 82,
|
||||
102, 80, 173, 242, 238, 207, 57, 105, 237, 160, 59, 189, 189, 199, 26, 11,
|
||||
190, 156, 97, 118, 20, 12, 254, 189, 165, 147, 142, 199, 5, 213, 64, 133,
|
||||
108, 217, 133, 60, 94, 28, 116, 136, 47, 165, 125, 42, 183, 143, 14, 129,
|
||||
223, 70, 212, 205, 181, 180, 3, 201, 182, 46, 57, 104, 239, 60, 99, 181,
|
||||
220, 231, 45, 79, 156, 89, 149, 143, 190, 103, 153, 61, 235, 73, 136, 20,
|
||||
89, 243, 16, 130, 247, 141, 134, 93, 80, 68, 85, 84, 8, 72, 194, 4,
|
||||
242, 110, 19, 133, 199, 70, 172, 92, 132, 254, 67, 74, 36, 94, 13, 90,
|
||||
154, 184, 9, 109, 118, 243, 214, 71, 36, 95, 0, 90, 201, 105, 112, 215,
|
||||
69, 196, 224, 210, 236, 242, 155, 211, 37, 134, 69, 113, 157, 97, 68, 26,
|
||||
230, 149, 219, 180, 20, 76, 172, 145, 154, 40, 129, 8, 93, 56, 162, 124,
|
||||
207, 233, 105, 19, 3, 183, 155, 134, 8, 244, 213, 78, 139, 88, 156, 37,
|
||||
51, 152, 111, 102, 112, 250, 114, 252, 201, 241, 133, 24, 136, 153, 5, 90,
|
||||
210, 197, 216, 24, 131, 17, 147, 246, 13, 86, 3, 253, 179, 237, 101, 114,
|
||||
243, 191, 207, 2, 220, 133, 244, 53, 87, 125, 154, 158, 197, 20, 8, 83,
|
||||
32, 191, 38, 241, 204, 22, 168, 59, 217, 123, 162, 82, 21, 50, 130, 89,
|
||||
239, 253, 195, 56, 253, 74, 147, 125, 234, 199, 250, 28, 65, 193, 22, 237,
|
||||
193, 94, 58, 229, 139, 176, 69, 42, 179, 164, 150, 168, 246, 214, 86, 174,
|
||||
59, 117, 15, 19, 76, 37, 214, 238, 153, 226, 154, 45, 109, 114, 198, 107,
|
||||
45, 70, 238, 196, 142, 252, 244, 71, 123, 136, 134, 188, 99, 132, 25, 42,
|
||||
240, 0, 196, 33, 26, 124, 256, 145, 27, 102, 153, 35, 28, 132, 221, 167,
|
||||
138, 133, 41, 170, 95, 224, 40, 139, 239, 153, 1, 106, 255, 106, 170, 163,
|
||||
127, 44, 155, 232, 194, 119, 232, 117, 239, 143, 108, 41, 3, 9, 180, 256,
|
||||
144, 113, 133, 200, 79, 69, 128, 216, 31, 50, 102, 209, 249, 136, 150, 154,
|
||||
182, 51, 228, 39, 127, 142, 87, 15, 94, 92, 187, 245, 31, 236, 64, 58,
|
||||
114, 11, 17, 166, 189, 152, 218, 34, 123, 39, 58, 37, 153, 91, 63, 121,
|
||||
31, 34, 12, 254, 106, 96, 171, 14, 155, 247, 214, 69, 24, 98, 3, 204,
|
||||
202, 194, 207, 30, 253, 44, 119, 70, 14, 96, 82, 250, 63, 6, 232, 38,
|
||||
89, 144, 102, 191, 82, 254, 20, 222, 96, 162, 110, 6, 159, 58, 200, 226,
|
||||
98, 128, 42, 70, 84, 247, 128, 211, 136, 54, 143, 166, 60, 118, 99, 218,
|
||||
27, 193, 85, 81, 219, 223, 46, 41, 23, 233, 152, 222, 36, 236, 54, 181,
|
||||
56, 50, 4, 207, 129, 92, 78, 88, 197, 251, 131, 105, 31, 172, 38, 131,
|
||||
19, 204, 129, 47, 227, 106, 202, 183, 23, 6, 77, 224, 102, 147, 11, 218,
|
||||
131, 132, 60, 192, 208, 223, 236, 23, 103, 115, 89, 18, 185, 171, 70, 174,
|
||||
139, 0, 100, 160, 221, 11, 228, 60, 12, 122, 114, 12, 157, 235, 148, 57,
|
||||
83, 62, 173, 131, 169, 126, 85, 99, 93, 243, 81, 80, 29, 245, 206, 82,
|
||||
236, 227, 166, 14, 230, 213, 144, 97, 27, 111, 99, 164, 105, 150, 89, 111,
|
||||
252, 118, 140, 232, 120, 183, 137, 213, 232, 157, 224, 33, 134, 118, 186, 80,
|
||||
159, 2, 186, 193, 54, 242, 25, 237, 232, 249, 226, 213, 90, 149, 90, 160,
|
||||
118, 69, 64, 37, 10, 183, 109, 246, 30, 52, 219, 69, 189, 26, 116, 220,
|
||||
50, 244, 243, 243, 139, 137, 232, 98, 38, 45, 256, 143, 171, 101, 73, 238,
|
||||
123, 45, 194, 167, 250, 123, 12, 29, 136, 237, 141, 21, 89, 96, 199, 44,
|
||||
8, 214, 208, 17, 113, 41, 137, 26, 166, 155, 89, 85, 54, 58, 97, 160,
|
||||
50, 239, 58, 71, 21, 157, 139, 12, 37, 198, 182, 131, 149, 134, 16, 204,
|
||||
164, 181, 248, 166, 52, 216, 136, 201, 37, 255, 187, 240, 5, 101, 147, 231,
|
||||
14, 163, 253, 134, 146, 216, 8, 54, 224, 90, 220, 195, 75, 215, 186, 58,
|
||||
71, 204, 124, 105, 239, 53, 16, 85, 69, 163, 195, 223, 33, 38, 69, 88,
|
||||
88, 203, 99, 55, 176, 13, 156, 204, 236, 99, 194, 134, 75, 247, 126, 129,
|
||||
160, 124, 233, 206, 139, 144, 154, 45, 233, 51, 206, 61, 60, 55, 205, 107,
|
||||
84, 108, 96, 188, 203, 31, 89, 20, 115, 144, 137, 90, 237, 78, 231, 185,
|
||||
120, 217, 1, 176, 169, 30, 155, 176, 100, 113, 53, 42, 193, 108, 14, 121,
|
||||
176, 158, 137, 92, 178, 44, 110, 249, 108, 234, 94, 101, 128, 12, 250, 173,
|
||||
72, 202, 232, 66, 139, 152, 189, 18, 32, 197, 9, 238, 246, 55, 119, 183,
|
||||
196, 119, 113, 247, 191, 100, 200, 245, 46, 16, 234, 112, 136, 116, 232, 48,
|
||||
176, 108, 11, 237, 14, 153, 93, 177, 124, 72, 67, 121, 135, 143, 45, 18,
|
||||
97, 251, 184, 172, 136, 55, 213, 8, 103, 12, 221, 212, 13, 160, 116, 91,
|
||||
237, 127, 218, 190, 103, 131, 77, 82, 36, 100, 22, 252, 79, 69, 54, 26,
|
||||
65, 182, 115, 142, 247, 20, 89, 81, 188, 244, 27, 120, 240, 248, 13, 230,
|
||||
67, 133, 32, 201, 129, 87, 9, 245, 66, 88, 166, 34, 46, 184, 119, 218,
|
||||
144, 235, 163, 40, 138, 134, 127, 217, 64, 227, 116, 67, 55, 202, 130, 48,
|
||||
199, 42, 251, 112, 124, 153, 123, 194, 243, 49, 250, 12, 78, 157, 167, 134,
|
||||
210, 73, 156, 102, 21, 88, 216, 123, 45, 11, 208, 18, 47, 187, 20, 43,
|
||||
3, 180, 124, 2, 136, 176, 77, 111, 138, 139, 91, 225, 126, 8, 74, 255,
|
||||
88, 192, 193, 239, 138, 204, 139, 194, 166, 130, 252, 184, 140, 168, 30, 177,
|
||||
121, 98, 131, 124, 69, 171, 75, 49, 184, 34, 76, 122, 202, 115, 184, 253,
|
||||
120, 182, 33, 251, 1, 74, 216, 217, 243, 168, 70, 162, 119, 158, 197, 198,
|
||||
61, 89, 7, 5, 54, 199, 211, 170, 23, 226, 44, 247, 165, 195, 7, 225,
|
||||
91, 23, 50, 15, 51, 208, 106, 94, 12, 31, 43, 112, 146, 139, 246, 182,
|
||||
113, 1, 97, 15, 66, 2, 51, 76, 164, 184, 237, 200, 218, 176, 72, 98,
|
||||
33, 135, 38, 147, 140, 229, 50, 94, 81, 187, 129, 17, 238, 168, 146, 203,
|
||||
181, 99, 164, 3, 104, 98, 255, 189, 114, 142, 86, 102, 229, 102, 80, 129,
|
||||
64, 84, 79, 161, 81, 156, 128, 111, 164, 197, 18, 15, 55, 196, 198, 191,
|
||||
28, 113, 117, 96, 207, 253, 19, 158, 231, 13, 53, 130, 252, 211, 58, 180,
|
||||
212, 142, 7, 219, 38, 81, 62, 109, 167, 113, 33, 56, 97, 185, 157, 130,
|
||||
186, 129, 119, 182, 196, 26, 54, 110, 65, 170, 166, 236, 30, 22, 162, 0,
|
||||
106, 12, 248, 33, 48, 72, 159, 17, 76, 244, 172, 132, 89, 171, 196, 76,
|
||||
254, 166, 76, 218, 226, 3, 52, 220, 238, 181, 179, 144, 225, 23, 3, 166,
|
||||
158, 35, 228, 154, 204, 23, 203, 71, 134, 189, 18, 168, 236, 141, 117, 138,
|
||||
2, 132, 78, 57, 154, 21, 250, 196, 184, 40, 161, 40, 10, 178, 134, 120,
|
||||
132, 123, 101, 82, 205, 121, 55, 140, 231, 56, 231, 71, 206, 246, 198, 150,
|
||||
146, 192, 45, 105, 242, 1, 125, 18, 176, 46, 222, 122, 19, 80, 113, 133,
|
||||
131, 162, 81, 51, 98, 168, 247, 161, 139, 39, 63, 162, 22, 153, 170, 92,
|
||||
91, 130, 174, 200, 45, 112, 99, 164, 132, 184, 191, 186, 200, 167, 86, 145,
|
||||
167, 227, 130, 44, 12, 158, 172, 249, 204, 17, 54, 249, 16, 200, 21, 174,
|
||||
67, 223, 105, 201, 50, 36, 133, 203, 244, 131, 228, 67, 29, 195, 91, 91,
|
||||
55, 107, 167, 154, 170, 137, 218, 183, 169, 61, 99, 175, 128, 23, 142, 183,
|
||||
66, 255, 59, 187, 66, 85, 212, 109, 168, 82, 16, 43, 67, 139, 114, 176,
|
||||
216, 255, 130, 94, 152, 79, 183, 64, 100, 23, 214, 82, 34, 230, 48, 15,
|
||||
242, 130, 50, 241, 81, 32, 5, 125, 183, 182, 184, 99, 248, 109, 159, 210,
|
||||
226, 61, 119, 129, 39, 149, 78, 214, 107, 78, 147, 124, 228, 18, 143, 188,
|
||||
84, 180, 233, 119, 64, 39, 158, 133, 177, 168, 6, 150, 80, 117, 150, 56,
|
||||
49, 72, 49, 37, 30, 242, 49, 142, 33, 156, 34, 44, 44, 72, 58, 22,
|
||||
249, 46, 168, 80, 25, 196, 64, 174, 97, 179, 244, 134, 213, 105, 63, 151,
|
||||
21, 90, 168, 90, 245, 28, 157, 65, 250, 232, 188, 27, 99, 160, 156, 127,
|
||||
68, 193, 10, 80, 205, 36, 138, 229, 12, 223, 70, 169, 251, 41, 48, 94,
|
||||
41, 177, 99, 256, 158, 0, 6, 83, 231, 191, 120, 135, 157, 146, 218, 213,
|
||||
160, 7, 47, 234, 98, 211, 79, 225, 179, 95, 175, 105, 185, 79, 115, 0,
|
||||
104, 14, 65, 124, 15, 188, 52, 9, 253, 27, 132, 137, 13, 127, 75, 238,
|
||||
185, 253, 33, 8, 52, 157, 164, 68, 232, 188, 69, 28, 209, 233, 5, 129,
|
||||
216, 90, 252, 212, 33, 200, 222, 9, 112, 15, 43, 36, 226, 114, 15, 249,
|
||||
217, 8, 148, 22, 147, 23, 143, 67, 222, 116, 235, 250, 212, 210, 39, 142,
|
||||
108, 64, 209, 83, 73, 66, 99, 34, 17, 29, 45, 151, 244, 114, 28, 241,
|
||||
144, 208, 146, 179, 132, 89, 217, 198, 252, 219, 205, 165, 75, 107, 11, 173,
|
||||
76, 6, 196, 247, 152, 216, 248, 91, 209, 178, 57, 250, 174, 60, 79, 123,
|
||||
18, 135, 9, 241, 230, 159, 184, 68, 156, 251, 215, 9, 113, 234, 75, 235,
|
||||
103, 194, 205, 129, 230, 45, 96, 73, 157, 20, 200, 212, 212, 228, 161, 7,
|
||||
231, 228, 108, 43, 198, 87, 140, 140, 4, 182, 164, 3, 53, 104, 250, 213,
|
||||
85, 38, 89, 61, 52, 187, 35, 204, 86, 249, 100, 71, 248, 213, 163, 215,
|
||||
66, 106, 252, 129, 40, 111, 47, 24, 186, 221, 85, 205, 199, 237, 122, 181,
|
||||
32, 46, 182, 135, 33, 251, 142, 34, 208, 242, 128, 255, 4, 234, 15, 33,
|
||||
167, 222, 32, 186, 191, 34, 255, 244, 98, 240, 228, 204, 30, 142, 32, 70,
|
||||
69, 83, 110, 151, 10, 243, 141, 21, 223, 69, 61, 37, 59, 209, 102, 114,
|
||||
223, 33, 129, 254, 255, 103, 86, 247, 235, 72, 126, 177, 102, 226, 102, 30,
|
||||
149, 221, 62, 247, 251, 120, 163, 173, 57, 202, 204, 24, 39, 106, 120, 143,
|
||||
202, 176, 191, 147, 37, 38, 51, 133, 47, 245, 157, 132, 154, 71, 183, 111,
|
||||
30, 180, 18, 202, 82, 96, 170, 91, 157, 181, 212, 140, 256, 8, 196, 121,
|
||||
149, 79, 66, 127, 113, 78, 4, 197, 84, 256, 111, 222, 102, 63, 228, 104,
|
||||
136, 223, 67, 193, 93, 154, 249, 83, 204, 101, 200, 234, 84, 252, 230, 195,
|
||||
43, 140, 120, 242, 89, 63, 166, 233, 209, 94, 43, 170, 126, 5, 205, 78,
|
||||
112, 80, 143, 151, 146, 248, 137, 203, 45, 183, 61, 1, 155, 8, 102, 59,
|
||||
68, 212, 230, 61, 254, 191, 128, 223, 176, 123, 229, 27, 146, 120, 96, 165,
|
||||
213, 12, 232, 40, 186, 225, 66, 105, 200, 195, 212, 110, 237, 238, 151, 19,
|
||||
12, 171, 150, 82, 7, 228, 79, 52, 15, 78, 62, 43, 21, 154, 114, 21,
|
||||
12, 212, 256, 232, 125, 127, 5, 51, 37, 252, 136, 13, 47, 195, 168, 191,
|
||||
231, 55, 57, 251, 214, 116, 15, 86, 210, 41, 249, 242, 119, 27, 250, 203,
|
||||
107, 69, 90, 43, 206, 154, 127, 54, 100, 78, 187, 54, 244, 177, 234, 167,
|
||||
202, 136, 209, 171, 69, 114, 133, 173, 26, 139, 78, 141, 128, 32, 124, 39,
|
||||
45, 218, 96, 68, 90, 44, 67, 62, 83, 190, 188, 256, 103, 42, 102, 64,
|
||||
249, 0, 141, 11, 61, 69, 70, 66, 233, 237, 29, 200, 251, 157, 71, 51,
|
||||
64, 133, 113, 76, 35, 125, 76, 137, 217, 145, 35, 69, 226, 180, 56, 249,
|
||||
156, 163, 176, 237, 81, 54, 85, 169, 115, 211, 129, 70, 248, 40, 252, 192,
|
||||
194, 101, 247, 8, 181, 124, 217, 191, 194, 93, 99, 127, 117, 177, 144, 151,
|
||||
228, 121, 32, 11, 89, 81, 26, 29, 183, 76, 249, 132, 179, 70, 34, 102,
|
||||
20, 66, 87, 63, 124, 205, 174, 177, 87, 219, 73, 218, 91, 87, 176, 72,
|
||||
15, 211, 47, 61, 251, 165, 39, 247, 146, 70, 150, 57, 1, 212, 36, 162,
|
||||
39, 38, 16, 216, 3, 50, 116, 200, 32, 234, 77, 181, 155, 19, 90, 188,
|
||||
36, 6, 254, 46, 46, 203, 25, 230, 181, 196, 4, 151, 225, 65, 122, 216,
|
||||
168, 86, 158, 131, 136, 16, 49, 102, 233, 64, 154, 88, 228, 52, 146, 69,
|
||||
93, 157, 243, 121, 70, 209, 126, 213, 88, 145, 236, 65, 70, 96, 204, 47,
|
||||
10, 200, 77, 8, 103, 150, 48, 153, 5, 37, 52, 235, 209, 31, 181, 126,
|
||||
83, 142, 224, 140, 6, 32, 200, 171, 160, 179, 115, 229, 75, 194, 208, 39,
|
||||
59, 223, 52, 247, 38, 197, 135, 1, 6, 189, 106, 114, 168, 5, 211, 222,
|
||||
44, 63, 90, 160, 116, 172, 170, 133, 125, 138, 39, 131, 23, 178, 10, 214,
|
||||
36, 93, 28, 59, 68, 17, 123, 25, 255, 184, 204, 102, 194, 214, 129, 94,
|
||||
159, 245, 112, 141, 62, 11, 61, 197, 124, 221, 205, 11, 79, 71, 201, 54,
|
||||
58, 150, 29, 121, 87, 46, 240, 201, 68, 20, 194, 209, 47, 152, 158, 174,
|
||||
193, 164, 120, 255, 216, 165, 247, 58, 85, 130, 220, 23, 122, 223, 188, 98,
|
||||
21, 70, 72, 170, 150, 237, 76, 143, 112, 238, 206, 146, 215, 110, 4, 250,
|
||||
68, 44, 174, 177, 30, 98, 143, 241, 180, 127, 113, 48, 0, 1, 179, 199,
|
||||
59, 106, 201, 114, 29, 86, 173, 133, 217, 44, 200, 141, 107, 172, 16, 60,
|
||||
82, 58, 239, 94, 141, 234, 186, 235, 109, 173, 249, 139, 141, 59, 100, 248,
|
||||
84, 144, 49, 160, 51, 207, 164, 103, 74, 97, 146, 202, 193, 125, 168, 134,
|
||||
236, 111, 135, 121, 59, 145, 168, 200, 181, 173, 109, 2, 255, 6, 9, 245,
|
||||
90, 202, 214, 143, 121, 65, 85, 232, 132, 77, 228, 84, 26, 54, 184, 15,
|
||||
161, 29, 177, 79, 43, 0, 156, 184, 163, 165, 62, 90, 179, 93, 45, 239,
|
||||
1, 16, 120, 189, 127, 47, 74, 166, 20, 214, 233, 226, 89, 217, 229, 26,
|
||||
156, 53, 162, 60, 21, 3, 192, 72, 111, 51, 53, 101, 181, 208, 88, 82,
|
||||
179, 160, 219, 113, 240, 108, 43, 224, 162, 147, 62, 14, 95, 81, 205, 4,
|
||||
160, 177, 225, 115, 29, 69, 235, 168, 148, 29, 128, 114, 124, 129, 172, 165,
|
||||
215, 231, 214, 86, 160, 44, 157, 91, 248, 183, 73, 164, 56, 181, 162, 92,
|
||||
141, 118, 127, 240, 196, 77, 0, 9, 244, 79, 250, 100, 195, 25, 255, 85,
|
||||
94, 35, 212, 137, 107, 34, 110, 20, 200, 104, 17, 32, 231, 43, 150, 159,
|
||||
231, 216, 223, 190, 226, 109, 162, 197, 87, 92, 224, 11, 111, 73, 60, 225,
|
||||
238, 73, 246, 169, 19, 217, 119, 38, 121, 118, 70, 82, 99, 241, 110, 67,
|
||||
31, 76, 146, 215, 124, 240, 31, 103, 139, 224, 75, 160, 31, 78, 93, 4,
|
||||
64, 9, 103, 223, 6, 227, 119, 85, 116, 81, 21, 43, 46, 206, 234, 132,
|
||||
85, 99, 22, 131, 135, 97, 86, 13, 234, 188, 21, 14, 89, 169, 207, 238,
|
||||
219, 177, 190, 72, 157, 41, 114, 140, 92, 141, 186, 1, 63, 107, 225, 184,
|
||||
118, 150, 153, 254, 241, 106, 120, 210, 104, 144, 151, 161, 88, 206, 125, 164,
|
||||
15, 211, 173, 49, 146, 241, 71, 36, 58, 201, 46, 27, 33, 187, 91, 162,
|
||||
117, 19, 210, 213, 187, 97, 193, 50, 190, 114, 217, 60, 61, 167, 207, 213,
|
||||
213, 53, 135, 34, 156, 91, 115, 119, 46, 99, 242, 1, 90, 52, 198, 227,
|
||||
201, 91, 216, 146, 210, 82, 121, 38, 73, 133, 182, 193, 132, 148, 246, 75,
|
||||
109, 157, 179, 113, 176, 134, 205, 159, 148, 58, 103, 171, 132, 156, 133, 147,
|
||||
161, 231, 39, 100, 175, 97, 125, 28, 183, 129, 135, 191, 202, 181, 29, 218,
|
||||
43, 104, 148, 203, 189, 204, 4, 182, 169, 1, 134, 122, 141, 202, 13, 187,
|
||||
177, 112, 162, 35, 231, 6, 8, 241, 99, 6, 191, 45, 113, 113, 101, 104};
|
||||
|
||||
// The S-Box we use for further linearity breaking.
|
||||
// We created it by taking the digits of decimal expansion of e.
|
||||
// The code that created it can be found in 'ProduceRandomSBox.c'.
|
||||
unsigned char SBox[256] = {
|
||||
//0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0x7d, 0xd1, 0x70, 0x0b, 0xfa, 0x39, 0x18, 0xc3, 0xf3, 0xbb, 0xa7, 0xd4, 0x84, 0x25, 0x3b, 0x3c, // 0
|
||||
0x2c, 0x15, 0x69, 0x9a, 0xf9, 0x27, 0xfb, 0x02, 0x52, 0xba, 0xa8, 0x4b, 0x20, 0xb5, 0x8b, 0x3a, // 1
|
||||
0x88, 0x8e, 0x26, 0xcb, 0x71, 0x5e, 0xaf, 0xad, 0x0c, 0xac, 0xa1, 0x93, 0xc6, 0x78, 0xce, 0xfc, // 2
|
||||
0x2a, 0x76, 0x17, 0x1f, 0x62, 0xc2, 0x2e, 0x99, 0x11, 0x37, 0x65, 0x40, 0xfd, 0xa0, 0x03, 0xc1, // 3
|
||||
0xca, 0x48, 0xe2, 0x9b, 0x81, 0xe4, 0x1c, 0x01, 0xec, 0x68, 0x7a, 0x5a, 0x50, 0xf8, 0x0e, 0xa3, // 4
|
||||
0xe8, 0x61, 0x2b, 0xa2, 0xeb, 0xcf, 0x8c, 0x3d, 0xb4, 0x95, 0x13, 0x08, 0x46, 0xab, 0x91, 0x7b, // 5
|
||||
0xea, 0x55, 0x67, 0x9d, 0xdd, 0x29, 0x6a, 0x8f, 0x9f, 0x22, 0x4e, 0xf2, 0x57, 0xd2, 0xa9, 0xbd, // 6
|
||||
0x38, 0x16, 0x5f, 0x4c, 0xf7, 0x9e, 0x1b, 0x2f, 0x30, 0xc7, 0x41, 0x24, 0x5c, 0xbf, 0x05, 0xf6, // 7
|
||||
0x0a, 0x31, 0xa5, 0x45, 0x21, 0x33, 0x6b, 0x6d, 0x6c, 0x86, 0xe1, 0xa4, 0xe6, 0x92, 0x9c, 0xdf, // 8
|
||||
0xe7, 0xbe, 0x28, 0xe3, 0xfe, 0x06, 0x4d, 0x98, 0x80, 0x04, 0x96, 0x36, 0x3e, 0x14, 0x4a, 0x34, // 9
|
||||
0xd3, 0xd5, 0xdb, 0x44, 0xcd, 0xf5, 0x54, 0xdc, 0x89, 0x09, 0x90, 0x42, 0x87, 0xff, 0x7e, 0x56, // A
|
||||
0x5d, 0x59, 0xd7, 0x23, 0x75, 0x19, 0x97, 0x73, 0x83, 0x64, 0x53, 0xa6, 0x1e, 0xd8, 0xb0, 0x49, // B
|
||||
0x3f, 0xef, 0xbc, 0x7f, 0x43, 0xf0, 0xc9, 0x72, 0x0f, 0x63, 0x79, 0x2d, 0xc0, 0xda, 0x66, 0xc8, // C
|
||||
0x32, 0xde, 0x47, 0x07, 0xb8, 0xe9, 0x1d, 0xc4, 0x85, 0x74, 0x82, 0xcc, 0x60, 0x51, 0x77, 0x0d, // D
|
||||
0xaa, 0x35, 0xed, 0x58, 0x7c, 0x5b, 0xb9, 0x94, 0x6e, 0x8d, 0xb1, 0xc5, 0xb7, 0xee, 0xb6, 0xae, // E
|
||||
0x10, 0xe0, 0xd6, 0xd9, 0xe5, 0x4f, 0xf1, 0x12, 0x00, 0xd0, 0xf4, 0x1a, 0x6f, 0x8a, 0xb3, 0xb2 }; // F
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Helper functions definition portion.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Don't vectorize, move decl to header file
|
||||
|
||||
// Translates an input array with values in base 257 to output array with values in base 256.
|
||||
// Returns the carry bit.
|
||||
//
|
||||
// Parameters:
|
||||
// - input: the input array of size EIGHTH_N. Each value in the array is a number in Z_257.
|
||||
// The MSB is assumed to be the last one in the array.
|
||||
// - output: the input array encoded in base 256.
|
||||
//
|
||||
// Returns:
|
||||
// - The carry bit (MSB).
|
||||
swift_int16_t TranslateToBase256(swift_int32_t input[EIGHTH_N], unsigned char output[EIGHTH_N]);
|
||||
|
||||
// Translates an input integer into the range (-FIELD_SIZE / 2) <= result <= (FIELD_SIZE / 2).
|
||||
//
|
||||
// Parameters:
|
||||
// - x: the input integer.
|
||||
//
|
||||
// Returns:
|
||||
// - The result, which equals (x MOD FIELD_SIZE), such that |result| <= (FIELD_SIZE / 2).
|
||||
int Center(int x);
|
||||
|
||||
// Calculates bit reversal permutation.
|
||||
//
|
||||
// Parameters:
|
||||
// - input: the input to reverse.
|
||||
// - numOfBits: the number of bits in the input to reverse.
|
||||
//
|
||||
// Returns:
|
||||
// - The resulting number, which is obtained from the input by reversing its bits.
|
||||
int ReverseBits(int input, int numOfBits);
|
||||
|
||||
// Initializes the FFT fast lookup table.
|
||||
// Shall be called only once.
|
||||
void InitializeSWIFFTX();
|
||||
|
||||
// Calculates the FFT.
|
||||
//
|
||||
// Parameters:
|
||||
// - input: the input to the FFT.
|
||||
// - output: the resulting output.
|
||||
void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Helper functions implementation portion.
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Don't vectorize, delete this copy.
|
||||
|
||||
swift_int16_t TranslateToBase256(swift_int32_t input[EIGHTH_N], unsigned char output[EIGHTH_N])
|
||||
{
|
||||
swift_int32_t pairs[EIGHTH_N / 2];
|
||||
int i;
|
||||
|
||||
for (i = 0; i < EIGHTH_N; i += 2)
|
||||
{
|
||||
// input[i] + 257 * input[i + 1]
|
||||
pairs[i >> 1] = input[i] + input[i + 1] + (input[i + 1] << 8);
|
||||
}
|
||||
|
||||
for (i = (EIGHTH_N / 2) - 1; i > 0; --i)
|
||||
{
|
||||
int j;
|
||||
|
||||
for (j = i - 1; j < (EIGHTH_N / 2) - 1; ++j)
|
||||
{
|
||||
// pairs[j + 1] * 513, because 257^2 = 513 % 256^2.
|
||||
register swift_int32_t temp = pairs[j] + pairs[j + 1] + (pairs[j + 1] << 9);
|
||||
pairs[j] = temp & 0xffff;
|
||||
pairs[j + 1] += (temp >> 16);
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < EIGHTH_N; i += 2)
|
||||
{
|
||||
output[i] = (unsigned char) (pairs[i >> 1] & 0xff);
|
||||
output[i + 1] = (unsigned char) ((pairs[i >> 1] >> 8) & 0xff);
|
||||
}
|
||||
|
||||
return (pairs[EIGHTH_N/2 - 1] >> 16);
|
||||
}
|
||||
|
||||
int Center(int x)
|
||||
{
|
||||
int result = x % FIELD_SIZE;
|
||||
|
||||
if (result > (FIELD_SIZE / 2))
|
||||
result -= FIELD_SIZE;
|
||||
|
||||
if (result < (FIELD_SIZE / -2))
|
||||
result += FIELD_SIZE;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int ReverseBits(int input, int numOfBits)
|
||||
{
|
||||
register int reversed = 0;
|
||||
|
||||
for (input |= numOfBits; input > 1; input >>= 1)
|
||||
reversed = (reversed << 1) | (input & 1);
|
||||
|
||||
return reversed;
|
||||
}
|
||||
|
||||
void InitializeSWIFFTX()
|
||||
{
|
||||
int i, j, k, x;
|
||||
// The powers of OMEGA
|
||||
int omegaPowers[2 * N];
|
||||
omegaPowers[0] = 1;
|
||||
|
||||
if (wasSetupDone)
|
||||
return;
|
||||
|
||||
for (i = 1; i < (2 * N); ++i)
|
||||
{
|
||||
omegaPowers[i] = Center(omegaPowers[i - 1] * OMEGA);
|
||||
}
|
||||
|
||||
for (i = 0; i < (N / W); ++i)
|
||||
{
|
||||
for (j = 0; j < W; ++j)
|
||||
{
|
||||
multipliers[(i << 3) + j] = omegaPowers[ReverseBits(i, N / W) * (2 * j + 1)];
|
||||
}
|
||||
}
|
||||
|
||||
for (x = 0; x < 256; ++x)
|
||||
{
|
||||
for (j = 0; j < 8; ++j)
|
||||
{
|
||||
register int temp = 0;
|
||||
for (k = 0; k < 8; ++k)
|
||||
{
|
||||
temp += omegaPowers[(EIGHTH_N * (2 * j + 1) * ReverseBits(k, W)) % (2 * N)]
|
||||
* ((x >> k) & 1);
|
||||
}
|
||||
|
||||
fftTable[(x << 3) + j] = Center(temp);
|
||||
}
|
||||
}
|
||||
|
||||
wasSetupDone = true;
|
||||
}
|
||||
|
||||
// input should be deinterleaved in contiguos memory
|
||||
// output and F are 4x32
|
||||
// multipliers & fftTable are scalar 16
|
||||
|
||||
|
||||
void FFT_4way(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
||||
{
|
||||
swift_int16_t *mult = multipliers;
|
||||
m128_swift_int32_t F[64];
|
||||
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
int j = i<<3;
|
||||
|
||||
// Need to isolate bytes in input, 8 bytes per lane.
|
||||
// Each iteration of the loop process one input vector
|
||||
// Each lane reads a different index to ffttable.
|
||||
|
||||
// deinterleave the input!
|
||||
|
||||
// load table with 4 lanes from different indexes into fftTable
|
||||
// extract bytes into m128 4x16
|
||||
// mutiply by vectorized mult
|
||||
|
||||
// input[lane][byte]
|
||||
|
||||
__m128i table;
|
||||
table = _mm_set_epi32( fftTable[ input[3][i] ],
|
||||
fftTable[ input[2][i] ],
|
||||
fftTable[ input[1][i] ],
|
||||
fftTable[ input[0][i] ] );
|
||||
|
||||
F[i ] = _mm_mullo_epi32( mm128_const1_32( mult[j+0] ), table );
|
||||
|
||||
table = _mm_set_epi32( fftTable[ input[3][i+1] ]
|
||||
fftTable[ input[2][i+1] ]
|
||||
fftTable[ input[1][i+1] ]
|
||||
fftTable[ input[0][i+1] ] );
|
||||
|
||||
F[i+8] = _mm_mullo_epi32( mm128_const1_32( mult[j+0] ), table );
|
||||
|
||||
|
||||
m128_swift_int16_t *table = &( fftTable[input[i] << 3] );
|
||||
|
||||
F[i ] = _mm_mullo_epi32( mm128_const1_32( mult[j+0] ),
|
||||
mm128_const1_32( table[0] ) );
|
||||
F[i+ 8] = _mm_mullo_epi32( mm128_const1_32( mult[j+1] ),
|
||||
mm128_const1_32( table[1] ) );
|
||||
F[i+16] = _mm_mullo_epi32( mm128_const1_32( mult[j+2] ),
|
||||
mm128_const1_32( table[2] ) );
|
||||
F[i+24] = _mm_mullo_epi32( mm128_const1_32( mult[j+3] ),
|
||||
mm128_const1_32( table[3] ) );
|
||||
F[i+32] = _mm_mullo_epi32( mm128_const1_32( mult[j+4] ),
|
||||
mm128_const1_32( table[4] ) );
|
||||
F[i+40] = _mm_mullo_epi32( mm128_const1_32( mult[j+5] ),
|
||||
mm128_const1_32( table[5] ) );
|
||||
F[i+48] = _mm_mullo_epi32( mm128_const1_32( mult[j+6] ),
|
||||
mm128_const1_32( table[6] ) );
|
||||
F[i+56] = _mm_mullo_epi32( mm128_const1_32( mult[j+7] ),
|
||||
mm128_const1_32( table[7] ) );
|
||||
}
|
||||
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
{
|
||||
int j = i<<3;
|
||||
ADD_SUB_4WAY( F[j ], F[j+1] );
|
||||
ADD_SUB_4WAY( F[j+2], F[j+3] );
|
||||
ADD_SUB_4WAY( F[j+4], F[j+5] );
|
||||
ADD_SUB_4WAY( F[j+6], F[j+7] );
|
||||
|
||||
F[j+3] = _mm_slli_epi32( F[j+3], 4 );
|
||||
F[j+7] = _mm_slli_epi32( F[j+7], 4 );
|
||||
|
||||
ADD_SUB_4WAY( F[j ], F[j+2] );
|
||||
ADD_SUB_4WAY( F[j+1], F[j+3] );
|
||||
ADD_SUB_4WAY( F[j+4], F[j+6] );
|
||||
ADD_SUB_4WAY( F[j+5], F[j+7] );
|
||||
|
||||
F[j+5] = _mm_slli_epi32( F[j+5], 2 );
|
||||
F[j+6] = _mm_slli_epi32( F[j+6], 4 );
|
||||
F[j+7] = _mm_slli_epi32( F[j+7], 6 );
|
||||
|
||||
ADD_SUB_4WAY( F[j ], F[j+4] );
|
||||
ADD_SUB_4WAY( F[j+1], F[j+5] );
|
||||
ADD_SUB_4WAY( F[j+2], F[j+6] );
|
||||
ADD_SUB_4WAY( F[j+3], F[j+7] );
|
||||
|
||||
output[i ] = Q_REDUCE_4WAY( F[j ] );
|
||||
output[i+ 8] = Q_REDUCE_4WAY( F[j+1] );
|
||||
output[i+16] = Q_REDUCE_4WAY( F[j+2] );
|
||||
output[i+24] = Q_REDUCE_4WAY( F[j+3] );
|
||||
output[i+32] = Q_REDUCE_4WAY( F[j+4] );
|
||||
output[i+40] = Q_REDUCE_4WAY( F[j+5] );
|
||||
output[i+48] = Q_REDUCE_4WAY( F[j+6] );
|
||||
output[i+56] = Q_REDUCE_4WAY( F[j+7] );
|
||||
}
|
||||
}
|
||||
|
||||
// Calculates the FFT part of SWIFFT.
|
||||
// We divided the SWIFFT calculation into two, because that way we could save 2 computations of
|
||||
// the FFT part, since in the first stage of SWIFFTX the difference between the first 3 SWIFFTs
|
||||
// is only the A's part.
|
||||
//
|
||||
// Parameters:
|
||||
// - input: the input to FFT.
|
||||
// - m: the input size divided by 8. The function performs m FFTs.
|
||||
// - output: will store the result.
|
||||
void SWIFFTFFT(const unsigned char *input, int m, swift_int32_t *output)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0;
|
||||
i < m;
|
||||
i++, input += EIGHTH_N, output += N)
|
||||
{
|
||||
FFT(input, output);
|
||||
}
|
||||
}
|
||||
|
||||
// Calculates the 'sum' part of SWIFFT, including the base change at the end.
|
||||
// We divided the SWIFFT calculation into two, because that way we could save 2 computations of
|
||||
// the FFT part, since in the first stage of SWIFFTX the difference between the first 3 SWIFFTs
|
||||
// is only the A's part.
|
||||
//
|
||||
// Parameters:
|
||||
// - input: the input. Of size 64 * m.
|
||||
// - m: the input size divided by 64.
|
||||
// - output: will store the result.
|
||||
// - a: the coefficients in the sum. Of size 64 * m.
|
||||
void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const swift_int16_t *a)
|
||||
{
|
||||
int i, j;
|
||||
swift_int32_t result[N];
|
||||
register swift_int16_t carry = 0;
|
||||
|
||||
for (j = 0; j < N; ++j)
|
||||
{
|
||||
register swift_int32_t sum = 0;
|
||||
const register swift_int32_t *f = input + j;
|
||||
const register swift_int16_t *k = a + j;
|
||||
|
||||
for (i = 0; i < m; i++, f += N,k += N)
|
||||
{
|
||||
sum += (*f) * (*k);
|
||||
}
|
||||
|
||||
result[j] = sum;
|
||||
}
|
||||
|
||||
for (j = 0; j < N; ++j)
|
||||
{
|
||||
result[j] = ((FIELD_SIZE << 22) + result[j]) % FIELD_SIZE;
|
||||
}
|
||||
|
||||
for (j = 0; j < 8; ++j)
|
||||
{
|
||||
int register carryBit = TranslateToBase256(result + (j << 3), output + (j << 3));
|
||||
carry |= carryBit << j;
|
||||
}
|
||||
|
||||
output[N] = carry;
|
||||
}
|
||||
|
||||
|
||||
// On entry input is interleaved 4x64. SIZE is *4 lanes / 8 bytes,
|
||||
// multiply by 2.
|
||||
|
||||
|
||||
void ComputeSingleSWIFFTX_4way( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
|
||||
unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE],
|
||||
bool doSmooth)
|
||||
{
|
||||
int i;
|
||||
// Will store the result of the FFT parts:
|
||||
m128_swift_int32_t fftOut[N * M];
|
||||
// swift_int32_t fftOut[N * M];
|
||||
unsigned char intermediate[N * 3 + 8];
|
||||
unsigned char carry0,carry1,carry2;
|
||||
|
||||
// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
|
||||
// overriden by the following SWIFFT):
|
||||
|
||||
// 1. Compute the FFT of the input - the common part for the first 3 SWIFFTs:
|
||||
SWIFFTFFT(input, M, fftOut);
|
||||
|
||||
// 2. Compute the sums of the 3 SWIFFTs, each using a different set of coefficients:
|
||||
|
||||
// 2a. The first SWIFFT:
|
||||
SWIFFTSum(fftOut, M, intermediate, As);
|
||||
// Remember the carry byte:
|
||||
carry0 = intermediate[N];
|
||||
|
||||
// 2b. The second one:
|
||||
SWIFFTSum(fftOut, M, intermediate + N, As + (M * N));
|
||||
carry1 = intermediate[2 * N];
|
||||
|
||||
// 2c. The third one:
|
||||
SWIFFTSum(fftOut, M, intermediate + (2 * N), As + 2 * (M * N));
|
||||
carry2 = intermediate[3 * N];
|
||||
|
||||
//2d. Put three carry bytes in their place
|
||||
intermediate[3 * N] = carry0;
|
||||
intermediate[(3 * N) + 1] = carry1;
|
||||
intermediate[(3 * N) + 2] = carry2;
|
||||
|
||||
// Padding intermediate output with 5 zeroes.
|
||||
memset(intermediate + (3 * N) + 3, 0, 5);
|
||||
|
||||
// Apply the S-Box:
|
||||
for (i = 0; i < (3 * N) + 8; ++i)
|
||||
{
|
||||
intermediate[i] = SBox[intermediate[i]];
|
||||
}
|
||||
|
||||
// 3. The final and last SWIFFT:
|
||||
SWIFFTFFT(intermediate, 3 * (N/8) + 1, fftOut);
|
||||
SWIFFTSum(fftOut, 3 * (N/8) + 1, output, As);
|
||||
|
||||
if (doSmooth)
|
||||
{
|
||||
unsigned char sum[N];
|
||||
register int i, j;
|
||||
memset(sum, 0, N);
|
||||
|
||||
for (i = 0; i < (N + 1) * 8; ++i)
|
||||
{
|
||||
register const swift_int16_t *AsRow;
|
||||
register int AShift;
|
||||
|
||||
if (!(output[i >> 3] & (1 << (i & 7))))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
AsRow = As + N * M + (i & ~(N - 1)) ;
|
||||
AShift = i & 63;
|
||||
|
||||
for (j = AShift; j < N; ++j)
|
||||
{
|
||||
sum[j] += AsRow[j - AShift];
|
||||
}
|
||||
|
||||
for(j = 0; j < AShift; ++j)
|
||||
{
|
||||
sum[j] -= AsRow[N - AShift + j];
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < N; ++i)
|
||||
{
|
||||
output[i] = sum[i];
|
||||
}
|
||||
|
||||
output[N] = 0;
|
||||
}
|
||||
}
|
@@ -18,6 +18,8 @@
|
||||
//#include "stdbool.h"
|
||||
#include <memory.h>
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Constants and static tables portion.
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -49,20 +51,20 @@
|
||||
// - A: the first operand. After the operation stores the sum of the two operands.
|
||||
// - B: the second operand. After the operation stores the difference between the first and the
|
||||
// second operands.
|
||||
#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
|
||||
//#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
|
||||
|
||||
// Quickly reduces an integer modulo 257.
|
||||
//
|
||||
// Parameters:
|
||||
// - A: the input.
|
||||
#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
|
||||
//#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
|
||||
|
||||
// Since we need to do the setup only once, this is the indicator variable:
|
||||
static bool wasSetupDone = false;
|
||||
|
||||
// This array stores the powers of omegas that correspond to the indices, which are the input
|
||||
// values. Known also as the "outer FFT twiddle factors".
|
||||
swift_int16_t multipliers[N];
|
||||
swift_int16_t multipliers[N] __attribute__ ((aligned (64)));
|
||||
|
||||
// This array stores the powers of omegas, multiplied by the corresponding values.
|
||||
// We store this table to save computation time.
|
||||
@@ -72,14 +74,14 @@ swift_int16_t multipliers[N];
|
||||
// compression function, i is between 0 and 31, x_i is a 64-bit value.
|
||||
// One can see the formula for this (intermediate) stage in the SWIFFT FSE 2008 paper --
|
||||
// formula (2), section 3, page 6.
|
||||
swift_int16_t fftTable[256 * EIGHTH_N];
|
||||
swift_int16_t fftTable[256 * EIGHTH_N] __attribute__ ((aligned (64)));
|
||||
|
||||
// The A's we use in SWIFFTX shall be random elements of Z_257.
|
||||
// We generated these A's from the decimal expansion of PI as follows: we converted each
|
||||
// triple of digits into a decimal number d. If d < (257 * 3) we used (d % 257) for the next A
|
||||
// element, otherwise move to the next triple of digits in the expansion. This guarntees that
|
||||
// the A's are random, provided that PI digits are.
|
||||
const swift_int16_t As[3 * M * N] =
|
||||
const swift_int16_t As[3 * M * N] __attribute__ ((aligned (64))) =
|
||||
{141, 78, 139, 75, 238, 205, 129, 126, 22, 245, 197, 169, 142, 118, 105, 78,
|
||||
50, 149, 29, 208, 114, 34, 85, 117, 67, 148, 86, 256, 25, 49, 133, 93,
|
||||
95, 36, 68, 231, 211, 102, 151, 128, 224, 117, 193, 27, 102, 187, 7, 105,
|
||||
@@ -602,21 +604,14 @@ void InitializeSWIFFTX()
|
||||
int omegaPowers[2 * N];
|
||||
omegaPowers[0] = 1;
|
||||
|
||||
if (wasSetupDone)
|
||||
return;
|
||||
if (wasSetupDone) return;
|
||||
|
||||
for (i = 1; i < (2 * N); ++i)
|
||||
{
|
||||
omegaPowers[i] = Center(omegaPowers[i - 1] * OMEGA);
|
||||
}
|
||||
|
||||
for (i = 0; i < (N / W); ++i)
|
||||
{
|
||||
for (j = 0; j < W; ++j)
|
||||
{
|
||||
multipliers[(i << 3) + j] = omegaPowers[ReverseBits(i, N / W) * (2 * j + 1)];
|
||||
}
|
||||
}
|
||||
|
||||
for (x = 0; x < 256; ++x)
|
||||
{
|
||||
@@ -624,10 +619,8 @@ void InitializeSWIFFTX()
|
||||
{
|
||||
register int temp = 0;
|
||||
for (k = 0; k < 8; ++k)
|
||||
{
|
||||
temp += omegaPowers[(EIGHTH_N * (2 * j + 1) * ReverseBits(k, W)) % (2 * N)]
|
||||
* ((x >> k) & 1);
|
||||
}
|
||||
|
||||
fftTable[(x << 3) + j] = Center(temp);
|
||||
}
|
||||
@@ -636,9 +629,203 @@ void InitializeSWIFFTX()
|
||||
wasSetupDone = true;
|
||||
}
|
||||
|
||||
// In the original code the F matrix is rotated so it was not aranged
|
||||
// the same as all the other data. Rearanging F to match all the other
|
||||
// data made vectorizing possible, the compiler probably could have been
|
||||
// able to auto-vectorize with proper data organisation.
|
||||
// Also in the original code the custom 16 bit data types are all now 32
|
||||
// bit int32_t regardless of the type name.
|
||||
//
|
||||
void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
||||
{
|
||||
swift_int16_t *mult = multipliers;
|
||||
#if defined(__AVX2__)
|
||||
|
||||
__m256i F[8] __attribute__ ((aligned (64)));
|
||||
__m256i *mul = (__m256i*)multipliers;
|
||||
__m256i *out = (__m256i*)output;
|
||||
__m256i *tbl = (__m256i*)&( fftTable[ input[0] << 3 ] );
|
||||
|
||||
F[0] = _mm256_mullo_epi32( mul[0], *tbl );
|
||||
tbl = (__m256i*)&( fftTable[ input[1] << 3 ] );
|
||||
F[1] = _mm256_mullo_epi32( mul[1], *tbl );
|
||||
tbl = (__m256i*)&( fftTable[ input[2] << 3 ] );
|
||||
F[2] = _mm256_mullo_epi32( mul[2], *tbl );
|
||||
tbl = (__m256i*)&( fftTable[ input[3] << 3 ] );
|
||||
F[3] = _mm256_mullo_epi32( mul[3], *tbl );
|
||||
tbl = (__m256i*)&( fftTable[ input[4] << 3 ] );
|
||||
F[4] = _mm256_mullo_epi32( mul[4], *tbl );
|
||||
tbl = (__m256i*)&( fftTable[ input[5] << 3 ] );
|
||||
F[5] = _mm256_mullo_epi32( mul[5], *tbl );
|
||||
tbl = (__m256i*)&( fftTable[ input[6] << 3 ] );
|
||||
F[6] = _mm256_mullo_epi32( mul[6], *tbl );
|
||||
tbl = (__m256i*)&( fftTable[ input[7] << 3 ] );
|
||||
F[7] = _mm256_mullo_epi32( mul[7], *tbl );
|
||||
|
||||
#define ADD_SUB( a, b ) \
|
||||
{ \
|
||||
__m256i tmp = b; \
|
||||
b = _mm256_sub_epi32( a, b ); \
|
||||
a = _mm256_add_epi32( a, tmp ); \
|
||||
}
|
||||
|
||||
ADD_SUB( F[0], F[1] );
|
||||
ADD_SUB( F[2], F[3] );
|
||||
ADD_SUB( F[4], F[5] );
|
||||
ADD_SUB( F[6], F[7] );
|
||||
|
||||
F[3] = _mm256_slli_epi32( F[3], 4 );
|
||||
F[7] = _mm256_slli_epi32( F[7], 4 );
|
||||
|
||||
ADD_SUB( F[0], F[2] );
|
||||
ADD_SUB( F[1], F[3] );
|
||||
ADD_SUB( F[4], F[6] );
|
||||
ADD_SUB( F[5], F[7] );
|
||||
|
||||
F[5] = _mm256_slli_epi32( F[5], 2 );
|
||||
F[6] = _mm256_slli_epi32( F[6], 4 );
|
||||
F[7] = _mm256_slli_epi32( F[7], 6 );
|
||||
|
||||
ADD_SUB( F[0], F[4] );
|
||||
ADD_SUB( F[1], F[5] );
|
||||
ADD_SUB( F[2], F[6] );
|
||||
ADD_SUB( F[3], F[7] );
|
||||
|
||||
#undef ADD_SUB
|
||||
|
||||
#if defined (__AVX512VL__) && defined(__AVX512BW__)
|
||||
|
||||
const __m256i mask = _mm256_movm_epi8( 0x11111111 );
|
||||
|
||||
#else
|
||||
|
||||
const __m256i mask = m256_const1_32( 0x000000ff );
|
||||
|
||||
#endif
|
||||
|
||||
#define Q_REDUCE( a ) \
|
||||
_mm256_sub_epi32( _mm256_and_si256( a, mask ), \
|
||||
_mm256_srai_epi32( a, 8 ) )
|
||||
|
||||
out[0] = Q_REDUCE( F[0] );
|
||||
out[1] = Q_REDUCE( F[1] );
|
||||
out[2] = Q_REDUCE( F[2] );
|
||||
out[3] = Q_REDUCE( F[3] );
|
||||
out[4] = Q_REDUCE( F[4] );
|
||||
out[5] = Q_REDUCE( F[5] );
|
||||
out[6] = Q_REDUCE( F[6] );
|
||||
out[7] = Q_REDUCE( F[7] );
|
||||
|
||||
#undef Q_REDUCE
|
||||
|
||||
#elif defined(__SSE4_1__)
|
||||
|
||||
__m128i F[16] __attribute__ ((aligned (64)));
|
||||
__m128i *mul = (__m128i*)multipliers;
|
||||
__m128i *out = (__m128i*)output;
|
||||
__m128i *tbl = (__m128i*)&( fftTable[ input[0] << 3 ] );
|
||||
|
||||
F[ 0] = _mm_mullo_epi32( mul[ 0], tbl[0] );
|
||||
F[ 1] = _mm_mullo_epi32( mul[ 1], tbl[1] );
|
||||
tbl = (__m128i*)&( fftTable[ input[1] << 3 ] );
|
||||
F[ 2] = _mm_mullo_epi32( mul[ 2], tbl[0] );
|
||||
F[ 3] = _mm_mullo_epi32( mul[ 3], tbl[1] );
|
||||
tbl = (__m128i*)&( fftTable[ input[2] << 3 ] );
|
||||
F[ 4] = _mm_mullo_epi32( mul[ 4], tbl[0] );
|
||||
F[ 5] = _mm_mullo_epi32( mul[ 5], tbl[1] );
|
||||
tbl = (__m128i*)&( fftTable[ input[3] << 3 ] );
|
||||
F[ 6] = _mm_mullo_epi32( mul[ 6], tbl[0] );
|
||||
F[ 7] = _mm_mullo_epi32( mul[ 7], tbl[1] );
|
||||
tbl = (__m128i*)&( fftTable[ input[4] << 3 ] );
|
||||
F[ 8] = _mm_mullo_epi32( mul[ 8], tbl[0] );
|
||||
F[ 9] = _mm_mullo_epi32( mul[ 9], tbl[1] );
|
||||
tbl = (__m128i*)&( fftTable[ input[5] << 3 ] );
|
||||
F[10] = _mm_mullo_epi32( mul[10], tbl[0] );
|
||||
F[11] = _mm_mullo_epi32( mul[11], tbl[1] );
|
||||
tbl = (__m128i*)&( fftTable[ input[6] << 3 ] );
|
||||
F[12] = _mm_mullo_epi32( mul[12], tbl[0] );
|
||||
F[13] = _mm_mullo_epi32( mul[13], tbl[1] );
|
||||
tbl = (__m128i*)&( fftTable[ input[7] << 3 ] );
|
||||
F[14] = _mm_mullo_epi32( mul[14], tbl[0] );
|
||||
F[15] = _mm_mullo_epi32( mul[15], tbl[1] );
|
||||
|
||||
#define ADD_SUB( a, b ) \
|
||||
{ \
|
||||
__m128i tmp = b; \
|
||||
b = _mm_sub_epi32( a, b ); \
|
||||
a = _mm_add_epi32( a, tmp ); \
|
||||
}
|
||||
|
||||
ADD_SUB( F[ 0], F[ 2] );
|
||||
ADD_SUB( F[ 1], F[ 3] );
|
||||
ADD_SUB( F[ 4], F[ 6] );
|
||||
ADD_SUB( F[ 5], F[ 7] );
|
||||
ADD_SUB( F[ 8], F[10] );
|
||||
ADD_SUB( F[ 9], F[11] );
|
||||
ADD_SUB( F[12], F[14] );
|
||||
ADD_SUB( F[13], F[15] );
|
||||
|
||||
F[ 6] = _mm_slli_epi32( F[ 6], 4 );
|
||||
F[ 7] = _mm_slli_epi32( F[ 7], 4 );
|
||||
F[14] = _mm_slli_epi32( F[14], 4 );
|
||||
F[15] = _mm_slli_epi32( F[15], 4 );
|
||||
|
||||
ADD_SUB( F[ 0], F[ 4] );
|
||||
ADD_SUB( F[ 1], F[ 5] );
|
||||
ADD_SUB( F[ 2], F[ 6] );
|
||||
ADD_SUB( F[ 3], F[ 7] );
|
||||
ADD_SUB( F[ 8], F[12] );
|
||||
ADD_SUB( F[ 9], F[13] );
|
||||
ADD_SUB( F[10], F[14] );
|
||||
ADD_SUB( F[11], F[15] );
|
||||
|
||||
F[10] = _mm_slli_epi32( F[10], 2 );
|
||||
F[11] = _mm_slli_epi32( F[11], 2 );
|
||||
F[12] = _mm_slli_epi32( F[12], 4 );
|
||||
F[13] = _mm_slli_epi32( F[13], 4 );
|
||||
F[14] = _mm_slli_epi32( F[14], 6 );
|
||||
F[15] = _mm_slli_epi32( F[15], 6 );
|
||||
|
||||
ADD_SUB( F[ 0], F[ 8] );
|
||||
ADD_SUB( F[ 1], F[ 9] );
|
||||
ADD_SUB( F[ 2], F[10] );
|
||||
ADD_SUB( F[ 3], F[11] );
|
||||
ADD_SUB( F[ 4], F[12] );
|
||||
ADD_SUB( F[ 5], F[13] );
|
||||
ADD_SUB( F[ 6], F[14] );
|
||||
ADD_SUB( F[ 7], F[15] );
|
||||
|
||||
#undef ADD_SUB
|
||||
|
||||
const __m128i mask = m128_const1_32( 0x000000ff );
|
||||
|
||||
#define Q_REDUCE( a ) \
|
||||
_mm_sub_epi32( _mm_and_si128( a, mask ), _mm_srai_epi32( a, 8 ) )
|
||||
|
||||
out[ 0] = Q_REDUCE( F[ 0] );
|
||||
out[ 1] = Q_REDUCE( F[ 1] );
|
||||
out[ 2] = Q_REDUCE( F[ 2] );
|
||||
out[ 3] = Q_REDUCE( F[ 3] );
|
||||
out[ 4] = Q_REDUCE( F[ 4] );
|
||||
out[ 5] = Q_REDUCE( F[ 5] );
|
||||
out[ 6] = Q_REDUCE( F[ 6] );
|
||||
out[ 7] = Q_REDUCE( F[ 7] );
|
||||
out[ 8] = Q_REDUCE( F[ 8] );
|
||||
out[ 9] = Q_REDUCE( F[ 9] );
|
||||
out[10] = Q_REDUCE( F[10] );
|
||||
out[11] = Q_REDUCE( F[11] );
|
||||
out[12] = Q_REDUCE( F[12] );
|
||||
out[13] = Q_REDUCE( F[13] );
|
||||
out[14] = Q_REDUCE( F[14] );
|
||||
out[15] = Q_REDUCE( F[15] );
|
||||
|
||||
#undef Q_REDUCE
|
||||
|
||||
#else // < SSE4.1
|
||||
|
||||
swift_int16_t *mult = multipliers;
|
||||
|
||||
// First loop unrolling:
|
||||
register swift_int16_t *table = &(fftTable[input[0] << 3]);
|
||||
|
||||
/*
|
||||
swift_int32_t F[64];
|
||||
@@ -666,11 +853,8 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
||||
F50, F51, F52, F53, F54, F55, F56, F57, F58, F59,
|
||||
F60, F61, F62, F63;
|
||||
|
||||
// First loop unrolling:
|
||||
register swift_int16_t *table = &(fftTable[input[0] << 3]);
|
||||
|
||||
F0 = mult[0] * table[0];
|
||||
F8 = mult[1] * table[1];
|
||||
F0 = mult[0] * table[0];
|
||||
F8 = mult[1] * table[1];
|
||||
F16 = mult[2] * table[2];
|
||||
F24 = mult[3] * table[3];
|
||||
F32 = mult[4] * table[4];
|
||||
@@ -678,90 +862,93 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
||||
F48 = mult[6] * table[6];
|
||||
F56 = mult[7] * table[7];
|
||||
|
||||
mult += 8;
|
||||
table = &(fftTable[input[1] << 3]);
|
||||
|
||||
F1 = mult[0] * table[0];
|
||||
F9 = mult[1] * table[1];
|
||||
F17 = mult[2] * table[2];
|
||||
F25 = mult[3] * table[3];
|
||||
F33 = mult[4] * table[4];
|
||||
F41 = mult[5] * table[5];
|
||||
F49 = mult[6] * table[6];
|
||||
F57 = mult[7] * table[7];
|
||||
F1 = mult[ 8] * table[0];
|
||||
F9 = mult[ 9] * table[1];
|
||||
F17 = mult[10] * table[2];
|
||||
F25 = mult[11] * table[3];
|
||||
F33 = mult[12] * table[4];
|
||||
F41 = mult[13] * table[5];
|
||||
F49 = mult[14] * table[6];
|
||||
F57 = mult[15] * table[7];
|
||||
|
||||
mult += 8;
|
||||
table = &(fftTable[input[2] << 3]);
|
||||
|
||||
F2 = mult[0] * table[0];
|
||||
F10 = mult[1] * table[1];
|
||||
F18 = mult[2] * table[2];
|
||||
F26 = mult[3] * table[3];
|
||||
F34 = mult[4] * table[4];
|
||||
F42 = mult[5] * table[5];
|
||||
F50 = mult[6] * table[6];
|
||||
F58 = mult[7] * table[7];
|
||||
F2 = mult[16] * table[0];
|
||||
F10 = mult[17] * table[1];
|
||||
F18 = mult[18] * table[2];
|
||||
F26 = mult[19] * table[3];
|
||||
F34 = mult[20] * table[4];
|
||||
F42 = mult[21] * table[5];
|
||||
F50 = mult[22] * table[6];
|
||||
F58 = mult[23] * table[7];
|
||||
|
||||
mult += 8;
|
||||
table = &(fftTable[input[3] << 3]);
|
||||
|
||||
F3 = mult[0] * table[0];
|
||||
F11 = mult[1] * table[1];
|
||||
F19 = mult[2] * table[2];
|
||||
F27 = mult[3] * table[3];
|
||||
F35 = mult[4] * table[4];
|
||||
F43 = mult[5] * table[5];
|
||||
F51 = mult[6] * table[6];
|
||||
F59 = mult[7] * table[7];
|
||||
F3 = mult[24] * table[0];
|
||||
F11 = mult[25] * table[1];
|
||||
F19 = mult[26] * table[2];
|
||||
F27 = mult[27] * table[3];
|
||||
F35 = mult[28] * table[4];
|
||||
F43 = mult[29] * table[5];
|
||||
F51 = mult[30] * table[6];
|
||||
F59 = mult[31] * table[7];
|
||||
|
||||
mult += 8;
|
||||
table = &(fftTable[input[4] << 3]);
|
||||
|
||||
F4 = mult[0] * table[0];
|
||||
F12 = mult[1] * table[1];
|
||||
F20 = mult[2] * table[2];
|
||||
F28 = mult[3] * table[3];
|
||||
F36 = mult[4] * table[4];
|
||||
F44 = mult[5] * table[5];
|
||||
F52 = mult[6] * table[6];
|
||||
F60 = mult[7] * table[7];
|
||||
F4 = mult[32] * table[0];
|
||||
F12 = mult[33] * table[1];
|
||||
F20 = mult[34] * table[2];
|
||||
F28 = mult[35] * table[3];
|
||||
F36 = mult[36] * table[4];
|
||||
F44 = mult[37] * table[5];
|
||||
F52 = mult[38] * table[6];
|
||||
F60 = mult[39] * table[7];
|
||||
|
||||
mult += 8;
|
||||
table = &(fftTable[input[5] << 3]);
|
||||
|
||||
F5 = mult[0] * table[0];
|
||||
F13 = mult[1] * table[1];
|
||||
F21 = mult[2] * table[2];
|
||||
F29 = mult[3] * table[3];
|
||||
F37 = mult[4] * table[4];
|
||||
F45 = mult[5] * table[5];
|
||||
F53 = mult[6] * table[6];
|
||||
F61 = mult[7] * table[7];
|
||||
F5 = mult[40] * table[0];
|
||||
F13 = mult[41] * table[1];
|
||||
F21 = mult[42] * table[2];
|
||||
F29 = mult[43] * table[3];
|
||||
F37 = mult[44] * table[4];
|
||||
F45 = mult[45] * table[5];
|
||||
F53 = mult[46] * table[6];
|
||||
F61 = mult[47] * table[7];
|
||||
|
||||
mult += 8;
|
||||
table = &(fftTable[input[6] << 3]);
|
||||
|
||||
F6 = mult[0] * table[0];
|
||||
F14 = mult[1] * table[1];
|
||||
F22 = mult[2] * table[2];
|
||||
F30 = mult[3] * table[3];
|
||||
F38 = mult[4] * table[4];
|
||||
F46 = mult[5] * table[5];
|
||||
F54 = mult[6] * table[6];
|
||||
F62 = mult[7] * table[7];
|
||||
F6 = mult[48] * table[0];
|
||||
F14 = mult[49] * table[1];
|
||||
F22 = mult[50] * table[2];
|
||||
F30 = mult[51] * table[3];
|
||||
F38 = mult[52] * table[4];
|
||||
F46 = mult[53] * table[5];
|
||||
F54 = mult[54] * table[6];
|
||||
F62 = mult[55] * table[7];
|
||||
|
||||
mult += 8;
|
||||
table = &(fftTable[input[7] << 3]);
|
||||
|
||||
F7 = mult[0] * table[0];
|
||||
F15 = mult[1] * table[1];
|
||||
F23 = mult[2] * table[2];
|
||||
F31 = mult[3] * table[3];
|
||||
F39 = mult[4] * table[4];
|
||||
F47 = mult[5] * table[5];
|
||||
F55 = mult[6] * table[6];
|
||||
F63 = mult[7] * table[7];
|
||||
F7 = mult[56] * table[0];
|
||||
F15 = mult[57] * table[1];
|
||||
F23 = mult[58] * table[2];
|
||||
F31 = mult[59] * table[3];
|
||||
F39 = mult[60] * table[4];
|
||||
F47 = mult[61] * table[5];
|
||||
F55 = mult[62] * table[6];
|
||||
F63 = mult[63] * table[7];
|
||||
|
||||
#define ADD_SUB( a, b ) \
|
||||
{ \
|
||||
int temp = b; \
|
||||
b = a - b; \
|
||||
a = a + temp; \
|
||||
}
|
||||
|
||||
#define Q_REDUCE( a ) \
|
||||
( ( (a) & 0xff ) - ( (a) >> 8 ) )
|
||||
|
||||
/*
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
@@ -800,7 +987,6 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
// Second loop unrolling:
|
||||
// Iteration 0:
|
||||
ADD_SUB(F0, F1);
|
||||
@@ -1057,6 +1243,11 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
||||
output[47] = Q_REDUCE(F61);
|
||||
output[55] = Q_REDUCE(F62);
|
||||
output[63] = Q_REDUCE(F63);
|
||||
|
||||
#undef ADD_SUB
|
||||
#undef Q_REDUCE
|
||||
|
||||
#endif // AVX2 elif SSE4.1 else
|
||||
}
|
||||
|
||||
// Calculates the FFT part of SWIFFT.
|
||||
@@ -1086,24 +1277,66 @@ void SWIFFTFFT(const unsigned char *input, int m, swift_int32_t *output)
|
||||
// - m: the input size divided by 64.
|
||||
// - output: will store the result.
|
||||
// - a: the coefficients in the sum. Of size 64 * m.
|
||||
void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const swift_int16_t *a)
|
||||
void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output,
|
||||
const swift_int16_t *a )
|
||||
{
|
||||
int i, j;
|
||||
swift_int32_t result[N];
|
||||
swift_int32_t result[N] __attribute__ ((aligned (64)));
|
||||
register swift_int16_t carry = 0;
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
__m512i *res = (__m512i*)result;
|
||||
for ( j = 0; j < N/16; ++j )
|
||||
{
|
||||
__m512i sum = _mm512_setzero_si512();
|
||||
const __m512i *f = (__m512i*)input + j;
|
||||
const __m512i *k = (__m512i*)a + j;
|
||||
for ( i = 0; i < m; i++, f += N/16, k += N/16 )
|
||||
sum = _mm512_add_epi32( sum, _mm512_mullo_epi32( *f, *k ) );
|
||||
res[j] = sum;
|
||||
}
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
|
||||
__m256i *res = (__m256i*)result;
|
||||
for ( j = 0; j < N/8; ++j )
|
||||
{
|
||||
__m256i sum = _mm256_setzero_si256();
|
||||
const __m256i *f = (__m256i*)input + j;
|
||||
const __m256i *k = (__m256i*)a + j;
|
||||
for ( i = 0; i < m; i++, f += N/8, k += N/8 )
|
||||
sum = _mm256_add_epi32( sum, _mm256_mullo_epi32( *f, *k ) );
|
||||
res[j] = sum;
|
||||
}
|
||||
|
||||
#elif defined(__SSE4_1__)
|
||||
|
||||
__m128i *res = (__m128i*)result;
|
||||
for ( j = 0; j < N/4; ++j )
|
||||
{
|
||||
__m128i sum = _mm_setzero_si128();
|
||||
const __m128i *f = (__m128i*)input + j;
|
||||
const __m128i *k = (__m128i*)a + j;
|
||||
for ( i = 0; i < m; i++, f += N/4, k += N/4 )
|
||||
sum = _mm_add_epi32( sum, _mm_mullo_epi32( *f, *k ) );
|
||||
res[j] = sum;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for (j = 0; j < N; ++j)
|
||||
{
|
||||
register swift_int32_t sum = 0;
|
||||
const register swift_int32_t *f = input + j;
|
||||
const register swift_int16_t *k = a + j;
|
||||
|
||||
for (i = 0; i < m; i++, f += N,k += N)
|
||||
sum += (*f) * (*k);
|
||||
|
||||
result[j] = sum;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
for (j = 0; j < N; ++j)
|
||||
result[j] = ((FIELD_SIZE << 22) + result[j]) % FIELD_SIZE;
|
||||
|
||||
@@ -1116,14 +1349,15 @@ void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const s
|
||||
output[N] = carry;
|
||||
}
|
||||
|
||||
/*
|
||||
void ComputeSingleSWIFFTX_smooth(unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
|
||||
unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE],
|
||||
bool doSmooth)
|
||||
{
|
||||
int i;
|
||||
// Will store the result of the FFT parts:
|
||||
swift_int32_t fftOut[N * M];
|
||||
unsigned char intermediate[N * 3 + 8];
|
||||
swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
|
||||
unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64)));
|
||||
unsigned char carry0,carry1,carry2;
|
||||
|
||||
// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
|
||||
@@ -1193,51 +1427,50 @@ void ComputeSingleSWIFFTX_smooth(unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
|
||||
output[N] = 0;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
void ComputeSingleSWIFFTX( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
|
||||
unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE] )
|
||||
void ComputeSingleSWIFFTX( unsigned char *input, unsigned char *output )
|
||||
{
|
||||
int i;
|
||||
// Will store the result of the FFT parts:
|
||||
swift_int32_t fftOut[N * M];
|
||||
unsigned char intermediate[N * 3 + 8];
|
||||
swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
|
||||
unsigned char sum[ N*3 + 8 ] __attribute__ ((aligned (64)));
|
||||
unsigned char carry0,carry1,carry2;
|
||||
|
||||
// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
|
||||
// overriden by the following SWIFFT):
|
||||
|
||||
// 1. Compute the FFT of the input - the common part for the first 3 SWIFFTs:
|
||||
SWIFFTFFT(input, M, fftOut);
|
||||
SWIFFTFFT( input, M, fftOut );
|
||||
|
||||
// 2. Compute the sums of the 3 SWIFFTs, each using a different set of coefficients:
|
||||
|
||||
// 2a. The first SWIFFT:
|
||||
SWIFFTSum(fftOut, M, intermediate, As);
|
||||
// Remember the carry byte:
|
||||
carry0 = intermediate[N];
|
||||
SWIFFTSum( fftOut, M, sum, As );
|
||||
carry0 = sum[N];
|
||||
|
||||
// 2b. The second one:
|
||||
SWIFFTSum(fftOut, M, intermediate + N, As + (M * N));
|
||||
carry1 = intermediate[2 * N];
|
||||
SWIFFTSum( fftOut, M, sum + N, As + M*N );
|
||||
carry1 = sum[ 2*N ];
|
||||
|
||||
// 2c. The third one:
|
||||
SWIFFTSum(fftOut, M, intermediate + (2 * N), As + 2 * (M * N));
|
||||
carry2 = intermediate[3 * N];
|
||||
SWIFFTSum( fftOut, M, sum + 2*N, As + 2*M*N );
|
||||
carry2 = sum[ 3*N ];
|
||||
|
||||
//2d. Put three carry bytes in their place
|
||||
intermediate[3 * N] = carry0;
|
||||
intermediate[(3 * N) + 1] = carry1;
|
||||
intermediate[(3 * N) + 2] = carry2;
|
||||
sum[ 3*N ] = carry0;
|
||||
sum[ 3*N + 1 ] = carry1;
|
||||
sum[ 3*N + 2 ] = carry2;
|
||||
|
||||
// Padding intermediate output with 5 zeroes.
|
||||
memset(intermediate + (3 * N) + 3, 0, 5);
|
||||
memset( sum + 3*N + 3, 0, 5 );
|
||||
|
||||
// Apply the S-Box:
|
||||
for ( i = 0; i < (3 * N) + 8; ++i )
|
||||
intermediate[i] = SBox[intermediate[i]];
|
||||
sum[i] = SBox[ sum[i] ];
|
||||
|
||||
// 3. The final and last SWIFFT:
|
||||
SWIFFTFFT(intermediate, 3 * (N/8) + 1, fftOut);
|
||||
SWIFFTSum(fftOut, 3 * (N/8) + 1, output, As);
|
||||
|
||||
SWIFFTFFT( sum, 3 * (N/8) + 1, fftOut );
|
||||
SWIFFTSum( fftOut, 3 * (N/8) + 1, sum, As );
|
||||
memcpy( output, sum, SWIFFTX_OUTPUT_BLOCK_SIZE - 1 );
|
||||
}
|
||||
|
@@ -61,11 +61,10 @@ void ComputeSingleSWIFFT(unsigned char *input, unsigned short m,
|
||||
//
|
||||
// Returns:
|
||||
// - Success value.
|
||||
void ComputeSingleSWIFFTX( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
|
||||
unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE] );
|
||||
void ComputeSingleSWIFFTX( unsigned char *input, unsigned char *output );
|
||||
|
||||
void ComputeSingleSWIFFTX_smooth( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
|
||||
unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE], bool doSmooth);
|
||||
//void ComputeSingleSWIFFTX_smooth( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
|
||||
// unsigned char output[SWIFFTX_OUTPUT_BLOCK_SIZE], bool doSmooth);
|
||||
|
||||
// Calculates the powers of OMEGA and generates the bit reversal permutation.
|
||||
// You must call this function before doing SWIFFT/X, otherwise you will get zeroes everywhere.
|
||||
|
@@ -176,12 +176,6 @@ static void rotate_indexes( uint32_t *p )
|
||||
*/
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline uint32_t rotl32( uint32_t a, size_t r )
|
||||
{
|
||||
return ( a << r ) | ( a >> (32-r) );
|
||||
}
|
||||
|
||||
// Vectorized and targetted version of fnv1a
|
||||
#if defined (__AVX2__)
|
||||
|
||||
@@ -232,7 +226,7 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
|
||||
for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
|
||||
{ \
|
||||
const uint32_t *blob_off = blob + \
|
||||
( ( fnv1a( rotl32( subset[i], r ), accumulator ) % mdiv ) \
|
||||
( ( fnv1a( rol32( subset[i], r ), accumulator ) % mdiv ) \
|
||||
* ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
|
||||
UPDATE_ACCUMULATOR; \
|
||||
MULXOR; \
|
||||
|
@@ -1,5 +1,5 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#include "Verthash.h"
|
||||
#include "tiny_sha3/sha3-4way.h"
|
||||
|
||||
@@ -140,7 +140,7 @@ bool register_verthash_algo( algo_gate_t* gate )
|
||||
uint8_t vhDataFileHash[32] = { 0 };
|
||||
|
||||
applog( LOG_NOTICE, "Verifying Verthash data" );
|
||||
sph_sha256_full( vhDataFileHash, verthashInfo.data,
|
||||
sha256_full( vhDataFileHash, verthashInfo.data,
|
||||
verthashInfo.dataSize );
|
||||
if ( memcmp( vhDataFileHash, verthashDatFileHash_bytes,
|
||||
sizeof(verthashDatFileHash_bytes) ) == 0 )
|
||||
|
@@ -82,7 +82,7 @@ int scanhash_whirlpool( struct work* work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[19], n );
|
||||
whirlpool_hash(vhash, endiandata);
|
||||
|
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
|
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget) & ! opt_benchmark )
|
||||
submit_solution( work, vhash, mythr );
|
||||
} while ( n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
|
@@ -52,10 +52,10 @@ void x16r_8way_prehash( void *vdata, void *pdata )
|
||||
break;
|
||||
case CUBEHASH:
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 );
|
||||
intrlv_8x64( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
|
||||
cube_4way_init( &x16r_ctx.cube, 512, 16, 32 );
|
||||
cube_4way_update( &x16r_ctx.cube, vdata2, 64 );
|
||||
rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
|
||||
break;
|
||||
case HAMSI:
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
@@ -207,15 +207,15 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
|
||||
case LUFFA:
|
||||
if ( i == 0 )
|
||||
{
|
||||
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash,
|
||||
vhash + (16<<2), 16 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash,
|
||||
vhash + (16<<2), 16 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash,
|
||||
vhash + (16<<2), 16 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash,
|
||||
vhash + (16<<2), 16 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -230,56 +230,24 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
|
||||
case CUBEHASH:
|
||||
if ( i == 0 )
|
||||
{
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
|
||||
(const byte*)in0 + 64, 16 );
|
||||
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
|
||||
cube_4way_update_close( &ctx.cube, vhash,
|
||||
vhash + (16<<2), 16 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
|
||||
(const byte*)in1 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
|
||||
(const byte*)in2 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
|
||||
(const byte*)in3 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash4,
|
||||
(const byte*)in4 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash5,
|
||||
(const byte*)in5 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash6,
|
||||
(const byte*)in6 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash7,
|
||||
(const byte*)in7 + 64, 16 );
|
||||
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
|
||||
cube_4way_update_close( &ctx.cube, vhash,
|
||||
vhash + (16<<2), 16 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
}
|
||||
else
|
||||
{
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
|
||||
(const byte*)in0, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
|
||||
(const byte*)in1, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
|
||||
(const byte*)in2, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
|
||||
(const byte*)in3, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash4,
|
||||
(const byte*)in4, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash5,
|
||||
(const byte*)in5, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash6,
|
||||
(const byte*)in6, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash7,
|
||||
(const byte*)in7, size );
|
||||
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
|
||||
cube_4way_full( &ctx.cube, vhash, 512, vhash, size );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
|
||||
cube_4way_full( &ctx.cube, vhash, 512, vhash, size );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
}
|
||||
break;
|
||||
case SHAVITE:
|
||||
@@ -556,9 +524,10 @@ void x16r_4way_prehash( void *vdata, void *pdata )
|
||||
break;
|
||||
case CUBEHASH:
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 );
|
||||
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
|
||||
intrlv_2x128( vdata2, edata, edata, 640 );
|
||||
cube_2way_init( &x16r_ctx.cube, 512, 16, 32 );
|
||||
cube_2way_update( &x16r_ctx.cube, vdata2, 64 );
|
||||
rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
|
||||
break;
|
||||
case HAMSI:
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
@@ -680,13 +649,13 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
|
||||
case LUFFA:
|
||||
if ( i == 0 )
|
||||
{
|
||||
intrlv_2x128( vhash, hash0, hash1, 640 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
|
||||
dintrlv_2x128_512( hash0, hash1, vhash );
|
||||
intrlv_2x128( vhash, hash2, hash3, 640 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
|
||||
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||
intrlv_2x128( vhash, hash0, hash1, 640 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
|
||||
dintrlv_2x128_512( hash0, hash1, vhash );
|
||||
intrlv_2x128( vhash, hash2, hash3, 640 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
|
||||
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -701,32 +670,24 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
|
||||
case CUBEHASH:
|
||||
if ( i == 0 )
|
||||
{
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
|
||||
(const byte*)in0 + 64, 16 );
|
||||
intrlv_2x128( vhash, in0, in1, size<<3 );
|
||||
cube_2way_update_close( &ctx.cube, vhash,
|
||||
vhash + (16<<1), 16 );
|
||||
dintrlv_2x128_512( hash0, hash1, vhash );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
|
||||
(const byte*)in1 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
|
||||
(const byte*)in2 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
|
||||
(const byte*)in3 + 64, 16 );
|
||||
intrlv_2x128( vhash, in2, in3, size<<3 );
|
||||
cube_2way_update_close( &ctx.cube, vhash,
|
||||
vhash + (16<<1), 16 );
|
||||
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||
}
|
||||
else
|
||||
{
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
|
||||
(const byte*)in0, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
|
||||
(const byte*)in1, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
|
||||
(const byte*)in2, size );
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
|
||||
(const byte*)in3, size );
|
||||
intrlv_2x128( vhash, in0, in1, size<<3 );
|
||||
cube_2way_full( &ctx.cube, vhash, 512, vhash, size );
|
||||
dintrlv_2x128_512( hash0, hash1, vhash );
|
||||
intrlv_2x128( vhash, in2, in3, size<<3 );
|
||||
cube_2way_full( &ctx.cube, vhash, 512, vhash, size );
|
||||
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||
}
|
||||
break;
|
||||
case SHAVITE:
|
||||
|
@@ -1,4 +1,5 @@
|
||||
#include "x16r-gate.h"
|
||||
#include "algo/sha/sha256d.h"
|
||||
|
||||
__thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 };
|
||||
|
||||
@@ -61,8 +62,7 @@ bool register_x16r_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x16r;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
|
||||
VAES_OPT | VAES256_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -80,8 +80,7 @@ bool register_x16rv2_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x16rv2;
|
||||
gate->hash = (void*)&x16rv2_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
|
||||
VAES_OPT | VAES256_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -99,8 +98,7 @@ bool register_x16s_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x16r;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
|
||||
VAES_OPT | VAES256_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -233,8 +231,7 @@ bool register_x16rt_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x16rt;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
|
||||
VAES_OPT | VAES256_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
@@ -251,8 +248,7 @@ bool register_x16rt_veil_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x16rt;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
|
||||
VAES_OPT | VAES256_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->build_extraheader = (void*)&veil_build_extraheader;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -291,8 +287,7 @@ bool register_x21s_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&x21s_hash;
|
||||
gate->miner_thread_init = (void*)&x21s_thread_init;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
|
||||
VAES_OPT | VAES256_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
|
@@ -37,6 +37,7 @@
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/hamsi/hamsi-hash-4way.h"
|
||||
@@ -115,7 +116,7 @@ union _x16r_8way_context_overlay
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
luffa_4way_context luffa;
|
||||
cubehashParam cube;
|
||||
cube_4way_context cube;
|
||||
simd_4way_context simd;
|
||||
hamsi512_8way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
@@ -164,8 +165,8 @@ union _x16r_4way_context_overlay
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
luffa_2way_context luffa;
|
||||
cube_2way_context cube;
|
||||
hashState_luffa luffa1;
|
||||
cubehashParam cube;
|
||||
simd_2way_context simd;
|
||||
hamsi512_4way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
|
@@ -13,7 +13,7 @@
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/lyra2/lyra2.h"
|
||||
#if defined(__SHA__)
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#endif
|
||||
|
||||
#if defined (X21S_8WAY)
|
||||
@@ -208,9 +208,7 @@ union _x21s_4way_context_overlay
|
||||
haval256_5_4way_context haval;
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
#if defined(__SHA__)
|
||||
sph_sha256_context sha256;
|
||||
#else
|
||||
#if !defined(__SHA__)
|
||||
sha256_4way_context sha256;
|
||||
#endif
|
||||
} __attribute__ ((aligned (64)));
|
||||
@@ -275,18 +273,10 @@ int x21s_4way_hash( void* output, const void* input, int thrid )
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash0, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash1, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+32 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash2, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+64 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash3, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+96 );
|
||||
sha256_full( output, hash0, 64 );
|
||||
sha256_full( output+32, hash1, 64 );
|
||||
sha256_full( output+64, hash2, 64 );
|
||||
sha256_full( output+96, hash3, 64 );
|
||||
|
||||
#else
|
||||
|
||||
|
@@ -8,7 +8,7 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include "algo/tiger/sph_tiger.h"
|
||||
#include "algo/gost/sph_gost.h"
|
||||
@@ -23,7 +23,7 @@ union _x21s_context_overlay
|
||||
sph_haval256_5_context haval;
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
sph_sha256_context sha256;
|
||||
sha256_context sha256;
|
||||
};
|
||||
typedef union _x21s_context_overlay x21s_context_overlay;
|
||||
|
||||
@@ -50,9 +50,7 @@ int x21s_hash( void* output, const void* input, int thrid )
|
||||
sph_gost512 ( &ctx.gost, (const void*) hash, 64 );
|
||||
sph_gost512_close( &ctx.gost, (void*) hash );
|
||||
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash, 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash );
|
||||
sha256_full( hash, hash, 64 );
|
||||
|
||||
memcpy( output, hash, 32 );
|
||||
|
||||
|
@@ -12,7 +12,7 @@ bool register_sonoa_algo( algo_gate_t* gate )
|
||||
init_sonoa_ctx();
|
||||
gate->hash = (void*)&sonoa_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -37,7 +37,8 @@ union _x17_8way_context_overlay
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
luffa_4way_context luffa;
|
||||
cube_4way_context cube;
|
||||
// cube_4way_context cube;
|
||||
cube_4way_2buf_context cube;
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_context groestl;
|
||||
shavite512_4way_context shavite;
|
||||
@@ -119,8 +120,10 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
|
||||
luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
|
||||
cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
|
||||
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 );
|
||||
|
||||
// cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
|
||||
// cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
|
@@ -11,7 +11,7 @@ bool register_x17_algo( algo_gate_t* gate )
|
||||
#else
|
||||
gate->hash = (void*)&x17_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -12,7 +12,7 @@ bool register_xevan_algo( algo_gate_t* gate )
|
||||
init_xevan_ctx();
|
||||
gate->hash = (void*)&xevan_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
|
@@ -28,7 +28,7 @@
|
||||
#include "algo/echo/echo-hash-4way.h"
|
||||
#endif
|
||||
#if defined(__SHA__)
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#endif
|
||||
|
||||
#if defined(X22I_8WAY)
|
||||
@@ -51,9 +51,7 @@ union _x22i_8way_ctx_overlay
|
||||
haval256_5_8way_context haval;
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
#if defined(X22I_8WAY_SHA)
|
||||
sph_sha256_context sha256;
|
||||
#else
|
||||
#if !defined(X22I_8WAY_SHA)
|
||||
sha256_8way_context sha256;
|
||||
#endif
|
||||
#if defined(__VAES__)
|
||||
@@ -391,30 +389,14 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
|
||||
|
||||
#if defined(X22I_8WAY_SHA)
|
||||
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash0, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash1, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+32 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash2, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+64 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash3, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+96 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash4, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+128 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash5, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+160 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash6, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+192 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash7, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+224 );
|
||||
sha256_full( hash0, hash0, 64 );
|
||||
sha256_full( hash1, hash1, 64 );
|
||||
sha256_full( hash2, hash2, 64 );
|
||||
sha256_full( hash3, hash3, 64 );
|
||||
sha256_full( hash4, hash4, 64 );
|
||||
sha256_full( hash5, hash5, 64 );
|
||||
sha256_full( hash6, hash6, 64 );
|
||||
sha256_full( hash7, hash7, 64 );
|
||||
|
||||
#else
|
||||
|
||||
@@ -551,9 +533,7 @@ union _x22i_4way_ctx_overlay
|
||||
haval256_5_4way_context haval;
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
#if defined(X22I_4WAY_SHA)
|
||||
sph_sha256_context sha256;
|
||||
#else
|
||||
#if !defined(X22I_4WAY_SHA)
|
||||
sha256_4way_context sha256;
|
||||
#endif
|
||||
};
|
||||
@@ -757,18 +737,10 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
|
||||
|
||||
#if defined(X22I_4WAY_SHA)
|
||||
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash0, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash1, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+32 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash2, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+64 );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash3, 64 );
|
||||
sph_sha256_close( &ctx.sha256, output+96 );
|
||||
sha256_full( hash0, hash0, 64 );
|
||||
sha256_full( hash1, hash1, 64 );
|
||||
sha256_full( hash2, hash2, 64 );
|
||||
sha256_full( hash3, hash3, 64 );
|
||||
|
||||
#else
|
||||
|
||||
|
@@ -31,8 +31,8 @@ bool register_x22i_algo( algo_gate_t* gate )
|
||||
|
||||
#endif
|
||||
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
|
||||
| AVX512_OPT | VAES_OPT | VAES256_OPT;
|
||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT
|
||||
| AVX512_OPT | VAES_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
@@ -48,8 +48,8 @@ bool register_x25x_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x25x;
|
||||
gate->hash = (void*)&x25x_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
|
||||
AVX512_OPT | VAES_OPT | VAES256_OPT;
|
||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
|
||||
AVX512_OPT | VAES_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -24,6 +24,7 @@
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include "algo/tiger/sph_tiger.h"
|
||||
#include "algo/lyra2/lyra2.h"
|
||||
@@ -57,7 +58,6 @@ union _x22i_context_overlay
|
||||
sph_haval256_5_context haval;
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
sph_sha256_context sha256;
|
||||
};
|
||||
typedef union _x22i_context_overlay x22i_context_overlay;
|
||||
|
||||
@@ -172,9 +172,7 @@ int x22i_hash( void *output, const void *input, int thrid )
|
||||
sph_gost512 (&ctx.gost, (const void*) hash, 64);
|
||||
sph_gost512_close(&ctx.gost, (void*) hash);
|
||||
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash, 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash );
|
||||
sha256_full( hash, hash, 64 );
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
|
||||
|
@@ -33,7 +33,7 @@
|
||||
#include "algo/echo/echo-hash-4way.h"
|
||||
#endif
|
||||
#if defined(__SHA__)
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#endif
|
||||
|
||||
void x25x_shuffle( void *hash )
|
||||
@@ -84,7 +84,7 @@ union _x25x_8way_ctx_overlay
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
#if defined(X25X_8WAY_SHA)
|
||||
sph_sha256_context sha256;
|
||||
sha256_context sha256;
|
||||
#else
|
||||
sha256_8way_context sha256;
|
||||
#endif
|
||||
@@ -447,31 +447,15 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
|
||||
|
||||
#if defined(X25X_8WAY_SHA)
|
||||
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash0[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash0[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash1[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash1[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash2[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash2[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash3[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash3[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash4[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash4[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash5[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash5[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash6[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash6[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash7[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash7[21] );
|
||||
|
||||
sha256_full( hash0[21], hash0[20], 64 );
|
||||
sha256_full( hash1[21], hash1[20], 64 );
|
||||
sha256_full( hash2[21], hash2[20], 64 );
|
||||
sha256_full( hash3[21], hash3[20], 64 );
|
||||
sha256_full( hash4[21], hash4[20], 64 );
|
||||
sha256_full( hash5[21], hash5[20], 64 );
|
||||
sha256_full( hash6[21], hash6[20], 64 );
|
||||
sha256_full( hash7[21], hash7[20], 64 );
|
||||
|
||||
intrlv_8x32_512( vhash, hash0[21], hash1[21], hash2[21], hash3[21],
|
||||
hash4[21], hash5[21], hash6[21], hash7[21] );
|
||||
|
||||
@@ -646,7 +630,7 @@ union _x25x_4way_ctx_overlay
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
#if defined(X25X_4WAY_SHA)
|
||||
sph_sha256_context sha256;
|
||||
sha256_context sha256;
|
||||
#else
|
||||
sha256_4way_context sha256;
|
||||
#endif
|
||||
@@ -848,18 +832,10 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
|
||||
|
||||
#if defined(X25X_4WAY_SHA)
|
||||
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash0[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash0[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash1[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash1[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash2[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash2[21] );
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, hash3[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, hash3[21] );
|
||||
sha256_full( hash0[21], hash0[20], 64 );
|
||||
sha256_full( hash1[21], hash1[20], 64 );
|
||||
sha256_full( hash2[21], hash2[20], 64 );
|
||||
sha256_full( hash3[21], hash3[20], 64 );
|
||||
|
||||
intrlv_4x32_512( vhash, hash0[21], hash1[21], hash2[21], hash3[21] );
|
||||
|
||||
|
@@ -23,7 +23,7 @@
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include "algo/tiger/sph_tiger.h"
|
||||
#include "algo/lyra2/lyra2.h"
|
||||
@@ -60,7 +60,7 @@ union _x25x_context_overlay
|
||||
sph_haval256_5_context haval;
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
sph_sha256_context sha256;
|
||||
sha256_context sha256;
|
||||
sph_panama_context panama;
|
||||
blake2s_state blake2s;
|
||||
};
|
||||
@@ -174,9 +174,7 @@ int x25x_hash( void *output, const void *input, int thrid )
|
||||
sph_gost512 (&ctx.gost, (const void*) &hash[19], 64);
|
||||
sph_gost512_close(&ctx.gost, (void*) &hash[20]);
|
||||
|
||||
sph_sha256_init( &ctx.sha256 );
|
||||
sph_sha256( &ctx.sha256, &hash[20], 64 );
|
||||
sph_sha256_close( &ctx.sha256, &hash[21] );
|
||||
sha256_full( &hash[21], &hash[20], 64 );
|
||||
|
||||
sph_panama_init(&ctx.panama);
|
||||
sph_panama (&ctx.panama, (const void*) &hash[21], 64 );
|
||||
|
@@ -35,9 +35,11 @@
|
||||
#include "blake2b-yp.h"
|
||||
|
||||
// Cyclic right rotation.
|
||||
#ifndef ROTR64
|
||||
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
|
||||
#endif
|
||||
//#ifndef ROTR64
|
||||
//#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
|
||||
//#endif
|
||||
|
||||
#define ROTR64(x, y) ror64( x, y )
|
||||
|
||||
// Little-endian byte access.
|
||||
#define B2B_GET64(p) \
|
||||
|
@@ -52,8 +52,8 @@ int scanhash_yespower_r8g( struct work *work, uint32_t max_nonce,
|
||||
endiandata[19] = n;
|
||||
|
||||
// do sha256 prehash
|
||||
sph_sha256_init( &sha256_prehash_ctx );
|
||||
sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
|
||||
sha256_ctx_init( &sha256_prehash_ctx );
|
||||
sha256_update( &sha256_prehash_ctx, endiandata, 64 );
|
||||
|
||||
do {
|
||||
yespower_tls( (unsigned char *)endiandata, params.perslen,
|
||||
|
@@ -27,14 +27,11 @@
|
||||
* coin.
|
||||
*/
|
||||
#include "yespower.h"
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
yespower_params_t yespower_params;
|
||||
|
||||
//SHA256_CTX sha256_prehash_ctx;
|
||||
__thread sph_sha256_context sha256_prehash_ctx;
|
||||
//__thread SHA256_CTX sha256_prehash_ctx;
|
||||
__thread sha256_context sha256_prehash_ctx;
|
||||
|
||||
// YESPOWER
|
||||
|
||||
@@ -61,8 +58,8 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
|
||||
endiandata[19] = n;
|
||||
|
||||
// do sha256 prehash
|
||||
sph_sha256_init( &sha256_prehash_ctx );
|
||||
sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
|
||||
sha256_ctx_init( &sha256_prehash_ctx );
|
||||
sha256_update( &sha256_prehash_ctx, endiandata, 64 );
|
||||
|
||||
do {
|
||||
if ( yespower_hash( (char*)endiandata, (char*)vhash, 80, thr_id ) )
|
||||
@@ -101,10 +98,6 @@ int scanhash_yespower_b2b( struct work *work, uint32_t max_nonce,
|
||||
be32enc( &endiandata[k], pdata[k] );
|
||||
endiandata[19] = n;
|
||||
|
||||
// do sha256 prehash
|
||||
sph_sha256_init( &sha256_prehash_ctx );
|
||||
sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
|
||||
|
||||
do {
|
||||
if (yespower_b2b_hash( (char*) endiandata, (char*) vhash, 80, thr_id ) )
|
||||
if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )
|
||||
|
@@ -203,17 +203,17 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
|
||||
ARX(X0, X3, X2, 18) \
|
||||
/* Rearrange data */ \
|
||||
X1 = _mm_shuffle_epi32(X1, 0x93); \
|
||||
X3 = _mm_shuffle_epi32(X3, 0x39); \
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E); \
|
||||
X3 = _mm_shuffle_epi32(X3, 0x39); \
|
||||
/* Operate on "rows" */ \
|
||||
ARX(X3, X0, X1, 7) \
|
||||
ARX(X2, X3, X0, 9) \
|
||||
ARX(X1, X2, X3, 13) \
|
||||
ARX(X0, X1, X2, 18) \
|
||||
/* Rearrange data */ \
|
||||
X3 = _mm_shuffle_epi32(X3, 0x93); \
|
||||
X1 = _mm_shuffle_epi32(X1, 0x39); \
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E); \
|
||||
X3 = _mm_shuffle_epi32(X3, 0x93);
|
||||
X2 = _mm_shuffle_epi32(X2, 0x4E);
|
||||
|
||||
/**
|
||||
* Apply the Salsa20 core to the block provided in (X0 ... X3).
|
||||
@@ -373,6 +373,45 @@ static inline void salsa20(salsa20_blk_t *restrict B,
|
||||
#define INTEGERIFY (uint32_t)X.d[0]
|
||||
#endif
|
||||
|
||||
// AVX512 ternary logic optimization
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define XOR_X_XOR_X( in1, in2 ) \
|
||||
X0 = _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
|
||||
X1 = _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
|
||||
X2 = _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
|
||||
X3 = _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 );
|
||||
|
||||
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
|
||||
X0 = _mm_ternarylogic_epi32( (in1).q[0], (in2).q[0], (in3).q[0], 0x96 ); \
|
||||
X1 = _mm_ternarylogic_epi32( (in1).q[1], (in2).q[1], (in3).q[1], 0x96 ); \
|
||||
X2 = _mm_ternarylogic_epi32( (in1).q[2], (in2).q[2], (in3).q[2], 0x96 ); \
|
||||
X3 = _mm_ternarylogic_epi32( (in1).q[3], (in2).q[3], (in3).q[3], 0x96 );
|
||||
|
||||
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
|
||||
X0 = _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
|
||||
X1 = _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
|
||||
X2 = _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
|
||||
X3 = _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); \
|
||||
SALSA20(out)
|
||||
|
||||
#else
|
||||
|
||||
#define XOR_X_XOR_X( in1, in2 ) \
|
||||
XOR_X( in1 ) \
|
||||
XOR_X( in2 )
|
||||
|
||||
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
|
||||
XOR_X_2( in1, in2 ) \
|
||||
XOR_X( in3 )
|
||||
|
||||
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
|
||||
XOR_X(in1) \
|
||||
XOR_X(in2) \
|
||||
SALSA20( out )
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Apply the Salsa20 core to the block provided in X ^ in.
|
||||
*/
|
||||
@@ -406,11 +445,15 @@ static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1,
|
||||
{
|
||||
DECL_X
|
||||
|
||||
XOR_X_2(Bin1[1], Bin2[1])
|
||||
XOR_X(Bin1[0])
|
||||
XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] )
|
||||
// XOR_X_2(Bin1[1], Bin2[1])
|
||||
// XOR_X(Bin1[0])
|
||||
SALSA20_XOR_MEM(Bin2[0], Bout[0])
|
||||
XOR_X(Bin1[1])
|
||||
SALSA20_XOR_MEM(Bin2[1], Bout[1])
|
||||
|
||||
// Factor out the XOR from salsa20 to do a xor3
|
||||
XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] )
|
||||
// XOR_X(Bin1[1])
|
||||
// SALSA20_XOR_MEM(Bin2[1], Bout[1])
|
||||
|
||||
return INTEGERIFY;
|
||||
}
|
||||
@@ -745,13 +788,15 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
|
||||
i = 0;
|
||||
r--;
|
||||
do {
|
||||
XOR_X(Bin1[i])
|
||||
XOR_X(Bin2[i])
|
||||
XOR_X_XOR_X( Bin1[i], Bin2[i] )
|
||||
// XOR_X(Bin1[i])
|
||||
// XOR_X(Bin2[i])
|
||||
PWXFORM
|
||||
WRITE_X(Bout[i])
|
||||
|
||||
XOR_X(Bin1[i + 1])
|
||||
XOR_X(Bin2[i + 1])
|
||||
XOR_X_XOR_X( Bin1[ i+1 ], Bin2[ i+1 ] )
|
||||
// XOR_X(Bin1[i + 1])
|
||||
// XOR_X(Bin2[i + 1])
|
||||
PWXFORM
|
||||
|
||||
if (unlikely(i >= r))
|
||||
@@ -1050,7 +1095,7 @@ int yespower(yespower_local_t *local,
|
||||
salsa20_blk_t *V, *XY;
|
||||
pwxform_ctx_t ctx;
|
||||
uint8_t sha256[32];
|
||||
sph_sha256_context sha256_ctx;
|
||||
sha256_context sha256_ctx;
|
||||
|
||||
/* Sanity-check parameters */
|
||||
if ( (version != YESPOWER_0_5 && version != YESPOWER_1_0)
|
||||
@@ -1093,10 +1138,9 @@ int yespower(yespower_local_t *local,
|
||||
|
||||
// copy prehash, do tail
|
||||
memcpy( &sha256_ctx, &sha256_prehash_ctx, sizeof sha256_ctx );
|
||||
|
||||
sph_sha256( &sha256_ctx, src+64, srclen-64 );
|
||||
sph_sha256_close( &sha256_ctx, sha256 );
|
||||
|
||||
sha256_update( &sha256_ctx, src+64, srclen-64 );
|
||||
sha256_final( &sha256_ctx, sha256 );
|
||||
|
||||
if ( version == YESPOWER_0_5 )
|
||||
{
|
||||
PBKDF2_SHA256( sha256, sizeof(sha256), src, srclen, 1, B, B_size );
|
||||
@@ -1141,7 +1185,9 @@ int yespower(yespower_local_t *local,
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
|
||||
smix_1_0( B, r, N, V, XY, &ctx );
|
||||
|
||||
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
|
||||
HMAC_SHA256_Buf( B + B_size - 64, 64, sha256, sizeof(sha256),
|
||||
(uint8_t *)dst );
|
||||
}
|
||||
|
@@ -34,8 +34,7 @@
|
||||
#include <stdlib.h> /* for size_t */
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include <openssl/sha.h>
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@@ -79,9 +78,7 @@ typedef struct {
|
||||
|
||||
extern yespower_params_t yespower_params;
|
||||
|
||||
//SHA256_CTX sha256_prehash_ctx;
|
||||
extern __thread sph_sha256_context sha256_prehash_ctx;
|
||||
//extern __thread SHA256_CTX sha256_prehash_ctx;
|
||||
extern __thread sha256_context sha256_prehash_ctx;
|
||||
|
||||
/**
|
||||
* yespower_init_local(local):
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user