mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
4 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
502ed0b1fe | ||
![]() |
d60a268972 | ||
![]() |
e4265a6f11 | ||
![]() |
a28daca3ce |
25
Makefile.am
25
Makefile.am
@@ -70,6 +70,8 @@ cpuminer_SOURCES = \
|
||||
algo/gost/sph_gost.c \
|
||||
algo/groestl/sph_groestl.c \
|
||||
algo/groestl/groestl.c \
|
||||
algo/groestl/myrgr-gate.c \
|
||||
algo/groestl/myrgr-4way.c \
|
||||
algo/groestl/myr-groestl.c \
|
||||
algo/groestl/aes_ni/hash-groestl.c \
|
||||
algo/groestl/aes_ni/hash-groestl256.c \
|
||||
@@ -97,10 +99,10 @@ cpuminer_SOURCES = \
|
||||
algo/keccak/keccak-4way.c\
|
||||
algo/keccak/keccak-gate.c \
|
||||
algo/keccak/sse2/keccak.c \
|
||||
algo/lbry.c \
|
||||
algo/luffa/sph_luffa.c \
|
||||
algo/luffa/luffa.c \
|
||||
algo/luffa/sse2/luffa_for_sse2.c \
|
||||
algo/luffa/luffa_for_sse2.c \
|
||||
algo/luffa/luffa-hash-2way.c \
|
||||
algo/lyra2/lyra2.c \
|
||||
algo/lyra2/sponge.c \
|
||||
algo/lyra2/lyra2rev2-gate.c \
|
||||
@@ -114,6 +116,9 @@ cpuminer_SOURCES = \
|
||||
algo/lyra2/lyra2h-gate.c \
|
||||
algo/lyra2/lyra2h.c \
|
||||
algo/lyra2/lyra2h-4way.c \
|
||||
algo/lyra2/allium-gate.c \
|
||||
algo/lyra2/allium-4way.c \
|
||||
algo/lyra2/allium.c \
|
||||
algo/m7m.c \
|
||||
algo/neoscrypt/neoscrypt.c \
|
||||
algo/nist5/nist5-gate.c \
|
||||
@@ -127,9 +132,17 @@ cpuminer_SOURCES = \
|
||||
algo/quark/anime-gate.c \
|
||||
algo/quark/anime.c \
|
||||
algo/quark/anime-4way.c \
|
||||
algo/qubit/qubit-gate.c \
|
||||
algo/qubit/qubit.c \
|
||||
algo/qubit/qubit-2way.c \
|
||||
algo/qubit/deep-gate.c \
|
||||
algo/qubit/deep-2way.c \
|
||||
algo/qubit/deep.c \
|
||||
algo/ripemd/sph_ripemd.c \
|
||||
algo/ripemd/ripemd-hash-4way.c \
|
||||
algo/ripemd/lbry-gate.c \
|
||||
algo/ripemd/lbry.c \
|
||||
algo/ripemd/lbry-4way.c \
|
||||
algo/scrypt.c \
|
||||
algo/scryptjane/scrypt-jane.c \
|
||||
algo/sha/sph_sha2.c \
|
||||
@@ -143,8 +156,9 @@ cpuminer_SOURCES = \
|
||||
algo/shavite/sph-shavite-aesni.c \
|
||||
algo/shavite/shavite.c \
|
||||
algo/simd/sph_simd.c \
|
||||
algo/simd/sse2/nist.c \
|
||||
algo/simd/sse2/vector.c \
|
||||
algo/simd/nist.c \
|
||||
algo/simd/vector.c \
|
||||
algo/simd/simd-hash-2way.c \
|
||||
algo/skein/sph_skein.c \
|
||||
algo/skein/skein-hash-4way.c \
|
||||
algo/skein/skein.c \
|
||||
@@ -184,6 +198,9 @@ cpuminer_SOURCES = \
|
||||
algo/x11/x11evo.c \
|
||||
algo/x11/x11evo-4way.c \
|
||||
algo/x11/x11evo-gate.c \
|
||||
algo/x12/x12-gate.c \
|
||||
algo/x12/x12.c \
|
||||
algo/x12/x12-4way.c \
|
||||
algo/x13/x13-gate.c \
|
||||
algo/x13/x13.c \
|
||||
algo/x13/x13-4way.c \
|
||||
|
61
README.md
61
README.md
@@ -13,9 +13,34 @@ mailto://jayddee246@gmail.com
|
||||
|
||||
See file RELEASE_NOTES for change log and compile instructions.
|
||||
|
||||
Requirements
|
||||
------------
|
||||
|
||||
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
|
||||
Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
|
||||
optimizations a CPU with AES_NI is required. This includes Intel Westbridge
|
||||
and newer and AMD equivalents. Further optimizations are available on some
|
||||
algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
|
||||
|
||||
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
|
||||
performance.
|
||||
|
||||
ARM CPUs are not supported.
|
||||
|
||||
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
|
||||
Centos are known to work and have all dependencies in their repositories.
|
||||
Others may work but may require more effort.
|
||||
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
|
||||
|
||||
MacOS, OSx is not supported.
|
||||
|
||||
3. Stratum pool. Some algos may work wallet mining using getwork or GBT. YMMV.
|
||||
|
||||
Supported Algorithms
|
||||
--------------------
|
||||
|
||||
allium Garlicoin
|
||||
anime Animecoin
|
||||
argon2
|
||||
axiom Shabal-256 MemoHash
|
||||
bastion
|
||||
@@ -74,40 +99,19 @@ Supported Algorithms
|
||||
x11 Dash
|
||||
x11evo Revolvercoin
|
||||
x11gost sib (SibCoin)
|
||||
x12 Galaxie Cash (GCH)
|
||||
x13 X13
|
||||
x13sm3 hsr (Hshare)
|
||||
x14 X14
|
||||
x15 X15
|
||||
x16r Ravencoin
|
||||
x17
|
||||
xevan Bitsend
|
||||
yescrypt Globalboost-Y (BSTY)
|
||||
yescryptr8 BitZeny (ZNY)\n\
|
||||
yescryptr8 BitZeny (ZNY)
|
||||
yescryptr16 Yenten (YTN)
|
||||
zr5 Ziftr
|
||||
|
||||
Requirements
|
||||
------------
|
||||
|
||||
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
|
||||
Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
|
||||
optimizations a CPU with AES_NI is required. This includes Intel Westbridge
|
||||
and newer and AMD equivalents. Further optimizations are available on some
|
||||
algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
|
||||
|
||||
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
|
||||
performance.
|
||||
|
||||
ARM CPUs are not supported.
|
||||
|
||||
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
|
||||
Centos are known to work and have all dependencies in their repositories.
|
||||
Others may work but may require more effort.
|
||||
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
|
||||
|
||||
MacOS, OSx is not supported.
|
||||
|
||||
3. Stratum pool. Some algos may work wallet mining using getwork.
|
||||
|
||||
Errata
|
||||
------
|
||||
|
||||
@@ -136,10 +140,13 @@ output from the miner showing the startup and any errors.
|
||||
Donations
|
||||
---------
|
||||
|
||||
I do not do this for money but I have a donation address if users
|
||||
are so inclined.
|
||||
cpuminer-opt has no fees of any kind but donations are accepted.
|
||||
|
||||
bitcoin:12tdvfF7KmAsihBXQXynT6E6th2c2pByTT?label=donations
|
||||
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
|
||||
ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
|
||||
LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
|
||||
BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
|
||||
BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
|
||||
|
||||
Happy mining!
|
||||
|
||||
|
@@ -25,3 +25,12 @@ cpuminer-aes-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2" Haswell...
|
||||
cpuminer-avx2-sha.exe "-march=core-avx2 -msha" Ryzen
|
||||
|
||||
If you like this software feel free to donate:
|
||||
|
||||
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
|
||||
ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
|
||||
LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
|
||||
BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
|
||||
BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
|
||||
|
||||
|
||||
|
@@ -98,8 +98,8 @@ Start mining.
|
||||
|
||||
Windows
|
||||
|
||||
The following in how the Windows binary releases are built. It's old and
|
||||
not very good but it works, for me anyway.
|
||||
Precompiled Windows binaries are built on a Linux host using Mingw
|
||||
with a more recent compiler than the following Windows hosted procedure.
|
||||
|
||||
Building on Windows prerequisites:
|
||||
|
||||
@@ -131,7 +131,7 @@ or similar Windows program.
|
||||
In msys shell cd to miner directory.
|
||||
cd /c/path/to/cpuminer-opt
|
||||
|
||||
Run winbuild.sh to build on Windows or execute the following commands.
|
||||
Run build.sh to build on Windows or execute the following commands.
|
||||
|
||||
./autogen.sh
|
||||
CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
|
||||
@@ -159,6 +159,34 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.8.2.1
|
||||
|
||||
Fixed low difficulty rejects with allium.
|
||||
Fixed qubit AVX2.
|
||||
Restored lyra2z lost hash.
|
||||
Fixed build.sh
|
||||
|
||||
v3.8.2
|
||||
|
||||
Fixed and faster myr-gr.
|
||||
Added x12 algo (Galaxie Cash), allium algo (Garlicoin).
|
||||
Faster lyra2rev2, lbry, skein.
|
||||
Large reduction in compiler warnings.
|
||||
|
||||
v3.8.1.1
|
||||
|
||||
Fixed Windows AVX2 crash.
|
||||
|
||||
v3.8.1
|
||||
|
||||
Fixes x16r on CPUs with only SSE2.
|
||||
More Optimizations for X algos, qubit & deep.
|
||||
Corrected algo optimizations for scrypt and yescrypt, no new optimizations.
|
||||
|
||||
v3.8.0.1
|
||||
|
||||
Fixed x16r AVX2 low hash rate.
|
||||
|
||||
v3.8.0
|
||||
|
||||
4way no longer a seperate feature, included in AVX2.
|
||||
|
@@ -155,6 +155,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
|
||||
switch (algo)
|
||||
{
|
||||
case ALGO_ALLIUM: register_allium_algo ( gate ); break;
|
||||
case ALGO_ANIME: register_anime_algo ( gate ); break;
|
||||
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
|
||||
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
|
||||
@@ -213,6 +214,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_X11: register_x11_algo ( gate ); break;
|
||||
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
|
||||
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
|
||||
case ALGO_X12: register_x12_algo ( gate ); break;
|
||||
case ALGO_X13: register_x13_algo ( gate ); break;
|
||||
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
|
||||
case ALGO_X14: register_x14_algo ( gate ); break;
|
||||
@@ -298,6 +300,7 @@ const char* const algo_alias_map[][2] =
|
||||
{ "lyra2", "lyra2re" },
|
||||
{ "lyra2v2", "lyra2rev2" },
|
||||
{ "lyra2zoin", "lyra2z330" },
|
||||
{ "myrgr", "myr-gr" },
|
||||
{ "myriad", "myr-gr" },
|
||||
{ "neo", "neoscrypt" },
|
||||
{ "phi", "phi1612" },
|
||||
|
@@ -553,22 +553,22 @@ do { \
|
||||
, _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
|
||||
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
|
||||
M[0x0] = mm_byteswap_32( *(buf + 0) ); \
|
||||
M[0x1] = mm_byteswap_32( *(buf + 1) ); \
|
||||
M[0x2] = mm_byteswap_32( *(buf + 2) ); \
|
||||
M[0x3] = mm_byteswap_32( *(buf + 3) ); \
|
||||
M[0x4] = mm_byteswap_32( *(buf + 4) ); \
|
||||
M[0x5] = mm_byteswap_32( *(buf + 5) ); \
|
||||
M[0x6] = mm_byteswap_32( *(buf + 6) ); \
|
||||
M[0x7] = mm_byteswap_32( *(buf + 7) ); \
|
||||
M[0x8] = mm_byteswap_32( *(buf + 8) ); \
|
||||
M[0x9] = mm_byteswap_32( *(buf + 9) ); \
|
||||
M[0xA] = mm_byteswap_32( *(buf + 10) ); \
|
||||
M[0xB] = mm_byteswap_32( *(buf + 11) ); \
|
||||
M[0xC] = mm_byteswap_32( *(buf + 12) ); \
|
||||
M[0xD] = mm_byteswap_32( *(buf + 13) ); \
|
||||
M[0xE] = mm_byteswap_32( *(buf + 14) ); \
|
||||
M[0xF] = mm_byteswap_32( *(buf + 15) ); \
|
||||
M[0x0] = mm_bswap_32( *(buf + 0) ); \
|
||||
M[0x1] = mm_bswap_32( *(buf + 1) ); \
|
||||
M[0x2] = mm_bswap_32( *(buf + 2) ); \
|
||||
M[0x3] = mm_bswap_32( *(buf + 3) ); \
|
||||
M[0x4] = mm_bswap_32( *(buf + 4) ); \
|
||||
M[0x5] = mm_bswap_32( *(buf + 5) ); \
|
||||
M[0x6] = mm_bswap_32( *(buf + 6) ); \
|
||||
M[0x7] = mm_bswap_32( *(buf + 7) ); \
|
||||
M[0x8] = mm_bswap_32( *(buf + 8) ); \
|
||||
M[0x9] = mm_bswap_32( *(buf + 9) ); \
|
||||
M[0xA] = mm_bswap_32( *(buf + 10) ); \
|
||||
M[0xB] = mm_bswap_32( *(buf + 11) ); \
|
||||
M[0xC] = mm_bswap_32( *(buf + 12) ); \
|
||||
M[0xD] = mm_bswap_32( *(buf + 13) ); \
|
||||
M[0xE] = mm_bswap_32( *(buf + 14) ); \
|
||||
M[0xF] = mm_bswap_32( *(buf + 15) ); \
|
||||
for (r = 0; r < rounds; r ++) \
|
||||
ROUND_S_4WAY(r); \
|
||||
H0 = _mm_xor_si128( _mm_xor_si128( \
|
||||
@@ -615,22 +615,22 @@ do { \
|
||||
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
|
||||
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
|
||||
M0 = mm_byteswap_32( * buf ); \
|
||||
M1 = mm_byteswap_32( *(buf+1) ); \
|
||||
M2 = mm_byteswap_32( *(buf+2) ); \
|
||||
M3 = mm_byteswap_32( *(buf+3) ); \
|
||||
M4 = mm_byteswap_32( *(buf+4) ); \
|
||||
M5 = mm_byteswap_32( *(buf+5) ); \
|
||||
M6 = mm_byteswap_32( *(buf+6) ); \
|
||||
M7 = mm_byteswap_32( *(buf+7) ); \
|
||||
M8 = mm_byteswap_32( *(buf+8) ); \
|
||||
M9 = mm_byteswap_32( *(buf+9) ); \
|
||||
MA = mm_byteswap_32( *(buf+10) ); \
|
||||
MB = mm_byteswap_32( *(buf+11) ); \
|
||||
MC = mm_byteswap_32( *(buf+12) ); \
|
||||
MD = mm_byteswap_32( *(buf+13) ); \
|
||||
ME = mm_byteswap_32( *(buf+14) ); \
|
||||
MF = mm_byteswap_32( *(buf+15) ); \
|
||||
M0 = mm_bswap_32( * buf ); \
|
||||
M1 = mm_bswap_32( *(buf+1) ); \
|
||||
M2 = mm_bswap_32( *(buf+2) ); \
|
||||
M3 = mm_bswap_32( *(buf+3) ); \
|
||||
M4 = mm_bswap_32( *(buf+4) ); \
|
||||
M5 = mm_bswap_32( *(buf+5) ); \
|
||||
M6 = mm_bswap_32( *(buf+6) ); \
|
||||
M7 = mm_bswap_32( *(buf+7) ); \
|
||||
M8 = mm_bswap_32( *(buf+8) ); \
|
||||
M9 = mm_bswap_32( *(buf+9) ); \
|
||||
MA = mm_bswap_32( *(buf+10) ); \
|
||||
MB = mm_bswap_32( *(buf+11) ); \
|
||||
MC = mm_bswap_32( *(buf+12) ); \
|
||||
MD = mm_bswap_32( *(buf+13) ); \
|
||||
ME = mm_bswap_32( *(buf+14) ); \
|
||||
MF = mm_bswap_32( *(buf+15) ); \
|
||||
ROUND_S_4WAY(0); \
|
||||
ROUND_S_4WAY(1); \
|
||||
ROUND_S_4WAY(2); \
|
||||
@@ -727,22 +727,22 @@ do { \
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
|
||||
M0 = mm256_byteswap_32( * buf ); \
|
||||
M1 = mm256_byteswap_32( *(buf+1) ); \
|
||||
M2 = mm256_byteswap_32( *(buf+2) ); \
|
||||
M3 = mm256_byteswap_32( *(buf+3) ); \
|
||||
M4 = mm256_byteswap_32( *(buf+4) ); \
|
||||
M5 = mm256_byteswap_32( *(buf+5) ); \
|
||||
M6 = mm256_byteswap_32( *(buf+6) ); \
|
||||
M7 = mm256_byteswap_32( *(buf+7) ); \
|
||||
M8 = mm256_byteswap_32( *(buf+8) ); \
|
||||
M9 = mm256_byteswap_32( *(buf+9) ); \
|
||||
MA = mm256_byteswap_32( *(buf+10) ); \
|
||||
MB = mm256_byteswap_32( *(buf+11) ); \
|
||||
MC = mm256_byteswap_32( *(buf+12) ); \
|
||||
MD = mm256_byteswap_32( *(buf+13) ); \
|
||||
ME = mm256_byteswap_32( *(buf+14) ); \
|
||||
MF = mm256_byteswap_32( *(buf+15) ); \
|
||||
M0 = mm256_bswap_32( * buf ); \
|
||||
M1 = mm256_bswap_32( *(buf+1) ); \
|
||||
M2 = mm256_bswap_32( *(buf+2) ); \
|
||||
M3 = mm256_bswap_32( *(buf+3) ); \
|
||||
M4 = mm256_bswap_32( *(buf+4) ); \
|
||||
M5 = mm256_bswap_32( *(buf+5) ); \
|
||||
M6 = mm256_bswap_32( *(buf+6) ); \
|
||||
M7 = mm256_bswap_32( *(buf+7) ); \
|
||||
M8 = mm256_bswap_32( *(buf+8) ); \
|
||||
M9 = mm256_bswap_32( *(buf+9) ); \
|
||||
MA = mm256_bswap_32( *(buf+10) ); \
|
||||
MB = mm256_bswap_32( *(buf+11) ); \
|
||||
MC = mm256_bswap_32( *(buf+12) ); \
|
||||
MD = mm256_bswap_32( *(buf+13) ); \
|
||||
ME = mm256_bswap_32( *(buf+14) ); \
|
||||
MF = mm256_bswap_32( *(buf+15) ); \
|
||||
ROUND_S_8WAY(0); \
|
||||
ROUND_S_8WAY(1); \
|
||||
ROUND_S_8WAY(2); \
|
||||
@@ -848,22 +848,22 @@ do { \
|
||||
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
|
||||
M[0x0] = mm256_byteswap_64( *(buf+0) ); \
|
||||
M[0x1] = mm256_byteswap_64( *(buf+1) ); \
|
||||
M[0x2] = mm256_byteswap_64( *(buf+2) ); \
|
||||
M[0x3] = mm256_byteswap_64( *(buf+3) ); \
|
||||
M[0x4] = mm256_byteswap_64( *(buf+4) ); \
|
||||
M[0x5] = mm256_byteswap_64( *(buf+5) ); \
|
||||
M[0x6] = mm256_byteswap_64( *(buf+6) ); \
|
||||
M[0x7] = mm256_byteswap_64( *(buf+7) ); \
|
||||
M[0x8] = mm256_byteswap_64( *(buf+8) ); \
|
||||
M[0x9] = mm256_byteswap_64( *(buf+9) ); \
|
||||
M[0xA] = mm256_byteswap_64( *(buf+10) ); \
|
||||
M[0xB] = mm256_byteswap_64( *(buf+11) ); \
|
||||
M[0xC] = mm256_byteswap_64( *(buf+12) ); \
|
||||
M[0xD] = mm256_byteswap_64( *(buf+13) ); \
|
||||
M[0xE] = mm256_byteswap_64( *(buf+14) ); \
|
||||
M[0xF] = mm256_byteswap_64( *(buf+15) ); \
|
||||
M[0x0] = mm256_bswap_64( *(buf+0) ); \
|
||||
M[0x1] = mm256_bswap_64( *(buf+1) ); \
|
||||
M[0x2] = mm256_bswap_64( *(buf+2) ); \
|
||||
M[0x3] = mm256_bswap_64( *(buf+3) ); \
|
||||
M[0x4] = mm256_bswap_64( *(buf+4) ); \
|
||||
M[0x5] = mm256_bswap_64( *(buf+5) ); \
|
||||
M[0x6] = mm256_bswap_64( *(buf+6) ); \
|
||||
M[0x7] = mm256_bswap_64( *(buf+7) ); \
|
||||
M[0x8] = mm256_bswap_64( *(buf+8) ); \
|
||||
M[0x9] = mm256_bswap_64( *(buf+9) ); \
|
||||
M[0xA] = mm256_bswap_64( *(buf+10) ); \
|
||||
M[0xB] = mm256_bswap_64( *(buf+11) ); \
|
||||
M[0xC] = mm256_bswap_64( *(buf+12) ); \
|
||||
M[0xD] = mm256_bswap_64( *(buf+13) ); \
|
||||
M[0xE] = mm256_bswap_64( *(buf+14) ); \
|
||||
M[0xF] = mm256_bswap_64( *(buf+15) ); \
|
||||
for (r = 0; r < 16; r ++) \
|
||||
ROUND_B_4WAY(r); \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
@@ -913,22 +913,22 @@ do { \
|
||||
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
|
||||
M0 = mm256_byteswap_64( *(buf + 0) ); \
|
||||
M1 = mm256_byteswap_64( *(buf + 1) ); \
|
||||
M2 = mm256_byteswap_64( *(buf + 2) ); \
|
||||
M3 = mm256_byteswap_64( *(buf + 3) ); \
|
||||
M4 = mm256_byteswap_64( *(buf + 4) ); \
|
||||
M5 = mm256_byteswap_64( *(buf + 5) ); \
|
||||
M6 = mm256_byteswap_64( *(buf + 6) ); \
|
||||
M7 = mm256_byteswap_64( *(buf + 7) ); \
|
||||
M8 = mm256_byteswap_64( *(buf + 8) ); \
|
||||
M9 = mm256_byteswap_64( *(buf + 9) ); \
|
||||
MA = mm256_byteswap_64( *(buf + 10) ); \
|
||||
MB = mm256_byteswap_64( *(buf + 11) ); \
|
||||
MC = mm256_byteswap_64( *(buf + 12) ); \
|
||||
MD = mm256_byteswap_64( *(buf + 13) ); \
|
||||
ME = mm256_byteswap_64( *(buf + 14) ); \
|
||||
MF = mm256_byteswap_64( *(buf + 15) ); \
|
||||
M0 = mm256_bswap_64( *(buf + 0) ); \
|
||||
M1 = mm256_bswap_64( *(buf + 1) ); \
|
||||
M2 = mm256_bswap_64( *(buf + 2) ); \
|
||||
M3 = mm256_bswap_64( *(buf + 3) ); \
|
||||
M4 = mm256_bswap_64( *(buf + 4) ); \
|
||||
M5 = mm256_bswap_64( *(buf + 5) ); \
|
||||
M6 = mm256_bswap_64( *(buf + 6) ); \
|
||||
M7 = mm256_bswap_64( *(buf + 7) ); \
|
||||
M8 = mm256_bswap_64( *(buf + 8) ); \
|
||||
M9 = mm256_bswap_64( *(buf + 9) ); \
|
||||
MA = mm256_bswap_64( *(buf + 10) ); \
|
||||
MB = mm256_bswap_64( *(buf + 11) ); \
|
||||
MC = mm256_bswap_64( *(buf + 12) ); \
|
||||
MD = mm256_bswap_64( *(buf + 13) ); \
|
||||
ME = mm256_bswap_64( *(buf + 14) ); \
|
||||
MF = mm256_bswap_64( *(buf + 15) ); \
|
||||
ROUND_B_4WAY(0); \
|
||||
ROUND_B_4WAY(1); \
|
||||
ROUND_B_4WAY(2); \
|
||||
@@ -1064,8 +1064,8 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
if (out_size_w32 == 8)
|
||||
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
|
||||
_mm_set1_epi32( 0x01000000UL ) );
|
||||
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
|
||||
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
|
||||
*(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
|
||||
*(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
|
||||
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
|
||||
}
|
||||
else
|
||||
@@ -1077,13 +1077,13 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
memset_zero_128( u.buf, 56>>2 );
|
||||
if (out_size_w32 == 8)
|
||||
u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
|
||||
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
|
||||
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
|
||||
*(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
|
||||
*(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
|
||||
blake32_4way( sc, u.buf, 64 );
|
||||
}
|
||||
out = (__m128i*)dst;
|
||||
for ( k = 0; k < out_size_w32; k++ )
|
||||
out[k] = mm_byteswap_32( sc->H[k] );
|
||||
out[k] = mm_bswap_32( sc->H[k] );
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
@@ -1187,8 +1187,8 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
if (out_size_w32 == 8)
|
||||
u.buf[52>>2] = _mm256_or_si256( u.buf[52>>2],
|
||||
_mm256_set1_epi32( 0x01000000UL ) );
|
||||
*(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) );
|
||||
*(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) );
|
||||
*(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
|
||||
*(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
|
||||
blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr );
|
||||
}
|
||||
else
|
||||
@@ -1200,13 +1200,13 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
memset_zero_256( u.buf, 56>>2 );
|
||||
if (out_size_w32 == 8)
|
||||
u.buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
|
||||
*(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) );
|
||||
*(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) );
|
||||
*(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
|
||||
*(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
|
||||
blake32_8way( sc, u.buf, 64 );
|
||||
}
|
||||
out = (__m256i*)dst;
|
||||
for ( k = 0; k < out_size_w32; k++ )
|
||||
out[k] = mm256_byteswap_32( sc->H[k] );
|
||||
out[k] = mm256_bswap_32( sc->H[k] );
|
||||
}
|
||||
|
||||
// Blake-512 4 way
|
||||
@@ -1311,9 +1311,9 @@ blake64_4way_close( blake_4way_big_context *sc,
|
||||
if ( out_size_w64 == 8 )
|
||||
u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
|
||||
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
|
||||
*(u.buf+(112>>3)) = mm256_byteswap_64(
|
||||
*(u.buf+(112>>3)) = mm256_bswap_64(
|
||||
_mm256_set_epi64x( th, th, th, th ) );
|
||||
*(u.buf+(120>>3)) = mm256_byteswap_64(
|
||||
*(u.buf+(120>>3)) = mm256_bswap_64(
|
||||
_mm256_set_epi64x( tl, tl, tl, tl ) );
|
||||
|
||||
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
|
||||
@@ -1328,16 +1328,16 @@ blake64_4way_close( blake_4way_big_context *sc,
|
||||
memset_zero_256( u.buf, 112>>3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
|
||||
*(u.buf+(112>>3)) = mm256_byteswap_64(
|
||||
*(u.buf+(112>>3)) = mm256_bswap_64(
|
||||
_mm256_set_epi64x( th, th, th, th ) );
|
||||
*(u.buf+(120>>3)) = mm256_byteswap_64(
|
||||
*(u.buf+(120>>3)) = mm256_bswap_64(
|
||||
_mm256_set_epi64x( tl, tl, tl, tl ) );
|
||||
|
||||
blake64_4way( sc, u.buf, 128 );
|
||||
}
|
||||
out = (__m256i*)dst;
|
||||
for ( k = 0; k < out_size_w64; k++ )
|
||||
out[k] = mm256_byteswap_64( sc->H[k] );
|
||||
out[k] = mm256_bswap_64( sc->H[k] );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -96,7 +96,7 @@ int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
if ( ( n >= max_nonce ) && ( *hashes_done < 10 ) )
|
||||
{
|
||||
*hashes_done = 0;
|
||||
sleep(1);
|
||||
// sleep(1);
|
||||
}
|
||||
|
||||
return num_found;
|
||||
|
@@ -12,11 +12,11 @@ static __thread blake256_4way_context blake_mid;
|
||||
void decred_hash_4way( void *state, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||
void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
|
||||
// uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
// uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
// uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||
// uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||
const void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
|
||||
int tail_len = 180 - DECRED_MIDSTATE_LEN;
|
||||
blake256_4way_context ctx __attribute__ ((aligned (64)));
|
||||
|
||||
|
@@ -49,11 +49,6 @@ extern "C"{
|
||||
|
||||
// BMW256
|
||||
|
||||
// BMW small has a bug not present in big. Lanes 0 & 2 produce valid hash
|
||||
// while lanes 1 & 3 produce invalid hash. The cause is not known.
|
||||
|
||||
|
||||
|
||||
static const sph_u32 IV256[] = {
|
||||
SPH_C32(0x40414243), SPH_C32(0x44454647),
|
||||
SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F),
|
||||
@@ -121,16 +116,14 @@ static const sph_u64 IV512[] = {
|
||||
mm_rotl_32( M[ ( (j) + (off) ) & 0xF ] , \
|
||||
( ( (j) + (off) ) & 0xF ) + 1 )
|
||||
|
||||
// The multiplication in this macro is a possible cause of the lane
|
||||
// corruption but a vectorized mullo did not help.
|
||||
#define add_elt_s( M, H, j ) \
|
||||
_mm_xor_si128( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
|
||||
rol_off_32( M, j, 3 ) ), \
|
||||
rol_off_32( M, j, 10 ) ), \
|
||||
_mm_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) \
|
||||
), H[ ( (j)+7 ) & 0xF ] )
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
|
||||
rol_off_32( M, j, 3 ) ), \
|
||||
rol_off_32( M, j, 10 ) ), \
|
||||
_mm_set1_epi32( ( (j)+16 ) * SPH_C32(0x05555555UL) ) ), \
|
||||
H[ ( (j)+7 ) & 0xF ] )
|
||||
|
||||
|
||||
#define expand1s( qt, M, H, i ) \
|
||||
@@ -447,22 +440,22 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
|
||||
{
|
||||
__m128i qt[32], xl, xh; \
|
||||
|
||||
qt[ 0] = ss0( Ws0 ) + H[ 1];
|
||||
qt[ 1] = ss1( Ws1 ) + H[ 2];
|
||||
qt[ 2] = ss2( Ws2 ) + H[ 3];
|
||||
qt[ 3] = ss3( Ws3 ) + H[ 4];
|
||||
qt[ 4] = ss4( Ws4 ) + H[ 5];
|
||||
qt[ 5] = ss0( Ws5 ) + H[ 6];
|
||||
qt[ 6] = ss1( Ws6 ) + H[ 7];
|
||||
qt[ 7] = ss2( Ws7 ) + H[ 8];
|
||||
qt[ 8] = ss3( Ws8 ) + H[ 9];
|
||||
qt[ 9] = ss4( Ws9 ) + H[10];
|
||||
qt[10] = ss0( Ws10) + H[11];
|
||||
qt[11] = ss1( Ws11) + H[12];
|
||||
qt[12] = ss2( Ws12) + H[13];
|
||||
qt[13] = ss3( Ws13) + H[14];
|
||||
qt[14] = ss4( Ws14) + H[15];
|
||||
qt[15] = ss0( Ws15) + H[ 0];
|
||||
qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] );
|
||||
qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] );
|
||||
qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] );
|
||||
qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] );
|
||||
qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] );
|
||||
qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] );
|
||||
qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] );
|
||||
qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] );
|
||||
qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] );
|
||||
qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] );
|
||||
qt[10] = _mm_add_epi32( ss0( Ws10), H[11] );
|
||||
qt[11] = _mm_add_epi32( ss1( Ws11), H[12] );
|
||||
qt[12] = _mm_add_epi32( ss2( Ws12), H[13] );
|
||||
qt[13] = _mm_add_epi32( ss3( Ws13), H[14] );
|
||||
qt[14] = _mm_add_epi32( ss4( Ws14), H[15] );
|
||||
qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] );
|
||||
qt[16] = expand1s( qt, M, H, 16 );
|
||||
qt[17] = expand1s( qt, M, H, 17 );
|
||||
qt[18] = expand2s( qt, M, H, 18 );
|
||||
@@ -738,24 +731,24 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
|
||||
|
||||
void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
{
|
||||
__m256i qt[32], xl, xh; \
|
||||
__m256i qt[32], xl, xh;
|
||||
|
||||
qt[ 0] = sb0( Wb0 ) + H[ 1];
|
||||
qt[ 1] = sb1( Wb1 ) + H[ 2];
|
||||
qt[ 2] = sb2( Wb2 ) + H[ 3];
|
||||
qt[ 3] = sb3( Wb3 ) + H[ 4];
|
||||
qt[ 4] = sb4( Wb4 ) + H[ 5];
|
||||
qt[ 5] = sb0( Wb5 ) + H[ 6];
|
||||
qt[ 6] = sb1( Wb6 ) + H[ 7];
|
||||
qt[ 7] = sb2( Wb7 ) + H[ 8];
|
||||
qt[ 8] = sb3( Wb8 ) + H[ 9];
|
||||
qt[ 9] = sb4( Wb9 ) + H[10];
|
||||
qt[10] = sb0( Wb10) + H[11];
|
||||
qt[11] = sb1( Wb11) + H[12];
|
||||
qt[12] = sb2( Wb12) + H[13];
|
||||
qt[13] = sb3( Wb13) + H[14];
|
||||
qt[14] = sb4( Wb14) + H[15];
|
||||
qt[15] = sb0( Wb15) + H[ 0];
|
||||
qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] );
|
||||
qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] );
|
||||
qt[ 2] = _mm256_add_epi64( sb2( Wb2 ), H[ 3] );
|
||||
qt[ 3] = _mm256_add_epi64( sb3( Wb3 ), H[ 4] );
|
||||
qt[ 4] = _mm256_add_epi64( sb4( Wb4 ), H[ 5] );
|
||||
qt[ 5] = _mm256_add_epi64( sb0( Wb5 ), H[ 6] );
|
||||
qt[ 6] = _mm256_add_epi64( sb1( Wb6 ), H[ 7] );
|
||||
qt[ 7] = _mm256_add_epi64( sb2( Wb7 ), H[ 8] );
|
||||
qt[ 8] = _mm256_add_epi64( sb3( Wb8 ), H[ 9] );
|
||||
qt[ 9] = _mm256_add_epi64( sb4( Wb9 ), H[10] );
|
||||
qt[10] = _mm256_add_epi64( sb0( Wb10), H[11] );
|
||||
qt[11] = _mm256_add_epi64( sb1( Wb11), H[12] );
|
||||
qt[12] = _mm256_add_epi64( sb2( Wb12), H[13] );
|
||||
qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] );
|
||||
qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] );
|
||||
qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] );
|
||||
qt[16] = expand1b( qt, M, H, 16 );
|
||||
qt[17] = expand1b( qt, M, H, 17 );
|
||||
qt[18] = expand2b( qt, M, H, 18 );
|
||||
@@ -868,7 +861,7 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
}
|
||||
|
||||
// BMW256
|
||||
/*
|
||||
|
||||
static const uint32_t final_s[16][4] =
|
||||
{
|
||||
{ 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0 },
|
||||
@@ -888,7 +881,7 @@ static const uint32_t final_s[16][4] =
|
||||
{ 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae },
|
||||
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
|
||||
};
|
||||
*/
|
||||
/*
|
||||
static const __m128i final_s[16] =
|
||||
{
|
||||
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
|
||||
@@ -908,7 +901,7 @@ static const __m128i final_s[16] =
|
||||
{ 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
|
||||
{ 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
|
||||
};
|
||||
|
||||
*/
|
||||
static void
|
||||
bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
|
||||
{
|
||||
@@ -984,7 +977,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
}
|
||||
memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
|
||||
buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
|
||||
buf[ (buf_size - 4) >> 2 ] = mm_zero;
|
||||
buf[ (buf_size - 4) >> 2 ] = m128_zero;
|
||||
compress_small( buf, h, h2 );
|
||||
|
||||
for ( u = 0; u < 16; u ++ )
|
||||
|
1251
algo/bmw/bmw.test
Normal file
1251
algo/bmw/bmw.test
Normal file
File diff suppressed because it is too large
Load Diff
@@ -477,7 +477,7 @@ do { \
|
||||
for (u = 0; u < 16; u ++) \
|
||||
sph_enc64le_aligned(data + 8 * u, h2[u]); \
|
||||
dh = h1; \
|
||||
h = final_b; \
|
||||
h = (sph_u64*)final_b; \
|
||||
} \
|
||||
/* end wrapped for break loop */ \
|
||||
out = dst; \
|
||||
|
@@ -129,7 +129,7 @@ static void transform( cubehashParam *sp )
|
||||
#endif
|
||||
} // transform
|
||||
|
||||
// Ccubehash context initializing is very expensive.
|
||||
// Cubehash context initializing is very expensive.
|
||||
// Cache the intial value for faster reinitializing.
|
||||
cubehashParam cube_ctx_cache __attribute__ ((aligned (64)));
|
||||
|
||||
|
@@ -1,2 +0,0 @@
|
||||
amd64
|
||||
x86
|
@@ -14,18 +14,20 @@
|
||||
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
|
||||
*
|
||||
*/
|
||||
#if defined(__AES__)
|
||||
|
||||
#include <memory.h>
|
||||
#include "miner.h"
|
||||
#include "hash_api.h"
|
||||
#include "vperm.h"
|
||||
|
||||
//#include "vperm.h"
|
||||
#include <immintrin.h>
|
||||
/*
|
||||
#ifndef NO_AES_NI
|
||||
#include <wmmintrin.h>
|
||||
#else
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
*/
|
||||
|
||||
MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
|
||||
MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
|
||||
@@ -246,7 +248,8 @@ void DumpState(__m128i *ps)
|
||||
void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
|
||||
{
|
||||
unsigned int r, b, i, j;
|
||||
__m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
|
||||
// __m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
|
||||
__m128i t1, t2, s2, k1;
|
||||
__m128i _state[4][4], _state2[4][4], _statebackup[4][4];
|
||||
|
||||
|
||||
@@ -396,7 +399,7 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
ctx->k = _mm_xor_si128(ctx->k, ctx->k);
|
||||
ctx->k = _mm_setzero_si128();
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
|
||||
@@ -742,4 +745,4 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
@@ -1 +0,0 @@
|
||||
Çağdaş Çalık
|
@@ -1,120 +0,0 @@
|
||||
/*
|
||||
* file : vperm.h
|
||||
* version : 1.0.208
|
||||
* date : 14.12.2010
|
||||
*
|
||||
* vperm implementation of AES s-box
|
||||
*
|
||||
* Credits: Adapted from Mike Hamburg's AES implementation, http://crypto.stanford.edu/vpaes/
|
||||
*
|
||||
* Cagdas Calik
|
||||
* ccalik@metu.edu.tr
|
||||
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef VPERM_H
|
||||
#define VPERM_H
|
||||
|
||||
#include "algo/sha/sha3_common.h"
|
||||
#include <tmmintrin.h>
|
||||
|
||||
/*
|
||||
extern const unsigned int _k_s0F[];
|
||||
extern const unsigned int _k_ipt[];
|
||||
extern const unsigned int _k_opt[];
|
||||
extern const unsigned int _k_inv[];
|
||||
extern const unsigned int _k_sb1[];
|
||||
extern const unsigned int _k_sb2[];
|
||||
extern const unsigned int _k_sb3[];
|
||||
extern const unsigned int _k_sb4[];
|
||||
extern const unsigned int _k_sb5[];
|
||||
extern const unsigned int _k_sb7[];
|
||||
extern const unsigned int _k_sbo[];
|
||||
extern const unsigned int _k_h63[];
|
||||
extern const unsigned int _k_hc6[];
|
||||
extern const unsigned int _k_h5b[];
|
||||
extern const unsigned int _k_h4e[];
|
||||
extern const unsigned int _k_h0e[];
|
||||
extern const unsigned int _k_h15[];
|
||||
extern const unsigned int _k_aesmix1[];
|
||||
extern const unsigned int _k_aesmix2[];
|
||||
extern const unsigned int _k_aesmix3[];
|
||||
extern const unsigned int _k_aesmix4[];
|
||||
*/
|
||||
|
||||
// input: x, table
|
||||
// output: x
|
||||
#define TRANSFORM(x, table, t1, t2)\
|
||||
t1 = _mm_andnot_si128(M128(_k_s0F), x);\
|
||||
t1 = _mm_srli_epi32(t1, 4);\
|
||||
x = _mm_and_si128(x, M128(_k_s0F));\
|
||||
t1 = _mm_shuffle_epi8(*((__m128i*)table + 1), t1);\
|
||||
x = _mm_shuffle_epi8(*((__m128i*)table + 0), x);\
|
||||
x = _mm_xor_si128(x, t1)
|
||||
|
||||
#if 0
|
||||
// compiled erroneously with 32-bit msc compiler
|
||||
t2 = _mm_shuffle_epi8(table[0], x);\
|
||||
x = _mm_shuffle_epi8(table[1], t1);\
|
||||
x = _mm_xor_si128(x, t2)
|
||||
#endif
|
||||
|
||||
// input: x
|
||||
// output: t2, t3
|
||||
#define SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4)\
|
||||
t1 = _mm_andnot_si128(M128(_k_s0F), x);\
|
||||
t1 = _mm_srli_epi32(t1, 4);\
|
||||
x = _mm_and_si128(x, M128(_k_s0F));\
|
||||
t2 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 1), x);\
|
||||
x = _mm_xor_si128(x, t1);\
|
||||
t3 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t1);\
|
||||
t3 = _mm_xor_si128(t3, t2);\
|
||||
t4 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), x);\
|
||||
t4 = _mm_xor_si128(t4, t2);\
|
||||
t2 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t3);\
|
||||
t2 = _mm_xor_si128(t2, x);\
|
||||
t3 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t4);\
|
||||
t3 = _mm_xor_si128(t3, t1);\
|
||||
|
||||
|
||||
// input: x1, x2, table
|
||||
// output: y
|
||||
#define VPERM_LOOKUP(x1, x2, table, y, t)\
|
||||
t = _mm_shuffle_epi8(*((__m128i*)table + 0), x1);\
|
||||
y = _mm_shuffle_epi8(*((__m128i*)table + 1), x2);\
|
||||
y = _mm_xor_si128(y, t)
|
||||
|
||||
|
||||
// input: x
|
||||
// output: x
|
||||
#define SUBSTITUTE_VPERM(x, t1, t2, t3, t4) \
|
||||
TRANSFORM(x, _k_ipt, t1, t2);\
|
||||
SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4);\
|
||||
VPERM_LOOKUP(t2, t3, _k_sbo, x, t1);\
|
||||
x = _mm_xor_si128(x, M128(_k_h63))
|
||||
|
||||
|
||||
// input: x
|
||||
// output: x
|
||||
#define AES_ROUND_VPERM_CORE(x, t1, t2, t3, t4, s1, s2, s3) \
|
||||
SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4);\
|
||||
VPERM_LOOKUP(t2, t3, _k_sb1, s1, t1);\
|
||||
VPERM_LOOKUP(t2, t3, _k_sb2, s2, t1);\
|
||||
s3 = _mm_xor_si128(s1, s2);\
|
||||
x = _mm_shuffle_epi8(s2, M128(_k_aesmix1));\
|
||||
x = _mm_xor_si128(x, _mm_shuffle_epi8(s3, M128(_k_aesmix2)));\
|
||||
x = _mm_xor_si128(x, _mm_shuffle_epi8(s1, M128(_k_aesmix3)));\
|
||||
x = _mm_xor_si128(x, _mm_shuffle_epi8(s1, M128(_k_aesmix4)));\
|
||||
x = _mm_xor_si128(x, M128(_k_h5b))
|
||||
|
||||
|
||||
// input: x
|
||||
// output: x
|
||||
#define AES_ROUND_VPERM(x, t1, t2, t3, t4, s1, s2, s3) \
|
||||
TRANSFORM(x, _k_ipt, t1, t2);\
|
||||
AES_ROUND_VPERM_CORE(x, t1, t2, t3, t4, s1, s2, s3);\
|
||||
TRANSFORM(x, _k_opt, t1, t2)
|
||||
|
||||
#endif // VPERM_H
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "myrgr-gate.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
@@ -10,8 +10,6 @@
|
||||
#else
|
||||
#include "aes_ni/hash-groestl.h"
|
||||
#endif
|
||||
|
||||
#include <openssl/sha.h>
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
|
||||
typedef struct {
|
||||
@@ -20,11 +18,7 @@ typedef struct {
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
#endif
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA256_CTX sha;
|
||||
#else
|
||||
sph_sha256_context sha;
|
||||
#endif
|
||||
sph_sha256_context sha;
|
||||
} myrgr_ctx_holder;
|
||||
|
||||
myrgr_ctx_holder myrgr_ctx;
|
||||
@@ -36,44 +30,37 @@ void init_myrgr_ctx()
|
||||
#else
|
||||
init_groestl (&myrgr_ctx.groestl, 64 );
|
||||
#endif
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA256_Init( &myrgr_ctx.sha );
|
||||
#else
|
||||
sph_sha256_init( &myrgr_ctx.sha );
|
||||
#endif
|
||||
sph_sha256_init(&myrgr_ctx.sha);
|
||||
}
|
||||
|
||||
void myriadhash( void *output, const void *input )
|
||||
void myriad_hash(void *output, const void *input)
|
||||
{
|
||||
myrgr_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
|
||||
uint32_t hash[16] __attribute__ ((aligned (64)));
|
||||
myrgr_ctx_holder ctx;
|
||||
memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
|
||||
|
||||
uint32_t _ALIGN(32) hash[16];
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512(&ctx.groestl, input, 80);
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
sph_groestl512(&ctx.groestl, input, 80);
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#else
|
||||
update_and_final_groestl( &ctx.groestl, (char*)input,
|
||||
(const char*)input, 640 );
|
||||
update_groestl( &ctx.groestl, (char*)input, 640 );
|
||||
final_groestl( &ctx.groestl, (char*)hash);
|
||||
#endif
|
||||
|
||||
#ifndef USE_SPH_SHA
|
||||
SHA256_Update( &ctx.sha, hash, 64 );
|
||||
SHA256_Final( (unsigned char*) hash, &ctx.sha );
|
||||
#else
|
||||
sph_sha256(&ctx.sha, hash, 64);
|
||||
sph_sha256_close(&ctx.sha, hash);
|
||||
#endif
|
||||
memcpy(output, hash, 32);
|
||||
sph_sha256(&ctx.sha, hash, 64);
|
||||
sph_sha256_close(&ctx.sha, hash);
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done)
|
||||
int scanhash_myriad(int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done)
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
||||
uint32_t endiandata[20] __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
|
||||
@@ -84,9 +71,9 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
|
||||
do {
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8];
|
||||
be32enc(&endiandata[19], nonce);
|
||||
myriadhash(hash, endiandata);
|
||||
myriad_hash(hash, endiandata);
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
pdata[19] = nonce;
|
||||
@@ -101,14 +88,15 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
bool register_myriad_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
init_myrgr_ctx();
|
||||
gate->scanhash = (void*)&scanhash_myriad;
|
||||
gate->hash = (void*)&myriadhash;
|
||||
// gate->hash_alt = (void*)&myriadhash;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
||||
*/
|
||||
|
134
algo/groestl/myrgr-4way.c
Normal file
134
algo/groestl/myrgr-4way.c
Normal file
@@ -0,0 +1,134 @@
|
||||
#include "myrgr-gate.h"
|
||||
|
||||
#if defined(MYRGR_4WAY)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "aes_ni/hash-groestl.h"
|
||||
#include "algo/sha/sha2-hash-4way.h"
|
||||
|
||||
typedef struct {
|
||||
hashState_groestl groestl;
|
||||
sha256_4way_context sha;
|
||||
} myrgr_4way_ctx_holder;
|
||||
|
||||
myrgr_4way_ctx_holder myrgr_4way_ctx;
|
||||
|
||||
void init_myrgr_4way_ctx()
|
||||
{
|
||||
init_groestl (&myrgr_4way_ctx.groestl, 64 );
|
||||
sha256_4way_init( &myrgr_4way_ctx.sha );
|
||||
}
|
||||
|
||||
void myriad_4way_hash( void *output, const void *input )
|
||||
{
|
||||
uint32_t hash0[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[20] __attribute__ ((aligned (64)));
|
||||
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
|
||||
myrgr_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) );
|
||||
|
||||
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, input, 640 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
|
||||
|
||||
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
sha256_4way( &ctx.sha, vhash, 64 );
|
||||
sha256_4way_close( &ctx.sha, vhash );
|
||||
|
||||
mm_deinterleave_4x32( output, output+32, output+64, output+96,
|
||||
vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 76; // 19*4
|
||||
uint32_t *noncep1 = vdata + 77;
|
||||
uint32_t *noncep2 = vdata + 78;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
|
||||
/*
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
*/
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
swab32_array( edata, pdata, 20 );
|
||||
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
myriad_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = pdata[19] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
&& !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
18
algo/groestl/myrgr-gate.c
Normal file
18
algo/groestl/myrgr-gate.c
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "myrgr-gate.h"
|
||||
|
||||
bool register_myriad_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (MYRGR_4WAY)
|
||||
init_myrgr_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_myriad_4way;
|
||||
gate->hash = (void*)&myriad_4way_hash;
|
||||
#else
|
||||
init_myrgr_ctx();
|
||||
gate->scanhash = (void*)&scanhash_myriad;
|
||||
gate->hash = (void*)&myriad_hash;
|
||||
#endif
|
||||
gate->optimizations = AES_OPT | AVX2_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
30
algo/groestl/myrgr-gate.h
Normal file
30
algo/groestl/myrgr-gate.h
Normal file
@@ -0,0 +1,30 @@
|
||||
#ifndef MYRGR_GATE_H__
|
||||
#define MYRGR_GATE_H__
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define MYRGR_4WAY
|
||||
#endif
|
||||
|
||||
#if defined(MYRGR_4WAY)
|
||||
|
||||
void myriad_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_myrgr_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void myriad_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_myrgr_ctx();
|
||||
|
||||
#endif
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -48,20 +48,20 @@ extern "C"{
|
||||
|
||||
#define SPH_SIZE_hamsi512 512
|
||||
|
||||
// Partial is only scalar but needs pointer ref for hamsi-helper
|
||||
// deprecate partial_len
|
||||
typedef struct {
|
||||
__m128i h[16];
|
||||
__m128i partial[2];
|
||||
__m256i h[8];
|
||||
__m256i buf[1];
|
||||
size_t partial_len;
|
||||
sph_u32 count_high, count_low;
|
||||
} hamsi_4way_big_context;
|
||||
|
||||
typedef hamsi_4way_big_context hamsi512_4way_context;
|
||||
|
||||
void hamsi512_4way_init(void *cc);
|
||||
|
||||
void hamsi512_4way(void *cc, const void *data, size_t len);
|
||||
|
||||
void hamsi512_4way_close(void *cc, void *dst);
|
||||
void hamsi512_4way_init( hamsi512_4way_context *sc );
|
||||
void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
|
||||
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@@ -1,482 +0,0 @@
|
||||
/* $Id: hamsi_helper.c 202 2010-05-31 15:46:48Z tp $ */
|
||||
/*
|
||||
* Helper code for Hamsi (input block expansion). This code is
|
||||
* automatically generated and includes precomputed tables for
|
||||
* expansion code which handles 2 to 8 bits at a time.
|
||||
*
|
||||
* This file is included from hamsi.c, and is not meant to be compiled
|
||||
* independently.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
/* Note: this table lists bits within each byte from least
|
||||
siginificant to most significant. */
|
||||
static const sph_u32 T512[64][16] = {
|
||||
{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
|
||||
SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
|
||||
SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
|
||||
SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
|
||||
SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
|
||||
SPH_C32(0x9e69af68) },
|
||||
{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
|
||||
SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
|
||||
SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
|
||||
SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
|
||||
SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
|
||||
SPH_C32(0x0c26f262) },
|
||||
{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
|
||||
SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
|
||||
SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
|
||||
SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
|
||||
SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
|
||||
SPH_C32(0xdc24e61f) },
|
||||
{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
|
||||
SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
|
||||
SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
|
||||
SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
|
||||
SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
|
||||
SPH_C32(0x3daac2da) },
|
||||
{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
|
||||
SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
|
||||
SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
|
||||
SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
|
||||
SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
|
||||
SPH_C32(0x78cace29) },
|
||||
{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
|
||||
SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
|
||||
SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
|
||||
SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
|
||||
SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
|
||||
SPH_C32(0x2dd1f9ab) },
|
||||
{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
|
||||
SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
|
||||
SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
|
||||
SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
|
||||
SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
|
||||
SPH_C32(0xbf2c0be2) },
|
||||
{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
|
||||
SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
|
||||
SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
|
||||
SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
|
||||
SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
|
||||
SPH_C32(0x32219526) },
|
||||
{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
|
||||
SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
|
||||
SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
|
||||
SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
|
||||
SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
|
||||
SPH_C32(0xac8e6c88) },
|
||||
{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
|
||||
SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
|
||||
SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
|
||||
SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
|
||||
SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
|
||||
SPH_C32(0x7b1bd6b9) },
|
||||
{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
|
||||
SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
|
||||
SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
|
||||
SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
|
||||
SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
|
||||
SPH_C32(0xf746c320) },
|
||||
{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
|
||||
SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
|
||||
SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
|
||||
SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
|
||||
SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
|
||||
SPH_C32(0x69505b3a) },
|
||||
{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
|
||||
SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
|
||||
SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
|
||||
SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
|
||||
SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
|
||||
SPH_C32(0x8a341574) },
|
||||
{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
|
||||
SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
|
||||
SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
|
||||
SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
|
||||
SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
|
||||
SPH_C32(0x450360bf) },
|
||||
{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
|
||||
SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
|
||||
SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
|
||||
SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
|
||||
SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
|
||||
SPH_C32(0xf3d45758) },
|
||||
{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
|
||||
SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
|
||||
SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
|
||||
SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
|
||||
SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
|
||||
SPH_C32(0x925c44e9) },
|
||||
{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
|
||||
SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
|
||||
SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
|
||||
SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
|
||||
SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
|
||||
SPH_C32(0xa123ff9f) },
|
||||
{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
|
||||
SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
|
||||
SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
|
||||
SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
|
||||
SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
|
||||
SPH_C32(0x1568ff0f) },
|
||||
{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
|
||||
SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
|
||||
SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
|
||||
SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
|
||||
SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
|
||||
SPH_C32(0xc5c1eb3e) },
|
||||
{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
|
||||
SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
|
||||
SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
|
||||
SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
|
||||
SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
|
||||
SPH_C32(0x1af21fe1) },
|
||||
{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
|
||||
SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
|
||||
SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
|
||||
SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
|
||||
SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
|
||||
SPH_C32(0x857f3c2b) },
|
||||
{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
|
||||
SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
|
||||
SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
|
||||
SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
|
||||
SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
|
||||
SPH_C32(0x2ba05a55) },
|
||||
{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
|
||||
SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
|
||||
SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
|
||||
SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
|
||||
SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
|
||||
SPH_C32(0xfeabf254) },
|
||||
{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
|
||||
SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
|
||||
SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
|
||||
SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
|
||||
SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
|
||||
SPH_C32(0xfe1cdc7f) },
|
||||
{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
|
||||
SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
|
||||
SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
|
||||
SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
|
||||
SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
|
||||
SPH_C32(0xb0a51834) },
|
||||
{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
|
||||
SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
|
||||
SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
|
||||
SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
|
||||
SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
|
||||
SPH_C32(0xa6b8c28d) },
|
||||
{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
|
||||
SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
|
||||
SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
|
||||
SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
|
||||
SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
|
||||
SPH_C32(0x3a4e99d7) },
|
||||
{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
|
||||
SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
|
||||
SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
|
||||
SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
|
||||
SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
|
||||
SPH_C32(0xe1844257) },
|
||||
{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
|
||||
SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
|
||||
SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
|
||||
SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
|
||||
SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
|
||||
SPH_C32(0x2c3b504e) },
|
||||
{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
|
||||
SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
|
||||
SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
|
||||
SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
|
||||
SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
|
||||
SPH_C32(0x524a0d59) },
|
||||
{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
|
||||
SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
|
||||
SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
|
||||
SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
|
||||
SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
|
||||
SPH_C32(0x378dd173) },
|
||||
{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
|
||||
SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
|
||||
SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
|
||||
SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
|
||||
SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
|
||||
SPH_C32(0x8b6c72bd) },
|
||||
{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
|
||||
SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
|
||||
SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
|
||||
SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
|
||||
SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
|
||||
SPH_C32(0x8e67b7fa) },
|
||||
{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
|
||||
SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
|
||||
SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
|
||||
SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
|
||||
SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
|
||||
SPH_C32(0x443d3004) },
|
||||
{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
|
||||
SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
|
||||
SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
|
||||
SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
|
||||
SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
|
||||
SPH_C32(0xf4f6ea7b) },
|
||||
{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
|
||||
SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
|
||||
SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
|
||||
SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
|
||||
SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
|
||||
SPH_C32(0x979961d0) },
|
||||
{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
|
||||
SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
|
||||
SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
|
||||
SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
|
||||
SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
|
||||
SPH_C32(0x98aa496e) },
|
||||
{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
|
||||
SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
|
||||
SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
|
||||
SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
|
||||
SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
|
||||
SPH_C32(0x094e3198) },
|
||||
{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
|
||||
SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
|
||||
SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
|
||||
SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
|
||||
SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
|
||||
SPH_C32(0xe86cba2e) },
|
||||
{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
|
||||
SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
|
||||
SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
|
||||
SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
|
||||
SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
|
||||
SPH_C32(0x4b7eec55) },
|
||||
{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
|
||||
SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
|
||||
SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
|
||||
SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
|
||||
SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
|
||||
SPH_C32(0x1e7536a6) },
|
||||
{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
|
||||
SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
|
||||
SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
|
||||
SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
|
||||
SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
|
||||
SPH_C32(0x24314f17) },
|
||||
{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
|
||||
SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
|
||||
SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
|
||||
SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
|
||||
SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
|
||||
SPH_C32(0x9075b1ce) },
|
||||
{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
|
||||
SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
|
||||
SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
|
||||
SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
|
||||
SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
|
||||
SPH_C32(0x9b6ef888) },
|
||||
{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
|
||||
SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
|
||||
SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
|
||||
SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
|
||||
SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
|
||||
SPH_C32(0xd8b61463) },
|
||||
{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
|
||||
SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
|
||||
SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
|
||||
SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
|
||||
SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
|
||||
SPH_C32(0x3ea660f7) },
|
||||
{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
|
||||
SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
|
||||
SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
|
||||
SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
|
||||
SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
|
||||
SPH_C32(0x7f975691) },
|
||||
{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
|
||||
SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
|
||||
SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
|
||||
SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
|
||||
SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
|
||||
SPH_C32(0x2c94459e) },
|
||||
{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
|
||||
SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
|
||||
SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
|
||||
SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
|
||||
SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
|
||||
SPH_C32(0x56a7b19f) },
|
||||
{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
|
||||
SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
|
||||
SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
|
||||
SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
|
||||
SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
|
||||
SPH_C32(0x81fdf908) },
|
||||
{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
|
||||
SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
|
||||
SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
|
||||
SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
|
||||
SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
|
||||
SPH_C32(0x5bd61539) },
|
||||
{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
|
||||
SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
|
||||
SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
|
||||
SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
|
||||
SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
|
||||
SPH_C32(0x15b961e7) },
|
||||
{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
|
||||
SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
|
||||
SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
|
||||
SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
|
||||
SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
|
||||
SPH_C32(0x2a2c18f0) },
|
||||
{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
|
||||
SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
|
||||
SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
|
||||
SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
|
||||
SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
|
||||
SPH_C32(0x551e3d6e) },
|
||||
{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
|
||||
SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
|
||||
SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
|
||||
SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
|
||||
SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
|
||||
SPH_C32(0x33c5244f) },
|
||||
{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
|
||||
SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
|
||||
SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
|
||||
SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
|
||||
SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
|
||||
SPH_C32(0x8a58e6a4) },
|
||||
{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
|
||||
SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
|
||||
SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
|
||||
SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
|
||||
SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
|
||||
SPH_C32(0xda878000) },
|
||||
{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
|
||||
SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
|
||||
SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
|
||||
SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
|
||||
SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
|
||||
SPH_C32(0x3c5dfffe) },
|
||||
{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
|
||||
SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
|
||||
SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
|
||||
SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
|
||||
SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
|
||||
SPH_C32(0x7b1675d7) },
|
||||
{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
|
||||
SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
|
||||
SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
|
||||
SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
|
||||
SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
|
||||
SPH_C32(0x2879ebac) },
|
||||
{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
|
||||
SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
|
||||
SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
|
||||
SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
|
||||
SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
|
||||
SPH_C32(0xbe0a679e) },
|
||||
{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
|
||||
SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
|
||||
SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
|
||||
SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
|
||||
SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
|
||||
SPH_C32(0x30aebcf7) },
|
||||
{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
|
||||
SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
|
||||
SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
|
||||
SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
|
||||
SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
|
||||
SPH_C32(0xc7ff60f0) },
|
||||
{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
|
||||
SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
|
||||
SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
|
||||
SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
|
||||
SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
|
||||
SPH_C32(0xe7e00a94) }
|
||||
};
|
||||
|
||||
#define U_BIG( n ) \
|
||||
do { \
|
||||
__m128i db = buf[n]; \
|
||||
for ( int u = 0; u < 32; u++ ) \
|
||||
{ \
|
||||
__m128i dm = mm_negate_32( _mm_and_si128( db, mm_one_32 ) ); \
|
||||
m0 = _mm_xor_si128( m0, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
|
||||
m1 = _mm_xor_si128( m1, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
|
||||
m2 = _mm_xor_si128( m2, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
|
||||
m3 = _mm_xor_si128( m3, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
|
||||
m4 = _mm_xor_si128( m4, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
|
||||
m5 = _mm_xor_si128( m5, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
|
||||
m6 = _mm_xor_si128( m6, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
|
||||
m7 = _mm_xor_si128( m7, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
|
||||
m8 = _mm_xor_si128( m8, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
|
||||
m9 = _mm_xor_si128( m9, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
|
||||
mA = _mm_xor_si128( mA, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
|
||||
mB = _mm_xor_si128( mB, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
|
||||
mC = _mm_xor_si128( mC, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
|
||||
mD = _mm_xor_si128( mD, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
|
||||
mE = _mm_xor_si128( mE, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
|
||||
mF = _mm_xor_si128( mF, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
|
||||
db = _mm_srli_epi32( db, 1 ); \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define INPUT_BIG \
|
||||
do { \
|
||||
const sph_u32 *tp = &T512[0][0]; \
|
||||
m0 = mm_zero; \
|
||||
m1 = mm_zero; \
|
||||
m2 = mm_zero; \
|
||||
m3 = mm_zero; \
|
||||
m4 = mm_zero; \
|
||||
m5 = mm_zero; \
|
||||
m6 = mm_zero; \
|
||||
m7 = mm_zero; \
|
||||
m8 = mm_zero; \
|
||||
m9 = mm_zero; \
|
||||
mA = mm_zero; \
|
||||
mB = mm_zero; \
|
||||
mC = mm_zero; \
|
||||
mD = mm_zero; \
|
||||
mE = mm_zero; \
|
||||
mF = mm_zero; \
|
||||
U_BIG( 0 ); \
|
||||
U_BIG( 1 ); \
|
||||
} while (0)
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
@@ -83,7 +83,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
|
||||
|
||||
current = (unsigned)sc->count_low & 127UL;
|
||||
|
||||
sc->buf[ current>>2 ] = mm_one_32;
|
||||
sc->buf[ current>>2 ] = m128_one_32;
|
||||
current += 4;
|
||||
RSTATE;
|
||||
if ( current > 116UL )
|
||||
|
@@ -15,7 +15,7 @@
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/skein/sse2/skein.c"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
|
@@ -95,10 +95,11 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
#ifndef NO_AES_NI
|
||||
GenRandomGarbage( hodl_scratchbuf, work->data, thr_id );
|
||||
GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, thr_id );
|
||||
pthread_barrier_wait( &hodl_barrier );
|
||||
return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
bool register_hodl_algo( algo_gate_t* gate )
|
||||
|
@@ -44,7 +44,7 @@ void jha_hash_4way( void *out, const void *input )
|
||||
for ( int round = 0; round < 3; round++ )
|
||||
{
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
|
||||
vh[0], _mm256_set1_epi64x( 1 ) ), mm256_zero );
|
||||
vh[0], _mm256_set1_epi64x( 1 ) ), m256_zero );
|
||||
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
|
@@ -339,13 +339,13 @@ do { \
|
||||
jhSbuffer[53] = 0x00, \
|
||||
jhSbuffer[54] = 0x00, \
|
||||
jhSbuffer[55] = 0x00; \
|
||||
jhSbuffer[56] = ((64*8) >> 56) & 0xff, \
|
||||
jhSbuffer[57] = ((64*8) >> 48) & 0xff, \
|
||||
jhSbuffer[58] = ((64*8) >> 40) & 0xff, \
|
||||
jhSbuffer[59] = ((64*8) >> 32) & 0xff, \
|
||||
jhSbuffer[60] = ((64*8) >> 24) & 0xff, \
|
||||
jhSbuffer[61] = ((64*8) >> 16) & 0xff, \
|
||||
jhSbuffer[62] = ((64*8) >> 8) & 0xff, \
|
||||
jhSbuffer[56] = ((char)((uint64_t)(64*8) >> 56)) & 0xff, \
|
||||
jhSbuffer[57] = ((char)((uint64_t)(64*8) >> 48)) & 0xff, \
|
||||
jhSbuffer[58] = ((char)((uint64_t)(64*8) >> 40)) & 0xff, \
|
||||
jhSbuffer[59] = ((char)((uint64_t)(64*8) >> 32)) & 0xff, \
|
||||
jhSbuffer[60] = ((char)((uint64_t)(64*8) >> 24)) & 0xff, \
|
||||
jhSbuffer[61] = ((char)((uint64_t)(64*8) >> 16)) & 0xff, \
|
||||
jhSbuffer[62] = ((char)((uint64_t)(64*8) >> 8)) & 0xff, \
|
||||
jhSbuffer[63] = (64*8) & 0xff; \
|
||||
b = true; \
|
||||
} \
|
||||
|
@@ -59,7 +59,7 @@ static const sph_u64 RC[] = {
|
||||
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b))
|
||||
#define AND64(d, a, b) (d = _mm256_and_si256(a,b))
|
||||
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
|
||||
#define NOT64(d, s) (d = _mm256_xor_si256(s,mm256_neg1))
|
||||
#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
|
||||
#define ROL64(d, v, n) (d = mm256_rotl_64(v, n))
|
||||
#define XOR64_IOTA XOR64
|
||||
|
||||
@@ -375,12 +375,12 @@ static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
|
||||
kc->w[i] = _mm256_setzero_si256();
|
||||
|
||||
// Initialization for the "lane complement".
|
||||
kc->w[ 1] = mm256_neg1;
|
||||
kc->w[ 2] = mm256_neg1;
|
||||
kc->w[ 8] = mm256_neg1;
|
||||
kc->w[12] = mm256_neg1;
|
||||
kc->w[17] = mm256_neg1;
|
||||
kc->w[20] = mm256_neg1;
|
||||
kc->w[ 1] = m256_neg1;
|
||||
kc->w[ 2] = m256_neg1;
|
||||
kc->w[ 8] = m256_neg1;
|
||||
kc->w[12] = m256_neg1;
|
||||
kc->w[17] = m256_neg1;
|
||||
kc->w[20] = m256_neg1;
|
||||
kc->ptr = 0;
|
||||
kc->lim = 200 - (out_size >> 2);
|
||||
}
|
||||
|
@@ -775,10 +775,8 @@ static const sph_u64 RC[] = {
|
||||
KF_ELT( 5, 6, RC[j + 5]); \
|
||||
KF_ELT( 6, 7, RC[j + 6]); \
|
||||
KF_ELT( 7, 8, RC[j + 7]); \
|
||||
*/
|
||||
|
||||
//kekDECL_STATE \
|
||||
|
||||
kekDECL_STATE \
|
||||
*/
|
||||
#define DECL_KEC
|
||||
|
||||
|
||||
|
583
algo/luffa/luffa-hash-2way.c
Normal file
583
algo/luffa/luffa-hash-2way.c
Normal file
@@ -0,0 +1,583 @@
|
||||
/*
|
||||
* luffa_for_sse2.c
|
||||
* Version 2.0 (Sep 15th 2009)
|
||||
*
|
||||
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
|
||||
*
|
||||
* Hitachi, Ltd. is the owner of this software and hereby grant
|
||||
* the U.S. Government and any interested party the right to use
|
||||
* this software for the purposes of the SHA-3 evaluation process,
|
||||
* notwithstanding that this software is copyrighted.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
#include "luffa-hash-2way.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include "avxdefs.h"
|
||||
|
||||
#define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
|
||||
0UL, 0UL, 0UL, 0xffffffffUL )
|
||||
|
||||
#define ADD_CONSTANT(a,b,c0,c1)\
|
||||
a = _mm256_xor_si256(a,c0);\
|
||||
b = _mm256_xor_si256(b,c1);\
|
||||
|
||||
#define MULT2(a0,a1) \
|
||||
do { \
|
||||
register __m256i b = _mm256_xor_si256( a0, \
|
||||
_mm256_shuffle_epi32( _mm256_and_si256(a1,MASK), 16 ) ); \
|
||||
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
|
||||
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
|
||||
} while(0)
|
||||
|
||||
// confirm pointer arithmetic
|
||||
// ok but use array indexes
|
||||
#define STEP_PART(x,c,t)\
|
||||
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
||||
MIXWORD(*x,*(x+4),*t,*(t+1));\
|
||||
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
|
||||
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
|
||||
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
|
||||
ADD_CONSTANT(*x, *(x+4), *c, *(c+1));
|
||||
|
||||
#define SUBCRUMB(a0,a1,a2,a3,t)\
|
||||
t = _mm256_load_si256(&a0);\
|
||||
a0 = _mm256_or_si256(a0,a1);\
|
||||
a2 = _mm256_xor_si256(a2,a3);\
|
||||
a1 = _mm256_andnot_si256(a1, m256_neg1 );\
|
||||
a0 = _mm256_xor_si256(a0,a3);\
|
||||
a3 = _mm256_and_si256(a3,t);\
|
||||
a1 = _mm256_xor_si256(a1,a3);\
|
||||
a3 = _mm256_xor_si256(a3,a2);\
|
||||
a2 = _mm256_and_si256(a2,a0);\
|
||||
a0 = _mm256_andnot_si256(a0, m256_neg1 );\
|
||||
a2 = _mm256_xor_si256(a2,a1);\
|
||||
a1 = _mm256_or_si256(a1,a3);\
|
||||
t = _mm256_xor_si256(t,a1);\
|
||||
a3 = _mm256_xor_si256(a3,a2);\
|
||||
a2 = _mm256_and_si256(a2,a1);\
|
||||
a1 = _mm256_xor_si256(a1,a0);\
|
||||
a0 = _mm256_load_si256(&t);\
|
||||
|
||||
#define MIXWORD(a,b,t1,t2)\
|
||||
b = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(a,2);\
|
||||
t2 = _mm256_srli_epi32(a,30);\
|
||||
a = _mm256_or_si256(t1,t2);\
|
||||
a = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(b,14);\
|
||||
t2 = _mm256_srli_epi32(b,18);\
|
||||
b = _mm256_or_si256(t1,t2);\
|
||||
b = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(a,10);\
|
||||
t2 = _mm256_srli_epi32(a,22);\
|
||||
a = _mm256_or_si256(t1,t2);\
|
||||
a = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(b,1);\
|
||||
t2 = _mm256_srli_epi32(b,31);\
|
||||
b = _mm256_or_si256(t1,t2);
|
||||
|
||||
#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
|
||||
a1 = _mm256_shuffle_epi32(a1,147);\
|
||||
t0 = _mm256_load_si256(&a1);\
|
||||
a1 = _mm256_unpacklo_epi32(a1,a0);\
|
||||
t0 = _mm256_unpackhi_epi32(t0,a0);\
|
||||
t1 = _mm256_shuffle_epi32(t0,78);\
|
||||
a0 = _mm256_shuffle_epi32(a1,78);\
|
||||
SUBCRUMB(t1,t0,a0,a1,tmp0);\
|
||||
t0 = _mm256_unpacklo_epi32(t0,t1);\
|
||||
a1 = _mm256_unpacklo_epi32(a1,a0);\
|
||||
a0 = _mm256_load_si256(&a1);\
|
||||
a0 = _mm256_unpackhi_epi64(a0,t0);\
|
||||
a1 = _mm256_unpacklo_epi64(a1,t0);\
|
||||
a1 = _mm256_shuffle_epi32(a1,57);\
|
||||
MIXWORD(a0,a1,tmp0,tmp1);\
|
||||
ADD_CONSTANT(a0,a1,c0,c1);
|
||||
|
||||
#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
|
||||
s2 = _mm256_load_si256(&r1);\
|
||||
q2 = _mm256_load_si256(&p1);\
|
||||
r2 = _mm256_shuffle_epi32(r2,216);\
|
||||
p2 = _mm256_shuffle_epi32(p2,216);\
|
||||
r1 = _mm256_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm256_unpacklo_epi32(p1,p0);\
|
||||
s2 = _mm256_unpackhi_epi32(s2,r0);\
|
||||
q2 = _mm256_unpackhi_epi32(q2,p0);\
|
||||
s0 = _mm256_load_si256(&r2);\
|
||||
q0 = _mm256_load_si256(&p2);\
|
||||
r2 = _mm256_unpacklo_epi64(r2,r1);\
|
||||
p2 = _mm256_unpacklo_epi64(p2,p1);\
|
||||
s1 = _mm256_load_si256(&s0);\
|
||||
q1 = _mm256_load_si256(&q0);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,r1);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,p1);\
|
||||
r2 = _mm256_shuffle_epi32(r2,225);\
|
||||
p2 = _mm256_shuffle_epi32(p2,225);\
|
||||
r0 = _mm256_load_si256(&s1);\
|
||||
p0 = _mm256_load_si256(&q1);\
|
||||
s0 = _mm256_shuffle_epi32(s0,225);\
|
||||
q0 = _mm256_shuffle_epi32(q0,225);\
|
||||
s1 = _mm256_unpacklo_epi64(s1,s2);\
|
||||
q1 = _mm256_unpacklo_epi64(q1,q2);\
|
||||
r0 = _mm256_unpackhi_epi64(r0,s2);\
|
||||
p0 = _mm256_unpackhi_epi64(p0,q2);\
|
||||
s2 = _mm256_load_si256(&r0);\
|
||||
q2 = _mm256_load_si256(&p0);\
|
||||
s3 = _mm256_load_si256(&r2);\
|
||||
q3 = _mm256_load_si256(&p2);\
|
||||
|
||||
#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
|
||||
s0 = _mm256_load_si256(&r0);\
|
||||
q0 = _mm256_load_si256(&p0);\
|
||||
s1 = _mm256_load_si256(&r2);\
|
||||
q1 = _mm256_load_si256(&p2);\
|
||||
r0 = _mm256_unpackhi_epi32(r0,r1);\
|
||||
p0 = _mm256_unpackhi_epi32(p0,p1);\
|
||||
r2 = _mm256_unpackhi_epi32(r2,r3);\
|
||||
p2 = _mm256_unpackhi_epi32(p2,p3);\
|
||||
s0 = _mm256_unpacklo_epi32(s0,r1);\
|
||||
q0 = _mm256_unpacklo_epi32(q0,p1);\
|
||||
s1 = _mm256_unpacklo_epi32(s1,r3);\
|
||||
q1 = _mm256_unpacklo_epi32(q1,p3);\
|
||||
r1 = _mm256_load_si256(&r0);\
|
||||
p1 = _mm256_load_si256(&p0);\
|
||||
r0 = _mm256_unpackhi_epi64(r0,r2);\
|
||||
p0 = _mm256_unpackhi_epi64(p0,p2);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,s1);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,q1);\
|
||||
r1 = _mm256_unpacklo_epi64(r1,r2);\
|
||||
p1 = _mm256_unpacklo_epi64(p1,p2);\
|
||||
s2 = _mm256_load_si256(&r0);\
|
||||
q2 = _mm256_load_si256(&p0);\
|
||||
s1 = _mm256_load_si256(&r1);\
|
||||
q1 = _mm256_load_si256(&p1);\
|
||||
|
||||
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
s1 = _mm256_load_si256(&r3);\
|
||||
q1 = _mm256_load_si256(&p3);\
|
||||
s3 = _mm256_load_si256(&r3);\
|
||||
q3 = _mm256_load_si256(&p3);\
|
||||
s1 = _mm256_unpackhi_epi32(s1,r2);\
|
||||
q1 = _mm256_unpackhi_epi32(q1,p2);\
|
||||
s3 = _mm256_unpacklo_epi32(s3,r2);\
|
||||
q3 = _mm256_unpacklo_epi32(q3,p2);\
|
||||
s0 = _mm256_load_si256(&s1);\
|
||||
q0 = _mm256_load_si256(&q1);\
|
||||
s2 = _mm256_load_si256(&s3);\
|
||||
q2 = _mm256_load_si256(&q3);\
|
||||
r3 = _mm256_load_si256(&r1);\
|
||||
p3 = _mm256_load_si256(&p1);\
|
||||
r1 = _mm256_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm256_unpacklo_epi32(p1,p0);\
|
||||
r3 = _mm256_unpackhi_epi32(r3,r0);\
|
||||
p3 = _mm256_unpackhi_epi32(p3,p0);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,r3);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,p3);\
|
||||
s1 = _mm256_unpacklo_epi64(s1,r3);\
|
||||
q1 = _mm256_unpacklo_epi64(q1,p3);\
|
||||
s2 = _mm256_unpackhi_epi64(s2,r1);\
|
||||
q2 = _mm256_unpackhi_epi64(q2,p1);\
|
||||
s3 = _mm256_unpacklo_epi64(s3,r1);\
|
||||
q3 = _mm256_unpacklo_epi64(q3,p1);
|
||||
|
||||
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
|
||||
|
||||
/* initial values of chaining variables */
|
||||
static const uint32 IV[40] __attribute((aligned(32))) = {
|
||||
0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
|
||||
0xdef610bb,0xee058139,0x90152df4,0x6e292011,
|
||||
0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
|
||||
0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
|
||||
0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
|
||||
0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
|
||||
0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
|
||||
0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
|
||||
0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
|
||||
0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
|
||||
};
|
||||
|
||||
/* Round Constants */
|
||||
static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
|
||||
0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
|
||||
0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
|
||||
0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
|
||||
0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
|
||||
0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
|
||||
0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
|
||||
0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
|
||||
0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
|
||||
0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
|
||||
0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
|
||||
0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
|
||||
0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
|
||||
0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
|
||||
0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
|
||||
0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
|
||||
0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
|
||||
0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
|
||||
0x00000000,0x00000000,0x00000000,0x5090d577,
|
||||
0x00000000,0x00000000,0x00000000,0xac11d7fa,
|
||||
0x00000000,0x00000000,0x00000000,0x2d1925ab,
|
||||
0x00000000,0x00000000,0x00000000,0x1bcb66f2,
|
||||
0x00000000,0x00000000,0x00000000,0xb46496ac,
|
||||
0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
|
||||
0x00000000,0x00000000,0x00000000,0xd1925ab0,
|
||||
0x00000000,0x00000000,0x00000000,0x78602649,
|
||||
0x00000000,0x00000000,0x00000000,0x29131ab6,
|
||||
0x00000000,0x00000000,0x00000000,0x8edae952,
|
||||
0x00000000,0x00000000,0x00000000,0x0fc053c3,
|
||||
0x00000000,0x00000000,0x00000000,0x3b6ba548,
|
||||
0x00000000,0x00000000,0x00000000,0x3f014f0c,
|
||||
0x00000000,0x00000000,0x00000000,0xedae9520,
|
||||
0x00000000,0x00000000,0x00000000,0xfc053c31
|
||||
};
|
||||
|
||||
__m256i CNS[32];
|
||||
|
||||
/***************************************************/
|
||||
/* Round function */
|
||||
/* state: hash context */
|
||||
|
||||
void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
{
|
||||
__m256i t0, t1;
|
||||
__m256i *chainv = state->chainv;
|
||||
__m256i msg0, msg1;
|
||||
__m256i tmp[2];
|
||||
__m256i x[8];
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
|
||||
t0 = _mm256_xor_si256( t0, chainv[2] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[3] );
|
||||
t0 = _mm256_xor_si256( t0, chainv[4] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[5] );
|
||||
t0 = _mm256_xor_si256( t0, chainv[6] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[7] );
|
||||
t0 = _mm256_xor_si256( t0, chainv[8] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[9] );
|
||||
|
||||
MULT2( t0, t1 );
|
||||
|
||||
msg0 = _mm256_shuffle_epi32( msg[0], 27 );
|
||||
msg1 = _mm256_shuffle_epi32( msg[1], 27 );
|
||||
|
||||
chainv[0] = _mm256_xor_si256( chainv[0], t0 );
|
||||
chainv[1] = _mm256_xor_si256( chainv[1], t1 );
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], t0 );
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], t1 );
|
||||
chainv[4] = _mm256_xor_si256( chainv[4], t0 );
|
||||
chainv[5] = _mm256_xor_si256( chainv[5], t1 );
|
||||
chainv[6] = _mm256_xor_si256( chainv[6], t0 );
|
||||
chainv[7] = _mm256_xor_si256( chainv[7], t1 );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
|
||||
MULT2( chainv[0], chainv[1]);
|
||||
chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
|
||||
chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
|
||||
|
||||
MULT2( chainv[2], chainv[3]);
|
||||
chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
|
||||
chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
|
||||
|
||||
MULT2( chainv[4], chainv[5]);
|
||||
chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
|
||||
chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
|
||||
|
||||
MULT2( chainv[6], chainv[7]);
|
||||
chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
|
||||
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
|
||||
|
||||
MULT2( chainv[8], chainv[9]);
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
|
||||
|
||||
t0 = chainv[8];
|
||||
t1 = chainv[9];
|
||||
|
||||
MULT2( chainv[8], chainv[9]);
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
|
||||
|
||||
MULT2( chainv[6], chainv[7]);
|
||||
chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
|
||||
chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
|
||||
|
||||
MULT2( chainv[4], chainv[5]);
|
||||
chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
|
||||
chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
|
||||
|
||||
MULT2( chainv[2], chainv[3] );
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
|
||||
|
||||
MULT2( chainv[0], chainv[1] );
|
||||
chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
|
||||
chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
|
||||
|
||||
MULT2( msg0, msg1);
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
|
||||
|
||||
MULT2( msg0, msg1);
|
||||
chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
|
||||
chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
|
||||
|
||||
MULT2( msg0, msg1);
|
||||
chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
|
||||
chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
|
||||
|
||||
MULT2( msg0, msg1);
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
|
||||
|
||||
MULT2( msg0, msg1);
|
||||
|
||||
chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3], 1 ),
|
||||
_mm256_srli_epi32( chainv[3], 31 ) );
|
||||
chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5], 2 ),
|
||||
_mm256_srli_epi32( chainv[5], 30 ) );
|
||||
chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7], 3 ),
|
||||
_mm256_srli_epi32( chainv[7], 29 ) );
|
||||
chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9], 4 ),
|
||||
_mm256_srli_epi32( chainv[9], 28 ) );
|
||||
|
||||
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x[0], x[1], x[2], x[3],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7],
|
||||
x[4], x[5], x[6], x[7] );
|
||||
|
||||
STEP_PART( &x[0], &CNS[ 0], &tmp[0] );
|
||||
STEP_PART( &x[0], &CNS[ 2], &tmp[0] );
|
||||
STEP_PART( &x[0], &CNS[ 4], &tmp[0] );
|
||||
STEP_PART( &x[0], &CNS[ 6], &tmp[0] );
|
||||
STEP_PART( &x[0], &CNS[ 8], &tmp[0] );
|
||||
STEP_PART( &x[0], &CNS[10], &tmp[0] );
|
||||
STEP_PART( &x[0], &CNS[12], &tmp[0] );
|
||||
STEP_PART( &x[0], &CNS[14], &tmp[0] );
|
||||
|
||||
MIXTON1024( x[0], x[1], x[2], x[3],
|
||||
chainv[0], chainv[2], chainv[4],chainv[6],
|
||||
x[4], x[5], x[6], x[7],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7]);
|
||||
|
||||
/* Process last 256-bit block */
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[16], CNS[17],
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[18], CNS[19],
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[20], CNS[21],
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[22], CNS[23],
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[24], CNS[25],
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[26], CNS[27],
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[28], CNS[29],
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[30], CNS[31],
|
||||
tmp[0], tmp[1] );
|
||||
}
|
||||
|
||||
|
||||
/***************************************************/
|
||||
/* Finalization function */
|
||||
/* state: hash context */
|
||||
/* b[8]: hash values */
|
||||
|
||||
void finalization512_2way( luffa_2way_context *state, uint32 *b )
|
||||
{
|
||||
uint32 hash[8] __attribute((aligned(64)));
|
||||
__m256i* chainv = state->chainv;
|
||||
__m256i t[2];
|
||||
__m256i zero[2];
|
||||
zero[0] = zero[1] = _mm256_setzero_si256();
|
||||
|
||||
/*---- blank round with m=0 ----*/
|
||||
rnd512_2way( state, zero );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[2] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[3] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[4] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[5] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[6] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[7] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[8] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[9] );
|
||||
|
||||
t[0] = _mm256_shuffle_epi32( t[0], 27 );
|
||||
t[1] = _mm256_shuffle_epi32( t[1], 27 );
|
||||
|
||||
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
|
||||
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
|
||||
|
||||
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
||||
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
|
||||
|
||||
rnd512_2way( state, zero );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[2] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[3] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[4] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[5] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[6] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[7] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[8] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[9] );
|
||||
|
||||
t[0] = _mm256_shuffle_epi32( t[0], 27 );
|
||||
t[1] = _mm256_shuffle_epi32( t[1], 27 );
|
||||
|
||||
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
|
||||
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
|
||||
|
||||
casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
||||
casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
|
||||
}
|
||||
|
||||
int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
|
||||
{
|
||||
int i;
|
||||
state->hashbitlen = hashbitlen;
|
||||
|
||||
for ( i=0; i<32; i++ ) CNS[i] =
|
||||
_mm256_set_epi32( CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
|
||||
CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ],
|
||||
CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
|
||||
CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ] );
|
||||
|
||||
for ( i=0; i<10; i++ ) state->chainv[i] =
|
||||
_mm256_set_epi32( IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
|
||||
IV[ (i<<2) +1 ], IV[ (i<<2) ],
|
||||
IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
|
||||
IV[ (i<<2) +1 ], IV[ (i<<2) ] );
|
||||
|
||||
((__m256i*)state->buffer)[0] = m256_zero;
|
||||
((__m256i*)state->buffer)[1] = m256_zero;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Do not call luffa_update_close after having called luffa_update.
|
||||
// Once luffa_update has been called only call luffa_update or luffa_close.
|
||||
int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
__m256i *buffer = (__m256i*)state->buffer;
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
int blocks = (int)len >> 5;
|
||||
state-> rembytes = (int)len & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = mm256_bswap_32( vdata[ 0] );
|
||||
msg[1] = mm256_bswap_32( vdata[ 1 ] );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
// store in buffer for transform in final for midstate to work
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// remaining data bytes
|
||||
buffer[0] = mm256_bswap_32( vdata[0] );
|
||||
buffer[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int luffa_2way_close( luffa_2way_context *state, void *hashval )
|
||||
{
|
||||
__m256i *buffer = (__m256i*)state->buffer;
|
||||
__m256i msg[2];
|
||||
|
||||
// transform pad block
|
||||
if ( state->rembytes )
|
||||
// not empty, data is in buffer
|
||||
rnd512_2way( state, buffer );
|
||||
else
|
||||
{ // empty pad block, constant data
|
||||
msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
|
||||
msg[1] = m256_zero;
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
finalization512_2way( state, (uint32*)hashval );
|
||||
|
||||
if ( state->hashbitlen > 512 )
|
||||
finalization512_2way( state, (uint32*)( hashval+32 ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int luffa_2way_update_close( luffa_2way_context *state,
|
||||
void *output, const void *data, size_t inlen )
|
||||
{
|
||||
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
|
||||
const __m256i *vdata = (__m256i*)data;
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
const int blocks = (int)( inlen >> 5 );
|
||||
state->rembytes = inlen & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = mm256_bswap_32( vdata[ 0 ] );
|
||||
msg[1] = mm256_bswap_32( vdata[ 1 ] );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = mm256_bswap_32( vdata[0] );
|
||||
msg[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
else
|
||||
{
|
||||
// empty pad block
|
||||
msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
|
||||
msg[1] = m256_zero;
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
finalization512_2way( state, (uint32*)output );
|
||||
if ( state->hashbitlen > 512 )
|
||||
finalization512_2way( state, (uint32*)( output+32 ) );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
69
algo/luffa/luffa-hash-2way.h
Normal file
69
algo/luffa/luffa-hash-2way.h
Normal file
@@ -0,0 +1,69 @@
|
||||
#if !defined(LUFFA_HASH_2WAY_H__)
|
||||
#define LUFFA_HASH_2WAY_H__ 1
|
||||
/*
|
||||
* luffa_for_sse2.h
|
||||
* Version 2.0 (Sep 15th 2009)
|
||||
*
|
||||
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
|
||||
*
|
||||
* Hitachi, Ltd. is the owner of this software and hereby grant
|
||||
* the U.S. Government and any interested party the right to use
|
||||
* this software for the purposes of the SHA-3 evaluation process,
|
||||
* notwithstanding that this software is copyrighted.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*/
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <immintrin.h>
|
||||
#include "algo/sha/sha3-defs.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
/* The length of digests*/
|
||||
#define DIGEST_BIT_LEN_224 224
|
||||
#define DIGEST_BIT_LEN_256 256
|
||||
#define DIGEST_BIT_LEN_384 384
|
||||
#define DIGEST_BIT_LEN_512 512
|
||||
|
||||
/*********************************/
|
||||
/* The parameters of Luffa */
|
||||
#define MSG_BLOCK_BIT_LEN 256 /*The bit length of a message block*/
|
||||
#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length
|
||||
* of a message block*/
|
||||
|
||||
/* The number of blocks in Luffa */
|
||||
#define WIDTH_224 3
|
||||
#define WIDTH_256 3
|
||||
#define WIDTH_384 4
|
||||
#define WIDTH_512 5
|
||||
|
||||
/* The limit of the length of message */
|
||||
#define LIMIT_224 64
|
||||
#define LIMIT_256 64
|
||||
#define LIMIT_384 128
|
||||
#define LIMIT_512 128
|
||||
/*********************************/
|
||||
|
||||
typedef struct {
|
||||
uint32 buffer[8*2] __attribute((aligned(64)));
|
||||
__m256i chainv[10] __attribute((aligned(32))); /* Chaining values */
|
||||
int hashbitlen;
|
||||
int rembytes;
|
||||
} luffa_2way_context;
|
||||
|
||||
int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
|
||||
int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
size_t len );
|
||||
int luffa_2way_close( luffa_2way_context *state, void *hashval );
|
||||
int luffa_2way_update_close( luffa_2way_context *state, void *output,
|
||||
const void *data, size_t inlen );
|
||||
|
||||
#endif
|
||||
#endif
|
@@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
{
|
||||
rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
|
||||
mm_byteswap_32( casti_m128i( data, 0 ) ) );
|
||||
rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
|
||||
mm_bswap_32( casti_m128i( data, 0 ) ) );
|
||||
data += MSG_BLOCK_BYTE_LEN;
|
||||
}
|
||||
|
||||
@@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// remaining data bytes
|
||||
casti_m128i( state->buffer, 0 ) = mm_byteswap_32( cast_m128i( data ) );
|
||||
casti_m128i( state->buffer, 0 ) = mm_bswap_32( cast_m128i( data ) );
|
||||
// padding of partial block
|
||||
casti_m128i( state->buffer, 1 ) =
|
||||
_mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
|
||||
@@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
{
|
||||
rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
|
||||
mm_byteswap_32( casti_m128i( data, 0 ) ) );
|
||||
rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
|
||||
mm_bswap_32( casti_m128i( data, 0 ) ) );
|
||||
data += MSG_BLOCK_BYTE_LEN;
|
||||
}
|
||||
|
||||
@@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
|
||||
{
|
||||
// padding of partial block
|
||||
rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
|
||||
mm_byteswap_32( cast_m128i( data ) ) );
|
||||
mm_bswap_32( cast_m128i( data ) ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -542,7 +542,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
|
||||
_mm256_store_si256( (__m256i*)hash, t );
|
||||
|
||||
casti_m256i( b, 0 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
|
||||
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
||||
|
||||
rnd512( state, zero, zero );
|
||||
|
||||
@@ -555,7 +555,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
|
||||
_mm256_store_si256( (__m256i*)hash, t );
|
||||
|
||||
casti_m256i( b, 1 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
|
||||
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
||||
}
|
||||
|
||||
#else
|
||||
@@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
_mm_store_si128((__m128i*)&hash[0], t[0]);
|
||||
_mm_store_si128((__m128i*)&hash[4], t[1]);
|
||||
|
||||
casti_m128i( b, 0 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
|
||||
casti_m128i( b, 1 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
|
||||
casti_m128i( b, 0 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
|
||||
casti_m128i( b, 1 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
|
||||
|
||||
rnd512( state, zero, zero );
|
||||
|
||||
@@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
_mm_store_si128((__m128i*)&hash[0], t[0]);
|
||||
_mm_store_si128((__m128i*)&hash[4], t[1]);
|
||||
|
||||
casti_m128i( b, 2 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
|
||||
casti_m128i( b, 3 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
|
||||
casti_m128i( b, 2 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
|
||||
casti_m128i( b, 3 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
|
||||
}
|
||||
#endif
|
||||
|
165
algo/lyra2/allium-4way.c
Normal file
165
algo/lyra2/allium-4way.c
Normal file
@@ -0,0 +1,165 @@
|
||||
#include "allium-gate.h"
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
|
||||
#if defined (ALLIUM_4WAY)
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl256.h"
|
||||
|
||||
typedef struct {
|
||||
blake256_4way_context blake;
|
||||
keccak256_4way_context keccak;
|
||||
cubehashParam cube;
|
||||
skein256_4way_context skein;
|
||||
hashState_groestl256 groestl;
|
||||
|
||||
} allium_4way_ctx_holder;
|
||||
|
||||
static __thread allium_4way_ctx_holder allium_4way_ctx;
|
||||
|
||||
bool init_allium_4way_ctx()
|
||||
{
|
||||
keccak256_4way_init( &allium_4way_ctx.keccak );
|
||||
cubehashInit( &allium_4way_ctx.cube, 256, 16, 32 );
|
||||
skein256_4way_init( &allium_4way_ctx.skein );
|
||||
init_groestl256( &allium_4way_ctx.groestl, 32 );
|
||||
return true;
|
||||
}
|
||||
|
||||
void allium_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||
uint32_t vhash32[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vhash64[8*4] __attribute__ ((aligned (64)));
|
||||
allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
|
||||
blake256_4way( &ctx.blake, input + (64<<2), 16 );
|
||||
blake256_4way_close( &ctx.blake, vhash32 );
|
||||
|
||||
mm256_reinterleave_4x64( vhash64, vhash32, 256 );
|
||||
keccak256_4way( &ctx.keccak, vhash64, 32 );
|
||||
keccak256_4way_close( &ctx.keccak, vhash64 );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
|
||||
|
||||
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
|
||||
|
||||
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
|
||||
skein256_4way( &ctx.skein, vhash64, 32 );
|
||||
skein256_4way_close( &ctx.skein, vhash64 );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
|
||||
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
|
||||
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
|
||||
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 76; // 19*4
|
||||
uint32_t *noncep1 = vdata + 77;
|
||||
uint32_t *noncep2 = vdata + 78;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
swab32_array( edata, pdata, 20 );
|
||||
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
blake256_4way_init( &allium_4way_ctx.blake );
|
||||
blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
|
||||
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
allium_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = pdata[19] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
&& !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
22
algo/lyra2/allium-gate.c
Normal file
22
algo/lyra2/allium-gate.c
Normal file
@@ -0,0 +1,22 @@
|
||||
#include "allium-gate.h"
|
||||
|
||||
int64_t get_max64_0xFFFFLL() { return 0xFFFFLL; }
|
||||
|
||||
bool register_allium_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (ALLIUM_4WAY)
|
||||
gate->miner_thread_init = (void*)&init_allium_4way_ctx;
|
||||
gate->scanhash = (void*)&scanhash_allium_4way;
|
||||
gate->hash = (void*)&allium_4way_hash;
|
||||
#else
|
||||
gate->miner_thread_init = (void*)&init_allium_ctx;
|
||||
gate->scanhash = (void*)&scanhash_allium;
|
||||
gate->hash = (void*)&allium_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->set_target = (void*)&alt_set_target;
|
||||
gate->get_max64 = (void*)&get_max64_0xFFFFLL;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
29
algo/lyra2/allium-gate.h
Normal file
29
algo/lyra2/allium-gate.h
Normal file
@@ -0,0 +1,29 @@
|
||||
#ifndef ALLIUM_GATE_H__
|
||||
#define ALLIUM_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
#include "lyra2.h"
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define ALLIUM_4WAY
|
||||
#endif
|
||||
|
||||
bool register_allium_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(ALLIUM_4WAY)
|
||||
|
||||
void allium_4way_hash( void *state, const void *input );
|
||||
int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
bool init_allium_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void allium_hash( void *state, const void *input );
|
||||
int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
bool init_allium_ctx();
|
||||
|
||||
#endif
|
||||
|
112
algo/lyra2/allium.c
Normal file
112
algo/lyra2/allium.c
Normal file
@@ -0,0 +1,112 @@
|
||||
#include "allium-gate.h"
|
||||
#include <memory.h>
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/groestl/aes_ni/hash-groestl256.h"
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#endif
|
||||
#include "lyra2.h"
|
||||
|
||||
typedef struct {
|
||||
sph_blake256_context blake;
|
||||
sph_keccak256_context keccak;
|
||||
cubehashParam cube;
|
||||
sph_skein256_context skein;
|
||||
#if defined (__AES__)
|
||||
hashState_groestl256 groestl;
|
||||
#else
|
||||
sph_groestl256_context groestl;
|
||||
#endif
|
||||
} allium_ctx_holder;
|
||||
|
||||
static __thread allium_ctx_holder allium_ctx;
|
||||
|
||||
bool init_allium_ctx()
|
||||
{
|
||||
sph_keccak256_init( &allium_ctx.keccak );
|
||||
cubehashInit( &allium_ctx.cube, 256, 16, 32 );
|
||||
sph_skein256_init( &allium_ctx.skein );
|
||||
#if defined (__AES__)
|
||||
init_groestl256( &allium_ctx.groestl, 32 );
|
||||
#else
|
||||
sph_groestl256_init( &allium_ctx.groestl );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
void allium_hash(void *state, const void *input)
|
||||
{
|
||||
uint32_t hash[8] __attribute__ ((aligned (64)));
|
||||
allium_ctx_holder ctx __attribute__ ((aligned (32)));
|
||||
|
||||
memcpy( &ctx, &allium_ctx, sizeof(allium_ctx) );
|
||||
sph_blake256( &ctx.blake, input + 64, 16 );
|
||||
sph_blake256_close( &ctx.blake, hash );
|
||||
|
||||
sph_keccak256( &ctx.keccak, hash, 32 );
|
||||
sph_keccak256_close( &ctx.keccak, hash );
|
||||
|
||||
LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash, (const byte*)hash, 32 );
|
||||
|
||||
LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
|
||||
|
||||
sph_skein256( &ctx.skein, hash, 32 );
|
||||
sph_skein256_close( &ctx.skein, hash );
|
||||
|
||||
#if defined (__AES__)
|
||||
update_and_final_groestl256( &ctx.groestl, hash, hash, 256 );
|
||||
#else
|
||||
sph_groestl256( &ctx.groestl, hash, 32 );
|
||||
sph_groestl256_close( &ctx.groestl, hash );
|
||||
#endif
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t _ALIGN(128) hash[8];
|
||||
uint32_t _ALIGN(128) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x3ffff;
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
be32enc( &endiandata[i], pdata[i] );
|
||||
|
||||
sph_blake256_init( &allium_ctx.blake );
|
||||
sph_blake256( &allium_ctx.blake, endiandata, 64 );
|
||||
|
||||
do {
|
||||
be32enc( &endiandata[19], nonce );
|
||||
allium_hash( hash, endiandata );
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
work_set_target_ratio( work, hash );
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
@@ -47,8 +47,9 @@
|
||||
*/
|
||||
|
||||
int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t pwdlen, const void *salt, uint64_t saltlen,
|
||||
uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
|
||||
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
|
||||
const uint64_t timeCost, const uint64_t nRows,
|
||||
const uint64_t nCols )
|
||||
{
|
||||
//====================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[16];
|
||||
@@ -73,6 +74,8 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
|
||||
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
|
||||
|
||||
//=== Getting the password + salt + basil padded with 10*1 ==========//
|
||||
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||
//but this ensures that the password copied locally will be overwritten as soon as possible
|
||||
@@ -209,8 +212,9 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
}
|
||||
|
||||
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t pwdlen, const void *salt, uint64_t saltlen,
|
||||
uint64_t timeCost, uint64_t nRows, uint64_t nCols )
|
||||
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
|
||||
const uint64_t timeCost, const uint64_t nRows,
|
||||
const uint64_t nCols )
|
||||
{
|
||||
//========================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[16];
|
||||
@@ -230,6 +234,8 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
|
||||
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
|
||||
|
||||
//==== Getting the password + salt + basil padded with 10*1 ============//
|
||||
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||
//but this ensures that the password copied locally will be overwritten as soon as possible
|
||||
@@ -347,9 +353,9 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
}
|
||||
|
||||
// Lyra2RE doesn't like the new wholeMatrix implementation
|
||||
int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t pwdlen, const void *salt, uint64_t saltlen,
|
||||
uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
|
||||
int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
|
||||
const void *salt, const uint64_t saltlen, const uint64_t timeCost,
|
||||
const uint64_t nRows, const uint64_t nCols )
|
||||
{
|
||||
//====================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[16];
|
||||
@@ -377,15 +383,15 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
|
||||
if (wholeMatrix == NULL)
|
||||
return -1;
|
||||
/*
|
||||
#if defined (__AVX2__)
|
||||
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
|
||||
|
||||
#if defined(__AVX2__)
|
||||
memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
|
||||
#elif defined(__AVX__)
|
||||
memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
|
||||
memset_zero_128( (__m128i*)wholeMatrix, i>>4 );
|
||||
#else
|
||||
memset(wholeMatrix, 0, i);
|
||||
memset( wholeMatrix, 0, i );
|
||||
#endif
|
||||
*/
|
||||
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
|
||||
//=== Getting the password + salt + basil padded with 10*1 ==========//
|
||||
@@ -406,8 +412,8 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
|
||||
memcpy(ptrByte, salt, saltlen);
|
||||
ptrByte += saltlen;
|
||||
|
||||
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
|
||||
- (saltlen + pwdlen) );
|
||||
// memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
|
||||
// - (saltlen + pwdlen) );
|
||||
|
||||
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||
memcpy(ptrByte, &kLen, sizeof(int64_t));
|
||||
|
@@ -54,4 +54,6 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t pwdlen, const void *salt, uint64_t saltlen,
|
||||
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||
|
||||
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
|
||||
|
||||
#endif /* LYRA2_H_ */
|
||||
|
@@ -7,9 +7,6 @@
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
//#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
|
||||
typedef struct {
|
||||
@@ -18,19 +15,17 @@ typedef struct {
|
||||
cubehashParam cube;
|
||||
skein256_4way_context skein;
|
||||
bmw256_4way_context bmw;
|
||||
// sph_bmw256_context bmw;
|
||||
} lyra2v2_4way_ctx_holder;
|
||||
|
||||
static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
|
||||
|
||||
void init_lyra2rev2_4way_ctx()
|
||||
bool init_lyra2rev2_4way_ctx()
|
||||
{
|
||||
// blake256_4way_init( &l2v2_4way_ctx.blake );
|
||||
keccak256_4way_init( &l2v2_4way_ctx.keccak );
|
||||
cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
|
||||
skein256_4way_init( &l2v2_4way_ctx.skein );
|
||||
bmw256_4way_init( &l2v2_4way_ctx.bmw );
|
||||
// sph_bmw256_init( &l2v2_4way_ctx.bmw );
|
||||
return true;
|
||||
}
|
||||
|
||||
void lyra2rev2_4way_hash( void *state, const void *input )
|
||||
@@ -45,7 +40,6 @@ void lyra2rev2_4way_hash( void *state, const void *input )
|
||||
memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
|
||||
|
||||
blake256_4way( &ctx.blake, input + (64<<2), 16 );
|
||||
// blake256_4way( &ctx.blake, input, 80 );
|
||||
blake256_4way_close( &ctx.blake, vhash );
|
||||
|
||||
mm256_reinterleave_4x64( vhash64, vhash, 256 );
|
||||
@@ -54,11 +48,11 @@ void lyra2rev2_4way_hash( void *state, const void *input )
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
|
||||
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
|
||||
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
|
||||
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
|
||||
|
||||
LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
|
||||
@@ -71,36 +65,20 @@ void lyra2rev2_4way_hash( void *state, const void *input )
|
||||
skein256_4way_close( &ctx.skein, vhash64 );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
|
||||
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
|
||||
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
|
||||
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
|
||||
|
||||
|
||||
// BMW256 4way has a lane corruption problem, only lanes 0 & 2 produce
|
||||
// good hash. As a result this ugly workaround of running bmw256-4way
|
||||
// twice with data shuffled to get all 4 lanes of good hash.
|
||||
// The hash is then shuffled back into the appropriate lanes for output.
|
||||
// Not as fast but still faster than using sph serially.
|
||||
|
||||
// shift lane 1 data to lane 2.
|
||||
mm_interleave_4x32( vhash, hash0, hash0, hash1, hash1, 256 );
|
||||
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
bmw256_4way( &ctx.bmw, vhash, 32 );
|
||||
bmw256_4way_close( &ctx.bmw, vhash );
|
||||
uint32_t trash[8] __attribute__ ((aligned (32)));
|
||||
// extract lane 0 as usual and lane2 containing lane 1 hash
|
||||
mm_deinterleave_4x32( state, trash, state+32, trash, vhash, 256 );
|
||||
// shift lane2 data to lane 0 and lane 3 data to lane 2
|
||||
mm_interleave_4x32( vhash, hash2, hash2, hash3, hash3, 256 );
|
||||
bmw256_4way_init( &ctx.bmw );
|
||||
bmw256_4way( &ctx.bmw, vhash, 32 );
|
||||
bmw256_4way_close( &ctx.bmw, vhash );
|
||||
// extract lane 2 hash from lane 0 and lane 3 hash from lane 2.
|
||||
mm_deinterleave_4x32( state+64, trash, state+96, trash, vhash, 256 );
|
||||
|
||||
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
@@ -144,7 +122,6 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
//printf("found0\n");
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = pdata[19] = n;
|
||||
@@ -152,7 +129,6 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
//printf("found1\n");
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
@@ -160,7 +136,6 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
//printf("found2\n");
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
@@ -168,7 +143,6 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
//printf("found3\n");
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
|
@@ -14,18 +14,20 @@ bool lyra2rev2_thread_init()
|
||||
|
||||
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
|
||||
l2v2_wholeMatrix = _mm_malloc( i, 64 );
|
||||
|
||||
#if defined (LYRA2REV2_4WAY)
|
||||
init_lyra2rev2_4way_ctx();;
|
||||
#else
|
||||
init_lyra2rev2_ctx();
|
||||
#endif
|
||||
return l2v2_wholeMatrix;
|
||||
}
|
||||
|
||||
bool register_lyra2rev2_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (LYRA2REV2_4WAY)
|
||||
init_lyra2rev2_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_lyra2rev2_4way;
|
||||
gate->hash = (void*)&lyra2rev2_4way_hash;
|
||||
#else
|
||||
init_lyra2rev2_ctx();
|
||||
gate->scanhash = (void*)&scanhash_lyra2rev2;
|
||||
gate->hash = (void*)&lyra2rev2_hash;
|
||||
#endif
|
||||
|
@@ -20,7 +20,7 @@ void lyra2rev2_4way_hash( void *state, const void *input );
|
||||
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_lyra2rev2_4way_ctx();
|
||||
bool init_lyra2rev2_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
@@ -29,7 +29,7 @@ void lyra2rev2_hash( void *state, const void *input );
|
||||
int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_lyra2rev2_ctx();
|
||||
bool init_lyra2rev2_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -21,7 +21,7 @@ typedef struct {
|
||||
static lyra2v2_ctx_holder lyra2v2_ctx;
|
||||
static __thread sph_blake256_context l2v2_blake_mid;
|
||||
|
||||
void init_lyra2rev2_ctx()
|
||||
bool init_lyra2rev2_ctx()
|
||||
{
|
||||
cubehashInit( &lyra2v2_ctx.cube1, 256, 16, 32 );
|
||||
cubehashInit( &lyra2v2_ctx.cube2, 256, 16, 32 );
|
||||
@@ -29,6 +29,7 @@ void init_lyra2rev2_ctx()
|
||||
sph_keccak256_init( &lyra2v2_ctx.keccak );
|
||||
sph_skein256_init( &lyra2v2_ctx.skein );
|
||||
sph_bmw256_init( &lyra2v2_ctx.bmw );
|
||||
return true;
|
||||
}
|
||||
|
||||
void l2v2_blake256_midstate( const void* input )
|
||||
|
@@ -42,7 +42,7 @@ inline void initState( uint64_t State[/*16*/] )
|
||||
{
|
||||
#if defined (__AVX2__)
|
||||
|
||||
__m256i* state = (__m256i*)State;
|
||||
__m256i *state = (__m256i*)State;
|
||||
|
||||
state[0] = _mm256_setzero_si256();
|
||||
state[1] = _mm256_setzero_si256();
|
||||
@@ -53,7 +53,7 @@ inline void initState( uint64_t State[/*16*/] )
|
||||
|
||||
#elif defined (__AVX__)
|
||||
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i *state = (__m128i*)State;
|
||||
|
||||
state[0] = _mm_setzero_si128();
|
||||
state[1] = _mm_setzero_si128();
|
||||
@@ -123,8 +123,8 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
|
||||
|
||||
const int len_m256i = len / 32;
|
||||
const int fullBlocks = len_m256i / BLOCK_LEN_M256I;
|
||||
__m256i* state = (__m256i*)State;
|
||||
__m256i* out = (__m256i*)Out;
|
||||
__m256i *state = (__m256i*)State;
|
||||
__m256i *out = (__m256i*)Out;
|
||||
int i;
|
||||
|
||||
//Squeezes full blocks
|
||||
@@ -141,8 +141,8 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
|
||||
|
||||
const int len_m128i = len / 16;
|
||||
const int fullBlocks = len_m128i / BLOCK_LEN_M128I;
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i* out = (__m128i*)Out;
|
||||
__m128i *state = (__m128i*)State;
|
||||
__m128i *out = (__m128i*)Out;
|
||||
int i;
|
||||
|
||||
//Squeezes full blocks
|
||||
@@ -186,19 +186,27 @@ inline void absorbBlock( uint64_t *State, const uint64_t *In )
|
||||
{
|
||||
#if defined (__AVX2__)
|
||||
|
||||
__m256i* state = (__m256i*)State;
|
||||
__m256i* in = (__m256i*)In;
|
||||
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
|
||||
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
|
||||
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
|
||||
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
|
||||
const __m256i *in = (const __m256i*)In;
|
||||
|
||||
state[0] = _mm256_xor_si256( state[0], in[0] );
|
||||
state[1] = _mm256_xor_si256( state[1], in[1] );
|
||||
state[2] = _mm256_xor_si256( state[2], in[2] );
|
||||
state0 = _mm256_xor_si256( state0, in[0] );
|
||||
state1 = _mm256_xor_si256( state1, in[1] );
|
||||
state2 = _mm256_xor_si256( state2, in[2] );
|
||||
|
||||
LYRA_12_ROUNDS_AVX2( state[0], state[1], state[2], state[3] );
|
||||
LYRA_12_ROUNDS_AVX2( state0, state1, state2, state3 );
|
||||
|
||||
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
|
||||
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
|
||||
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
|
||||
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
|
||||
|
||||
#elif defined (__AVX__)
|
||||
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i* in = (__m128i*)In;
|
||||
__m128i *state = (__m128i*)State;
|
||||
const __m128i *in = (const __m128i*)In;
|
||||
|
||||
state[0] = _mm_xor_si128( state[0], in[0] );
|
||||
state[1] = _mm_xor_si128( state[1], in[1] );
|
||||
@@ -245,18 +253,26 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
|
||||
//XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
|
||||
#if defined (__AVX2__)
|
||||
|
||||
__m256i* state = (__m256i*)State;
|
||||
__m256i* in = (__m256i*)In;
|
||||
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
|
||||
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
|
||||
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
|
||||
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
|
||||
const __m256i *in = (const __m256i*)In;
|
||||
|
||||
state[0] = _mm256_xor_si256( state[0], in[0] );
|
||||
state[1] = _mm256_xor_si256( state[1], in[1] );
|
||||
state0 = _mm256_xor_si256( state0, in[0] );
|
||||
state1 = _mm256_xor_si256( state1, in[1] );
|
||||
|
||||
LYRA_12_ROUNDS_AVX2( state[0], state[1], state[2], state[3] );
|
||||
LYRA_12_ROUNDS_AVX2( state0, state1, state2, state3 );
|
||||
|
||||
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
|
||||
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
|
||||
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
|
||||
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
|
||||
|
||||
#elif defined (__AVX__)
|
||||
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i* in = (__m128i*)In;
|
||||
__m128i *state = (__m128i*)State;
|
||||
const __m128i *in = (const __m128i*)In;
|
||||
|
||||
state[0] = _mm_xor_si128( state[0], in[0] );
|
||||
state[1] = _mm_xor_si128( state[1], in[1] );
|
||||
@@ -292,7 +308,7 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
|
||||
* @param state The current state of the sponge
|
||||
* @param rowOut Row to receive the data squeezed
|
||||
*/
|
||||
inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
|
||||
inline void reducedSqueezeRow0( uint64_t *State, uint64_t *rowOut,
|
||||
uint64_t nCols )
|
||||
{
|
||||
int i;
|
||||
@@ -301,24 +317,19 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
__m256i* state = (__m256i*)State;
|
||||
__m256i state0 = _mm256_load_si256( state );
|
||||
__m256i state1 = _mm256_load_si256( &state[1] );
|
||||
__m256i state2 = _mm256_load_si256( &state[2] );
|
||||
__m256i state3 = _mm256_load_si256( &state[3] );
|
||||
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
|
||||
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
|
||||
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
|
||||
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
|
||||
__m256i *out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
|
||||
__m256i* out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
|
||||
for ( i = 0; i < 9; i += 3)
|
||||
{
|
||||
_mm_prefetch( out - i, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - i - 2, _MM_HINT_T0 );
|
||||
}
|
||||
__builtin_prefetch( out, 1, 0 );
|
||||
__builtin_prefetch( out -2, 1, 0 );
|
||||
__builtin_prefetch( out -4, 1, 0 );
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
_mm_prefetch( out - 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - 11, _MM_HINT_T0 );
|
||||
__builtin_prefetch( out -i-6, 1, 0 );
|
||||
|
||||
out[0] = state0;
|
||||
out[1] = state1;
|
||||
@@ -330,15 +341,14 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
|
||||
LYRA_ROUND_AVX2( state0, state1, state2, state3 );
|
||||
}
|
||||
|
||||
_mm256_store_si256( state, state0 );
|
||||
_mm256_store_si256( &state[1], state1 );
|
||||
_mm256_store_si256( &state[2], state2 );
|
||||
_mm256_store_si256( &state[3], state3 );
|
||||
|
||||
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
|
||||
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
|
||||
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
|
||||
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
|
||||
|
||||
#elif defined (__AVX__)
|
||||
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i *state = (__m128i*)State;
|
||||
__m128i state0 = _mm_load_si128( state );
|
||||
__m128i state1 = _mm_load_si128( &state[1] );
|
||||
__m128i state2 = _mm_load_si128( &state[2] );
|
||||
@@ -348,7 +358,7 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
|
||||
__m128i state6 = _mm_load_si128( &state[6] );
|
||||
__m128i state7 = _mm_load_si128( &state[7] );
|
||||
|
||||
__m128i* out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
|
||||
__m128i *out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
|
||||
|
||||
for ( i = 0; i < 6; i += 3)
|
||||
{
|
||||
@@ -387,7 +397,7 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
|
||||
|
||||
#else
|
||||
|
||||
uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
|
||||
uint64_t *ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
@@ -422,37 +432,31 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
|
||||
* @param rowIn Row to feed the sponge
|
||||
* @param rowOut Row to receive the sponge's output
|
||||
*/
|
||||
inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
|
||||
inline void reducedDuplexRow1( uint64_t *State, const uint64_t *rowIn,
|
||||
uint64_t *rowOut, uint64_t nCols )
|
||||
{
|
||||
int i;
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
__m256i* state = (__m256i*)State;
|
||||
__m256i state0 = _mm256_load_si256( state );
|
||||
__m256i state1 = _mm256_load_si256( &state[1] );
|
||||
__m256i state2 = _mm256_load_si256( &state[2] );
|
||||
__m256i state3 = _mm256_load_si256( &state[3] );
|
||||
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
|
||||
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
|
||||
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
|
||||
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
|
||||
const __m256i *in = (const __m256i*)rowIn;
|
||||
__m256i *out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
|
||||
__m256i* in = (__m256i*)rowIn;
|
||||
__m256i* out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
|
||||
for ( i = 0; i < 9; i += 3)
|
||||
{
|
||||
_mm_prefetch( in + i, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + i + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - i, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - i - 2, _MM_HINT_T0 );
|
||||
}
|
||||
__builtin_prefetch( in, 0, 0 );
|
||||
__builtin_prefetch( in +2, 0, 0 );
|
||||
__builtin_prefetch( in +4, 0, 0 );
|
||||
__builtin_prefetch( out, 1, 0 );
|
||||
__builtin_prefetch( out -2, 1, 0 );
|
||||
__builtin_prefetch( out -4, 1, 0 );
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
|
||||
_mm_prefetch( in + 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + 11, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - 11, _MM_HINT_T0 );
|
||||
__builtin_prefetch( in +i+6, 0, 0 );
|
||||
__builtin_prefetch( out -i-6, 1, 0 );
|
||||
|
||||
state0 = _mm256_xor_si256( state0, in[0] );
|
||||
state1 = _mm256_xor_si256( state1, in[1] );
|
||||
@@ -470,14 +474,14 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
|
||||
out -= BLOCK_LEN_M256I;
|
||||
}
|
||||
|
||||
_mm256_store_si256( state, state0 );
|
||||
_mm256_store_si256( &state[1], state1 );
|
||||
_mm256_store_si256( &state[2], state2 );
|
||||
_mm256_store_si256( &state[3], state3 );
|
||||
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
|
||||
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
|
||||
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
|
||||
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
|
||||
|
||||
#elif defined (__AVX__)
|
||||
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i *state = (__m128i*)State;
|
||||
__m128i state0 = _mm_load_si128( state );
|
||||
__m128i state1 = _mm_load_si128( &state[1] );
|
||||
__m128i state2 = _mm_load_si128( &state[2] );
|
||||
@@ -487,8 +491,8 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
|
||||
__m128i state6 = _mm_load_si128( &state[6] );
|
||||
__m128i state7 = _mm_load_si128( &state[7] );
|
||||
|
||||
__m128i* in = (__m128i*)rowIn;
|
||||
__m128i* out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
|
||||
const __m128i *in = (const __m128i*)rowIn;
|
||||
__m128i *out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
|
||||
|
||||
for ( i = 0; i < 6; i += 3)
|
||||
{
|
||||
@@ -540,8 +544,8 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
|
||||
|
||||
#else
|
||||
|
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
|
||||
const uint64_t *ptrWordIn = (const uint64_t*)rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t *ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
@@ -600,7 +604,7 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
|
||||
* @param rowOut Row receiving the output
|
||||
*
|
||||
*/
|
||||
inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
|
||||
inline void reducedDuplexRowSetup( uint64_t *State, const uint64_t *rowIn,
|
||||
uint64_t *rowInOut, uint64_t *rowOut,
|
||||
uint64_t nCols )
|
||||
{
|
||||
@@ -608,35 +612,30 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
__m256i* state = (__m256i*)State;
|
||||
__m256i state0 = _mm256_load_si256( state );
|
||||
__m256i state1 = _mm256_load_si256( &state[1] );
|
||||
__m256i state2 = _mm256_load_si256( &state[2] );
|
||||
__m256i state3 = _mm256_load_si256( &state[3] );
|
||||
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
|
||||
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
|
||||
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
|
||||
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
|
||||
const __m256i *in = (const __m256i*)rowIn;
|
||||
__m256i *inout = (__m256i*)rowInOut;
|
||||
__m256i *out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
__m256i t0, t1, t2;
|
||||
|
||||
__m256i* in = (__m256i*)rowIn;
|
||||
__m256i* inout = (__m256i*)rowInOut;
|
||||
__m256i* out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
__m256i t0, t1, t2;
|
||||
|
||||
for ( i = 0; i < 9; i += 3)
|
||||
{
|
||||
_mm_prefetch( in + i, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + i + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + i, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + i + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - i, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - i - 2, _MM_HINT_T0 );
|
||||
}
|
||||
__builtin_prefetch( in, 0, 0 );
|
||||
__builtin_prefetch( in +2, 0, 0 );
|
||||
__builtin_prefetch( in +4, 0, 0 );
|
||||
__builtin_prefetch( inout, 1, 0 );
|
||||
__builtin_prefetch( inout +2, 1, 0 );
|
||||
__builtin_prefetch( inout +4, 1, 0 );
|
||||
__builtin_prefetch( out, 1, 0 );
|
||||
__builtin_prefetch( out -2, 1, 0 );
|
||||
__builtin_prefetch( out -4, 1, 0 );
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
_mm_prefetch( in + 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + 11, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + 11, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - 11, _MM_HINT_T0 );
|
||||
__builtin_prefetch( in +i+6, 0, 0 );
|
||||
__builtin_prefetch( inout +i+6, 1, 0 );
|
||||
__builtin_prefetch( out -i-6, 1, 0 );
|
||||
|
||||
state0 = _mm256_xor_si256( state0,
|
||||
_mm256_add_epi64( in[0], inout[0] ) );
|
||||
@@ -670,16 +669,16 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
|
||||
out -= BLOCK_LEN_M256I;
|
||||
}
|
||||
|
||||
_mm256_store_si256( state, state0 );
|
||||
_mm256_store_si256( &state[1], state1 );
|
||||
_mm256_store_si256( &state[2], state2 );
|
||||
_mm256_store_si256( &state[3], state3 );
|
||||
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
|
||||
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
|
||||
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
|
||||
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
|
||||
|
||||
#elif defined (__AVX__)
|
||||
|
||||
__m128i* in = (__m128i*)rowIn;
|
||||
__m128i* inout = (__m128i*)rowInOut;
|
||||
__m128i* out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
|
||||
const __m128i *in = (const __m128i*)rowIn;
|
||||
__m128i *inout = (__m128i*)rowInOut;
|
||||
__m128i *out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
|
||||
|
||||
for ( i = 0; i < 6; i += 3)
|
||||
{
|
||||
@@ -691,12 +690,12 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
|
||||
_mm_prefetch( out - i - 2, _MM_HINT_T0 );
|
||||
}
|
||||
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i *state = (__m128i*)State;
|
||||
|
||||
// For the last round in this function not optimized for AVX
|
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
|
||||
const uint64_t *ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t *ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t *ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
@@ -757,9 +756,9 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
|
||||
|
||||
#else
|
||||
|
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
|
||||
const uint64_t *ptrWordIn = (const uint64_t*)rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t *ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t *ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
@@ -834,7 +833,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
|
||||
*
|
||||
*/
|
||||
|
||||
inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
|
||||
inline void reducedDuplexRow( uint64_t *State, const uint64_t *rowIn,
|
||||
uint64_t *rowInOut, uint64_t *rowOut,
|
||||
uint64_t nCols )
|
||||
{
|
||||
@@ -842,35 +841,30 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
|
||||
|
||||
#if defined __AVX2__
|
||||
|
||||
__m256i* state = (__m256i*)State;
|
||||
__m256i state0 = _mm256_load_si256( state );
|
||||
__m256i state1 = _mm256_load_si256( &state[1] );
|
||||
__m256i state2 = _mm256_load_si256( &state[2] );
|
||||
__m256i state3 = _mm256_load_si256( &state[3] );
|
||||
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
|
||||
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
|
||||
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
|
||||
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
|
||||
const __m256i* in = (const __m256i*)rowIn;
|
||||
__m256i *inout = (__m256i*)rowInOut;
|
||||
__m256i *out = (__m256i*)rowOut;
|
||||
__m256i t0, t1, t2;
|
||||
|
||||
__m256i* in = (__m256i*)rowIn;
|
||||
__m256i* inout = (__m256i*)rowInOut;
|
||||
__m256i* out = (__m256i*)rowOut;
|
||||
__m256i t0, t1, t2;
|
||||
|
||||
for ( i = 0; i < 9; i += 3)
|
||||
{
|
||||
_mm_prefetch( in + i, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + i + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( out + i, _MM_HINT_T0 );
|
||||
_mm_prefetch( out + i + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + i, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + i + 2, _MM_HINT_T0 );
|
||||
}
|
||||
__builtin_prefetch( in, 0, 0 );
|
||||
__builtin_prefetch( in +2, 0, 0 );
|
||||
__builtin_prefetch( in +4, 0, 0 );
|
||||
__builtin_prefetch( inout, 1, 0 );
|
||||
__builtin_prefetch( inout +2, 1, 0 );
|
||||
__builtin_prefetch( inout +4, 1, 0 );
|
||||
__builtin_prefetch( out, 1, 0 );
|
||||
__builtin_prefetch( out +2, 1, 0 );
|
||||
__builtin_prefetch( out +4, 1, 0 );
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
_mm_prefetch( in + 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + 11, _MM_HINT_T0 );
|
||||
_mm_prefetch( out + 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( out + 11, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + 11, _MM_HINT_T0 );
|
||||
__builtin_prefetch( in +i+6, 0, 0 );
|
||||
__builtin_prefetch( inout +i+6, 1, 0 );
|
||||
__builtin_prefetch( out +i+6, 1, 0 );
|
||||
|
||||
//Absorbing "M[prev] [+] M[row*]"
|
||||
state0 = _mm256_xor_si256( state0,
|
||||
@@ -906,17 +900,17 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
|
||||
inout += BLOCK_LEN_M256I;
|
||||
}
|
||||
|
||||
_mm256_store_si256( state, state0 );
|
||||
_mm256_store_si256( &state[1], state1 );
|
||||
_mm256_store_si256( &state[2], state2 );
|
||||
_mm256_store_si256( &state[3], state3 );
|
||||
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
|
||||
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
|
||||
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
|
||||
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
|
||||
|
||||
#elif defined __AVX__
|
||||
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i* in = (__m128i*)rowIn;
|
||||
__m128i* inout = (__m128i*)rowInOut;
|
||||
__m128i* out = (__m128i*)rowOut;
|
||||
__m128i *state = (__m128i*)State;
|
||||
const __m128i *in = (const __m128i*)rowIn;
|
||||
__m128i *inout = (__m128i*)rowInOut;
|
||||
__m128i *out = (__m128i*)rowOut;
|
||||
|
||||
for ( i = 0; i < 6; i += 3)
|
||||
{
|
||||
@@ -929,9 +923,9 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
|
||||
}
|
||||
|
||||
// for the last round in this function that isn't optimized for AVX
|
||||
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
|
||||
uint64_t *ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
const uint64_t *ptrWordIn = (const uint64_t*)rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t *ptrWordOut = rowOut; //In Lyra2: pointer to row
|
||||
|
||||
for ( i = 0; i < nCols; i++)
|
||||
{
|
||||
@@ -997,9 +991,9 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
|
||||
|
||||
#else
|
||||
|
||||
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
|
||||
uint64_t *ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
const uint64_t *ptrWordIn = (const uint64_t*)rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t *ptrWordOut = rowOut; //In Lyra2: pointer to row
|
||||
|
||||
for ( i = 0; i < nCols; i++)
|
||||
{
|
||||
|
@@ -159,23 +159,26 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
|
||||
|
||||
//---- Housekeeping
|
||||
void initState(uint64_t state[/*16*/]);
|
||||
void initState( uint64_t state[/*16*/] );
|
||||
|
||||
//---- Squeezes
|
||||
void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
|
||||
void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols);
|
||||
void squeeze( uint64_t *state, unsigned char *out, unsigned int len );
|
||||
void reducedSqueezeRow0( uint64_t* state, uint64_t* row, uint64_t nCols );
|
||||
|
||||
//---- Absorbs
|
||||
void absorbBlock(uint64_t *state, const uint64_t *in);
|
||||
void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
|
||||
void absorbBlock( uint64_t *state, const uint64_t *in );
|
||||
void absorbBlockBlake2Safe( uint64_t *state, const uint64_t *in );
|
||||
|
||||
//---- Duplexes
|
||||
void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols);
|
||||
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
||||
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
||||
void reducedDuplexRow1( uint64_t *state, const uint64_t *rowIn,
|
||||
uint64_t *rowOut, uint64_t nCols);
|
||||
void reducedDuplexRowSetup( uint64_t *state, const uint64_t *rowIn,
|
||||
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
|
||||
void reducedDuplexRow( uint64_t *state, const uint64_t *rowIn,
|
||||
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
|
||||
|
||||
//---- Misc
|
||||
void printArray(unsigned char *array, unsigned int size, char *name);
|
||||
//void printArray(unsigned char *array, unsigned int size, char *name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@@ -85,12 +85,12 @@ typedef unsigned int uint;
|
||||
U32TO8_BE((p) + 4, (uint32_t)((v) ));
|
||||
|
||||
|
||||
typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE] __attribute__ ((aligned (16)));
|
||||
|
||||
|
||||
/* SHA-256 */
|
||||
|
||||
static const uint32_t sha256_constants[64] = {
|
||||
static const uint32_t sha256_constants[64] __attribute__ ((aligned (16))) = {
|
||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
|
||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
|
||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
|
||||
@@ -123,10 +123,10 @@ static const uint32_t sha256_constants[64] = {
|
||||
|
||||
|
||||
typedef struct sha256_hash_state_t {
|
||||
uint32_t H[8];
|
||||
uint32_t H[8] __attribute__ ((aligned (16)));
|
||||
uint64_t T;
|
||||
uint32_t leftover;
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE] __attribute__ ((aligned (16)));
|
||||
} sha256_hash_state;
|
||||
|
||||
|
||||
@@ -242,7 +242,7 @@ typedef struct sha256_hmac_state_t {
|
||||
} sha256_hmac_state;
|
||||
|
||||
static void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, const uint8_t *key, size_t keylen) {
|
||||
uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
|
||||
uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] __attribute__ ((aligned (16))) = {0};
|
||||
size_t i;
|
||||
|
||||
neoscrypt_hash_init_sha256(&st->inner);
|
||||
@@ -570,17 +570,17 @@ typedef struct blake2s_param_t {
|
||||
|
||||
/* State block of 180 bytes */
|
||||
typedef struct blake2s_state_t {
|
||||
uint h[8];
|
||||
uint h[8] __attribute__ ((aligned (16)));
|
||||
uint t[2];
|
||||
uint f[2];
|
||||
uchar buf[2 * BLAKE2S_BLOCK_SIZE];
|
||||
uchar buf[2 * BLAKE2S_BLOCK_SIZE] __attribute__ ((aligned (16)));
|
||||
uint buflen;
|
||||
} blake2s_state;
|
||||
|
||||
static void blake2s_compress(blake2s_state *S, const void *buf) {
|
||||
uint i;
|
||||
uint m[16];
|
||||
uint v[16];
|
||||
uint m[16] __attribute__ ((aligned (16)));
|
||||
uint v[16] __attribute__ ((aligned (16)));
|
||||
|
||||
neoscrypt_copy(m, buf, 64);
|
||||
neoscrypt_copy(v, S, 32);
|
||||
@@ -1082,6 +1082,7 @@ void neoscrypt_wait_for_diff( struct stratum_ctx *stratum )
|
||||
|
||||
bool register_neoscrypt_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT;
|
||||
gate->scanhash = (void*)&scanhash_neoscrypt;
|
||||
gate->hash = (void*)&neoscrypt;
|
||||
gate->get_max64 = (void*)&get_neoscrypt_max64;
|
||||
|
@@ -21,6 +21,7 @@ void nist5hash( void *state, const void *input );
|
||||
|
||||
int scanhash_nist5( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
void init_nist5_ctx();
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -60,7 +60,7 @@ void anime_4way_hash( void *state, const void *input )
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
mm256_zero );
|
||||
m256_zero );
|
||||
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0,
|
||||
@@ -97,7 +97,7 @@ void anime_4way_hash( void *state, const void *input )
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
mm256_zero );
|
||||
m256_zero );
|
||||
|
||||
blake512_4way_init( &ctx.blake );
|
||||
blake512_4way( &ctx.blake, vhash, 64 );
|
||||
@@ -118,7 +118,7 @@ void anime_4way_hash( void *state, const void *input )
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
mm256_zero );
|
||||
m256_zero );
|
||||
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
|
@@ -60,7 +60,7 @@ void quark_4way_hash( void *state, const void *input )
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
mm256_zero );
|
||||
m256_zero );
|
||||
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0,
|
||||
@@ -97,7 +97,7 @@ void quark_4way_hash( void *state, const void *input )
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
mm256_zero );
|
||||
m256_zero );
|
||||
|
||||
blake512_4way_init( &ctx.blake );
|
||||
blake512_4way( &ctx.blake, vhash, 64 );
|
||||
@@ -118,7 +118,7 @@ void quark_4way_hash( void *state, const void *input )
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
mm256_zero );
|
||||
m256_zero );
|
||||
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
|
130
algo/qubit/deep-2way.c
Normal file
130
algo/qubit/deep-2way.c
Normal file
@@ -0,0 +1,130 @@
|
||||
#include "deep-gate.h"
|
||||
|
||||
#if defined(DEEP_2WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
typedef struct
|
||||
{
|
||||
luffa_2way_context luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_echo echo;
|
||||
} deep_2way_ctx_holder;
|
||||
|
||||
deep_2way_ctx_holder deep_2way_ctx;
|
||||
|
||||
void init_deep_2way_ctx()
|
||||
{
|
||||
luffa_2way_init( &deep_2way_ctx.luffa, 512 );
|
||||
cubehashInit(&deep_2way_ctx.cube,512,16,32);
|
||||
sph_shavite512_init(&deep_2way_ctx.shavite);
|
||||
init_echo(&deep_2way_ctx.echo, 512);
|
||||
};
|
||||
|
||||
void deep_2way_hash( void *output, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*2] __attribute__ ((aligned (64)));
|
||||
deep_2way_ctx_holder ctx;
|
||||
|
||||
memcpy( &ctx, &deep_2way_ctx, sizeof(deep_2way_ctx) );
|
||||
luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
|
||||
luffa_2way_close( &ctx.luffa, vhash );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
|
||||
(const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &deep_2way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &deep_2way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &deep_2way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
|
||||
memcpy( output, hash0, 32 );
|
||||
memcpy( output+32, hash1, 32 );
|
||||
}
|
||||
|
||||
int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 32+3; // 4*8 + 3
|
||||
uint32_t *noncep1 = vdata + 32+7;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
|
||||
|
||||
luffa_2way_init( &deep_2way_ctx.luffa, 512 );
|
||||
luffa_2way_update( &deep_2way_ctx.luffa, vdata, 64 );
|
||||
|
||||
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
deep_2way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
n += 2;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
17
algo/qubit/deep-gate.c
Normal file
17
algo/qubit/deep-gate.c
Normal file
@@ -0,0 +1,17 @@
|
||||
#include "deep-gate.h"
|
||||
|
||||
bool register_deep_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (DEEP_2WAY)
|
||||
init_deep_2way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_deep_2way;
|
||||
gate->hash = (void*)&deep_2way_hash;
|
||||
#else
|
||||
init_deep_ctx();
|
||||
gate->scanhash = (void*)&scanhash_deep;
|
||||
gate->hash = (void*)&deep_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/qubit/deep-gate.h
Normal file
32
algo/qubit/deep-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef DEEP_GATE_H__
|
||||
#define DEEP_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define DEEP_2WAY
|
||||
#endif
|
||||
|
||||
bool register_deep_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(DEEP_2WAY)
|
||||
|
||||
void deep_2way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_deep_2way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_deep_2way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void deep_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_deep_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,9 +1,9 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "deep-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
@@ -139,12 +139,3 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_deep_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
init_deep_ctx();
|
||||
gate->scanhash = (void*)&scanhash_deep;
|
||||
gate->hash = (void*)&deep_hash;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
137
algo/qubit/qubit-2way.c
Normal file
137
algo/qubit/qubit-2way.c
Normal file
@@ -0,0 +1,137 @@
|
||||
#include "qubit-gate.h"
|
||||
|
||||
#if defined(QUBIT_2WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
typedef struct
|
||||
{
|
||||
luffa_2way_context luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
simd_2way_context simd;
|
||||
hashState_echo echo;
|
||||
} qubit_2way_ctx_holder;
|
||||
|
||||
qubit_2way_ctx_holder qubit_2way_ctx;
|
||||
|
||||
void init_qubit_2way_ctx()
|
||||
{
|
||||
cubehashInit(&qubit_2way_ctx.cube,512,16,32);
|
||||
sph_shavite512_init(&qubit_2way_ctx.shavite);
|
||||
simd_2way_init( &qubit_2way_ctx.simd, 512 );
|
||||
init_echo(&qubit_2way_ctx.echo, 512);
|
||||
};
|
||||
|
||||
void qubit_2way_hash( void *output, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*2] __attribute__ ((aligned (64)));
|
||||
qubit_2way_ctx_holder ctx;
|
||||
|
||||
memcpy( &ctx, &qubit_2way_ctx, sizeof(qubit_2way_ctx) );
|
||||
luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
|
||||
luffa_2way_close( &ctx.luffa, vhash );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
|
||||
(const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
|
||||
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
|
||||
memcpy( output, hash0, 32 );
|
||||
memcpy( output+32, hash1, 32 );
|
||||
}
|
||||
|
||||
int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 32+3; // 4*8 + 3
|
||||
uint32_t *noncep1 = vdata + 32+7;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
|
||||
|
||||
luffa_2way_init( &qubit_2way_ctx.luffa, 512 );
|
||||
luffa_2way_update( &qubit_2way_ctx.luffa, vdata, 64 );
|
||||
|
||||
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
qubit_2way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
n += 2;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
17
algo/qubit/qubit-gate.c
Normal file
17
algo/qubit/qubit-gate.c
Normal file
@@ -0,0 +1,17 @@
|
||||
#include "qubit-gate.h"
|
||||
|
||||
bool register_qubit_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (QUBIT_2WAY)
|
||||
init_qubit_2way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_qubit_2way;
|
||||
gate->hash = (void*)&qubit_2way_hash;
|
||||
#else
|
||||
init_qubit_ctx();
|
||||
gate->scanhash = (void*)&scanhash_qubit;
|
||||
gate->hash = (void*)&qubit_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/qubit/qubit-gate.h
Normal file
32
algo/qubit/qubit-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef QUBIT_GATE_H__
|
||||
#define QUBIT_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define QUBIT_2WAY
|
||||
#endif
|
||||
|
||||
bool register_qubit_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(QUBIT_2WAY)
|
||||
|
||||
void qubit_2way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_qubit_2way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_qubit_2way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void qubit_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_qubit_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,11 +1,11 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "qubit-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
@@ -48,7 +48,7 @@ void qubit_luffa_midstate( const void* input )
|
||||
update_luffa( &qubit_luffa_mid, input, 64 );
|
||||
}
|
||||
|
||||
void qubithash(void *output, const void *input)
|
||||
void qubit_hash(void *output, const void *input)
|
||||
{
|
||||
unsigned char hash[128] __attribute((aligned(64)));
|
||||
#define hashB hash+64
|
||||
@@ -115,7 +115,7 @@ int scanhash_qubit(int thr_id, struct work *work,
|
||||
{
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
qubithash(hash64, endiandata);
|
||||
qubit_hash(hash64, endiandata);
|
||||
#ifndef DEBUG_ALGO
|
||||
if (!(hash64[7] & mask))
|
||||
{
|
||||
@@ -151,12 +151,3 @@ int scanhash_qubit(int thr_id, struct work *work,
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_qubit_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
init_qubit_ctx();
|
||||
gate->scanhash = (void*)&scanhash_qubit;
|
||||
gate->hash = (void*)&qubithash;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
138
algo/ripemd/lbry-4way.c
Normal file
138
algo/ripemd/lbry-4way.c
Normal file
@@ -0,0 +1,138 @@
|
||||
#include "lbry-gate.h"
|
||||
|
||||
#if defined(LBRY_4WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/sha/sha2-hash-4way.h"
|
||||
#include "ripemd-hash-4way.h"
|
||||
|
||||
static __thread sha256_4way_context sha256_mid;
|
||||
|
||||
void lbry_4way_hash( void* output, const void* input )
|
||||
{
|
||||
sha256_4way_context ctx_sha256 __attribute__ ((aligned (64)));
|
||||
sha512_4way_context ctx_sha512;
|
||||
ripemd160_4way_context ctx_ripemd;
|
||||
uint32_t _ALIGN(64) vhashA[16<<2];
|
||||
uint32_t _ALIGN(64) vhashB[16<<2];
|
||||
uint32_t _ALIGN(64) vhashC[16<<2];
|
||||
|
||||
memcpy( &ctx_sha256, &sha256_mid, sizeof(ctx_sha256) );
|
||||
sha256_4way( &ctx_sha256, input+(64<<2), 48 );
|
||||
sha256_4way_close( &ctx_sha256, vhashA );
|
||||
|
||||
sha256_4way_init( &ctx_sha256 );
|
||||
sha256_4way( &ctx_sha256, vhashA, 32 );
|
||||
sha256_4way_close( &ctx_sha256, vhashA );
|
||||
|
||||
// sha512 64 bit data, 64 byte output
|
||||
mm256_reinterleave_4x64( vhashB, vhashA, 256 );
|
||||
sha512_4way_init( &ctx_sha512 );
|
||||
sha512_4way( &ctx_sha512, vhashB, 32 );
|
||||
sha512_4way_close( &ctx_sha512, vhashB );
|
||||
mm256_reinterleave_4x32( vhashA, vhashB, 512 );
|
||||
|
||||
ripemd160_4way_init( &ctx_ripemd );
|
||||
ripemd160_4way( &ctx_ripemd, vhashA, 32 );
|
||||
ripemd160_4way_close( &ctx_ripemd, vhashB );
|
||||
|
||||
ripemd160_4way_init( &ctx_ripemd );
|
||||
ripemd160_4way( &ctx_ripemd, vhashA+(8<<2), 32 );
|
||||
ripemd160_4way_close( &ctx_ripemd, vhashC );
|
||||
|
||||
sha256_4way_init( &ctx_sha256 );
|
||||
sha256_4way( &ctx_sha256, vhashB, 20 );
|
||||
sha256_4way( &ctx_sha256, vhashC, 20 );
|
||||
sha256_4way_close( &ctx_sha256, vhashA );
|
||||
|
||||
sha256_4way_init( &ctx_sha256 );
|
||||
sha256_4way( &ctx_sha256, vhashA, 32 );
|
||||
sha256_4way_close( &ctx_sha256, vhashA );
|
||||
|
||||
mm_deinterleave_4x32( output, output+32, output+64, output+96, vhashA, 256 );
|
||||
}
|
||||
|
||||
int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done)
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[32*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[27];
|
||||
const uint32_t first_nonce = pdata[27];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t edata[32] __attribute__ ((aligned (64)));
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 108; // 27*4
|
||||
uint32_t *noncep1 = vdata + 109;
|
||||
uint32_t *noncep2 = vdata + 110;
|
||||
uint32_t *noncep3 = vdata + 111;
|
||||
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// we need bigendian data...
|
||||
swab32_array( edata, pdata, 32 );
|
||||
mm_interleave_4x32( vdata, edata, edata, edata, edata, 1024 );
|
||||
sha256_4way_init( &sha256_mid );
|
||||
sha256_4way( &sha256_mid, vdata, 64 );
|
||||
|
||||
for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
lbry_4way_hash( hash, vdata );
|
||||
|
||||
if ( !( hash[7] & mask ) && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = pdata[27] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( !( (hash+16)[7] & mask ) && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( !( (hash+24)[7] & mask ) && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n+=4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
94
algo/ripemd/lbry-gate.c
Normal file
94
algo/ripemd/lbry-gate.c
Normal file
@@ -0,0 +1,94 @@
|
||||
#include "lbry-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
double lbry_calc_network_diff( struct work *work )
|
||||
{
|
||||
// sample for diff 43.281 : 1c05ea29
|
||||
// todo: endian reversed on longpoll could be zr5 specific...
|
||||
|
||||
uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
|
||||
uint32_t bits = (nbits & 0xffffff);
|
||||
int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
|
||||
double d = (double)0x0000ffff / (double)bits;
|
||||
|
||||
for (int m=shift; m < 29; m++) d *= 256.0;
|
||||
for (int m=29; m < shift; m++) d /= 256.0;
|
||||
if (opt_debug_diff)
|
||||
applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
// std_le should work but it doesn't
|
||||
void lbry_le_build_stratum_request( char *req, struct work *work,
|
||||
struct stratum_ctx *sctx )
|
||||
{
|
||||
unsigned char *xnonce2str;
|
||||
uint32_t ntime, nonce;
|
||||
char ntimestr[9], noncestr[9];
|
||||
|
||||
le32enc( &ntime, work->data[ LBRY_NTIME_INDEX ] );
|
||||
le32enc( &nonce, work->data[ LBRY_NONCE_INDEX ] );
|
||||
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
|
||||
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
|
||||
xnonce2str = abin2hex( work->xnonce2, work->xnonce2_len);
|
||||
snprintf( req, JSON_BUF_LEN,
|
||||
"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
|
||||
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
|
||||
free(xnonce2str);
|
||||
}
|
||||
void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
{
|
||||
unsigned char merkle_root[64] = { 0 };
|
||||
size_t t;
|
||||
int i;
|
||||
|
||||
algo_gate.gen_merkle_root( merkle_root, sctx );
|
||||
// Increment extranonce2
|
||||
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
|
||||
// Assemble block header
|
||||
memset( g_work->data, 0, sizeof(g_work->data) );
|
||||
g_work->data[0] = le32dec( sctx->job.version );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
g_work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
|
||||
g_work->data[ LBRY_NTIME_INDEX ] = le32dec(sctx->job.ntime);
|
||||
g_work->data[ LBRY_NBITS_INDEX ] = le32dec(sctx->job.nbits);
|
||||
g_work->data[28] = 0x80000000;
|
||||
}
|
||||
|
||||
void lbry_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
int64_t lbry_get_max64() { return 0x1ffffLL; }
|
||||
|
||||
bool register_lbry_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
|
||||
#if defined (LBRY_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_lbry_4way;
|
||||
gate->hash = (void*)&lbry_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_lbry;
|
||||
gate->hash = (void*)&lbry_hash;
|
||||
#endif
|
||||
gate->calc_network_diff = (void*)&lbry_calc_network_diff;
|
||||
gate->get_max64 = (void*)&lbry_get_max64;
|
||||
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
|
||||
gate->build_extraheader = (void*)&lbry_build_extraheader;
|
||||
gate->set_target = (void*)&lbry_set_target;
|
||||
gate->ntime_index = LBRY_NTIME_INDEX;
|
||||
gate->nbits_index = LBRY_NBITS_INDEX;
|
||||
gate->nonce_index = LBRY_NONCE_INDEX;
|
||||
gate->work_data_size = LBRY_WORK_DATA_SIZE;
|
||||
return true;
|
||||
}
|
||||
|
30
algo/ripemd/lbry-gate.h
Normal file
30
algo/ripemd/lbry-gate.h
Normal file
@@ -0,0 +1,30 @@
|
||||
#ifndef LBRY_GATE_H__
|
||||
#define LBRY_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define LBRY_4WAY
|
||||
#endif
|
||||
|
||||
#define LBRY_NTIME_INDEX 25
|
||||
#define LBRY_NBITS_INDEX 26
|
||||
#define LBRY_NONCE_INDEX 27
|
||||
#define LBRY_WORK_DATA_SIZE 192
|
||||
#define LBRY_WORK_CMP_SIZE 76 // same as default
|
||||
|
||||
bool register_lbry_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(LBRY_4WAY)
|
||||
|
||||
void lbry_4way_hash( void *state, const void *input );
|
||||
int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
#endif
|
||||
|
||||
void lbry_hash( void *state, const void *input );
|
||||
int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
#endif
|
||||
|
@@ -1,19 +1,12 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "lbry-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "ripemd/sph_ripemd.h"
|
||||
#include "sha/sph_sha2.h"
|
||||
#include "sph_ripemd.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include <openssl/sha.h>
|
||||
|
||||
#define LBRY_NTIME_INDEX 25
|
||||
#define LBRY_NBITS_INDEX 26
|
||||
#define LBRY_NONCE_INDEX 27
|
||||
#define LBRY_WORK_DATA_SIZE 192
|
||||
#define LBRY_WORK_CMP_SIZE 76 // same as default
|
||||
|
||||
|
||||
void lbry_hash(void* output, const void* input)
|
||||
{
|
||||
#ifndef USE_SPH_SHA
|
||||
@@ -151,88 +144,3 @@ int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
pdata[27] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
double lbry_calc_network_diff( struct work *work )
|
||||
{
|
||||
// sample for diff 43.281 : 1c05ea29
|
||||
// todo: endian reversed on longpoll could be zr5 specific...
|
||||
|
||||
uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
|
||||
uint32_t bits = (nbits & 0xffffff);
|
||||
int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
|
||||
double d = (double)0x0000ffff / (double)bits;
|
||||
|
||||
for (int m=shift; m < 29; m++) d *= 256.0;
|
||||
for (int m=29; m < shift; m++) d /= 256.0;
|
||||
if (opt_debug_diff)
|
||||
applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
// std_le should work but it doesn't
|
||||
void lbry_le_build_stratum_request( char *req, struct work *work,
|
||||
struct stratum_ctx *sctx )
|
||||
{
|
||||
unsigned char *xnonce2str;
|
||||
uint32_t ntime, nonce;
|
||||
char ntimestr[9], noncestr[9];
|
||||
|
||||
le32enc( &ntime, work->data[ LBRY_NTIME_INDEX ] );
|
||||
le32enc( &nonce, work->data[ LBRY_NONCE_INDEX ] );
|
||||
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
|
||||
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
|
||||
xnonce2str = abin2hex( work->xnonce2, work->xnonce2_len);
|
||||
snprintf( req, JSON_BUF_LEN,
|
||||
"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
|
||||
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
|
||||
free(xnonce2str);
|
||||
}
|
||||
|
||||
void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
{
|
||||
unsigned char merkle_root[64] = { 0 };
|
||||
size_t t;
|
||||
int i;
|
||||
|
||||
algo_gate.gen_merkle_root( merkle_root, sctx );
|
||||
// Increment extranonce2
|
||||
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
|
||||
// Assemble block header
|
||||
memset( g_work->data, 0, sizeof(g_work->data) );
|
||||
g_work->data[0] = le32dec( sctx->job.version );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
g_work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
|
||||
g_work->data[ LBRY_NTIME_INDEX ] = le32dec(sctx->job.ntime);
|
||||
g_work->data[ LBRY_NBITS_INDEX ] = le32dec(sctx->job.nbits);
|
||||
g_work->data[28] = 0x80000000;
|
||||
}
|
||||
|
||||
void lbry_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
int64_t lbry_get_max64() { return 0x1ffffLL; }
|
||||
|
||||
bool register_lbry_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_lbry;
|
||||
gate->hash = (void*)&lbry_hash;
|
||||
gate->calc_network_diff = (void*)&lbry_calc_network_diff;
|
||||
gate->get_max64 = (void*)&lbry_get_max64;
|
||||
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
|
||||
gate->build_extraheader = (void*)&lbry_build_extraheader;
|
||||
gate->set_target = (void*)&lbry_set_target;
|
||||
gate->ntime_index = LBRY_NTIME_INDEX;
|
||||
gate->nbits_index = LBRY_NBITS_INDEX;
|
||||
gate->nonce_index = LBRY_NONCE_INDEX;
|
||||
gate->work_data_size = LBRY_WORK_DATA_SIZE;
|
||||
return true;
|
||||
}
|
||||
|
323
algo/ripemd/ripemd-hash-4way.c
Normal file
323
algo/ripemd/ripemd-hash-4way.c
Normal file
@@ -0,0 +1,323 @@
|
||||
#include "ripemd-hash-4way.h"
|
||||
|
||||
#if defined(__AVX__)
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* Round functions for RIPEMD-128 and RIPEMD-160.
|
||||
*/
|
||||
#define F1(x, y, z) \
|
||||
_mm_xor_si128( _mm_xor_si128( x, y ), z )
|
||||
|
||||
#define F2(x, y, z) \
|
||||
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
|
||||
|
||||
#define F3(x, y, z) \
|
||||
_mm_xor_si128( _mm_or_si128( x, mm_not( y ) ), z )
|
||||
|
||||
#define F4(x, y, z) \
|
||||
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
|
||||
|
||||
#define F5(x, y, z) \
|
||||
_mm_xor_si128( x, _mm_or_si128( y, mm_not( z ) ) )
|
||||
|
||||
|
||||
static const uint32_t IV[5] =
|
||||
{ 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 };
|
||||
|
||||
/*
|
||||
* Round constants for RIPEMD-160.
|
||||
*/
|
||||
#define K11 0x00000000
|
||||
#define K12 0x5A827999
|
||||
#define K13 0x6ED9EBA1
|
||||
#define K14 0x8F1BBCDC
|
||||
#define K15 0xA953FD4E
|
||||
|
||||
#define K21 0x50A28BE6
|
||||
#define K22 0x5C4DD124
|
||||
#define K23 0x6D703EF3
|
||||
#define K24 0x7A6D76E9
|
||||
#define K25 0x00000000
|
||||
|
||||
#define RR(a, b, c, d, e, f, s, r, k) \
|
||||
do{ \
|
||||
a = _mm_add_epi32( mm_rotl_32( _mm_add_epi32( _mm_add_epi32( \
|
||||
_mm_add_epi32( a, f( b ,c, d ) ), r ), \
|
||||
_mm_set1_epi32( k ) ), s ), e ); \
|
||||
c = mm_rotl_32( c, 10 );\
|
||||
} while (0)
|
||||
|
||||
#define ROUND1(a, b, c, d, e, f, s, r, k) \
|
||||
RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
|
||||
|
||||
#define ROUND2(a, b, c, d, e, f, s, r, k) \
|
||||
RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
|
||||
|
||||
static void ripemd160_4way_round( ripemd160_4way_context *sc )
|
||||
{
|
||||
const __m128i *in = (__m128i*)sc->buf;
|
||||
__m128i *h = (__m128i*)sc->val;
|
||||
register __m128i A1, B1, C1, D1, E1;
|
||||
register __m128i A2, B2, C2, D2, E2;
|
||||
__m128i tmp;
|
||||
|
||||
A1 = A2 = h[0];
|
||||
B1 = B2 = h[1];
|
||||
C1 = C2 = h[2];
|
||||
D1 = D2 = h[3];
|
||||
E1 = E2 = h[4];
|
||||
|
||||
ROUND1( A, B, C, D, E, F1, 11, in[ 0], 1 );
|
||||
ROUND1( E, A, B, C, D, F1, 14, in[ 1], 1 );
|
||||
ROUND1( D, E, A, B, C, F1, 15, in[ 2], 1 );
|
||||
ROUND1( C, D, E, A, B, F1, 12, in[ 3], 1 );
|
||||
ROUND1( B, C, D, E, A, F1, 5, in[ 4], 1 );
|
||||
ROUND1( A, B, C, D, E, F1, 8, in[ 5], 1 );
|
||||
ROUND1( E, A, B, C, D, F1, 7, in[ 6], 1 );
|
||||
ROUND1( D, E, A, B, C, F1, 9, in[ 7], 1 );
|
||||
ROUND1( C, D, E, A, B, F1, 11, in[ 8], 1 );
|
||||
ROUND1( B, C, D, E, A, F1, 13, in[ 9], 1 );
|
||||
ROUND1( A, B, C, D, E, F1, 14, in[10], 1 );
|
||||
ROUND1( E, A, B, C, D, F1, 15, in[11], 1 );
|
||||
ROUND1( D, E, A, B, C, F1, 6, in[12], 1 );
|
||||
ROUND1( C, D, E, A, B, F1, 7, in[13], 1 );
|
||||
ROUND1( B, C, D, E, A, F1, 9, in[14], 1 );
|
||||
ROUND1( A, B, C, D, E, F1, 8, in[15], 1 );
|
||||
|
||||
ROUND1( E, A, B, C, D, F2, 7, in[ 7], 2 );
|
||||
ROUND1( D, E, A, B, C, F2, 6, in[ 4], 2 );
|
||||
ROUND1( C, D, E, A, B, F2, 8, in[13], 2 );
|
||||
ROUND1( B, C, D, E, A, F2, 13, in[ 1], 2 );
|
||||
ROUND1( A, B, C, D, E, F2, 11, in[10], 2 );
|
||||
ROUND1( E, A, B, C, D, F2, 9, in[ 6], 2 );
|
||||
ROUND1( D, E, A, B, C, F2, 7, in[15], 2 );
|
||||
ROUND1( C, D, E, A, B, F2, 15, in[ 3], 2 );
|
||||
ROUND1( B, C, D, E, A, F2, 7, in[12], 2 );
|
||||
ROUND1( A, B, C, D, E, F2, 12, in[ 0], 2 );
|
||||
ROUND1( E, A, B, C, D, F2, 15, in[ 9], 2 );
|
||||
ROUND1( D, E, A, B, C, F2, 9, in[ 5], 2 );
|
||||
ROUND1( C, D, E, A, B, F2, 11, in[ 2], 2 );
|
||||
ROUND1( B, C, D, E, A, F2, 7, in[14], 2 );
|
||||
ROUND1( A, B, C, D, E, F2, 13, in[11], 2 );
|
||||
ROUND1( E, A, B, C, D, F2, 12, in[ 8], 2 );
|
||||
|
||||
ROUND1( D, E, A, B, C, F3, 11, in[ 3], 3 );
|
||||
ROUND1( C, D, E, A, B, F3, 13, in[10], 3 );
|
||||
ROUND1( B, C, D, E, A, F3, 6, in[14], 3 );
|
||||
ROUND1( A, B, C, D, E, F3, 7, in[ 4], 3 );
|
||||
ROUND1( E, A, B, C, D, F3, 14, in[ 9], 3 );
|
||||
ROUND1( D, E, A, B, C, F3, 9, in[15], 3 );
|
||||
ROUND1( C, D, E, A, B, F3, 13, in[ 8], 3 );
|
||||
ROUND1( B, C, D, E, A, F3, 15, in[ 1], 3 );
|
||||
ROUND1( A, B, C, D, E, F3, 14, in[ 2], 3 );
|
||||
ROUND1( E, A, B, C, D, F3, 8, in[ 7], 3 );
|
||||
ROUND1( D, E, A, B, C, F3, 13, in[ 0], 3 );
|
||||
ROUND1( C, D, E, A, B, F3, 6, in[ 6], 3 );
|
||||
ROUND1( B, C, D, E, A, F3, 5, in[13], 3 );
|
||||
ROUND1( A, B, C, D, E, F3, 12, in[11], 3 );
|
||||
ROUND1( E, A, B, C, D, F3, 7, in[ 5], 3 );
|
||||
ROUND1( D, E, A, B, C, F3, 5, in[12], 3 );
|
||||
|
||||
ROUND1( C, D, E, A, B, F4, 11, in[ 1], 4 );
|
||||
ROUND1( B, C, D, E, A, F4, 12, in[ 9], 4 );
|
||||
ROUND1( A, B, C, D, E, F4, 14, in[11], 4 );
|
||||
ROUND1( E, A, B, C, D, F4, 15, in[10], 4 );
|
||||
ROUND1( D, E, A, B, C, F4, 14, in[ 0], 4 );
|
||||
ROUND1( C, D, E, A, B, F4, 15, in[ 8], 4 );
|
||||
ROUND1( B, C, D, E, A, F4, 9, in[12], 4 );
|
||||
ROUND1( A, B, C, D, E, F4, 8, in[ 4], 4 );
|
||||
ROUND1( E, A, B, C, D, F4, 9, in[13], 4 );
|
||||
ROUND1( D, E, A, B, C, F4, 14, in[ 3], 4 );
|
||||
ROUND1( C, D, E, A, B, F4, 5, in[ 7], 4 );
|
||||
ROUND1( B, C, D, E, A, F4, 6, in[15], 4 );
|
||||
ROUND1( A, B, C, D, E, F4, 8, in[14], 4 );
|
||||
ROUND1( E, A, B, C, D, F4, 6, in[ 5], 4 );
|
||||
ROUND1( D, E, A, B, C, F4, 5, in[ 6], 4 );
|
||||
ROUND1( C, D, E, A, B, F4, 12, in[ 2], 4 );
|
||||
|
||||
ROUND1( B, C, D, E, A, F5, 9, in[ 4], 5 );
|
||||
ROUND1( A, B, C, D, E, F5, 15, in[ 0], 5 );
|
||||
ROUND1( E, A, B, C, D, F5, 5, in[ 5], 5 );
|
||||
ROUND1( D, E, A, B, C, F5, 11, in[ 9], 5 );
|
||||
ROUND1( C, D, E, A, B, F5, 6, in[ 7], 5 );
|
||||
ROUND1( B, C, D, E, A, F5, 8, in[12], 5 );
|
||||
ROUND1( A, B, C, D, E, F5, 13, in[ 2], 5 );
|
||||
ROUND1( E, A, B, C, D, F5, 12, in[10], 5 );
|
||||
ROUND1( D, E, A, B, C, F5, 5, in[14], 5 );
|
||||
ROUND1( C, D, E, A, B, F5, 12, in[ 1], 5 );
|
||||
ROUND1( B, C, D, E, A, F5, 13, in[ 3], 5 );
|
||||
ROUND1( A, B, C, D, E, F5, 14, in[ 8], 5 );
|
||||
ROUND1( E, A, B, C, D, F5, 11, in[11], 5 );
|
||||
ROUND1( D, E, A, B, C, F5, 8, in[ 6], 5 );
|
||||
ROUND1( C, D, E, A, B, F5, 5, in[15], 5 );
|
||||
ROUND1( B, C, D, E, A, F5, 6, in[13], 5 );
|
||||
|
||||
ROUND2( A, B, C, D, E, F5, 8, in[ 5], 1 );
|
||||
ROUND2( E, A, B, C, D, F5, 9, in[14], 1 );
|
||||
ROUND2( D, E, A, B, C, F5, 9, in[ 7], 1 );
|
||||
ROUND2( C, D, E, A, B, F5, 11, in[ 0], 1 );
|
||||
ROUND2( B, C, D, E, A, F5, 13, in[ 9], 1 );
|
||||
ROUND2( A, B, C, D, E, F5, 15, in[ 2], 1 );
|
||||
ROUND2( E, A, B, C, D, F5, 15, in[11], 1 );
|
||||
ROUND2( D, E, A, B, C, F5, 5, in[ 4], 1 );
|
||||
ROUND2( C, D, E, A, B, F5, 7, in[13], 1 );
|
||||
ROUND2( B, C, D, E, A, F5, 7, in[ 6], 1 );
|
||||
ROUND2( A, B, C, D, E, F5, 8, in[15], 1 );
|
||||
ROUND2( E, A, B, C, D, F5, 11, in[ 8], 1 );
|
||||
ROUND2( D, E, A, B, C, F5, 14, in[ 1], 1 );
|
||||
ROUND2( C, D, E, A, B, F5, 14, in[10], 1 );
|
||||
ROUND2( B, C, D, E, A, F5, 12, in[ 3], 1 );
|
||||
ROUND2( A, B, C, D, E, F5, 6, in[12], 1 );
|
||||
|
||||
ROUND2( E, A, B, C, D, F4, 9, in[ 6], 2 );
|
||||
ROUND2( D, E, A, B, C, F4, 13, in[11], 2 );
|
||||
ROUND2( C, D, E, A, B, F4, 15, in[ 3], 2 );
|
||||
ROUND2( B, C, D, E, A, F4, 7, in[ 7], 2 );
|
||||
ROUND2( A, B, C, D, E, F4, 12, in[ 0], 2 );
|
||||
ROUND2( E, A, B, C, D, F4, 8, in[13], 2 );
|
||||
ROUND2( D, E, A, B, C, F4, 9, in[ 5], 2 );
|
||||
ROUND2( C, D, E, A, B, F4, 11, in[10], 2 );
|
||||
ROUND2( B, C, D, E, A, F4, 7, in[14], 2 );
|
||||
ROUND2( A, B, C, D, E, F4, 7, in[15], 2 );
|
||||
ROUND2( E, A, B, C, D, F4, 12, in[ 8], 2 );
|
||||
ROUND2( D, E, A, B, C, F4, 7, in[12], 2 );
|
||||
ROUND2( C, D, E, A, B, F4, 6, in[ 4], 2 );
|
||||
ROUND2( B, C, D, E, A, F4, 15, in[ 9], 2 );
|
||||
ROUND2( A, B, C, D, E, F4, 13, in[ 1], 2 );
|
||||
ROUND2( E, A, B, C, D, F4, 11, in[ 2], 2 );
|
||||
|
||||
ROUND2( D, E, A, B, C, F3, 9, in[15], 3 );
|
||||
ROUND2( C, D, E, A, B, F3, 7, in[ 5], 3 );
|
||||
ROUND2( B, C, D, E, A, F3, 15, in[ 1], 3 );
|
||||
ROUND2( A, B, C, D, E, F3, 11, in[ 3], 3 );
|
||||
ROUND2( E, A, B, C, D, F3, 8, in[ 7], 3 );
|
||||
ROUND2( D, E, A, B, C, F3, 6, in[14], 3 );
|
||||
ROUND2( C, D, E, A, B, F3, 6, in[ 6], 3 );
|
||||
ROUND2( B, C, D, E, A, F3, 14, in[ 9], 3 );
|
||||
ROUND2( A, B, C, D, E, F3, 12, in[11], 3 );
|
||||
ROUND2( E, A, B, C, D, F3, 13, in[ 8], 3 );
|
||||
ROUND2( D, E, A, B, C, F3, 5, in[12], 3 );
|
||||
ROUND2( C, D, E, A, B, F3, 14, in[ 2], 3 );
|
||||
ROUND2( B, C, D, E, A, F3, 13, in[10], 3 );
|
||||
ROUND2( A, B, C, D, E, F3, 13, in[ 0], 3 );
|
||||
ROUND2( E, A, B, C, D, F3, 7, in[ 4], 3 );
|
||||
ROUND2( D, E, A, B, C, F3, 5, in[13], 3 );
|
||||
|
||||
ROUND2( C, D, E, A, B, F2, 15, in[ 8], 4 );
|
||||
ROUND2( B, C, D, E, A, F2, 5, in[ 6], 4 );
|
||||
ROUND2( A, B, C, D, E, F2, 8, in[ 4], 4 );
|
||||
ROUND2( E, A, B, C, D, F2, 11, in[ 1], 4 );
|
||||
ROUND2( D, E, A, B, C, F2, 14, in[ 3], 4 );
|
||||
ROUND2( C, D, E, A, B, F2, 14, in[11], 4 );
|
||||
ROUND2( B, C, D, E, A, F2, 6, in[15], 4 );
|
||||
ROUND2( A, B, C, D, E, F2, 14, in[ 0], 4 );
|
||||
ROUND2( E, A, B, C, D, F2, 6, in[ 5], 4 );
|
||||
ROUND2( D, E, A, B, C, F2, 9, in[12], 4 );
|
||||
ROUND2( C, D, E, A, B, F2, 12, in[ 2], 4 );
|
||||
ROUND2( B, C, D, E, A, F2, 9, in[13], 4 );
|
||||
ROUND2( A, B, C, D, E, F2, 12, in[ 9], 4 );
|
||||
ROUND2( E, A, B, C, D, F2, 5, in[ 7], 4 );
|
||||
ROUND2( D, E, A, B, C, F2, 15, in[10], 4 );
|
||||
ROUND2( C, D, E, A, B, F2, 8, in[14], 4 );
|
||||
|
||||
ROUND2( B, C, D, E, A, F1, 8, in[12], 5 );
|
||||
ROUND2( A, B, C, D, E, F1, 5, in[15], 5 );
|
||||
ROUND2( E, A, B, C, D, F1, 12, in[10], 5 );
|
||||
ROUND2( D, E, A, B, C, F1, 9, in[ 4], 5 );
|
||||
ROUND2( C, D, E, A, B, F1, 12, in[ 1], 5 );
|
||||
ROUND2( B, C, D, E, A, F1, 5, in[ 5], 5 );
|
||||
ROUND2( A, B, C, D, E, F1, 14, in[ 8], 5 );
|
||||
ROUND2( E, A, B, C, D, F1, 6, in[ 7], 5 );
|
||||
ROUND2( D, E, A, B, C, F1, 8, in[ 6], 5 );
|
||||
ROUND2( C, D, E, A, B, F1, 13, in[ 2], 5 );
|
||||
ROUND2( B, C, D, E, A, F1, 6, in[13], 5 );
|
||||
ROUND2( A, B, C, D, E, F1, 5, in[14], 5 );
|
||||
ROUND2( E, A, B, C, D, F1, 15, in[ 0], 5 );
|
||||
ROUND2( D, E, A, B, C, F1, 13, in[ 3], 5 );
|
||||
ROUND2( C, D, E, A, B, F1, 11, in[ 9], 5 );
|
||||
ROUND2( B, C, D, E, A, F1, 11, in[11], 5 );
|
||||
|
||||
tmp = _mm_add_epi32( _mm_add_epi32( h[1], C1 ), D2 );
|
||||
h[1] = _mm_add_epi32( _mm_add_epi32( h[2], D1 ), E2 );
|
||||
h[2] = _mm_add_epi32( _mm_add_epi32( h[3], E1 ), A2 );
|
||||
h[3] = _mm_add_epi32( _mm_add_epi32( h[4], A1 ), B2 );
|
||||
h[4] = _mm_add_epi32( _mm_add_epi32( h[0], B1 ), C2 );
|
||||
h[0] = tmp;
|
||||
}
|
||||
|
||||
void ripemd160_4way_init( ripemd160_4way_context *sc )
|
||||
{
|
||||
sc->val[0] = _mm_set1_epi32( IV[0] );
|
||||
sc->val[1] = _mm_set1_epi32( IV[1] );
|
||||
sc->val[2] = _mm_set1_epi32( IV[2] );
|
||||
sc->val[3] = _mm_set1_epi32( IV[3] );
|
||||
sc->val[4] = _mm_set1_epi32( IV[4] );
|
||||
sc->count_high = sc->count_low = 0;
|
||||
}
|
||||
|
||||
void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m128i *vdata = (__m128i*)data;
|
||||
size_t ptr;
|
||||
const int block_size = 64;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (block_size - 1U);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
uint32_t clow, clow2;
|
||||
|
||||
clen = block_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_128( sc->buf + (ptr>>2), vdata, clen>>2 );
|
||||
vdata = vdata + (clen>>2);
|
||||
ptr += clen;
|
||||
len -= clen;
|
||||
if ( ptr == block_size )
|
||||
{
|
||||
ripemd160_4way_round( sc );
|
||||
ptr = 0;
|
||||
}
|
||||
clow = sc->count_low;
|
||||
clow2 = clow + clen;
|
||||
sc->count_low = clow2;
|
||||
if ( clow2 < clow )
|
||||
sc->count_high++;
|
||||
}
|
||||
}
|
||||
|
||||
void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
|
||||
{
|
||||
unsigned ptr, u;
|
||||
uint32_t low, high;
|
||||
const int block_size = 64;
|
||||
const int pad = block_size - 8;
|
||||
|
||||
ptr = (unsigned)sc->count_low & ( block_size - 1U);
|
||||
sc->buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 );
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
{
|
||||
memset_zero_128( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
|
||||
ripemd160_4way_round( sc );
|
||||
memset_zero_128( sc->buf, pad>>2 );
|
||||
}
|
||||
else
|
||||
memset_zero_128( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
|
||||
|
||||
low = sc->count_low;
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
sc->buf[ pad>>2 ] = _mm_set1_epi32( low );
|
||||
sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
|
||||
ripemd160_4way_round( sc );
|
||||
for (u = 0; u < 5; u ++)
|
||||
casti_m128i( dst, u ) = sc->val[u];
|
||||
}
|
||||
|
||||
#endif
|
23
algo/ripemd/ripemd-hash-4way.h
Normal file
23
algo/ripemd/ripemd-hash-4way.h
Normal file
@@ -0,0 +1,23 @@
|
||||
#ifndef RIPEMD_HASH_4WAY_H__
|
||||
#define RIPEMD_HASH_4WAY_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
|
||||
#if defined(__AVX__)
|
||||
|
||||
#include "avxdefs.h"
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m128i buf[64>>2];
|
||||
__m128i val[5];
|
||||
uint32_t count_high, count_low;
|
||||
} __attribute__ ((aligned (64))) ripemd160_4way_context;
|
||||
|
||||
void ripemd160_4way_init( ripemd160_4way_context *sc );
|
||||
void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len );
|
||||
void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst );
|
||||
|
||||
#endif
|
||||
#endif
|
@@ -778,6 +778,7 @@ bool scrypt_miner_thread_init( int thr_id )
|
||||
|
||||
bool register_scrypt_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_scrypt;
|
||||
// gate->hash = (void*)&scrypt_1024_1_1_256_24way;
|
||||
|
@@ -215,18 +215,18 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
|
||||
#if defined BE64
|
||||
#if defined PLW1
|
||||
sc->buf[ SPH_MAXPAD>>3 ] =
|
||||
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#elif defined PLW4
|
||||
memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
|
||||
sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#else
|
||||
sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#endif // PLW
|
||||
#else // LE64
|
||||
#if defined PLW1
|
||||
@@ -255,7 +255,7 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
|
||||
for ( u = 0; u < rnum; u ++ )
|
||||
{
|
||||
#if defined BE64
|
||||
((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
|
||||
((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
|
||||
#else // LE64
|
||||
((__m256i*)dst)[u] = sc->val[u];
|
||||
#endif
|
||||
|
@@ -1,247 +0,0 @@
|
||||
/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */
|
||||
/*
|
||||
* SHA-384 / SHA-512 implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "sph_sha2.h"
|
||||
|
||||
#if SPH_64
|
||||
|
||||
#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
|
||||
#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z)))
|
||||
|
||||
#define ROTR64 SPH_ROTR64
|
||||
|
||||
#define BSG5_0(x) (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39))
|
||||
#define BSG5_1(x) (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41))
|
||||
#define SSG5_0(x) (ROTR64(x, 1) ^ ROTR64(x, 8) ^ SPH_T64((x) >> 7))
|
||||
#define SSG5_1(x) (ROTR64(x, 19) ^ ROTR64(x, 61) ^ SPH_T64((x) >> 6))
|
||||
|
||||
static const sph_u64 K512[80] = {
|
||||
SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
|
||||
SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
|
||||
SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
|
||||
SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
|
||||
SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
|
||||
SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
|
||||
SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
|
||||
SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
|
||||
SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
|
||||
SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
|
||||
SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
|
||||
SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
|
||||
SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
|
||||
SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
|
||||
SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
|
||||
SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
|
||||
SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
|
||||
SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
|
||||
SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
|
||||
SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
|
||||
SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
|
||||
SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
|
||||
SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
|
||||
SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
|
||||
SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
|
||||
SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
|
||||
SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
|
||||
SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
|
||||
SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
|
||||
SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
|
||||
SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
|
||||
SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
|
||||
SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
|
||||
SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
|
||||
SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
|
||||
SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
|
||||
SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
|
||||
SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
|
||||
SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
|
||||
SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
|
||||
};
|
||||
|
||||
static const sph_u64 H384[8] = {
|
||||
SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507),
|
||||
SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939),
|
||||
SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511),
|
||||
SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4)
|
||||
};
|
||||
|
||||
static const sph_u64 H512[8] = {
|
||||
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
||||
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
|
||||
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
|
||||
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
|
||||
};
|
||||
|
||||
/*
|
||||
* This macro defines the body for a SHA-384 / SHA-512 compression function
|
||||
* implementation. The "in" parameter should evaluate, when applied to a
|
||||
* numerical input parameter from 0 to 15, to an expression which yields
|
||||
* the corresponding input block. The "r" parameter should evaluate to
|
||||
* an array or pointer expression designating the array of 8 words which
|
||||
* contains the input and output of the compression function.
|
||||
*
|
||||
* SHA-512 is hard for the compiler. If the loop is completely unrolled,
|
||||
* then the code will be quite huge (possibly more than 100 kB), and the
|
||||
* performance will be degraded due to cache misses on the code. We
|
||||
* unroll only eight steps, which avoids all needless copies when
|
||||
* 64-bit registers are swapped.
|
||||
*/
|
||||
|
||||
#define SHA3_STEP(A, B, C, D, E, F, G, H, i) do { \
|
||||
sph_u64 T1, T2; \
|
||||
T1 = SPH_T64(H + BSG5_1(E) + CH(E, F, G) + K512[i] + W[i]); \
|
||||
T2 = SPH_T64(BSG5_0(A) + MAJ(A, B, C)); \
|
||||
D = SPH_T64(D + T1); \
|
||||
H = SPH_T64(T1 + T2); \
|
||||
} while (0)
|
||||
|
||||
#define SHA3_ROUND_BODY(in, r) do { \
|
||||
int i; \
|
||||
sph_u64 A, B, C, D, E, F, G, H; \
|
||||
sph_u64 W[80]; \
|
||||
\
|
||||
for (i = 0; i < 16; i ++) \
|
||||
W[i] = in(i); \
|
||||
for (i = 16; i < 80; i ++) \
|
||||
W[i] = SPH_T64(SSG5_1(W[i - 2]) + W[i - 7] \
|
||||
+ SSG5_0(W[i - 15]) + W[i - 16]); \
|
||||
A = (r)[0]; \
|
||||
B = (r)[1]; \
|
||||
C = (r)[2]; \
|
||||
D = (r)[3]; \
|
||||
E = (r)[4]; \
|
||||
F = (r)[5]; \
|
||||
G = (r)[6]; \
|
||||
H = (r)[7]; \
|
||||
for (i = 0; i < 80; i += 8) { \
|
||||
SHA3_STEP(A, B, C, D, E, F, G, H, i + 0); \
|
||||
SHA3_STEP(H, A, B, C, D, E, F, G, i + 1); \
|
||||
SHA3_STEP(G, H, A, B, C, D, E, F, i + 2); \
|
||||
SHA3_STEP(F, G, H, A, B, C, D, E, i + 3); \
|
||||
SHA3_STEP(E, F, G, H, A, B, C, D, i + 4); \
|
||||
SHA3_STEP(D, E, F, G, H, A, B, C, i + 5); \
|
||||
SHA3_STEP(C, D, E, F, G, H, A, B, i + 6); \
|
||||
SHA3_STEP(B, C, D, E, F, G, H, A, i + 7); \
|
||||
} \
|
||||
(r)[0] = SPH_T64((r)[0] + A); \
|
||||
(r)[1] = SPH_T64((r)[1] + B); \
|
||||
(r)[2] = SPH_T64((r)[2] + C); \
|
||||
(r)[3] = SPH_T64((r)[3] + D); \
|
||||
(r)[4] = SPH_T64((r)[4] + E); \
|
||||
(r)[5] = SPH_T64((r)[5] + F); \
|
||||
(r)[6] = SPH_T64((r)[6] + G); \
|
||||
(r)[7] = SPH_T64((r)[7] + H); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* One round of SHA-384 / SHA-512. The data must be aligned for 64-bit access.
|
||||
*/
|
||||
static void
|
||||
sha3_round(const unsigned char *data, sph_u64 r[8])
|
||||
{
|
||||
#define SHA3_IN(x) sph_dec64be_aligned(data + (8 * (x)))
|
||||
SHA3_ROUND_BODY(SHA3_IN, r);
|
||||
#undef SHA3_IN
|
||||
}
|
||||
|
||||
/* see sph_sha3.h */
|
||||
void
|
||||
sph_sha384_init(void *cc)
|
||||
{
|
||||
sph_sha384_context *sc;
|
||||
|
||||
sc = cc;
|
||||
memcpy(sc->val, H384, sizeof H384);
|
||||
sc->count = 0;
|
||||
}
|
||||
|
||||
/* see sph_sha3.h */
|
||||
void
|
||||
sph_sha512_init(void *cc)
|
||||
{
|
||||
sph_sha512_context *sc;
|
||||
|
||||
sc = cc;
|
||||
memcpy(sc->val, H512, sizeof H512);
|
||||
sc->count = 0;
|
||||
}
|
||||
|
||||
#define RFUN sha3_round
|
||||
#define HASH sha384
|
||||
#define BE64 1
|
||||
#include "md_helper.c"
|
||||
|
||||
/* see sph_sha3.h */
|
||||
void
|
||||
sph_sha384_close(void *cc, void *dst)
|
||||
{
|
||||
sha384_close(cc, dst, 6);
|
||||
// sph_sha384_init(cc);
|
||||
}
|
||||
|
||||
/* see sph_sha3.h */
|
||||
void
|
||||
sph_sha384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
sha384_addbits_and_close(cc, ub, n, dst, 6);
|
||||
// sph_sha384_init(cc);
|
||||
}
|
||||
|
||||
/* see sph_sha3.h */
|
||||
void
|
||||
sph_sha512_close(void *cc, void *dst)
|
||||
{
|
||||
sha384_close(cc, dst, 8);
|
||||
// sph_sha512_init(cc);
|
||||
}
|
||||
|
||||
/* see sph_sha3.h */
|
||||
void
|
||||
sph_sha512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
sha384_addbits_and_close(cc, ub, n, dst, 8);
|
||||
// sph_sha512_init(cc);
|
||||
}
|
||||
|
||||
/* see sph_sha3.h */
|
||||
void
|
||||
sph_sha384_comp(const sph_u64 msg[16], sph_u64 val[8])
|
||||
{
|
||||
#define SHA3_IN(x) msg[x]
|
||||
SHA3_ROUND_BODY(SHA3_IN, val);
|
||||
#undef SHA3_IN
|
||||
}
|
||||
|
||||
#endif
|
@@ -30,13 +30,282 @@
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#if defined(__AVX__)
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "sha2-hash-4way.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
// SHA256 4 way 32 bit
|
||||
|
||||
static const sph_u32 H256[8] = {
|
||||
SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
|
||||
SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
|
||||
SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
|
||||
SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
|
||||
};
|
||||
|
||||
static const sph_u32 K256[64] = {
|
||||
SPH_C32(0x428A2F98), SPH_C32(0x71374491),
|
||||
SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
|
||||
SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
|
||||
SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
|
||||
SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
|
||||
SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
|
||||
SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
|
||||
SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
|
||||
SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
|
||||
SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
|
||||
SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
|
||||
SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
|
||||
SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
|
||||
SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
|
||||
SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
|
||||
SPH_C32(0x06CA6351), SPH_C32(0x14292967),
|
||||
SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
|
||||
SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
|
||||
SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
|
||||
SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
|
||||
SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
|
||||
SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
|
||||
SPH_C32(0xD192E819), SPH_C32(0xD6990624),
|
||||
SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
|
||||
SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
|
||||
SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
|
||||
SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
|
||||
SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
|
||||
SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
|
||||
SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
|
||||
SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
|
||||
SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
|
||||
};
|
||||
|
||||
#define SHA2s_MEXP( a, b, c, d ) \
|
||||
_mm_add_epi32( _mm_add_epi32( _mm_add_epi32( \
|
||||
SSG2_1( W[a] ), W[b] ), SSG2_0( W[c] ) ), W[d] );
|
||||
|
||||
#define CHs(X, Y, Z) \
|
||||
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z )
|
||||
|
||||
#define MAJs(X, Y, Z) \
|
||||
_mm_or_si128( _mm_and_si128( X, Y ), \
|
||||
_mm_and_si128( _mm_or_si128( X, Y ), Z ) )
|
||||
|
||||
#define BSG2_0(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( \
|
||||
mm_rotr_32(x, 2), mm_rotr_32(x, 13) ), mm_rotr_32( x, 22) )
|
||||
|
||||
#define BSG2_1(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( \
|
||||
mm_rotr_32(x, 6), mm_rotr_32(x, 11) ), mm_rotr_32( x, 25) )
|
||||
|
||||
#define SSG2_0(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( \
|
||||
mm_rotr_32(x, 7), mm_rotr_32(x, 18) ), _mm_srli_epi32(x, 3) )
|
||||
|
||||
#define SSG2_1(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( \
|
||||
mm_rotr_32(x, 17), mm_rotr_32(x, 19) ), _mm_srli_epi32(x, 10) )
|
||||
|
||||
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
register __m128i T1, T2; \
|
||||
T1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( \
|
||||
_mm_add_epi32( H, BSG2_1(E) ), CHs(E, F, G) ), \
|
||||
_mm_set1_epi32( K256[( (j)+(i) )] ) ), W[i] ); \
|
||||
T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
|
||||
D = _mm_add_epi32( D, T1 ); \
|
||||
H = _mm_add_epi32( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
static void
|
||||
sha256_4way_round( __m128i *in, __m128i r[8] )
|
||||
{
|
||||
register __m128i A, B, C, D, E, F, G, H;
|
||||
__m128i W[16];
|
||||
|
||||
W[ 0] = mm_bswap_32( in[ 0] );
|
||||
W[ 1] = mm_bswap_32( in[ 1] );
|
||||
W[ 2] = mm_bswap_32( in[ 2] );
|
||||
W[ 3] = mm_bswap_32( in[ 3] );
|
||||
W[ 4] = mm_bswap_32( in[ 4] );
|
||||
W[ 5] = mm_bswap_32( in[ 5] );
|
||||
W[ 6] = mm_bswap_32( in[ 6] );
|
||||
W[ 7] = mm_bswap_32( in[ 7] );
|
||||
W[ 8] = mm_bswap_32( in[ 8] );
|
||||
W[ 9] = mm_bswap_32( in[ 9] );
|
||||
W[10] = mm_bswap_32( in[10] );
|
||||
W[11] = mm_bswap_32( in[11] );
|
||||
W[12] = mm_bswap_32( in[12] );
|
||||
W[13] = mm_bswap_32( in[13] );
|
||||
W[14] = mm_bswap_32( in[14] );
|
||||
W[15] = mm_bswap_32( in[15] );
|
||||
|
||||
A = r[0];
|
||||
B = r[1];
|
||||
C = r[2];
|
||||
D = r[3];
|
||||
E = r[4];
|
||||
F = r[5];
|
||||
G = r[6];
|
||||
H = r[7];
|
||||
|
||||
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
|
||||
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
for ( int j = 16; j < 64; j += 16 )
|
||||
{
|
||||
W[ 0] = SHA2s_MEXP( 14, 9, 1, 0 );
|
||||
W[ 1] = SHA2s_MEXP( 15, 10, 2, 1 );
|
||||
W[ 2] = SHA2s_MEXP( 0, 11, 3, 2 );
|
||||
W[ 3] = SHA2s_MEXP( 1, 12, 4, 3 );
|
||||
W[ 4] = SHA2s_MEXP( 2, 13, 5, 4 );
|
||||
W[ 5] = SHA2s_MEXP( 3, 14, 6, 5 );
|
||||
W[ 6] = SHA2s_MEXP( 4, 15, 7, 6 );
|
||||
W[ 7] = SHA2s_MEXP( 5, 0, 8, 7 );
|
||||
W[ 8] = SHA2s_MEXP( 6, 1, 9, 8 );
|
||||
W[ 9] = SHA2s_MEXP( 7, 2, 10, 9 );
|
||||
W[10] = SHA2s_MEXP( 8, 3, 11, 10 );
|
||||
W[11] = SHA2s_MEXP( 9, 4, 12, 11 );
|
||||
W[12] = SHA2s_MEXP( 10, 5, 13, 12 );
|
||||
W[13] = SHA2s_MEXP( 11, 6, 14, 13 );
|
||||
W[14] = SHA2s_MEXP( 12, 7, 15, 14 );
|
||||
W[15] = SHA2s_MEXP( 13, 8, 0, 15 );
|
||||
|
||||
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
|
||||
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
|
||||
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
|
||||
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
|
||||
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
|
||||
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
|
||||
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
|
||||
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
|
||||
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
|
||||
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
|
||||
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
|
||||
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
|
||||
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
|
||||
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
|
||||
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
|
||||
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
|
||||
}
|
||||
|
||||
r[0] = _mm_add_epi32( r[0], A );
|
||||
r[1] = _mm_add_epi32( r[1], B );
|
||||
r[2] = _mm_add_epi32( r[2], C );
|
||||
r[3] = _mm_add_epi32( r[3], D );
|
||||
r[4] = _mm_add_epi32( r[4], E );
|
||||
r[5] = _mm_add_epi32( r[5], F );
|
||||
r[6] = _mm_add_epi32( r[6], G );
|
||||
r[7] = _mm_add_epi32( r[7], H );
|
||||
}
|
||||
|
||||
void sha256_4way_init( sha256_4way_context *sc )
|
||||
{
|
||||
sc->count_high = sc->count_low = 0;
|
||||
sc->val[0] = _mm_set1_epi32( H256[0] );
|
||||
sc->val[1] = _mm_set1_epi32( H256[1] );
|
||||
sc->val[2] = _mm_set1_epi32( H256[2] );
|
||||
sc->val[3] = _mm_set1_epi32( H256[3] );
|
||||
sc->val[4] = _mm_set1_epi32( H256[4] );
|
||||
sc->val[5] = _mm_set1_epi32( H256[5] );
|
||||
sc->val[6] = _mm_set1_epi32( H256[6] );
|
||||
sc->val[7] = _mm_set1_epi32( H256[7] );
|
||||
}
|
||||
|
||||
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m128i *vdata = (__m128i*)data;
|
||||
size_t ptr;
|
||||
const int buf_size = 64;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (buf_size - 1U);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
uint32_t clow, clow2;
|
||||
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_128( sc->buf + (ptr>>2), vdata, clen>>2 );
|
||||
vdata = vdata + (clen>>2);
|
||||
ptr += clen;
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
sha256_4way_round( sc->buf, sc->val );
|
||||
ptr = 0;
|
||||
}
|
||||
clow = sc->count_low;
|
||||
clow2 = SPH_T32( clow + clen );
|
||||
sc->count_low = clow2;
|
||||
if ( clow2 < clow )
|
||||
sc->count_high++;
|
||||
}
|
||||
}
|
||||
|
||||
void sha256_4way_close( sha256_4way_context *sc, void *dst )
|
||||
{
|
||||
unsigned ptr, u;
|
||||
uint32_t low, high;
|
||||
const int buf_size = 64;
|
||||
const int pad = buf_size - 8;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (buf_size - 1U);
|
||||
sc->buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 );
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
{
|
||||
memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||
sha256_4way_round( sc->buf, sc->val );
|
||||
memset_zero_128( sc->buf, pad >> 2 );
|
||||
}
|
||||
else
|
||||
memset_zero_128( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
|
||||
|
||||
low = sc->count_low;
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] =
|
||||
mm_bswap_32( _mm_set1_epi32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] =
|
||||
mm_bswap_32( _mm_set1_epi32( low ) );
|
||||
sha256_4way_round( sc->buf, sc->val );
|
||||
for ( u = 0; u < 8; u ++ )
|
||||
((__m128i*)dst)[u] = mm_bswap_32( sc->val[u] );
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// SHA512 4 way 64 bit
|
||||
|
||||
static const sph_u64 H512[8] = {
|
||||
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
||||
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
|
||||
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
|
||||
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
|
||||
};
|
||||
|
||||
static const sph_u64 K512[80] = {
|
||||
SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
|
||||
SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
|
||||
@@ -80,13 +349,6 @@ static const sph_u64 K512[80] = {
|
||||
SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
|
||||
};
|
||||
|
||||
static const sph_u64 H512[8] = {
|
||||
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
||||
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
|
||||
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
|
||||
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
|
||||
};
|
||||
|
||||
#define CH(X, Y, Z) \
|
||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
||||
|
||||
@@ -112,7 +374,7 @@ static const sph_u64 H512[8] = {
|
||||
|
||||
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
||||
do { \
|
||||
__m256i T1, T2; \
|
||||
register __m256i T1, T2; \
|
||||
T1 = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64( \
|
||||
_mm256_add_epi64( H, BSG5_1(E) ), CH(E, F, G) ), \
|
||||
_mm256_set1_epi64x( K512[i] ) ), W[i] ); \
|
||||
@@ -125,11 +387,11 @@ static void
|
||||
sha512_4way_round( __m256i *in, __m256i r[8] )
|
||||
{
|
||||
int i;
|
||||
__m256i A, B, C, D, E, F, G, H;
|
||||
register __m256i A, B, C, D, E, F, G, H;
|
||||
__m256i W[80];
|
||||
|
||||
for ( i = 0; i < 16; i++ )
|
||||
W[i] = mm256_byteswap_64( in[i] );
|
||||
W[i] = mm256_bswap_64( in[i] );
|
||||
for ( i = 16; i < 80; i++ )
|
||||
W[i] = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64(
|
||||
SSG5_1( W[ i-2 ] ), W[ i-7 ] ), SSG5_0( W[ i-15 ] ) ), W[ i-16 ] );
|
||||
@@ -182,7 +444,7 @@ void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
size_t ptr;
|
||||
int buf_size = 128;
|
||||
const int buf_size = 128;
|
||||
|
||||
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||
while ( len > 0 )
|
||||
@@ -207,13 +469,12 @@ void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
|
||||
void sha512_4way_close( sha512_4way_context *sc, void *dst )
|
||||
{
|
||||
unsigned ptr, u;
|
||||
int buf_size = 128;
|
||||
int pad = buf_size - 16;
|
||||
const int buf_size = 128;
|
||||
const int pad = buf_size - 16;
|
||||
|
||||
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||
sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
|
||||
ptr += 8;
|
||||
|
||||
if ( ptr > pad )
|
||||
{
|
||||
memset_zero_256( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
@@ -224,13 +485,14 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
|
||||
memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
|
||||
|
||||
sc->buf[ pad >> 3 ] =
|
||||
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
sc->buf[ ( pad+8 ) >> 3 ] =
|
||||
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
sha512_4way_round( sc->buf, sc->val );
|
||||
|
||||
for ( u = 0; u < 8; u ++ )
|
||||
((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
|
||||
((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // __AVX2__
|
||||
#endif // __AVX__
|
||||
|
@@ -44,47 +44,19 @@
|
||||
#include "sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
#if 0
|
||||
|
||||
#define SPH_SIZE_sha224 224
|
||||
#if defined(__AVX__)
|
||||
|
||||
#define SPH_SIZE_sha256 256
|
||||
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char buf[64]; /* first field, for alignment */
|
||||
sph_u32 val[8];
|
||||
#if SPH_64
|
||||
sph_u64 count;
|
||||
#else
|
||||
sph_u32 count_high, count_low;
|
||||
#endif
|
||||
#endif
|
||||
} sph_sha224_context;
|
||||
__m128i buf[64>>2];
|
||||
__m128i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
} sha256_4way_context;
|
||||
|
||||
typedef sph_sha224_context sph_sha256_context;
|
||||
|
||||
void sph_sha224_init(void *cc);
|
||||
|
||||
void sph_sha224(void *cc, const void *data, size_t len);
|
||||
|
||||
void sph_sha224_close(void *cc, void *dst);
|
||||
|
||||
void sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
void sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8]);
|
||||
|
||||
void sph_sha256_init(void *cc);
|
||||
|
||||
void sph_sha256(void *cc, const void *data, size_t len);
|
||||
|
||||
void sph_sha256_close(void *cc, void *dst);
|
||||
|
||||
void sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
|
||||
|
||||
#endif
|
||||
void sha256_4way_init( sha256_4way_context *sc );
|
||||
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
|
||||
void sha256_4way_close( sha256_4way_context *sc, void *dst );
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
@@ -102,3 +74,4 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst );
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
@@ -74,6 +74,18 @@ static const sph_u32 IV512[] = {
|
||||
C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
|
||||
};
|
||||
|
||||
// Return hi 128 bits with elements shifted one lane with vacated lane filled
|
||||
// with data rotated from lo.
|
||||
// Partially rotate elements in two 128 bit vectors as one 256 bit vector
|
||||
// and return the rotated high 128 bits.
|
||||
// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not
|
||||
// completed. It's faster than a full rotation.
|
||||
|
||||
static inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo, int n )
|
||||
{ return _mm_or_si128( _mm_srli_si128( hi, n<<2 ),
|
||||
_mm_slli_si128( lo, 16 - (n<<2) ) );
|
||||
}
|
||||
|
||||
#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \
|
||||
sph_u32 t0 = (x0); \
|
||||
sph_u32 t1 = (x1); \
|
||||
@@ -284,42 +296,42 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
// round
|
||||
k00 = m[0];
|
||||
x = _mm_xor_si128( p1, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k01 = m[1];
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k02 = m[2];
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k03 = m[3];
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
|
||||
k10 = m[4];
|
||||
x = _mm_xor_si128( p3, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k11 = m[5];
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k12 = m[6];
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k13 = m[7];
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
|
||||
for ( r = 0; r < 3; r ++ )
|
||||
{
|
||||
// round 1, 5, 9
|
||||
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
|
||||
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
|
||||
if ( r == 0 )
|
||||
@@ -327,8 +339,8 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
~sc->count3, sc->count2, sc->count1, sc->count0 ) );
|
||||
|
||||
x = _mm_xor_si128( p0, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
|
||||
if ( r == 1 )
|
||||
@@ -336,34 +348,34 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
~sc->count0, sc->count1, sc->count2, sc->count3 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
|
||||
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
|
||||
x = _mm_xor_si128( p2, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
|
||||
k12 = _mm_xor_si128( k12, k11 );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
|
||||
if ( r == 2 )
|
||||
@@ -371,89 +383,89 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
~sc->count1, sc->count0, sc->count3, sc->count2 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
p1 = _mm_xor_si128( p1, x );
|
||||
|
||||
// round 2, 6, 10
|
||||
|
||||
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
|
||||
x = _mm_xor_si128( p3, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
|
||||
x = _mm_xor_si128( p1, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
|
||||
// round 3, 7, 11
|
||||
|
||||
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
|
||||
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
|
||||
x = _mm_xor_si128( p2, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
|
||||
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
p1 = _mm_xor_si128( p1, x );
|
||||
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
|
||||
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
|
||||
x = _mm_xor_si128( p0, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
|
||||
k12 = _mm_xor_si128( k12, k11 );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
|
||||
// round 4, 8, 12
|
||||
@@ -461,83 +473,83 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( p1, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( p3, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
}
|
||||
|
||||
// round 13
|
||||
|
||||
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
|
||||
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
|
||||
x = _mm_xor_si128( p0, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
|
||||
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
|
||||
x = _mm_xor_si128( p2, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
|
||||
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
|
||||
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
p1 = _mm_xor_si128( p1, x );
|
||||
|
||||
h[0] = _mm_xor_si128( h[0], p2 );
|
||||
|
853
algo/simd/simd-hash-2way.c
Normal file
853
algo/simd/simd-hash-2way.c
Normal file
@@ -0,0 +1,853 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "simd-hash-2way.h"
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// imported from simd_iv.h
|
||||
|
||||
uint32_t SIMD_IV_512[] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc,
|
||||
0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
|
||||
0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f,
|
||||
0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
|
||||
0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8,
|
||||
0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
|
||||
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4,
|
||||
0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 };
|
||||
|
||||
/* Twiddle tables */
|
||||
|
||||
static const m256_v16 FFT64_Twiddle[] =
|
||||
{
|
||||
{{ 1, 2, 4, 8, 16, 32, 64, 128,
|
||||
1, 2, 4, 8, 16, 32, 64, 128 }},
|
||||
{{ 1, 60, 2, 120, 4, -17, 8, -34,
|
||||
1, 60, 2, 120, 4, -17, 8, -34 }},
|
||||
{{ 1, 120, 8, -68, 64, -30, -2, 17,
|
||||
1, 120, 8, -68, 64, -30, -2, 17 }},
|
||||
{{ 1, 46, 60, -67, 2, 92, 120, 123,
|
||||
1, 46, 60, -67, 2, 92, 120, 123 }},
|
||||
{{ 1, 92, -17, -22, 32, 117, -30, 67,
|
||||
1, 92, -17, -22, 32, 117, -30, 67 }},
|
||||
{{ 1, -67, 120, -73, 8, -22, -68, -70,
|
||||
1, -67, 120, -73, 8, -22, -68, -70 }},
|
||||
{{ 1, 123, -34, -70, 128, 67, 17, 35,
|
||||
1, 123, -34, -70, 128, 67, 17, 35 }},
|
||||
};
|
||||
|
||||
static const m256_v16 FFT128_Twiddle[] =
|
||||
{
|
||||
{{ 1, -118, 46, -31, 60, 116, -67, -61,
|
||||
1, -118, 46, -31, 60, 116, -67, -61 }},
|
||||
{{ 2, 21, 92, -62, 120, -25, 123, -122,
|
||||
2, 21, 92, -62, 120, -25, 123, -122 }},
|
||||
{{ 4, 42, -73, -124, -17, -50, -11, 13,
|
||||
4, 42, -73, -124, -17, -50, -11, 13 }},
|
||||
{{ 8, 84, 111, 9, -34, -100, -22, 26,
|
||||
8, 84, 111, 9, -34, -100, -22, 26 }},
|
||||
{{ 16, -89, -35, 18, -68, 57, -44, 52,
|
||||
16, -89, -35, 18, -68, 57, -44, 52 }},
|
||||
{{ 32, 79, -70, 36, 121, 114, -88, 104,
|
||||
32, 79, -70, 36, 121, 114, -88, 104 }},
|
||||
{{ 64, -99, 117, 72, -15, -29, 81, -49,
|
||||
64, -99, 117, 72, -15, -29, 81, -49 }},
|
||||
{{ 128, 59, -23, -113, -30, -58, -95, -98,
|
||||
128, 59, -23, -113, -30, -58, -95, -98 }},
|
||||
};
|
||||
|
||||
static const m256_v16 FFT256_Twiddle[] =
|
||||
{
|
||||
{{ 1, 41, -118, 45, 46, 87, -31, 14,
|
||||
1, 41, -118, 45, 46, 87, -31, 14 }},
|
||||
{{ 60, -110, 116, -127, -67, 80, -61, 69,
|
||||
60, -110, 116, -127, -67, 80, -61, 69 }},
|
||||
{{ 2, 82, 21, 90, 92, -83, -62, 28,
|
||||
2, 82, 21, 90, 92, -83, -62, 28 }},
|
||||
{{ 120, 37, -25, 3, 123, -97, -122, -119,
|
||||
120, 37, -25, 3, 123, -97, -122, -119 }},
|
||||
{{ 4, -93, 42, -77, -73, 91, -124, 56,
|
||||
4, -93, 42, -77, -73, 91, -124, 56 }},
|
||||
{{ -17, 74, -50, 6, -11, 63, 13, 19,
|
||||
-17, 74, -50, 6, -11, 63, 13, 19 }},
|
||||
{{ 8, 71, 84, 103, 111, -75, 9, 112,
|
||||
8, 71, 84, 103, 111, -75, 9, 112 }},
|
||||
{{ -34, -109, -100, 12, -22, 126, 26, 38,
|
||||
-34, -109, -100, 12, -22, 126, 26, 38 }},
|
||||
{{ 16, -115, -89, -51, -35, 107, 18, -33,
|
||||
16, -115, -89, -51, -35, 107, 18, -33 }},
|
||||
{{ -68, 39, 57, 24, -44, -5, 52, 76,
|
||||
-68, 39, 57, 24, -44, -5, 52, 76 }},
|
||||
{{ 32, 27, 79, -102, -70, -43, 36, -66,
|
||||
32, 27, 79, -102, -70, -43, 36, -66 }},
|
||||
{{ 121, 78, 114, 48, -88, -10, 104, -105,
|
||||
121, 78, 114, 48, -88, -10, 104, -105 }},
|
||||
{{ 64, 54, -99, 53, 117, -86, 72, 125,
|
||||
64, 54, -99, 53, 117, -86, 72, 125 }},
|
||||
{{ -15, -101, -29, 96, 81, -20, -49, 47,
|
||||
-15, -101, -29, 96, 81, -20, -49, 47 }},
|
||||
{{ 128, 108, 59, 106, -23, 85, -113, -7,
|
||||
128, 108, 59, 106, -23, 85, -113, -7 }},
|
||||
{{ -30, 55, -58, -65, -95, -40, -98, 94,
|
||||
-30, 55, -58, -65, -95, -40, -98, 94 }}
|
||||
};
|
||||
|
||||
#define SHUFXOR_1 0xb1 /* 0b10110001 */
|
||||
#define SHUFXOR_2 0x4e /* 0b01001110 */
|
||||
#define SHUFXOR_3 0x1b /* 0b00011011 */
|
||||
|
||||
#define CAT(x, y) x##y
|
||||
#define XCAT(x,y) CAT(x,y)
|
||||
|
||||
#define shufxor(x,s) _mm256_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
|
||||
|
||||
// imported from vector.c
|
||||
|
||||
#define REDUCE(x) \
|
||||
_mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi16( 255 ) ), \
|
||||
_mm256_srai_epi16( x, 8 ) )
|
||||
|
||||
#define EXTRA_REDUCE_S(x)\
|
||||
_mm256_sub_epi16( x, \
|
||||
_mm256_and_si256( _mm256_set1_epi16( 257 ), \
|
||||
_mm256_cmpgt_epi16( x, _mm256_set1_epi16( 128 ) ) ) )
|
||||
|
||||
#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) )
|
||||
|
||||
#define DO_REDUCE( i ) X(i) = REDUCE( X(i) )
|
||||
|
||||
#define DO_REDUCE_FULL_S(i) \
|
||||
do { \
|
||||
X(i) = REDUCE( X(i) ); \
|
||||
X(i) = EXTRA_REDUCE_S( X(i) ); \
|
||||
} while(0)
|
||||
|
||||
void fft64_2way( void *a )
|
||||
{
|
||||
__m256i* const A = a;
|
||||
register __m256i X0, X1, X2, X3, X4, X5, X6, X7;
|
||||
|
||||
#define X(i) X##i
|
||||
|
||||
X0 = A[0];
|
||||
X1 = A[1];
|
||||
X2 = A[2];
|
||||
X3 = A[3];
|
||||
X4 = A[4];
|
||||
X5 = A[5];
|
||||
X6 = A[6];
|
||||
X7 = A[7];
|
||||
|
||||
#define DO_REDUCE(i) X(i) = REDUCE( X(i) )
|
||||
|
||||
// Begin with 8 parallels DIF FFT_8
|
||||
//
|
||||
// FFT_8 using w=4 as 8th root of unity
|
||||
// Unrolled decimation in frequency (DIF) radix-2 NTT.
|
||||
// Output data is in revbin_permuted order.
|
||||
|
||||
static const int w[] = {0, 2, 4, 6};
|
||||
// __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
|
||||
|
||||
|
||||
#define BUTTERFLY_0( i,j ) \
|
||||
do { \
|
||||
__m256i v = X(j); \
|
||||
X(j) = _mm256_add_epi16( X(i), X(j) ); \
|
||||
X(i) = _mm256_sub_epi16( X(i), v ); \
|
||||
} while(0)
|
||||
|
||||
#define BUTTERFLY_N( i,j,n ) \
|
||||
do { \
|
||||
__m256i v = X(j); \
|
||||
X(j) = _mm256_add_epi16( X(i), X(j) ); \
|
||||
X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
|
||||
} while(0)
|
||||
|
||||
BUTTERFLY_0( 0, 4 );
|
||||
BUTTERFLY_N( 1, 5, 1 );
|
||||
BUTTERFLY_N( 2, 6, 2 );
|
||||
BUTTERFLY_N( 3, 7, 3 );
|
||||
|
||||
DO_REDUCE( 2 );
|
||||
DO_REDUCE( 3 );
|
||||
|
||||
BUTTERFLY_0( 0, 2 );
|
||||
BUTTERFLY_0( 4, 6 );
|
||||
BUTTERFLY_N( 1, 3, 2 );
|
||||
BUTTERFLY_N( 5, 7, 2 );
|
||||
|
||||
DO_REDUCE( 1 );
|
||||
|
||||
BUTTERFLY_0( 0, 1 );
|
||||
BUTTERFLY_0( 2, 3 );
|
||||
BUTTERFLY_0( 4, 5 );
|
||||
BUTTERFLY_0( 6, 7 );
|
||||
|
||||
/* We don't need to reduce X(7) */
|
||||
DO_REDUCE_FULL_S( 0 );
|
||||
DO_REDUCE_FULL_S( 1 );
|
||||
DO_REDUCE_FULL_S( 2 );
|
||||
DO_REDUCE_FULL_S( 3 );
|
||||
DO_REDUCE_FULL_S( 4 );
|
||||
DO_REDUCE_FULL_S( 5 );
|
||||
DO_REDUCE_FULL_S( 6 );
|
||||
|
||||
#undef BUTTERFLY_0
|
||||
#undef BUTTERFLY_N
|
||||
|
||||
// Multiply by twiddle factors
|
||||
X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i );
|
||||
X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i );
|
||||
X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i );
|
||||
X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i );
|
||||
X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i );
|
||||
X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i );
|
||||
X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i );
|
||||
|
||||
// Transpose the FFT state with a revbin order permutation
|
||||
// on the rows and the column.
|
||||
// This will make the full FFT_64 in order.
|
||||
#define INTERLEAVE(i,j) \
|
||||
do { \
|
||||
__m256i t1= X(i); \
|
||||
__m256i t2= X(j); \
|
||||
X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
|
||||
X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
|
||||
} while(0)
|
||||
|
||||
INTERLEAVE( 1, 0 );
|
||||
INTERLEAVE( 3, 2 );
|
||||
INTERLEAVE( 5, 4 );
|
||||
INTERLEAVE( 7, 6 );
|
||||
|
||||
INTERLEAVE( 2, 0 );
|
||||
INTERLEAVE( 3, 1 );
|
||||
INTERLEAVE( 6, 4 );
|
||||
INTERLEAVE( 7, 5 );
|
||||
|
||||
INTERLEAVE( 4, 0 );
|
||||
INTERLEAVE( 5, 1 );
|
||||
INTERLEAVE( 6, 2 );
|
||||
INTERLEAVE( 7, 3 );
|
||||
|
||||
#undef INTERLEAVE
|
||||
|
||||
//Finish with 8 parallels DIT FFT_8
|
||||
//FFT_8 using w=4 as 8th root of unity
|
||||
// Unrolled decimation in time (DIT) radix-2 NTT.
|
||||
// Input data is in revbin_permuted order.
|
||||
|
||||
#define BUTTERFLY_0( i,j ) \
|
||||
do { \
|
||||
__m256i u = X(j); \
|
||||
X(j) = _mm256_sub_epi16( X(j), X(i) ); \
|
||||
X(i) = _mm256_add_epi16( u, X(i) ); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define BUTTERFLY_N( i,j,n ) \
|
||||
do { \
|
||||
__m256i u = X(j); \
|
||||
X(i) = _mm256_slli_epi16( X(i), w[n] ); \
|
||||
X(j) = _mm256_sub_epi16( X(j), X(i) ); \
|
||||
X(i) = _mm256_add_epi16( u, X(i) ); \
|
||||
} while(0)
|
||||
|
||||
DO_REDUCE( 0 );
|
||||
DO_REDUCE( 1 );
|
||||
DO_REDUCE( 2 );
|
||||
DO_REDUCE( 3 );
|
||||
DO_REDUCE( 4 );
|
||||
DO_REDUCE( 5 );
|
||||
DO_REDUCE( 6 );
|
||||
DO_REDUCE( 7 );
|
||||
|
||||
BUTTERFLY_0( 0, 1 );
|
||||
BUTTERFLY_0( 2, 3 );
|
||||
BUTTERFLY_0( 4, 5 );
|
||||
BUTTERFLY_0( 6, 7 );
|
||||
|
||||
BUTTERFLY_0( 0, 2 );
|
||||
BUTTERFLY_0( 4, 6 );
|
||||
BUTTERFLY_N( 1, 3, 2 );
|
||||
BUTTERFLY_N( 5, 7, 2 );
|
||||
|
||||
DO_REDUCE( 3 );
|
||||
|
||||
BUTTERFLY_0( 0, 4 );
|
||||
BUTTERFLY_N( 1, 5, 1 );
|
||||
BUTTERFLY_N( 2, 6, 2 );
|
||||
BUTTERFLY_N( 3, 7, 3 );
|
||||
|
||||
DO_REDUCE_FULL_S( 0 );
|
||||
DO_REDUCE_FULL_S( 1 );
|
||||
DO_REDUCE_FULL_S( 2 );
|
||||
DO_REDUCE_FULL_S( 3 );
|
||||
DO_REDUCE_FULL_S( 4 );
|
||||
DO_REDUCE_FULL_S( 5 );
|
||||
DO_REDUCE_FULL_S( 6 );
|
||||
DO_REDUCE_FULL_S( 7 );
|
||||
|
||||
#undef BUTTERFLY
|
||||
|
||||
A[0] = X0;
|
||||
A[1] = X1;
|
||||
A[2] = X2;
|
||||
A[3] = X3;
|
||||
A[4] = X4;
|
||||
A[5] = X5;
|
||||
A[6] = X6;
|
||||
A[7] = X7;
|
||||
|
||||
#undef X
|
||||
}
|
||||
|
||||
void fft128_2way( void *a )
|
||||
{
|
||||
int i;
|
||||
// Temp space to help for interleaving in the end
|
||||
__m256i B[8];
|
||||
__m256i *A = (__m256i*) a;
|
||||
// __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
|
||||
|
||||
/* Size-2 butterflies */
|
||||
for ( i = 0; i<8; i++ )
|
||||
{
|
||||
B[ i ] = _mm256_add_epi16( A[ i ], A[ i+8 ] );
|
||||
B[ i ] = REDUCE_FULL_S( B[ i ] );
|
||||
A[ i+8 ] = _mm256_sub_epi16( A[ i ], A[ i+8 ] );
|
||||
A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
|
||||
A[ i+8 ] = _mm256_mullo_epi16( A[ i+8 ], FFT128_Twiddle[i].m256i );
|
||||
A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
|
||||
}
|
||||
|
||||
fft64_2way( B );
|
||||
fft64_2way( A+8 );
|
||||
|
||||
/* Transpose (i.e. interleave) */
|
||||
for ( i = 0; i < 8; i++ )
|
||||
{
|
||||
A[ 2*i ] = _mm256_unpacklo_epi16( B[ i ], A[ i+8 ] );
|
||||
A[ 2*i+1 ] = _mm256_unpackhi_epi16( B[ i ], A[ i+8 ] );
|
||||
}
|
||||
}
|
||||
|
||||
void fft128_2way_msg( uint16_t *a, const uint8_t *x, int final )
|
||||
{
|
||||
static const m256_v16 Tweak = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
|
||||
static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
|
||||
|
||||
__m256i *X = (__m256i*)x;
|
||||
__m256i *A = (__m256i*)a;
|
||||
// __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
|
||||
|
||||
#define UNPACK( i ) \
|
||||
do { \
|
||||
__m256i t = X[i]; \
|
||||
A[2*i] = _mm256_unpacklo_epi8( t, m256_zero ); \
|
||||
A[2*i+8] = _mm256_mullo_epi16( A[2*i], FFT128_Twiddle[2*i].m256i ); \
|
||||
A[2*i+8] = REDUCE(A[2*i+8]); \
|
||||
A[2*i+1] = _mm256_unpackhi_epi8( t, m256_zero ); \
|
||||
A[2*i+9] = _mm256_mullo_epi16(A[2*i+1], FFT128_Twiddle[2*i+1].m256i ); \
|
||||
A[2*i+9] = REDUCE(A[2*i+9]); \
|
||||
} while(0)
|
||||
|
||||
// This allows to tweak the last butterflies to introduce X^127
|
||||
#define UNPACK_TWEAK( i,tw ) \
|
||||
do { \
|
||||
__m256i t = X[i]; \
|
||||
__m256i tmp; \
|
||||
A[2*i] = _mm256_unpacklo_epi8( t, m256_zero ); \
|
||||
A[2*i+8] = _mm256_mullo_epi16( A[ 2*i ], FFT128_Twiddle[ 2*i ].m256i ); \
|
||||
A[2*i+8] = REDUCE( A[ 2*i+8 ] ); \
|
||||
tmp = _mm256_unpackhi_epi8( t, m256_zero ); \
|
||||
A[2*i+1] = _mm256_add_epi16( tmp, tw ); \
|
||||
A[2*i+9] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
|
||||
FFT128_Twiddle[ 2*i+1 ].m256i );\
|
||||
A[2*i+9] = REDUCE( A[ 2*i+9 ] ); \
|
||||
} while(0)
|
||||
|
||||
UNPACK( 0 );
|
||||
UNPACK( 1 );
|
||||
UNPACK( 2 );
|
||||
if ( final )
|
||||
UNPACK_TWEAK( 3, FinalTweak.m256i );
|
||||
else
|
||||
UNPACK_TWEAK( 3, Tweak.m256i );
|
||||
|
||||
#undef UNPACK
|
||||
#undef UNPACK_TWEAK
|
||||
|
||||
fft64_2way( a );
|
||||
fft64_2way( a+128 );
|
||||
}
|
||||
|
||||
void fft256_2way_msg( uint16_t *a, const uint8_t *x, int final )
|
||||
{
|
||||
static const m256_v16 Tweak = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
|
||||
static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
|
||||
|
||||
__m256i *X = (__m256i*)x;
|
||||
__m256i *A = (__m256i*)a;
|
||||
// __m256i *Twiddle = (__m256i*)FFT256_Twiddle;
|
||||
|
||||
#define UNPACK( i ) \
|
||||
do { \
|
||||
__m256i t = X[i]; \
|
||||
A[ 2*i ] = _mm256_unpacklo_epi8( t, m256_zero ); \
|
||||
A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
|
||||
FFT256_Twiddle[ 2*i ].m256i ); \
|
||||
A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
|
||||
A[ 2*i + 1 ] = _mm256_unpackhi_epi8( t, m256_zero ); \
|
||||
A[ 2*i + 17 ] = _mm256_mullo_epi16( A[ 2*i + 1 ], \
|
||||
FFT256_Twiddle[ 2*i + 1 ].m256i ); \
|
||||
A[ 2*i + 17 ] = REDUCE( A[ 2*i + 17 ] ); \
|
||||
} while(0)
|
||||
|
||||
// This allows to tweak the last butterflies to introduce X^127
|
||||
#define UNPACK_TWEAK( i,tw ) \
|
||||
do { \
|
||||
__m256i t = X[i]; \
|
||||
__m256i tmp; \
|
||||
A[ 2*i ] = _mm256_unpacklo_epi8( t, m256_zero ); \
|
||||
A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
|
||||
FFT256_Twiddle[ 2*i ].m256i ); \
|
||||
A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
|
||||
tmp = _mm256_unpackhi_epi8( t, m256_zero ); \
|
||||
A[ 2*i + 1 ] = _mm256_add_epi16( tmp, tw ); \
|
||||
A[ 2*i + 17 ] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
|
||||
FFT256_Twiddle[ 2*i + 1 ].m256i ); \
|
||||
} while(0)
|
||||
|
||||
UNPACK( 0 );
|
||||
UNPACK( 1 );
|
||||
UNPACK( 2 );
|
||||
UNPACK( 3 );
|
||||
UNPACK( 4 );
|
||||
UNPACK( 5 );
|
||||
UNPACK( 6 );
|
||||
if ( final )
|
||||
UNPACK_TWEAK( 7, FinalTweak.m256i );
|
||||
else
|
||||
UNPACK_TWEAK( 7, Tweak.m256i );
|
||||
|
||||
#undef UNPACK
|
||||
#undef UNPACK_TWEAK
|
||||
|
||||
fft128_2way( a );
|
||||
fft128_2way( a+256 );
|
||||
}
|
||||
|
||||
void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
|
||||
{
|
||||
register __m256i S0l, S1l, S2l, S3l;
|
||||
register __m256i S0h, S1h, S2h, S3h;
|
||||
__m256i *S = (__m256i*) state;
|
||||
__m256i *M = (__m256i*) msg;
|
||||
__m256i *W = (__m256i*) fft;
|
||||
static const m256_v16 code[] = { mm256_setc1_16(185), mm256_setc1_16(233) };
|
||||
|
||||
S0l = _mm256_xor_si256( S[0], M[0] );
|
||||
S0h = _mm256_xor_si256( S[1], M[1] );
|
||||
S1l = _mm256_xor_si256( S[2], M[2] );
|
||||
S1h = _mm256_xor_si256( S[3], M[3] );
|
||||
S2l = _mm256_xor_si256( S[4], M[4] );
|
||||
S2h = _mm256_xor_si256( S[5], M[5] );
|
||||
S3l = _mm256_xor_si256( S[6], M[6] );
|
||||
S3h = _mm256_xor_si256( S[7], M[7] );
|
||||
|
||||
#define S(i) S##i
|
||||
|
||||
#define F_0(B, C, D) \
|
||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( C,D ), B ), D )
|
||||
#define F_1(B, C, D) \
|
||||
_mm256_or_si256( _mm256_and_si256( D, C ),\
|
||||
_mm256_and_si256( _mm256_or_si256( D,C ), B ) )
|
||||
|
||||
#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
|
||||
#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
|
||||
|
||||
// We split the round function in two halfes
|
||||
// so as to insert some independent computations in between
|
||||
|
||||
#define SUM7_00 0
|
||||
#define SUM7_01 1
|
||||
#define SUM7_02 2
|
||||
#define SUM7_03 3
|
||||
#define SUM7_04 4
|
||||
#define SUM7_05 5
|
||||
#define SUM7_06 6
|
||||
|
||||
#define SUM7_10 1
|
||||
#define SUM7_11 2
|
||||
#define SUM7_12 3
|
||||
#define SUM7_13 4
|
||||
#define SUM7_14 5
|
||||
#define SUM7_15 6
|
||||
#define SUM7_16 0
|
||||
|
||||
#define SUM7_20 2
|
||||
#define SUM7_21 3
|
||||
#define SUM7_22 4
|
||||
#define SUM7_23 5
|
||||
#define SUM7_24 6
|
||||
#define SUM7_25 0
|
||||
#define SUM7_26 1
|
||||
|
||||
#define SUM7_30 3
|
||||
#define SUM7_31 4
|
||||
#define SUM7_32 5
|
||||
#define SUM7_33 6
|
||||
#define SUM7_34 0
|
||||
#define SUM7_35 1
|
||||
#define SUM7_36 2
|
||||
|
||||
#define SUM7_40 4
|
||||
#define SUM7_41 5
|
||||
#define SUM7_42 6
|
||||
#define SUM7_43 0
|
||||
#define SUM7_44 1
|
||||
#define SUM7_45 2
|
||||
#define SUM7_46 3
|
||||
|
||||
#define SUM7_50 5
|
||||
#define SUM7_51 6
|
||||
#define SUM7_52 0
|
||||
#define SUM7_53 1
|
||||
#define SUM7_54 2
|
||||
#define SUM7_55 3
|
||||
#define SUM7_56 4
|
||||
|
||||
#define SUM7_60 6
|
||||
#define SUM7_61 0
|
||||
#define SUM7_62 1
|
||||
#define SUM7_63 2
|
||||
#define SUM7_64 3
|
||||
#define SUM7_65 4
|
||||
#define SUM7_66 5
|
||||
|
||||
#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
|
||||
|
||||
#define PERM_0(d,a) /* XOR 1 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##l, 1 ); \
|
||||
d##h = shufxor( a##h, 1 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_1(d,a) /* XOR 6 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##h, 2 ); \
|
||||
d##h = shufxor( a##l, 2 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_2(d,a) /* XOR 2 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##l, 2 ); \
|
||||
d##h = shufxor( a##h, 2 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_3(d,a) /* XOR 3 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##l, 3 ); \
|
||||
d##h = shufxor( a##h, 3 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_4(d,a) /* XOR 5 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##h, 1 ); \
|
||||
d##h = shufxor( a##l, 1 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_5(d,a) /* XOR 7 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##h, 3 ); \
|
||||
d##h = shufxor( a##l, 3 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_6(d,a) /* XOR 4 */ \
|
||||
do { \
|
||||
d##l = a##h; \
|
||||
d##h = a##l; \
|
||||
} while(0)
|
||||
|
||||
#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
|
||||
do { \
|
||||
TTl = Fl( a,b,c,fun ); \
|
||||
TTh = Fh( a,b,c,fun ); \
|
||||
a##l = mm256_rotl_32( a##l, r ); \
|
||||
a##h = mm256_rotl_32( a##h, r ); \
|
||||
w##l = _mm256_add_epi32( w##l, d##l ); \
|
||||
w##h = _mm256_add_epi32( w##h, d##h ); \
|
||||
TTl = _mm256_add_epi32( TTl, w##l ); \
|
||||
TTh = _mm256_add_epi32( TTh, w##h ); \
|
||||
TTl = mm256_rotl_32( TTl, s ); \
|
||||
TTh = mm256_rotl_32( TTh, s ); \
|
||||
PERM( z,d,a ); \
|
||||
} while(0)
|
||||
|
||||
#define STEP_1( a,b,c,d,w,fun,r,s,z ) STEP_1_( a,b,c,d,w,fun,r,s,z )
|
||||
|
||||
#define STEP_2_( a,b,c,d,w,fun,r,s ) \
|
||||
do { \
|
||||
d##l = _mm256_add_epi32( d##l, TTl ); \
|
||||
d##h = _mm256_add_epi32( d##h, TTh ); \
|
||||
} while(0)
|
||||
|
||||
#define STEP_2( a,b,c,d,w,fun,r,s ) STEP_2_( a,b,c,d,w,fun,r,s )
|
||||
|
||||
#define STEP( a,b,c,d,w1,w2,fun,r,s,z ) \
|
||||
do { \
|
||||
register __m256i TTl, TTh, Wl=w1, Wh=w2; \
|
||||
STEP_1( a,b,c,d,W,fun,r,s,z ); \
|
||||
STEP_2( a,b,c,d,W,fun,r,s ); \
|
||||
} while(0);
|
||||
|
||||
#define MSG_l(x) (2*(x))
|
||||
#define MSG_h(x) (2*(x)+1)
|
||||
|
||||
#define MSG( w,hh,ll,u,z ) \
|
||||
do { \
|
||||
int a = MSG_##u(hh); \
|
||||
int b = MSG_##u(ll); \
|
||||
w##l = _mm256_unpacklo_epi16( W[a], W[b] ); \
|
||||
w##l = _mm256_mullo_epi16( w##l, code[z].m256i ); \
|
||||
w##h = _mm256_unpackhi_epi16( W[a], W[b]) ; \
|
||||
w##h = _mm256_mullo_epi16( w##h, code[z].m256i ); \
|
||||
} while(0)
|
||||
|
||||
#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \
|
||||
do { \
|
||||
register __m256i W0l, W1l, W2l, W3l, TTl; \
|
||||
register __m256i W0h, W1h, W2h, W3h, TTh; \
|
||||
MSG( W0, h0, l0, u0, z ); \
|
||||
STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0 ); \
|
||||
MSG( W1, h1, l1, u1, z ); \
|
||||
STEP_2( S(0), S(1), S(2), S(3), W0, fun, r, s ); \
|
||||
STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1 ); \
|
||||
MSG( W2,h2,l2,u2,z ); \
|
||||
STEP_2( S(3), S(0), S(1), S(2), W1, fun, s, t ); \
|
||||
STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2 ); \
|
||||
MSG( W3,h3,l3,u3,z ); \
|
||||
STEP_2( S(2), S(3), S(0), S(1), W2, fun, t, u ); \
|
||||
STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3 ); \
|
||||
STEP_2( S(1), S(2), S(3), S(0), W3, fun, u, r ); \
|
||||
} while(0)
|
||||
|
||||
// 4 rounds with code 185
|
||||
#define PERM_START 0
|
||||
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0);
|
||||
#undef PERM_START
|
||||
#define PERM_START 4
|
||||
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0);
|
||||
#undef PERM_START
|
||||
#define PERM_START 1
|
||||
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0);
|
||||
#undef PERM_START
|
||||
#define PERM_START 5
|
||||
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0);
|
||||
#undef PERM_START
|
||||
|
||||
// 4 rounds with code 233
|
||||
#define PERM_START 2
|
||||
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1);
|
||||
#undef PERM_START
|
||||
#define PERM_START 6
|
||||
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1);
|
||||
#undef PERM_START
|
||||
#define PERM_START 3
|
||||
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1);
|
||||
#undef PERM_START
|
||||
#define PERM_START 0
|
||||
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1);
|
||||
#undef PERM_START
|
||||
|
||||
// 1 round as feed-forward
|
||||
#define PERM_START 4
|
||||
STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0 );
|
||||
STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1 );
|
||||
STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2 );
|
||||
STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3 );
|
||||
|
||||
S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h;
|
||||
S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h;
|
||||
|
||||
#undef PERM_START
|
||||
#undef STEP_1
|
||||
#undef STEP_2
|
||||
#undef STEP
|
||||
#undef ROUND
|
||||
}
|
||||
|
||||
void SIMD_2way_Compress( simd_2way_context *state, const void *m, int final )
|
||||
{
|
||||
m256_v16 Y[32];
|
||||
uint16_t *y = (uint16_t*) Y[0].u16;
|
||||
fft256_2way_msg( y, m, final );
|
||||
rounds512_2way( state->A, m, y );
|
||||
}
|
||||
|
||||
// imported from nist.c
|
||||
|
||||
int simd_2way_init( simd_2way_context *state, int hashbitlen )
|
||||
{
|
||||
__m256i *A = (__m256i*)state->A;
|
||||
int n = 8;
|
||||
|
||||
state->hashbitlen = hashbitlen;
|
||||
state->n_feistels = n;
|
||||
state->blocksize = 128*8;
|
||||
state->count = 0;
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
A[i] = _mm256_set_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
|
||||
SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0],
|
||||
SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
|
||||
SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int simd_2way_update( simd_2way_context *state, const void *data,
|
||||
int databitlen )
|
||||
{
|
||||
int bs = state->blocksize;
|
||||
int current = state->count & (bs - 1);
|
||||
|
||||
while ( databitlen > 0 )
|
||||
{
|
||||
if ( current == 0 && databitlen >= bs )
|
||||
{
|
||||
// We can hash the data directly from the input buffer.
|
||||
SIMD_2way_Compress( state, data, 0 );
|
||||
databitlen -= bs;
|
||||
data += 2*(bs/8);
|
||||
state->count += bs;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Copy a chunk of data to the buffer
|
||||
int len = bs - current;
|
||||
if ( databitlen < len )
|
||||
{
|
||||
memcpy( state->buffer + 2*(current/8), data, 2*((databitlen+7)/8) );
|
||||
state->count += databitlen;
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
|
||||
state->count += len;
|
||||
databitlen -= len;
|
||||
data += 2*(len/8);
|
||||
current = 0;
|
||||
SIMD_2way_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int simd_2way_close( simd_2way_context *state, void *hashval )
|
||||
{
|
||||
uint64_t l;
|
||||
int current = state->count & (state->blocksize - 1);
|
||||
int i;
|
||||
int isshort = 1;
|
||||
|
||||
// If there is still some data in the buffer, hash it
|
||||
if ( current )
|
||||
{
|
||||
current = ( current+7 ) / 8;
|
||||
memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current ) );
|
||||
SIMD_2way_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
|
||||
//* Input the message length as the last block
|
||||
memset( state->buffer, 0, 2*(state->blocksize / 8) );
|
||||
l = state->count;
|
||||
for ( i = 0; i < 8; i++ )
|
||||
{
|
||||
state->buffer[ i ] = l & 0xff;
|
||||
state->buffer[ i+16 ] = l & 0xff;
|
||||
l >>= 8;
|
||||
}
|
||||
if ( state->count < 16384 )
|
||||
isshort = 2;
|
||||
|
||||
SIMD_2way_Compress( state, state->buffer, isshort );
|
||||
memcpy( hashval, state->A, 2*(state->hashbitlen / 8) );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int simd_2way_update_close( simd_2way_context *state, void *hashval,
|
||||
const void *data, int databitlen )
|
||||
{
|
||||
int current, i;
|
||||
int bs = state->blocksize; // bits in one lane
|
||||
int isshort = 1;
|
||||
uint64_t l;
|
||||
|
||||
current = state->count & (bs - 1);
|
||||
|
||||
while ( databitlen > 0 )
|
||||
{
|
||||
if ( current == 0 && databitlen >= bs )
|
||||
{
|
||||
// We can hash the data directly from the input buffer.
|
||||
SIMD_2way_Compress( state, data, 0 );
|
||||
databitlen -= bs;
|
||||
data += 2*( bs/8 );
|
||||
state->count += bs;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Copy a chunk of data to the buffer
|
||||
int len = bs - current;
|
||||
if ( databitlen < len )
|
||||
{
|
||||
memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) );
|
||||
state->count += databitlen;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
|
||||
state->count += len;
|
||||
databitlen -= len;
|
||||
data += 2*( len/8 );
|
||||
current = 0;
|
||||
SIMD_2way_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
current = state->count & (state->blocksize - 1);
|
||||
|
||||
// If there is still some data in the buffer, hash it
|
||||
if ( current )
|
||||
{
|
||||
current = ( current+7 ) / 8;
|
||||
memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current) );
|
||||
SIMD_2way_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
|
||||
//* Input the message length as the last block
|
||||
memset( state->buffer, 0, 2*( state->blocksize/8 ) );
|
||||
l = state->count;
|
||||
for ( i = 0; i < 8; i++ )
|
||||
{
|
||||
state->buffer[ i ] = l & 0xff;
|
||||
state->buffer[ i+16 ] = l & 0xff;
|
||||
l >>= 8;
|
||||
}
|
||||
if ( state->count < 16384 )
|
||||
isshort = 2;
|
||||
|
||||
SIMD_2way_Compress( state, state->buffer, isshort );
|
||||
memcpy( hashval, state->A, 2*( state->hashbitlen / 8 ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
27
algo/simd/simd-hash-2way.h
Normal file
27
algo/simd/simd-hash-2way.h
Normal file
@@ -0,0 +1,27 @@
|
||||
#ifndef SIMD_HASH_2WAY_H__
|
||||
#define SIMD_HASH_2WAY_H__ 1
|
||||
|
||||
#include "simd-compat.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include "avxdefs.h"
|
||||
|
||||
typedef struct {
|
||||
uint32_t A[ 32*2 ] __attribute__((aligned(64)));
|
||||
uint8_t buffer[ 128*2 ] __attribute__((aligned(64)));
|
||||
uint64_t count;
|
||||
unsigned int hashbitlen;
|
||||
unsigned int blocksize;
|
||||
unsigned int n_feistels;
|
||||
|
||||
} simd_2way_context;
|
||||
|
||||
int simd_2way_init( simd_2way_context *state, int hashbitlen );
|
||||
int simd_2way_update( simd_2way_context *state, const void *data,
|
||||
int databitlen );
|
||||
int simd_2way_close( simd_2way_context *state, void *hashval );
|
||||
int simd_2way_update_close( simd_2way_context *state, void *hashval,
|
||||
const void *data, int databitlen );
|
||||
#endif
|
||||
#endif
|
@@ -1,3 +1,6 @@
|
||||
#if !defined(SIMD_IV_H__)
|
||||
#define SIMD_IV_H__
|
||||
|
||||
u32 IV_224[] = {
|
||||
0x33586e9f, 0x12fff033, 0xb2d9f64d, 0x6f8fea53,
|
||||
0xde943106, 0x2742e439, 0x4fbab5ac, 0x62b9ff96,
|
||||
@@ -25,3 +28,5 @@ u32 IV_512[] = {
|
||||
0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
|
||||
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
|
||||
};
|
||||
|
||||
#endif
|
@@ -1,23 +0,0 @@
|
||||
|
||||
#ifndef DEFS_X5_H__
|
||||
#define DEFS_X5_H__
|
||||
#include <emmintrin.h>
|
||||
typedef unsigned char BitSequence;
|
||||
typedef unsigned long long DataLength;
|
||||
typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn;
|
||||
|
||||
typedef unsigned char uint8;
|
||||
typedef unsigned int uint32;
|
||||
typedef unsigned long long uint64;
|
||||
|
||||
typedef struct {
|
||||
uint32 buffer[8]; /* Buffer to be hashed */
|
||||
__m128i chainv[10]; /* Chaining values */
|
||||
uint64 bitlen[2]; /* Message length in bits */
|
||||
uint32 rembitlen; /* Length of buffer data to be hashed */
|
||||
int hashbitlen;
|
||||
} hashState_luffa;
|
||||
|
||||
|
||||
typedef unsigned char byte;
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
@@ -63,13 +63,13 @@ MAYBE_INLINE void fft64(void *a) {
|
||||
v16* const A = a;
|
||||
|
||||
register v16 X0, X1, X2, X3, X4, X5, X6, X7;
|
||||
|
||||
/*
|
||||
#if V16_SIZE == 8
|
||||
#define X(i) A[i]
|
||||
#elif V16_SIZE == 4
|
||||
#define X(i) A[2*i]
|
||||
#endif
|
||||
|
||||
*/
|
||||
#define X(i) X##i
|
||||
|
||||
X0 = A[0];
|
||||
@@ -623,6 +623,11 @@ void rounds(u32* state, const unsigned char* msg, short* fft) {
|
||||
STEP(S(1), S(2), S(3), S(0), S[3], 0, 25, 4, 20);
|
||||
|
||||
S[0] = S(0); S[1] = S(1); S[2] = S(2); S[3] = S(3);
|
||||
|
||||
#undef ROUND
|
||||
#undef STEP
|
||||
#undef STEP_1
|
||||
#undef STEP_2
|
||||
}
|
||||
|
||||
|
||||
@@ -849,24 +854,32 @@ void rounds512(u32* state, const unsigned char* msg, short* fft) {
|
||||
*/
|
||||
#define PERM_START 0
|
||||
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0);
|
||||
#undef PERM_START
|
||||
#define PERM_START 4
|
||||
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0);
|
||||
#undef PERM_START
|
||||
#define PERM_START 1
|
||||
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0);
|
||||
#undef PERM_START
|
||||
#define PERM_START 5
|
||||
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0);
|
||||
#undef PERM_START
|
||||
|
||||
/*
|
||||
* 4 rounds with code 233
|
||||
*/
|
||||
#define PERM_START 2
|
||||
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1);
|
||||
#undef PERM_START
|
||||
#define PERM_START 6
|
||||
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1);
|
||||
#undef PERM_START
|
||||
#define PERM_START 3
|
||||
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1);
|
||||
#undef PERM_START
|
||||
#define PERM_START 0
|
||||
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1);
|
||||
#undef PERM_START
|
||||
|
||||
|
||||
/*
|
||||
@@ -877,9 +890,15 @@ void rounds512(u32* state, const unsigned char* msg, short* fft) {
|
||||
STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1);
|
||||
STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2);
|
||||
STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3);
|
||||
#undef PERM_START
|
||||
|
||||
S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h;
|
||||
S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h;
|
||||
|
||||
#undef ROUND
|
||||
#undef STEP
|
||||
#undef STEP_1
|
||||
#undef STEP_2
|
||||
}
|
||||
|
||||
void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) {
|
@@ -1,47 +1,29 @@
|
||||
#include "skein-gate.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <openssl/sha.h>
|
||||
#include "skein-hash-4way.h"
|
||||
#include "algo/sha/sha2-hash-4way.h"
|
||||
|
||||
#if defined (__AVX2__)
|
||||
#if defined (SKEIN_4WAY)
|
||||
|
||||
void skeinhash_4way( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash64[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
|
||||
skein512_4way_context ctx_skein;
|
||||
SHA256_CTX ctx_sha256;
|
||||
sha256_4way_context ctx_sha256;
|
||||
|
||||
skein512_4way_init( &ctx_skein );
|
||||
skein512_4way( &ctx_skein, input, 80 );
|
||||
skein512_4way_close( &ctx_skein, vhash );
|
||||
skein512_4way_close( &ctx_skein, vhash64 );
|
||||
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
mm256_reinterleave_4x32( vhash32, vhash64, 512 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
|
||||
SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
|
||||
sha256_4way_init( &ctx_sha256 );
|
||||
sha256_4way( &ctx_sha256, vhash32, 64 );
|
||||
sha256_4way_close( &ctx_sha256, vhash32 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
|
||||
SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
|
||||
SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
|
||||
SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state + 32, hash1, 32 );
|
||||
memcpy( state + 64, hash2, 32 );
|
||||
memcpy( state + 96, hash3, 32 );
|
||||
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash32, 256 );
|
||||
}
|
||||
|
||||
int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
|
@@ -125,14 +125,14 @@ void sm3_4way_close( void *cc, void *dst )
|
||||
memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
|
||||
}
|
||||
|
||||
count[0] = mm_byteswap_32(
|
||||
count[0] = mm_bswap_32(
|
||||
_mm_set1_epi32( ctx->nblocks >> 23 ) );
|
||||
count[1] = mm_byteswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
|
||||
count[1] = mm_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
|
||||
( ctx->num << 3 ) ) );
|
||||
sm3_4way_compress( ctx->digest, block );
|
||||
|
||||
for ( i = 0; i < 8 ; i++ )
|
||||
hash[i] = mm_byteswap_32( ctx->digest[i] );
|
||||
hash[i] = mm_bswap_32( ctx->digest[i] );
|
||||
}
|
||||
|
||||
#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 9 ), \
|
||||
@@ -165,7 +165,7 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
|
||||
int j;
|
||||
|
||||
for ( j = 0; j < 16; j++ )
|
||||
W[j] = mm_byteswap_32( block[j] );
|
||||
W[j] = mm_bswap_32( block[j] );
|
||||
|
||||
for ( j = 16; j < 68; j++ )
|
||||
W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
|
||||
|
@@ -229,18 +229,18 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
|
||||
#if defined BE64
|
||||
#if defined PLW1
|
||||
sc->buf[ SPH_MAXPAD>>3 ] =
|
||||
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#elif defined PLW4
|
||||
memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
|
||||
sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#else
|
||||
sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#endif // PLW
|
||||
#else // LE64
|
||||
#if defined PLW1
|
||||
@@ -276,7 +276,7 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
|
||||
for ( u = 0; u < rnum; u ++ )
|
||||
{
|
||||
#if defined BE64
|
||||
((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
|
||||
((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
|
||||
#else // LE64
|
||||
((__m256i*)dst)[u] = sc->val[u];
|
||||
#endif
|
||||
|
@@ -11,15 +11,15 @@ void whirlpoolx_hash(void *state, const void *input)
|
||||
sph_whirlpool_context ctx_whirlpool;
|
||||
|
||||
unsigned char hash[64];
|
||||
unsigned char hash_xored[32];
|
||||
// unsigned char hash_xored[32];
|
||||
|
||||
sph_whirlpool1_init(&ctx_whirlpool);
|
||||
sph_whirlpool1(&ctx_whirlpool, input, 80);
|
||||
sph_whirlpool1_close(&ctx_whirlpool, hash);
|
||||
|
||||
// compress the 48 first bytes of the hash to 32
|
||||
for (int i = 0; i < 32; i++)
|
||||
hash_xored[i] = hash[i] ^ hash[i + 16];
|
||||
// for (int i = 0; i < 32; i++)
|
||||
// hash_xored[i] = hash[i] ^ hash[i + 16];
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
@@ -12,10 +12,10 @@
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
typedef struct {
|
||||
@@ -25,10 +25,10 @@ typedef struct {
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
luffa_2way_context luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
simd_2way_context simd;
|
||||
hashState_echo echo;
|
||||
} c11_4way_ctx_holder;
|
||||
|
||||
@@ -42,10 +42,10 @@ void init_c11_4way_ctx()
|
||||
skein512_4way_init( &c11_4way_ctx.skein );
|
||||
jh512_4way_init( &c11_4way_ctx.jh );
|
||||
keccak512_4way_init( &c11_4way_ctx.keccak );
|
||||
init_luffa( &c11_4way_ctx.luffa, 512 );
|
||||
luffa_2way_init( &c11_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &c11_4way_ctx.shavite );
|
||||
init_sd( &c11_4way_ctx.simd, 512 );
|
||||
simd_2way_init( &c11_4way_ctx.simd, 512 );
|
||||
init_echo( &c11_4way_ctx.echo, 512 );
|
||||
}
|
||||
|
||||
@@ -56,6 +56,7 @@ void c11_4way_hash( void *state, const void *input )
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[8*2] __attribute__ ((aligned (64)));
|
||||
c11_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
|
||||
|
||||
@@ -98,17 +99,13 @@ void c11_4way_hash( void *state, const void *input )
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 7 Luffa
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
|
||||
mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
|
||||
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
|
||||
|
||||
// 8 Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
@@ -136,17 +133,13 @@ void c11_4way_hash( void *state, const void *input )
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
// 10 Simd
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
|
||||
mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
|
||||
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
|
||||
|
||||
// 11 Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
|
@@ -22,9 +22,9 @@
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#endif
|
||||
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/blake/sse2/blake.c"
|
||||
#include "algo/keccak/sse2/keccak.c"
|
||||
#include "algo/bmw/sse2/bmw.c"
|
||||
|
@@ -12,7 +12,7 @@
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
@@ -25,7 +25,7 @@ typedef struct {
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
luffa_2way_context luffa;
|
||||
cubehashParam cube;
|
||||
} tt8_4way_ctx_holder;
|
||||
|
||||
@@ -39,7 +39,7 @@ void init_tt8_4way_ctx()
|
||||
skein512_4way_init( &tt8_4way_ctx.skein );
|
||||
jh512_4way_init( &tt8_4way_ctx.jh );
|
||||
keccak512_4way_init( &tt8_4way_ctx.keccak );
|
||||
init_luffa( &tt8_4way_ctx.luffa, 512 );
|
||||
luffa_2way_init( &tt8_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 );
|
||||
};
|
||||
|
||||
@@ -139,17 +139,13 @@ void timetravel_4way_hash(void *output, const void *input)
|
||||
case 6:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashA, dataLen<<3 );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence *)hash0, dataLen );
|
||||
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, dataLen );
|
||||
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, dataLen );
|
||||
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, dataLen );
|
||||
mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
|
||||
mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
|
||||
mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
|
||||
if ( i != 7 )
|
||||
mm256_interleave_4x64( vhashB,
|
||||
hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
|
@@ -9,7 +9,7 @@
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#ifdef NO_AES_NI
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
|
@@ -12,10 +12,10 @@
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
|
||||
@@ -27,10 +27,10 @@ typedef struct {
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
luffa_2way_context luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
simd_2way_context simd;
|
||||
} tt10_4way_ctx_holder;
|
||||
|
||||
tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64)));
|
||||
@@ -43,10 +43,10 @@ void init_tt10_4way_ctx()
|
||||
skein512_4way_init( &tt10_4way_ctx.skein );
|
||||
jh512_4way_init( &tt10_4way_ctx.jh );
|
||||
keccak512_4way_init( &tt10_4way_ctx.keccak );
|
||||
init_luffa( &tt10_4way_ctx.luffa, 512 );
|
||||
luffa_2way_init( &tt10_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &tt10_4way_ctx.shavite );
|
||||
init_sd( &tt10_4way_ctx.simd, 512 );
|
||||
simd_2way_init( &tt10_4way_ctx.simd, 512 );
|
||||
};
|
||||
|
||||
void timetravel10_4way_hash(void *output, const void *input)
|
||||
@@ -145,17 +145,13 @@ void timetravel10_4way_hash(void *output, const void *input)
|
||||
case 6:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashA, dataLen<<3 );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence *)hash0, dataLen );
|
||||
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, dataLen );
|
||||
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, dataLen );
|
||||
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, dataLen );
|
||||
mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
|
||||
mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
|
||||
mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
|
||||
if ( i != 9 )
|
||||
mm256_interleave_4x64( vhashB,
|
||||
hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
@@ -199,17 +195,13 @@ void timetravel10_4way_hash(void *output, const void *input)
|
||||
case 9:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashA, dataLen<<3 );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, dataLen<<3 );
|
||||
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, dataLen<<3 );
|
||||
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, dataLen<<3 );
|
||||
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, dataLen<<3 );
|
||||
mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
|
||||
simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
|
||||
mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
|
||||
mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
|
||||
if ( i != 9 )
|
||||
mm256_interleave_4x64( vhashB,
|
||||
hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
|
@@ -8,10 +8,10 @@
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/simd/nist.h"
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
|
@@ -5,17 +5,16 @@
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
typedef struct {
|
||||
@@ -25,10 +24,10 @@ typedef struct {
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
luffa_2way_context luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
simd_2way_context simd;
|
||||
hashState_echo echo;
|
||||
} x11_4way_ctx_holder;
|
||||
|
||||
@@ -42,10 +41,10 @@ void init_x11_4way_ctx()
|
||||
skein512_4way_init( &x11_4way_ctx.skein );
|
||||
jh512_4way_init( &x11_4way_ctx.jh );
|
||||
keccak512_4way_init( &x11_4way_ctx.keccak );
|
||||
init_luffa( &x11_4way_ctx.luffa, 512 );
|
||||
luffa_2way_init( &x11_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x11_4way_ctx.shavite );
|
||||
init_sd( &x11_4way_ctx.simd, 512 );
|
||||
simd_2way_init( &x11_4way_ctx.simd, 512 );
|
||||
init_echo( &x11_4way_ctx.echo, 512 );
|
||||
}
|
||||
|
||||
@@ -56,6 +55,8 @@ void x11_4way_hash( void *state, const void *input )
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[8*2] __attribute__ ((aligned (64)));
|
||||
|
||||
x11_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
|
||||
|
||||
@@ -94,21 +95,16 @@ void x11_4way_hash( void *state, const void *input )
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 7 Luffa
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
// 7 Luffa parallel 2 way 128 bit
|
||||
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
|
||||
mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
|
||||
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
|
||||
|
||||
// 8 Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
@@ -136,17 +132,13 @@ void x11_4way_hash( void *state, const void *input )
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
// 10 Simd
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
|
||||
mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
|
||||
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
|
||||
|
||||
// 11 Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
|
@@ -10,10 +10,8 @@
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/luffa/sph_luffa.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
@@ -21,9 +19,9 @@
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#endif
|
||||
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/blake/sse2/blake.c"
|
||||
#include "algo/keccak/sse2/keccak.c"
|
||||
#include "algo/bmw/sse2/bmw.c"
|
||||
|
@@ -11,15 +11,12 @@
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sph_luffa.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
@@ -28,10 +25,10 @@ typedef struct {
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
luffa_2way_context luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
simd_2way_context simd;
|
||||
hashState_echo echo;
|
||||
} x11evo_4way_ctx_holder;
|
||||
|
||||
@@ -45,10 +42,10 @@ void init_x11evo_4way_ctx()
|
||||
skein512_4way_init( &x11evo_4way_ctx.skein );
|
||||
jh512_4way_init( &x11evo_4way_ctx.jh );
|
||||
keccak512_4way_init( &x11evo_4way_ctx.keccak );
|
||||
init_luffa( &x11evo_4way_ctx.luffa, 512 );
|
||||
luffa_2way_init( &x11evo_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x11evo_4way_ctx.shavite );
|
||||
init_sd( &x11evo_4way_ctx.simd, 512 );
|
||||
simd_2way_init( &x11evo_4way_ctx.simd, 512 );
|
||||
init_echo( &x11evo_4way_ctx.echo, 512 );
|
||||
}
|
||||
|
||||
@@ -142,20 +139,13 @@ void x11evo_4way_hash( void *state, const void *input )
|
||||
case 6:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhash, 64<<3 );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
|
||||
sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
|
||||
sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
|
||||
sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
mm256_interleave_2x128( vhash, hash0, hash1, 64<<3 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhash, 64<<3 );
|
||||
mm256_interleave_2x128( vhash, hash2, hash3, 64<<3 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
mm256_deinterleave_2x128( hash2, hash3, vhash, 64<<3 );
|
||||
if ( i < len-1 )
|
||||
mm256_interleave_4x64( vhash,
|
||||
hash0, hash1, hash2, hash3, 64<<3 );
|
||||
@@ -202,17 +192,13 @@ void x11evo_4way_hash( void *state, const void *input )
|
||||
case 9:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhash, 64<<3 );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
mm256_interleave_2x128( vhash, hash0, hash1, 64<<3 );
|
||||
simd_2way_update_close( &ctx.simd, vhash, vhash, 64<<3 );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhash, 64<<3 );
|
||||
mm256_interleave_2x128( vhash, hash2, hash3, 64<<3 );
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhash, vhash, 64<<3 );
|
||||
mm256_deinterleave_2x128( hash2, hash3, vhash, 64<<3 );
|
||||
if ( i < len-1 )
|
||||
mm256_interleave_4x64( vhash,
|
||||
hash0, hash1, hash2, hash3, 64<<3 );
|
||||
|
@@ -22,9 +22,9 @@
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#endif
|
||||
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/simd/nist.h"
|
||||
|
||||
typedef struct {
|
||||
#ifdef NO_AES_NI
|
||||
|
@@ -13,10 +13,10 @@
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
typedef struct {
|
||||
@@ -27,10 +27,10 @@ typedef struct {
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
sph_gost512_context gost;
|
||||
hashState_luffa luffa;
|
||||
luffa_2way_context luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
simd_2way_context simd;
|
||||
hashState_echo echo;
|
||||
} x11gost_4way_ctx_holder;
|
||||
|
||||
@@ -45,10 +45,10 @@ void init_x11gost_4way_ctx()
|
||||
jh512_4way_init( &x11gost_4way_ctx.jh );
|
||||
keccak512_4way_init( &x11gost_4way_ctx.keccak );
|
||||
sph_gost512_init( &x11gost_4way_ctx.gost );
|
||||
init_luffa( &x11gost_4way_ctx.luffa, 512 );
|
||||
luffa_2way_init( &x11gost_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x11gost_4way_ctx.shavite );
|
||||
init_sd( &x11gost_4way_ctx.simd, 512 );
|
||||
simd_2way_init( &x11gost_4way_ctx.simd, 512 );
|
||||
init_echo( &x11gost_4way_ctx.echo, 512 );
|
||||
}
|
||||
|
||||
@@ -59,6 +59,7 @@ void x11gost_4way_hash( void *state, const void *input )
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
|
||||
x11gost_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
|
||||
|
||||
@@ -109,17 +110,13 @@ void x11gost_4way_hash( void *state, const void *input )
|
||||
sph_gost512( &ctx.gost, hash3, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash3 );
|
||||
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
|
||||
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
@@ -144,17 +141,12 @@ void x11gost_4way_hash( void *state, const void *input )
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
|
||||
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
|
@@ -10,9 +10,9 @@
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/blake/sse2/blake.c"
|
||||
#include "algo/keccak/sse2/keccak.c"
|
||||
#include "algo/bmw/sse2/bmw.c"
|
||||
|
273
algo/x12/x12-4way.c
Normal file
273
algo/x12/x12-4way.c
Normal file
@@ -0,0 +1,273 @@
|
||||
#include "x12-gate.h"
|
||||
|
||||
#if defined(X12_4WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/hamsi/hamsi-hash-4way.h"
|
||||
//#include "algo/fugue/sph_fugue.h"
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
luffa_2way_context luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
simd_2way_context simd;
|
||||
hashState_echo echo;
|
||||
hamsi512_4way_context hamsi;
|
||||
// sph_fugue512_context fugue;
|
||||
} x12_4way_ctx_holder;
|
||||
|
||||
x12_4way_ctx_holder x12_4way_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_x12_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x12_4way_ctx.blake );
|
||||
bmw512_4way_init( &x12_4way_ctx.bmw );
|
||||
init_groestl( &x12_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x12_4way_ctx.skein );
|
||||
jh512_4way_init( &x12_4way_ctx.jh );
|
||||
keccak512_4way_init( &x12_4way_ctx.keccak );
|
||||
luffa_2way_init( &x12_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &x12_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x12_4way_ctx.shavite );
|
||||
simd_2way_init( &x12_4way_ctx.simd, 512 );
|
||||
init_echo( &x12_4way_ctx.echo, 512 );
|
||||
hamsi512_4way_init( &x12_4way_ctx.hamsi );
|
||||
// sph_fugue512_init( &x12_4way_ctx.fugue );
|
||||
};
|
||||
|
||||
void x12_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
x12_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) );
|
||||
|
||||
// 1 Blake
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
// Parallel 4way 64 bit
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
// 4 Skein
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// 5 JH
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// 6 Keccak
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 7 Luffa
|
||||
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
|
||||
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
|
||||
|
||||
// 8 Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
cubehashReinit( &ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
// 10 Simd
|
||||
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
|
||||
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
|
||||
|
||||
// 11 Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
// 12 Hamsi parallel 4way 32 bit
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
hamsi512_4way( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
|
||||
mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
|
||||
|
||||
|
||||
/*
|
||||
// 13 Fugue serial
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
*/
|
||||
}
|
||||
|
||||
int scanhash_x12_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
for ( int m=0; m < 6; m++ )
|
||||
if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
x12_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
18
algo/x12/x12-gate.c
Normal file
18
algo/x12/x12-gate.c
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "x12-gate.h"
|
||||
|
||||
bool register_x12_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X12_4WAY)
|
||||
init_x12_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x12_4way;
|
||||
gate->hash = (void*)&x12_4way_hash;
|
||||
#else
|
||||
init_x12_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x12;
|
||||
gate->hash = (void*)&x12hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/x12/x12-gate.h
Normal file
32
algo/x12/x12-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef X12_GATE_H__
|
||||
#define X12_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define X12_4WAY
|
||||
#endif
|
||||
|
||||
bool register_x12_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(X12_4WAY)
|
||||
|
||||
void x12_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x12_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x12_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void x12hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x12( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x12_ctx();
|
||||
|
||||
#endif
|
||||
|
252
algo/x12/x12.c
Normal file
252
algo/x12/x12.c
Normal file
@@ -0,0 +1,252 @@
|
||||
#include "x12-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/luffa/sph_luffa.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
//#include "algo/fugue/sph_fugue.h"
|
||||
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/blake/sse2/blake.c"
|
||||
#include "algo/bmw/sse2/bmw.c"
|
||||
#include "algo/keccak/sse2/keccak.c"
|
||||
#include "algo/skein/sse2/skein.c"
|
||||
#include "algo/jh/sse2/jh_sse2_opt64.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cubehash;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
// sph_fugue512_context fugue;
|
||||
} x12_ctx_holder;
|
||||
|
||||
x12_ctx_holder x12_ctx;
|
||||
|
||||
void init_x12_ctx()
|
||||
{
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init(&x12_ctx.groestl);
|
||||
sph_echo512_init(&x12_ctx.echo);
|
||||
#else
|
||||
init_echo( &x12_ctx.echo, 512 );
|
||||
init_groestl (&x12_ctx.groestl, 64 );
|
||||
#endif
|
||||
init_luffa( &x12_ctx.luffa, 512 );
|
||||
cubehashInit( &x12_ctx.cubehash, 512, 16, 32 );
|
||||
sph_shavite512_init( &x12_ctx.shavite );
|
||||
init_sd( &x12_ctx.simd, 512 );
|
||||
sph_hamsi512_init( &x12_ctx.hamsi );
|
||||
// sph_fugue512_init( &x13_ctx.fugue );
|
||||
};
|
||||
|
||||
void x12hash(void *output, const void *input)
|
||||
{
|
||||
unsigned char hash[128] __attribute__ ((aligned (32)));
|
||||
#define hashB hash+64
|
||||
|
||||
x12_ctx_holder ctx;
|
||||
memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) );
|
||||
|
||||
// X11 algos
|
||||
|
||||
unsigned char hashbuf[128];
|
||||
size_t hashptr;
|
||||
sph_u64 hashctA;
|
||||
sph_u64 hashctB;
|
||||
|
||||
//---blake1---
|
||||
|
||||
DECL_BLK;
|
||||
BLK_I;
|
||||
BLK_W;
|
||||
BLK_C;
|
||||
|
||||
//---bmw2---
|
||||
|
||||
DECL_BMW;
|
||||
BMW_I;
|
||||
BMW_U;
|
||||
|
||||
#define M(x) sph_dec64le_aligned(data + 8 * (x))
|
||||
#define H(x) (h[x])
|
||||
#define dH(x) (dh[x])
|
||||
|
||||
BMW_C;
|
||||
|
||||
#undef M
|
||||
#undef H
|
||||
#undef dH
|
||||
|
||||
//---groetl----
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512 (&ctx.groestl, hash, 64);
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#else
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
#endif
|
||||
|
||||
//---skein4---
|
||||
|
||||
DECL_SKN;
|
||||
SKN_I;
|
||||
SKN_U;
|
||||
SKN_C;
|
||||
|
||||
//---jh5------
|
||||
|
||||
DECL_JH;
|
||||
JH_H;
|
||||
|
||||
//---keccak6---
|
||||
|
||||
DECL_KEC;
|
||||
KEC_I;
|
||||
KEC_U;
|
||||
KEC_C;
|
||||
|
||||
//--- luffa7
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
|
||||
(const BitSequence*)hash, 64 );
|
||||
|
||||
// 8 Cube
|
||||
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
|
||||
(const byte*)hashB, 64 );
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hashB);
|
||||
|
||||
// 10 Simd
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hashB, 512 );
|
||||
|
||||
//11---echo---
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_echo512(&ctx.echo, hash, 64);
|
||||
sph_echo512_close(&ctx.echo, hashB);
|
||||
#else
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hashB,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
|
||||
// 12 Hamsi
|
||||
sph_hamsi512(&ctx.hamsi, hashB, 64);
|
||||
sph_hamsi512_close(&ctx.hamsi, hash);
|
||||
|
||||
/*
|
||||
// 13 Fugue
|
||||
sph_fugue512(&ctx.fugue, hash, 64);
|
||||
sph_fugue512_close(&ctx.fugue, hashB);
|
||||
*/
|
||||
asm volatile ("emms");
|
||||
memcpy(output, hashB, 32);
|
||||
}
|
||||
|
||||
int scanhash_x12(int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done)
|
||||
{
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t hash64[8] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19] - 1;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
|
||||
uint64_t htmax[] = {
|
||||
0,
|
||||
0xF,
|
||||
0xFF,
|
||||
0xFFF,
|
||||
0xFFFF,
|
||||
0x10000000
|
||||
};
|
||||
uint32_t masks[] = {
|
||||
0xFFFFFFFF,
|
||||
0xFFFFFFF0,
|
||||
0xFFFFFF00,
|
||||
0xFFFFF000,
|
||||
0xFFFF0000,
|
||||
0
|
||||
};
|
||||
|
||||
// we need bigendian data...
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
printf("[%d] Htarg=%X\n", thr_id, Htarg);
|
||||
#endif
|
||||
for (int m=0; m < 6; m++) {
|
||||
if (Htarg <= htmax[m]) {
|
||||
uint32_t mask = masks[m];
|
||||
do {
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
x12hash(hash64, endiandata);
|
||||
#ifndef DEBUG_ALGO
|
||||
if (!(hash64[7] & mask))
|
||||
{
|
||||
if ( fulltest(hash64, ptarget) )
|
||||
{
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
// else
|
||||
// {
|
||||
// applog(LOG_INFO, "Result does not validate on CPU!");
|
||||
// }
|
||||
}
|
||||
|
||||
#else
|
||||
if (!(n % 0x1000) && !thr_id) printf(".");
|
||||
if (!(hash64[7] & mask)) {
|
||||
printf("[%d]",thr_id);
|
||||
if (fulltest(hash64, ptarget)) {
|
||||
work_set_target_ratio( work, hash );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
// see blake.c if else to understand the loop on htmax => mask
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user