Compare commits

...

4 Commits

Author SHA1 Message Date
Jay D Dee
502ed0b1fe v3.8.2.1 2018-02-17 13:52:24 -05:00
Jay D Dee
d60a268972 v3.8.2 2018-02-15 14:48:50 -05:00
Jay D Dee
e4265a6f11 v3.8.1.1 2018-02-09 23:30:14 -05:00
Jay D Dee
a28daca3ce v3.8.1 2018-02-07 16:38:45 -05:00
128 changed files with 7940 additions and 5243 deletions

View File

@@ -70,6 +70,8 @@ cpuminer_SOURCES = \
algo/gost/sph_gost.c \
algo/groestl/sph_groestl.c \
algo/groestl/groestl.c \
algo/groestl/myrgr-gate.c \
algo/groestl/myrgr-4way.c \
algo/groestl/myr-groestl.c \
algo/groestl/aes_ni/hash-groestl.c \
algo/groestl/aes_ni/hash-groestl256.c \
@@ -97,10 +99,10 @@ cpuminer_SOURCES = \
algo/keccak/keccak-4way.c\
algo/keccak/keccak-gate.c \
algo/keccak/sse2/keccak.c \
algo/lbry.c \
algo/luffa/sph_luffa.c \
algo/luffa/luffa.c \
algo/luffa/sse2/luffa_for_sse2.c \
algo/luffa/luffa_for_sse2.c \
algo/luffa/luffa-hash-2way.c \
algo/lyra2/lyra2.c \
algo/lyra2/sponge.c \
algo/lyra2/lyra2rev2-gate.c \
@@ -114,6 +116,9 @@ cpuminer_SOURCES = \
algo/lyra2/lyra2h-gate.c \
algo/lyra2/lyra2h.c \
algo/lyra2/lyra2h-4way.c \
algo/lyra2/allium-gate.c \
algo/lyra2/allium-4way.c \
algo/lyra2/allium.c \
algo/m7m.c \
algo/neoscrypt/neoscrypt.c \
algo/nist5/nist5-gate.c \
@@ -127,9 +132,17 @@ cpuminer_SOURCES = \
algo/quark/anime-gate.c \
algo/quark/anime.c \
algo/quark/anime-4way.c \
algo/qubit/qubit-gate.c \
algo/qubit/qubit.c \
algo/qubit/qubit-2way.c \
algo/qubit/deep-gate.c \
algo/qubit/deep-2way.c \
algo/qubit/deep.c \
algo/ripemd/sph_ripemd.c \
algo/ripemd/ripemd-hash-4way.c \
algo/ripemd/lbry-gate.c \
algo/ripemd/lbry.c \
algo/ripemd/lbry-4way.c \
algo/scrypt.c \
algo/scryptjane/scrypt-jane.c \
algo/sha/sph_sha2.c \
@@ -143,8 +156,9 @@ cpuminer_SOURCES = \
algo/shavite/sph-shavite-aesni.c \
algo/shavite/shavite.c \
algo/simd/sph_simd.c \
algo/simd/sse2/nist.c \
algo/simd/sse2/vector.c \
algo/simd/nist.c \
algo/simd/vector.c \
algo/simd/simd-hash-2way.c \
algo/skein/sph_skein.c \
algo/skein/skein-hash-4way.c \
algo/skein/skein.c \
@@ -184,6 +198,9 @@ cpuminer_SOURCES = \
algo/x11/x11evo.c \
algo/x11/x11evo-4way.c \
algo/x11/x11evo-gate.c \
algo/x12/x12-gate.c \
algo/x12/x12.c \
algo/x12/x12-4way.c \
algo/x13/x13-gate.c \
algo/x13/x13.c \
algo/x13/x13-4way.c \

View File

@@ -13,9 +13,34 @@ mailto://jayddee246@gmail.com
See file RELEASE_NOTES for change log and compile instructions.
Requirements
------------
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
optimizations a CPU with AES_NI is required. This includes Intel Westbridge
and newer and AMD equivalents. Further optimizations are available on some
algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
performance.
ARM CPUs are not supported.
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
Centos are known to work and have all dependencies in their repositories.
Others may work but may require more effort.
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
MacOS, OSx is not supported.
3. Stratum pool. Some algos may work wallet mining using getwork or GBT. YMMV.
Supported Algorithms
--------------------
allium Garlicoin
anime Animecoin
argon2
axiom Shabal-256 MemoHash
bastion
@@ -74,40 +99,19 @@ Supported Algorithms
x11 Dash
x11evo Revolvercoin
x11gost sib (SibCoin)
x12 Galaxie Cash (GCH)
x13 X13
x13sm3 hsr (Hshare)
x14 X14
x15 X15
x16r Ravencoin
x17
xevan Bitsend
yescrypt Globalboost-Y (BSTY)
yescryptr8 BitZeny (ZNY)\n\
yescryptr8 BitZeny (ZNY)
yescryptr16 Yenten (YTN)
zr5 Ziftr
Requirements
------------
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
optimizations a CPU with AES_NI is required. This includes Intel Westbridge
and newer and AMD equivalents. Further optimizations are available on some
algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
performance.
ARM CPUs are not supported.
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
Centos are known to work and have all dependencies in their repositories.
Others may work but may require more effort.
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
MacOS, OSx is not supported.
3. Stratum pool. Some algos may work wallet mining using getwork.
Errata
------
@@ -136,10 +140,13 @@ output from the miner showing the startup and any errors.
Donations
---------
I do not do this for money but I have a donation address if users
are so inclined.
cpuminer-opt has no fees of any kind but donations are accepted.
bitcoin:12tdvfF7KmAsihBXQXynT6E6th2c2pByTT?label=donations
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
Happy mining!

View File

@@ -25,3 +25,12 @@ cpuminer-aes-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
cpuminer-avx2.exe "-march=core-avx2" Haswell...
cpuminer-avx2-sha.exe "-march=core-avx2 -msha" Ryzen
If you like this software feel free to donate:
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ

View File

@@ -98,8 +98,8 @@ Start mining.
Windows
The following in how the Windows binary releases are built. It's old and
not very good but it works, for me anyway.
Precompiled Windows binaries are built on a Linux host using Mingw
with a more recent compiler than the following Windows hosted procedure.
Building on Windows prerequisites:
@@ -131,7 +131,7 @@ or similar Windows program.
In msys shell cd to miner directory.
cd /c/path/to/cpuminer-opt
Run winbuild.sh to build on Windows or execute the following commands.
Run build.sh to build on Windows or execute the following commands.
./autogen.sh
CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
@@ -159,6 +159,34 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log
----------
v3.8.2.1
Fixed low difficulty rejects with allium.
Fixed qubit AVX2.
Restored lyra2z lost hash.
Fixed build.sh
v3.8.2
Fixed and faster myr-gr.
Added x12 algo (Galaxie Cash), allium algo (Garlicoin).
Faster lyra2rev2, lbry, skein.
Large reduction in compiler warnings.
v3.8.1.1
Fixed Windows AVX2 crash.
v3.8.1
Fixes x16r on CPUs with only SSE2.
More Optimizations for X algos, qubit & deep.
Corrected algo optimizations for scrypt and yescrypt, no new optimizations.
v3.8.0.1
Fixed x16r AVX2 low hash rate.
v3.8.0
4way no longer a seperate feature, included in AVX2.

View File

@@ -155,6 +155,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
switch (algo)
{
case ALGO_ALLIUM: register_allium_algo ( gate ); break;
case ALGO_ANIME: register_anime_algo ( gate ); break;
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
@@ -213,6 +214,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_X11: register_x11_algo ( gate ); break;
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
case ALGO_X12: register_x12_algo ( gate ); break;
case ALGO_X13: register_x13_algo ( gate ); break;
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
case ALGO_X14: register_x14_algo ( gate ); break;
@@ -298,6 +300,7 @@ const char* const algo_alias_map[][2] =
{ "lyra2", "lyra2re" },
{ "lyra2v2", "lyra2rev2" },
{ "lyra2zoin", "lyra2z330" },
{ "myrgr", "myr-gr" },
{ "myriad", "myr-gr" },
{ "neo", "neoscrypt" },
{ "phi", "phi1612" },

View File

@@ -553,22 +553,22 @@ do { \
, _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
M[0x0] = mm_byteswap_32( *(buf + 0) ); \
M[0x1] = mm_byteswap_32( *(buf + 1) ); \
M[0x2] = mm_byteswap_32( *(buf + 2) ); \
M[0x3] = mm_byteswap_32( *(buf + 3) ); \
M[0x4] = mm_byteswap_32( *(buf + 4) ); \
M[0x5] = mm_byteswap_32( *(buf + 5) ); \
M[0x6] = mm_byteswap_32( *(buf + 6) ); \
M[0x7] = mm_byteswap_32( *(buf + 7) ); \
M[0x8] = mm_byteswap_32( *(buf + 8) ); \
M[0x9] = mm_byteswap_32( *(buf + 9) ); \
M[0xA] = mm_byteswap_32( *(buf + 10) ); \
M[0xB] = mm_byteswap_32( *(buf + 11) ); \
M[0xC] = mm_byteswap_32( *(buf + 12) ); \
M[0xD] = mm_byteswap_32( *(buf + 13) ); \
M[0xE] = mm_byteswap_32( *(buf + 14) ); \
M[0xF] = mm_byteswap_32( *(buf + 15) ); \
M[0x0] = mm_bswap_32( *(buf + 0) ); \
M[0x1] = mm_bswap_32( *(buf + 1) ); \
M[0x2] = mm_bswap_32( *(buf + 2) ); \
M[0x3] = mm_bswap_32( *(buf + 3) ); \
M[0x4] = mm_bswap_32( *(buf + 4) ); \
M[0x5] = mm_bswap_32( *(buf + 5) ); \
M[0x6] = mm_bswap_32( *(buf + 6) ); \
M[0x7] = mm_bswap_32( *(buf + 7) ); \
M[0x8] = mm_bswap_32( *(buf + 8) ); \
M[0x9] = mm_bswap_32( *(buf + 9) ); \
M[0xA] = mm_bswap_32( *(buf + 10) ); \
M[0xB] = mm_bswap_32( *(buf + 11) ); \
M[0xC] = mm_bswap_32( *(buf + 12) ); \
M[0xD] = mm_bswap_32( *(buf + 13) ); \
M[0xE] = mm_bswap_32( *(buf + 14) ); \
M[0xF] = mm_bswap_32( *(buf + 15) ); \
for (r = 0; r < rounds; r ++) \
ROUND_S_4WAY(r); \
H0 = _mm_xor_si128( _mm_xor_si128( \
@@ -615,22 +615,22 @@ do { \
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
M0 = mm_byteswap_32( * buf ); \
M1 = mm_byteswap_32( *(buf+1) ); \
M2 = mm_byteswap_32( *(buf+2) ); \
M3 = mm_byteswap_32( *(buf+3) ); \
M4 = mm_byteswap_32( *(buf+4) ); \
M5 = mm_byteswap_32( *(buf+5) ); \
M6 = mm_byteswap_32( *(buf+6) ); \
M7 = mm_byteswap_32( *(buf+7) ); \
M8 = mm_byteswap_32( *(buf+8) ); \
M9 = mm_byteswap_32( *(buf+9) ); \
MA = mm_byteswap_32( *(buf+10) ); \
MB = mm_byteswap_32( *(buf+11) ); \
MC = mm_byteswap_32( *(buf+12) ); \
MD = mm_byteswap_32( *(buf+13) ); \
ME = mm_byteswap_32( *(buf+14) ); \
MF = mm_byteswap_32( *(buf+15) ); \
M0 = mm_bswap_32( * buf ); \
M1 = mm_bswap_32( *(buf+1) ); \
M2 = mm_bswap_32( *(buf+2) ); \
M3 = mm_bswap_32( *(buf+3) ); \
M4 = mm_bswap_32( *(buf+4) ); \
M5 = mm_bswap_32( *(buf+5) ); \
M6 = mm_bswap_32( *(buf+6) ); \
M7 = mm_bswap_32( *(buf+7) ); \
M8 = mm_bswap_32( *(buf+8) ); \
M9 = mm_bswap_32( *(buf+9) ); \
MA = mm_bswap_32( *(buf+10) ); \
MB = mm_bswap_32( *(buf+11) ); \
MC = mm_bswap_32( *(buf+12) ); \
MD = mm_bswap_32( *(buf+13) ); \
ME = mm_bswap_32( *(buf+14) ); \
MF = mm_bswap_32( *(buf+15) ); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
@@ -727,22 +727,22 @@ do { \
VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
M0 = mm256_byteswap_32( * buf ); \
M1 = mm256_byteswap_32( *(buf+1) ); \
M2 = mm256_byteswap_32( *(buf+2) ); \
M3 = mm256_byteswap_32( *(buf+3) ); \
M4 = mm256_byteswap_32( *(buf+4) ); \
M5 = mm256_byteswap_32( *(buf+5) ); \
M6 = mm256_byteswap_32( *(buf+6) ); \
M7 = mm256_byteswap_32( *(buf+7) ); \
M8 = mm256_byteswap_32( *(buf+8) ); \
M9 = mm256_byteswap_32( *(buf+9) ); \
MA = mm256_byteswap_32( *(buf+10) ); \
MB = mm256_byteswap_32( *(buf+11) ); \
MC = mm256_byteswap_32( *(buf+12) ); \
MD = mm256_byteswap_32( *(buf+13) ); \
ME = mm256_byteswap_32( *(buf+14) ); \
MF = mm256_byteswap_32( *(buf+15) ); \
M0 = mm256_bswap_32( * buf ); \
M1 = mm256_bswap_32( *(buf+1) ); \
M2 = mm256_bswap_32( *(buf+2) ); \
M3 = mm256_bswap_32( *(buf+3) ); \
M4 = mm256_bswap_32( *(buf+4) ); \
M5 = mm256_bswap_32( *(buf+5) ); \
M6 = mm256_bswap_32( *(buf+6) ); \
M7 = mm256_bswap_32( *(buf+7) ); \
M8 = mm256_bswap_32( *(buf+8) ); \
M9 = mm256_bswap_32( *(buf+9) ); \
MA = mm256_bswap_32( *(buf+10) ); \
MB = mm256_bswap_32( *(buf+11) ); \
MC = mm256_bswap_32( *(buf+12) ); \
MD = mm256_bswap_32( *(buf+13) ); \
ME = mm256_bswap_32( *(buf+14) ); \
MF = mm256_bswap_32( *(buf+15) ); \
ROUND_S_8WAY(0); \
ROUND_S_8WAY(1); \
ROUND_S_8WAY(2); \
@@ -848,22 +848,22 @@ do { \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M[0x0] = mm256_byteswap_64( *(buf+0) ); \
M[0x1] = mm256_byteswap_64( *(buf+1) ); \
M[0x2] = mm256_byteswap_64( *(buf+2) ); \
M[0x3] = mm256_byteswap_64( *(buf+3) ); \
M[0x4] = mm256_byteswap_64( *(buf+4) ); \
M[0x5] = mm256_byteswap_64( *(buf+5) ); \
M[0x6] = mm256_byteswap_64( *(buf+6) ); \
M[0x7] = mm256_byteswap_64( *(buf+7) ); \
M[0x8] = mm256_byteswap_64( *(buf+8) ); \
M[0x9] = mm256_byteswap_64( *(buf+9) ); \
M[0xA] = mm256_byteswap_64( *(buf+10) ); \
M[0xB] = mm256_byteswap_64( *(buf+11) ); \
M[0xC] = mm256_byteswap_64( *(buf+12) ); \
M[0xD] = mm256_byteswap_64( *(buf+13) ); \
M[0xE] = mm256_byteswap_64( *(buf+14) ); \
M[0xF] = mm256_byteswap_64( *(buf+15) ); \
M[0x0] = mm256_bswap_64( *(buf+0) ); \
M[0x1] = mm256_bswap_64( *(buf+1) ); \
M[0x2] = mm256_bswap_64( *(buf+2) ); \
M[0x3] = mm256_bswap_64( *(buf+3) ); \
M[0x4] = mm256_bswap_64( *(buf+4) ); \
M[0x5] = mm256_bswap_64( *(buf+5) ); \
M[0x6] = mm256_bswap_64( *(buf+6) ); \
M[0x7] = mm256_bswap_64( *(buf+7) ); \
M[0x8] = mm256_bswap_64( *(buf+8) ); \
M[0x9] = mm256_bswap_64( *(buf+9) ); \
M[0xA] = mm256_bswap_64( *(buf+10) ); \
M[0xB] = mm256_bswap_64( *(buf+11) ); \
M[0xC] = mm256_bswap_64( *(buf+12) ); \
M[0xD] = mm256_bswap_64( *(buf+13) ); \
M[0xE] = mm256_bswap_64( *(buf+14) ); \
M[0xF] = mm256_bswap_64( *(buf+15) ); \
for (r = 0; r < 16; r ++) \
ROUND_B_4WAY(r); \
H0 = _mm256_xor_si256( _mm256_xor_si256( \
@@ -913,22 +913,22 @@ do { \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M0 = mm256_byteswap_64( *(buf + 0) ); \
M1 = mm256_byteswap_64( *(buf + 1) ); \
M2 = mm256_byteswap_64( *(buf + 2) ); \
M3 = mm256_byteswap_64( *(buf + 3) ); \
M4 = mm256_byteswap_64( *(buf + 4) ); \
M5 = mm256_byteswap_64( *(buf + 5) ); \
M6 = mm256_byteswap_64( *(buf + 6) ); \
M7 = mm256_byteswap_64( *(buf + 7) ); \
M8 = mm256_byteswap_64( *(buf + 8) ); \
M9 = mm256_byteswap_64( *(buf + 9) ); \
MA = mm256_byteswap_64( *(buf + 10) ); \
MB = mm256_byteswap_64( *(buf + 11) ); \
MC = mm256_byteswap_64( *(buf + 12) ); \
MD = mm256_byteswap_64( *(buf + 13) ); \
ME = mm256_byteswap_64( *(buf + 14) ); \
MF = mm256_byteswap_64( *(buf + 15) ); \
M0 = mm256_bswap_64( *(buf + 0) ); \
M1 = mm256_bswap_64( *(buf + 1) ); \
M2 = mm256_bswap_64( *(buf + 2) ); \
M3 = mm256_bswap_64( *(buf + 3) ); \
M4 = mm256_bswap_64( *(buf + 4) ); \
M5 = mm256_bswap_64( *(buf + 5) ); \
M6 = mm256_bswap_64( *(buf + 6) ); \
M7 = mm256_bswap_64( *(buf + 7) ); \
M8 = mm256_bswap_64( *(buf + 8) ); \
M9 = mm256_bswap_64( *(buf + 9) ); \
MA = mm256_bswap_64( *(buf + 10) ); \
MB = mm256_bswap_64( *(buf + 11) ); \
MC = mm256_bswap_64( *(buf + 12) ); \
MD = mm256_bswap_64( *(buf + 13) ); \
ME = mm256_bswap_64( *(buf + 14) ); \
MF = mm256_bswap_64( *(buf + 15) ); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
@@ -1064,8 +1064,8 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
_mm_set1_epi32( 0x01000000UL ) );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
*(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
}
else
@@ -1077,13 +1077,13 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
memset_zero_128( u.buf, 56>>2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
*(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, u.buf, 64 );
}
out = (__m128i*)dst;
for ( k = 0; k < out_size_w32; k++ )
out[k] = mm_byteswap_32( sc->H[k] );
out[k] = mm_bswap_32( sc->H[k] );
}
#if defined (__AVX2__)
@@ -1187,8 +1187,8 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
if (out_size_w32 == 8)
u.buf[52>>2] = _mm256_or_si256( u.buf[52>>2],
_mm256_set1_epi32( 0x01000000UL ) );
*(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) );
*(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr );
}
else
@@ -1200,13 +1200,13 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
memset_zero_256( u.buf, 56>>2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
*(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) );
*(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, u.buf, 64 );
}
out = (__m256i*)dst;
for ( k = 0; k < out_size_w32; k++ )
out[k] = mm256_byteswap_32( sc->H[k] );
out[k] = mm256_bswap_32( sc->H[k] );
}
// Blake-512 4 way
@@ -1311,9 +1311,9 @@ blake64_4way_close( blake_4way_big_context *sc,
if ( out_size_w64 == 8 )
u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
*(u.buf+(112>>3)) = mm256_byteswap_64(
*(u.buf+(112>>3)) = mm256_bswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_64(
*(u.buf+(120>>3)) = mm256_bswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
@@ -1328,16 +1328,16 @@ blake64_4way_close( blake_4way_big_context *sc,
memset_zero_256( u.buf, 112>>3 );
if ( out_size_w64 == 8 )
u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
*(u.buf+(112>>3)) = mm256_byteswap_64(
*(u.buf+(112>>3)) = mm256_bswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_64(
*(u.buf+(120>>3)) = mm256_bswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf, 128 );
}
out = (__m256i*)dst;
for ( k = 0; k < out_size_w64; k++ )
out[k] = mm256_byteswap_64( sc->H[k] );
out[k] = mm256_bswap_64( sc->H[k] );
}
#endif

View File

@@ -96,7 +96,7 @@ int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( ( n >= max_nonce ) && ( *hashes_done < 10 ) )
{
*hashes_done = 0;
sleep(1);
// sleep(1);
}
return num_found;

View File

@@ -12,11 +12,11 @@ static __thread blake256_4way_context blake_mid;
void decred_hash_4way( void *state, const void *input )
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
// uint32_t hash0[8] __attribute__ ((aligned (32)));
// uint32_t hash1[8] __attribute__ ((aligned (32)));
// uint32_t hash2[8] __attribute__ ((aligned (32)));
// uint32_t hash3[8] __attribute__ ((aligned (32)));
const void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
int tail_len = 180 - DECRED_MIDSTATE_LEN;
blake256_4way_context ctx __attribute__ ((aligned (64)));

View File

@@ -49,11 +49,6 @@ extern "C"{
// BMW256
// BMW small has a bug not present in big. Lanes 0 & 2 produce valid hash
// while lanes 1 & 3 produce invalid hash. The cause is not known.
static const sph_u32 IV256[] = {
SPH_C32(0x40414243), SPH_C32(0x44454647),
SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F),
@@ -121,16 +116,14 @@ static const sph_u64 IV512[] = {
mm_rotl_32( M[ ( (j) + (off) ) & 0xF ] , \
( ( (j) + (off) ) & 0xF ) + 1 )
// The multiplication in this macro is a possible cause of the lane
// corruption but a vectorized mullo did not help.
#define add_elt_s( M, H, j ) \
_mm_xor_si128( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
rol_off_32( M, j, 3 ) ), \
rol_off_32( M, j, 10 ) ), \
_mm_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) \
), H[ ( (j)+7 ) & 0xF ] )
_mm_add_epi32( \
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
rol_off_32( M, j, 3 ) ), \
rol_off_32( M, j, 10 ) ), \
_mm_set1_epi32( ( (j)+16 ) * SPH_C32(0x05555555UL) ) ), \
H[ ( (j)+7 ) & 0xF ] )
#define expand1s( qt, M, H, i ) \
@@ -447,22 +440,22 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
{
__m128i qt[32], xl, xh; \
qt[ 0] = ss0( Ws0 ) + H[ 1];
qt[ 1] = ss1( Ws1 ) + H[ 2];
qt[ 2] = ss2( Ws2 ) + H[ 3];
qt[ 3] = ss3( Ws3 ) + H[ 4];
qt[ 4] = ss4( Ws4 ) + H[ 5];
qt[ 5] = ss0( Ws5 ) + H[ 6];
qt[ 6] = ss1( Ws6 ) + H[ 7];
qt[ 7] = ss2( Ws7 ) + H[ 8];
qt[ 8] = ss3( Ws8 ) + H[ 9];
qt[ 9] = ss4( Ws9 ) + H[10];
qt[10] = ss0( Ws10) + H[11];
qt[11] = ss1( Ws11) + H[12];
qt[12] = ss2( Ws12) + H[13];
qt[13] = ss3( Ws13) + H[14];
qt[14] = ss4( Ws14) + H[15];
qt[15] = ss0( Ws15) + H[ 0];
qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] );
qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] );
qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] );
qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] );
qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] );
qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] );
qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] );
qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] );
qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] );
qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] );
qt[10] = _mm_add_epi32( ss0( Ws10), H[11] );
qt[11] = _mm_add_epi32( ss1( Ws11), H[12] );
qt[12] = _mm_add_epi32( ss2( Ws12), H[13] );
qt[13] = _mm_add_epi32( ss3( Ws13), H[14] );
qt[14] = _mm_add_epi32( ss4( Ws14), H[15] );
qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] );
qt[16] = expand1s( qt, M, H, 16 );
qt[17] = expand1s( qt, M, H, 17 );
qt[18] = expand2s( qt, M, H, 18 );
@@ -738,24 +731,24 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
{
__m256i qt[32], xl, xh; \
__m256i qt[32], xl, xh;
qt[ 0] = sb0( Wb0 ) + H[ 1];
qt[ 1] = sb1( Wb1 ) + H[ 2];
qt[ 2] = sb2( Wb2 ) + H[ 3];
qt[ 3] = sb3( Wb3 ) + H[ 4];
qt[ 4] = sb4( Wb4 ) + H[ 5];
qt[ 5] = sb0( Wb5 ) + H[ 6];
qt[ 6] = sb1( Wb6 ) + H[ 7];
qt[ 7] = sb2( Wb7 ) + H[ 8];
qt[ 8] = sb3( Wb8 ) + H[ 9];
qt[ 9] = sb4( Wb9 ) + H[10];
qt[10] = sb0( Wb10) + H[11];
qt[11] = sb1( Wb11) + H[12];
qt[12] = sb2( Wb12) + H[13];
qt[13] = sb3( Wb13) + H[14];
qt[14] = sb4( Wb14) + H[15];
qt[15] = sb0( Wb15) + H[ 0];
qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] );
qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] );
qt[ 2] = _mm256_add_epi64( sb2( Wb2 ), H[ 3] );
qt[ 3] = _mm256_add_epi64( sb3( Wb3 ), H[ 4] );
qt[ 4] = _mm256_add_epi64( sb4( Wb4 ), H[ 5] );
qt[ 5] = _mm256_add_epi64( sb0( Wb5 ), H[ 6] );
qt[ 6] = _mm256_add_epi64( sb1( Wb6 ), H[ 7] );
qt[ 7] = _mm256_add_epi64( sb2( Wb7 ), H[ 8] );
qt[ 8] = _mm256_add_epi64( sb3( Wb8 ), H[ 9] );
qt[ 9] = _mm256_add_epi64( sb4( Wb9 ), H[10] );
qt[10] = _mm256_add_epi64( sb0( Wb10), H[11] );
qt[11] = _mm256_add_epi64( sb1( Wb11), H[12] );
qt[12] = _mm256_add_epi64( sb2( Wb12), H[13] );
qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] );
qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] );
qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] );
qt[16] = expand1b( qt, M, H, 16 );
qt[17] = expand1b( qt, M, H, 17 );
qt[18] = expand2b( qt, M, H, 18 );
@@ -868,7 +861,7 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
}
// BMW256
/*
static const uint32_t final_s[16][4] =
{
{ 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0 },
@@ -888,7 +881,7 @@ static const uint32_t final_s[16][4] =
{ 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae },
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
};
*/
/*
static const __m128i final_s[16] =
{
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
@@ -908,7 +901,7 @@ static const __m128i final_s[16] =
{ 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
{ 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
};
*/
static void
bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
{
@@ -984,7 +977,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
}
memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
buf[ (buf_size - 4) >> 2 ] = mm_zero;
buf[ (buf_size - 4) >> 2 ] = m128_zero;
compress_small( buf, h, h2 );
for ( u = 0; u < 16; u ++ )

1251
algo/bmw/bmw.test Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -477,7 +477,7 @@ do { \
for (u = 0; u < 16; u ++) \
sph_enc64le_aligned(data + 8 * u, h2[u]); \
dh = h1; \
h = final_b; \
h = (sph_u64*)final_b; \
} \
/* end wrapped for break loop */ \
out = dst; \

View File

@@ -129,7 +129,7 @@ static void transform( cubehashParam *sp )
#endif
} // transform
// Ccubehash context initializing is very expensive.
// Cubehash context initializing is very expensive.
// Cache the intial value for faster reinitializing.
cubehashParam cube_ctx_cache __attribute__ ((aligned (64)));

View File

@@ -1,2 +0,0 @@
amd64
x86

View File

@@ -14,18 +14,20 @@
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
*
*/
#if defined(__AES__)
#include <memory.h>
#include "miner.h"
#include "hash_api.h"
#include "vperm.h"
//#include "vperm.h"
#include <immintrin.h>
/*
#ifndef NO_AES_NI
#include <wmmintrin.h>
#else
#include <tmmintrin.h>
#endif
*/
MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
@@ -246,7 +248,8 @@ void DumpState(__m128i *ps)
void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
{
unsigned int r, b, i, j;
__m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
// __m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
__m128i t1, t2, s2, k1;
__m128i _state[4][4], _state2[4][4], _statebackup[4][4];
@@ -396,7 +399,7 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
{
int i, j;
ctx->k = _mm_xor_si128(ctx->k, ctx->k);
ctx->k = _mm_setzero_si128();
ctx->processed_bits = 0;
ctx->uBufferBytes = 0;
@@ -742,4 +745,4 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
return SUCCESS;
}
#endif

View File

@@ -1 +0,0 @@
Çağdaş Çalık

View File

@@ -1,120 +0,0 @@
/*
* file : vperm.h
* version : 1.0.208
* date : 14.12.2010
*
* vperm implementation of AES s-box
*
* Credits: Adapted from Mike Hamburg's AES implementation, http://crypto.stanford.edu/vpaes/
*
* Cagdas Calik
* ccalik@metu.edu.tr
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
*
*/
#ifndef VPERM_H
#define VPERM_H
#include "algo/sha/sha3_common.h"
#include <tmmintrin.h>
/*
extern const unsigned int _k_s0F[];
extern const unsigned int _k_ipt[];
extern const unsigned int _k_opt[];
extern const unsigned int _k_inv[];
extern const unsigned int _k_sb1[];
extern const unsigned int _k_sb2[];
extern const unsigned int _k_sb3[];
extern const unsigned int _k_sb4[];
extern const unsigned int _k_sb5[];
extern const unsigned int _k_sb7[];
extern const unsigned int _k_sbo[];
extern const unsigned int _k_h63[];
extern const unsigned int _k_hc6[];
extern const unsigned int _k_h5b[];
extern const unsigned int _k_h4e[];
extern const unsigned int _k_h0e[];
extern const unsigned int _k_h15[];
extern const unsigned int _k_aesmix1[];
extern const unsigned int _k_aesmix2[];
extern const unsigned int _k_aesmix3[];
extern const unsigned int _k_aesmix4[];
*/
// input: x, table
// output: x
#define TRANSFORM(x, table, t1, t2)\
t1 = _mm_andnot_si128(M128(_k_s0F), x);\
t1 = _mm_srli_epi32(t1, 4);\
x = _mm_and_si128(x, M128(_k_s0F));\
t1 = _mm_shuffle_epi8(*((__m128i*)table + 1), t1);\
x = _mm_shuffle_epi8(*((__m128i*)table + 0), x);\
x = _mm_xor_si128(x, t1)
#if 0
// compiled erroneously with 32-bit msc compiler
t2 = _mm_shuffle_epi8(table[0], x);\
x = _mm_shuffle_epi8(table[1], t1);\
x = _mm_xor_si128(x, t2)
#endif
// input: x
// output: t2, t3
#define SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4)\
t1 = _mm_andnot_si128(M128(_k_s0F), x);\
t1 = _mm_srli_epi32(t1, 4);\
x = _mm_and_si128(x, M128(_k_s0F));\
t2 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 1), x);\
x = _mm_xor_si128(x, t1);\
t3 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t1);\
t3 = _mm_xor_si128(t3, t2);\
t4 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), x);\
t4 = _mm_xor_si128(t4, t2);\
t2 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t3);\
t2 = _mm_xor_si128(t2, x);\
t3 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t4);\
t3 = _mm_xor_si128(t3, t1);\
// input: x1, x2, table
// output: y
#define VPERM_LOOKUP(x1, x2, table, y, t)\
t = _mm_shuffle_epi8(*((__m128i*)table + 0), x1);\
y = _mm_shuffle_epi8(*((__m128i*)table + 1), x2);\
y = _mm_xor_si128(y, t)
// input: x
// output: x
#define SUBSTITUTE_VPERM(x, t1, t2, t3, t4) \
TRANSFORM(x, _k_ipt, t1, t2);\
SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4);\
VPERM_LOOKUP(t2, t3, _k_sbo, x, t1);\
x = _mm_xor_si128(x, M128(_k_h63))
// input: x
// output: x
#define AES_ROUND_VPERM_CORE(x, t1, t2, t3, t4, s1, s2, s3) \
SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4);\
VPERM_LOOKUP(t2, t3, _k_sb1, s1, t1);\
VPERM_LOOKUP(t2, t3, _k_sb2, s2, t1);\
s3 = _mm_xor_si128(s1, s2);\
x = _mm_shuffle_epi8(s2, M128(_k_aesmix1));\
x = _mm_xor_si128(x, _mm_shuffle_epi8(s3, M128(_k_aesmix2)));\
x = _mm_xor_si128(x, _mm_shuffle_epi8(s1, M128(_k_aesmix3)));\
x = _mm_xor_si128(x, _mm_shuffle_epi8(s1, M128(_k_aesmix4)));\
x = _mm_xor_si128(x, M128(_k_h5b))
// input: x
// output: x
#define AES_ROUND_VPERM(x, t1, t2, t3, t4, s1, s2, s3) \
TRANSFORM(x, _k_ipt, t1, t2);\
AES_ROUND_VPERM_CORE(x, t1, t2, t3, t4, s1, s2, s3);\
TRANSFORM(x, _k_opt, t1, t2)
#endif // VPERM_H

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "myrgr-gate.h"
#include <stdio.h>
#include <stdlib.h>
@@ -10,8 +10,6 @@
#else
#include "aes_ni/hash-groestl.h"
#endif
#include <openssl/sha.h>
#include "algo/sha/sph_sha2.h"
typedef struct {
@@ -20,11 +18,7 @@ typedef struct {
#else
hashState_groestl groestl;
#endif
#ifndef USE_SPH_SHA
SHA256_CTX sha;
#else
sph_sha256_context sha;
#endif
sph_sha256_context sha;
} myrgr_ctx_holder;
myrgr_ctx_holder myrgr_ctx;
@@ -36,44 +30,37 @@ void init_myrgr_ctx()
#else
init_groestl (&myrgr_ctx.groestl, 64 );
#endif
#ifndef USE_SPH_SHA
SHA256_Init( &myrgr_ctx.sha );
#else
sph_sha256_init( &myrgr_ctx.sha );
#endif
sph_sha256_init(&myrgr_ctx.sha);
}
void myriadhash( void *output, const void *input )
void myriad_hash(void *output, const void *input)
{
myrgr_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
uint32_t hash[16] __attribute__ ((aligned (64)));
myrgr_ctx_holder ctx;
memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
uint32_t _ALIGN(32) hash[16];
#ifdef NO_AES_NI
sph_groestl512(&ctx.groestl, input, 80);
sph_groestl512_close(&ctx.groestl, hash);
sph_groestl512(&ctx.groestl, input, 80);
sph_groestl512_close(&ctx.groestl, hash);
#else
update_and_final_groestl( &ctx.groestl, (char*)input,
(const char*)input, 640 );
update_groestl( &ctx.groestl, (char*)input, 640 );
final_groestl( &ctx.groestl, (char*)hash);
#endif
#ifndef USE_SPH_SHA
SHA256_Update( &ctx.sha, hash, 64 );
SHA256_Final( (unsigned char*) hash, &ctx.sha );
#else
sph_sha256(&ctx.sha, hash, 64);
sph_sha256_close(&ctx.sha, hash);
#endif
memcpy(output, hash, 32);
sph_sha256(&ctx.sha, hash, 64);
sph_sha256_close(&ctx.sha, hash);
memcpy(output, hash, 32);
}
int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
int scanhash_myriad(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) endiandata[20];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
@@ -84,9 +71,9 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
do {
const uint32_t Htarg = ptarget[7];
uint32_t hash[8] __attribute__ ((aligned (64)));
uint32_t hash[8];
be32enc(&endiandata[19], nonce);
myriadhash(hash, endiandata);
myriad_hash(hash, endiandata);
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
pdata[19] = nonce;
@@ -101,14 +88,15 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
/*
bool register_myriad_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
gate->optimizations = SSE2_OPT | AES_OPT;
init_myrgr_ctx();
gate->scanhash = (void*)&scanhash_myriad;
gate->hash = (void*)&myriadhash;
// gate->hash_alt = (void*)&myriadhash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};
*/

134
algo/groestl/myrgr-4way.c Normal file
View File

@@ -0,0 +1,134 @@
#include "myrgr-gate.h"
#if defined(MYRGR_4WAY)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "aes_ni/hash-groestl.h"
#include "algo/sha/sha2-hash-4way.h"
typedef struct {
hashState_groestl groestl;
sha256_4way_context sha;
} myrgr_4way_ctx_holder;
myrgr_4way_ctx_holder myrgr_4way_ctx;
void init_myrgr_4way_ctx()
{
init_groestl (&myrgr_4way_ctx.groestl, 64 );
sha256_4way_init( &myrgr_4way_ctx.sha );
}
void myriad_4way_hash( void *output, const void *input )
{
uint32_t hash0[20] __attribute__ ((aligned (64)));
uint32_t hash1[20] __attribute__ ((aligned (64)));
uint32_t hash2[20] __attribute__ ((aligned (64)));
uint32_t hash3[20] __attribute__ ((aligned (64)));
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
myrgr_4way_ctx_holder ctx;
memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, input, 640 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
sha256_4way( &ctx.sha, vhash, 64 );
sha256_4way_close( &ctx.sha, vhash );
mm_deinterleave_4x32( output, output+32, output+64, output+96,
vhash, 256 );
}
int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 76; // 19*4
uint32_t *noncep1 = vdata + 77;
uint32_t *noncep2 = vdata + 78;
uint32_t *noncep3 = vdata + 79;
/*
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(64) endiandata[20];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
*/
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
myriad_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = pdata[19] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/groestl/myrgr-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "myrgr-gate.h"
bool register_myriad_algo( algo_gate_t* gate )
{
#if defined (MYRGR_4WAY)
init_myrgr_4way_ctx();
gate->scanhash = (void*)&scanhash_myriad_4way;
gate->hash = (void*)&myriad_4way_hash;
#else
init_myrgr_ctx();
gate->scanhash = (void*)&scanhash_myriad;
gate->hash = (void*)&myriad_hash;
#endif
gate->optimizations = AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

30
algo/groestl/myrgr-gate.h Normal file
View File

@@ -0,0 +1,30 @@
#ifndef MYRGR_GATE_H__
#define MYRGR_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define MYRGR_4WAY
#endif
#if defined(MYRGR_4WAY)
void myriad_4way_hash( void *state, const void *input );
int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_myrgr_4way_ctx();
#endif
void myriad_hash( void *state, const void *input );
int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_myrgr_ctx();
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -48,20 +48,20 @@ extern "C"{
#define SPH_SIZE_hamsi512 512
// Partial is only scalar but needs pointer ref for hamsi-helper
// deprecate partial_len
typedef struct {
__m128i h[16];
__m128i partial[2];
__m256i h[8];
__m256i buf[1];
size_t partial_len;
sph_u32 count_high, count_low;
} hamsi_4way_big_context;
typedef hamsi_4way_big_context hamsi512_4way_context;
void hamsi512_4way_init(void *cc);
void hamsi512_4way(void *cc, const void *data, size_t len);
void hamsi512_4way_close(void *cc, void *dst);
void hamsi512_4way_init( hamsi512_4way_context *sc );
void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
#ifdef __cplusplus
}

View File

@@ -1,482 +0,0 @@
/* $Id: hamsi_helper.c 202 2010-05-31 15:46:48Z tp $ */
/*
* Helper code for Hamsi (input block expansion). This code is
* automatically generated and includes precomputed tables for
* expansion code which handles 2 to 8 bits at a time.
*
* This file is included from hamsi.c, and is not meant to be compiled
* independently.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifdef __cplusplus
extern "C"{
#endif
/* Note: this table lists bits within each byte from least
siginificant to most significant. */
static const sph_u32 T512[64][16] = {
{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
SPH_C32(0x9e69af68) },
{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
SPH_C32(0x0c26f262) },
{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
SPH_C32(0xdc24e61f) },
{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
SPH_C32(0x3daac2da) },
{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
SPH_C32(0x78cace29) },
{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
SPH_C32(0x2dd1f9ab) },
{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
SPH_C32(0xbf2c0be2) },
{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
SPH_C32(0x32219526) },
{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
SPH_C32(0xac8e6c88) },
{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
SPH_C32(0x7b1bd6b9) },
{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
SPH_C32(0xf746c320) },
{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
SPH_C32(0x69505b3a) },
{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
SPH_C32(0x8a341574) },
{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
SPH_C32(0x450360bf) },
{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
SPH_C32(0xf3d45758) },
{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
SPH_C32(0x925c44e9) },
{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
SPH_C32(0xa123ff9f) },
{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
SPH_C32(0x1568ff0f) },
{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
SPH_C32(0xc5c1eb3e) },
{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
SPH_C32(0x1af21fe1) },
{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
SPH_C32(0x857f3c2b) },
{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
SPH_C32(0x2ba05a55) },
{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
SPH_C32(0xfeabf254) },
{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
SPH_C32(0xfe1cdc7f) },
{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
SPH_C32(0xb0a51834) },
{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
SPH_C32(0xa6b8c28d) },
{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
SPH_C32(0x3a4e99d7) },
{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
SPH_C32(0xe1844257) },
{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
SPH_C32(0x2c3b504e) },
{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
SPH_C32(0x524a0d59) },
{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
SPH_C32(0x378dd173) },
{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
SPH_C32(0x8b6c72bd) },
{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
SPH_C32(0x8e67b7fa) },
{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
SPH_C32(0x443d3004) },
{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
SPH_C32(0xf4f6ea7b) },
{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
SPH_C32(0x979961d0) },
{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
SPH_C32(0x98aa496e) },
{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
SPH_C32(0x094e3198) },
{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
SPH_C32(0xe86cba2e) },
{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
SPH_C32(0x4b7eec55) },
{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
SPH_C32(0x1e7536a6) },
{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
SPH_C32(0x24314f17) },
{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
SPH_C32(0x9075b1ce) },
{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
SPH_C32(0x9b6ef888) },
{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
SPH_C32(0xd8b61463) },
{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
SPH_C32(0x3ea660f7) },
{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
SPH_C32(0x7f975691) },
{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
SPH_C32(0x2c94459e) },
{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
SPH_C32(0x56a7b19f) },
{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
SPH_C32(0x81fdf908) },
{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
SPH_C32(0x5bd61539) },
{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
SPH_C32(0x15b961e7) },
{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
SPH_C32(0x2a2c18f0) },
{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
SPH_C32(0x551e3d6e) },
{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
SPH_C32(0x33c5244f) },
{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
SPH_C32(0x8a58e6a4) },
{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
SPH_C32(0xda878000) },
{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
SPH_C32(0x3c5dfffe) },
{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
SPH_C32(0x7b1675d7) },
{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
SPH_C32(0x2879ebac) },
{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
SPH_C32(0xbe0a679e) },
{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
SPH_C32(0x30aebcf7) },
{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
SPH_C32(0xc7ff60f0) },
{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
SPH_C32(0xe7e00a94) }
};
#define U_BIG( n ) \
do { \
__m128i db = buf[n]; \
for ( int u = 0; u < 32; u++ ) \
{ \
__m128i dm = mm_negate_32( _mm_and_si128( db, mm_one_32 ) ); \
m0 = _mm_xor_si128( m0, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m1 = _mm_xor_si128( m1, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m2 = _mm_xor_si128( m2, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m3 = _mm_xor_si128( m3, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m4 = _mm_xor_si128( m4, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m5 = _mm_xor_si128( m5, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m6 = _mm_xor_si128( m6, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m7 = _mm_xor_si128( m7, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m8 = _mm_xor_si128( m8, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m9 = _mm_xor_si128( m9, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mA = _mm_xor_si128( mA, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mB = _mm_xor_si128( mB, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mC = _mm_xor_si128( mC, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mD = _mm_xor_si128( mD, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mE = _mm_xor_si128( mE, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mF = _mm_xor_si128( mF, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
db = _mm_srli_epi32( db, 1 ); \
} \
} while (0);
#define INPUT_BIG \
do { \
const sph_u32 *tp = &T512[0][0]; \
m0 = mm_zero; \
m1 = mm_zero; \
m2 = mm_zero; \
m3 = mm_zero; \
m4 = mm_zero; \
m5 = mm_zero; \
m6 = mm_zero; \
m7 = mm_zero; \
m8 = mm_zero; \
m9 = mm_zero; \
mA = mm_zero; \
mB = mm_zero; \
mC = mm_zero; \
mD = mm_zero; \
mE = mm_zero; \
mF = mm_zero; \
U_BIG( 0 ); \
U_BIG( 1 ); \
} while (0)
#ifdef __cplusplus
}
#endif

View File

@@ -83,7 +83,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
current = (unsigned)sc->count_low & 127UL;
sc->buf[ current>>2 ] = mm_one_32;
sc->buf[ current>>2 ] = m128_one_32;
current += 4;
RSTATE;
if ( current > 116UL )

View File

@@ -15,7 +15,7 @@
#include "algo/shabal/sph_shabal.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/skein/sse2/skein.c"
#ifndef NO_AES_NI

View File

@@ -95,10 +95,11 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
uint64_t *hashes_done )
{
#ifndef NO_AES_NI
GenRandomGarbage( hodl_scratchbuf, work->data, thr_id );
GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, thr_id );
pthread_barrier_wait( &hodl_barrier );
return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
#endif
return false;
}
bool register_hodl_algo( algo_gate_t* gate )

View File

@@ -44,7 +44,7 @@ void jha_hash_4way( void *out, const void *input )
for ( int round = 0; round < 3; round++ )
{
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
vh[0], _mm256_set1_epi64x( 1 ) ), mm256_zero );
vh[0], _mm256_set1_epi64x( 1 ) ), m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx_groestl, 64 );

View File

@@ -339,13 +339,13 @@ do { \
jhSbuffer[53] = 0x00, \
jhSbuffer[54] = 0x00, \
jhSbuffer[55] = 0x00; \
jhSbuffer[56] = ((64*8) >> 56) & 0xff, \
jhSbuffer[57] = ((64*8) >> 48) & 0xff, \
jhSbuffer[58] = ((64*8) >> 40) & 0xff, \
jhSbuffer[59] = ((64*8) >> 32) & 0xff, \
jhSbuffer[60] = ((64*8) >> 24) & 0xff, \
jhSbuffer[61] = ((64*8) >> 16) & 0xff, \
jhSbuffer[62] = ((64*8) >> 8) & 0xff, \
jhSbuffer[56] = ((char)((uint64_t)(64*8) >> 56)) & 0xff, \
jhSbuffer[57] = ((char)((uint64_t)(64*8) >> 48)) & 0xff, \
jhSbuffer[58] = ((char)((uint64_t)(64*8) >> 40)) & 0xff, \
jhSbuffer[59] = ((char)((uint64_t)(64*8) >> 32)) & 0xff, \
jhSbuffer[60] = ((char)((uint64_t)(64*8) >> 24)) & 0xff, \
jhSbuffer[61] = ((char)((uint64_t)(64*8) >> 16)) & 0xff, \
jhSbuffer[62] = ((char)((uint64_t)(64*8) >> 8)) & 0xff, \
jhSbuffer[63] = (64*8) & 0xff; \
b = true; \
} \

View File

@@ -59,7 +59,7 @@ static const sph_u64 RC[] = {
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b))
#define AND64(d, a, b) (d = _mm256_and_si256(a,b))
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
#define NOT64(d, s) (d = _mm256_xor_si256(s,mm256_neg1))
#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
#define ROL64(d, v, n) (d = mm256_rotl_64(v, n))
#define XOR64_IOTA XOR64
@@ -375,12 +375,12 @@ static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
kc->w[i] = _mm256_setzero_si256();
// Initialization for the "lane complement".
kc->w[ 1] = mm256_neg1;
kc->w[ 2] = mm256_neg1;
kc->w[ 8] = mm256_neg1;
kc->w[12] = mm256_neg1;
kc->w[17] = mm256_neg1;
kc->w[20] = mm256_neg1;
kc->w[ 1] = m256_neg1;
kc->w[ 2] = m256_neg1;
kc->w[ 8] = m256_neg1;
kc->w[12] = m256_neg1;
kc->w[17] = m256_neg1;
kc->w[20] = m256_neg1;
kc->ptr = 0;
kc->lim = 200 - (out_size >> 2);
}

View File

@@ -775,10 +775,8 @@ static const sph_u64 RC[] = {
KF_ELT( 5, 6, RC[j + 5]); \
KF_ELT( 6, 7, RC[j + 6]); \
KF_ELT( 7, 8, RC[j + 7]); \
*/
//kekDECL_STATE \
kekDECL_STATE \
*/
#define DECL_KEC

View File

@@ -0,0 +1,583 @@
/*
* luffa_for_sse2.c
* Version 2.0 (Sep 15th 2009)
*
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
*
* Hitachi, Ltd. is the owner of this software and hereby grant
* the U.S. Government and any interested party the right to use
* this software for the purposes of the SHA-3 evaluation process,
* notwithstanding that this software is copyrighted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <string.h>
#include <immintrin.h>
#include "luffa-hash-2way.h"
#if defined(__AVX2__)
#include "avxdefs.h"
#define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
0UL, 0UL, 0UL, 0xffffffffUL )
#define ADD_CONSTANT(a,b,c0,c1)\
a = _mm256_xor_si256(a,c0);\
b = _mm256_xor_si256(b,c1);\
#define MULT2(a0,a1) \
do { \
register __m256i b = _mm256_xor_si256( a0, \
_mm256_shuffle_epi32( _mm256_and_si256(a1,MASK), 16 ) ); \
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
} while(0)
// confirm pointer arithmetic
// ok but use array indexes
#define STEP_PART(x,c,t)\
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
MIXWORD(*x,*(x+4),*t,*(t+1));\
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
ADD_CONSTANT(*x, *(x+4), *c, *(c+1));
#define SUBCRUMB(a0,a1,a2,a3,t)\
t = _mm256_load_si256(&a0);\
a0 = _mm256_or_si256(a0,a1);\
a2 = _mm256_xor_si256(a2,a3);\
a1 = _mm256_andnot_si256(a1, m256_neg1 );\
a0 = _mm256_xor_si256(a0,a3);\
a3 = _mm256_and_si256(a3,t);\
a1 = _mm256_xor_si256(a1,a3);\
a3 = _mm256_xor_si256(a3,a2);\
a2 = _mm256_and_si256(a2,a0);\
a0 = _mm256_andnot_si256(a0, m256_neg1 );\
a2 = _mm256_xor_si256(a2,a1);\
a1 = _mm256_or_si256(a1,a3);\
t = _mm256_xor_si256(t,a1);\
a3 = _mm256_xor_si256(a3,a2);\
a2 = _mm256_and_si256(a2,a1);\
a1 = _mm256_xor_si256(a1,a0);\
a0 = _mm256_load_si256(&t);\
#define MIXWORD(a,b,t1,t2)\
b = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(a,2);\
t2 = _mm256_srli_epi32(a,30);\
a = _mm256_or_si256(t1,t2);\
a = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(b,14);\
t2 = _mm256_srli_epi32(b,18);\
b = _mm256_or_si256(t1,t2);\
b = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(a,10);\
t2 = _mm256_srli_epi32(a,22);\
a = _mm256_or_si256(t1,t2);\
a = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(b,1);\
t2 = _mm256_srli_epi32(b,31);\
b = _mm256_or_si256(t1,t2);
#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
a1 = _mm256_shuffle_epi32(a1,147);\
t0 = _mm256_load_si256(&a1);\
a1 = _mm256_unpacklo_epi32(a1,a0);\
t0 = _mm256_unpackhi_epi32(t0,a0);\
t1 = _mm256_shuffle_epi32(t0,78);\
a0 = _mm256_shuffle_epi32(a1,78);\
SUBCRUMB(t1,t0,a0,a1,tmp0);\
t0 = _mm256_unpacklo_epi32(t0,t1);\
a1 = _mm256_unpacklo_epi32(a1,a0);\
a0 = _mm256_load_si256(&a1);\
a0 = _mm256_unpackhi_epi64(a0,t0);\
a1 = _mm256_unpacklo_epi64(a1,t0);\
a1 = _mm256_shuffle_epi32(a1,57);\
MIXWORD(a0,a1,tmp0,tmp1);\
ADD_CONSTANT(a0,a1,c0,c1);
#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
s2 = _mm256_load_si256(&r1);\
q2 = _mm256_load_si256(&p1);\
r2 = _mm256_shuffle_epi32(r2,216);\
p2 = _mm256_shuffle_epi32(p2,216);\
r1 = _mm256_unpacklo_epi32(r1,r0);\
p1 = _mm256_unpacklo_epi32(p1,p0);\
s2 = _mm256_unpackhi_epi32(s2,r0);\
q2 = _mm256_unpackhi_epi32(q2,p0);\
s0 = _mm256_load_si256(&r2);\
q0 = _mm256_load_si256(&p2);\
r2 = _mm256_unpacklo_epi64(r2,r1);\
p2 = _mm256_unpacklo_epi64(p2,p1);\
s1 = _mm256_load_si256(&s0);\
q1 = _mm256_load_si256(&q0);\
s0 = _mm256_unpackhi_epi64(s0,r1);\
q0 = _mm256_unpackhi_epi64(q0,p1);\
r2 = _mm256_shuffle_epi32(r2,225);\
p2 = _mm256_shuffle_epi32(p2,225);\
r0 = _mm256_load_si256(&s1);\
p0 = _mm256_load_si256(&q1);\
s0 = _mm256_shuffle_epi32(s0,225);\
q0 = _mm256_shuffle_epi32(q0,225);\
s1 = _mm256_unpacklo_epi64(s1,s2);\
q1 = _mm256_unpacklo_epi64(q1,q2);\
r0 = _mm256_unpackhi_epi64(r0,s2);\
p0 = _mm256_unpackhi_epi64(p0,q2);\
s2 = _mm256_load_si256(&r0);\
q2 = _mm256_load_si256(&p0);\
s3 = _mm256_load_si256(&r2);\
q3 = _mm256_load_si256(&p2);\
#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
s0 = _mm256_load_si256(&r0);\
q0 = _mm256_load_si256(&p0);\
s1 = _mm256_load_si256(&r2);\
q1 = _mm256_load_si256(&p2);\
r0 = _mm256_unpackhi_epi32(r0,r1);\
p0 = _mm256_unpackhi_epi32(p0,p1);\
r2 = _mm256_unpackhi_epi32(r2,r3);\
p2 = _mm256_unpackhi_epi32(p2,p3);\
s0 = _mm256_unpacklo_epi32(s0,r1);\
q0 = _mm256_unpacklo_epi32(q0,p1);\
s1 = _mm256_unpacklo_epi32(s1,r3);\
q1 = _mm256_unpacklo_epi32(q1,p3);\
r1 = _mm256_load_si256(&r0);\
p1 = _mm256_load_si256(&p0);\
r0 = _mm256_unpackhi_epi64(r0,r2);\
p0 = _mm256_unpackhi_epi64(p0,p2);\
s0 = _mm256_unpackhi_epi64(s0,s1);\
q0 = _mm256_unpackhi_epi64(q0,q1);\
r1 = _mm256_unpacklo_epi64(r1,r2);\
p1 = _mm256_unpacklo_epi64(p1,p2);\
s2 = _mm256_load_si256(&r0);\
q2 = _mm256_load_si256(&p0);\
s1 = _mm256_load_si256(&r1);\
q1 = _mm256_load_si256(&p1);\
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
s1 = _mm256_load_si256(&r3);\
q1 = _mm256_load_si256(&p3);\
s3 = _mm256_load_si256(&r3);\
q3 = _mm256_load_si256(&p3);\
s1 = _mm256_unpackhi_epi32(s1,r2);\
q1 = _mm256_unpackhi_epi32(q1,p2);\
s3 = _mm256_unpacklo_epi32(s3,r2);\
q3 = _mm256_unpacklo_epi32(q3,p2);\
s0 = _mm256_load_si256(&s1);\
q0 = _mm256_load_si256(&q1);\
s2 = _mm256_load_si256(&s3);\
q2 = _mm256_load_si256(&q3);\
r3 = _mm256_load_si256(&r1);\
p3 = _mm256_load_si256(&p1);\
r1 = _mm256_unpacklo_epi32(r1,r0);\
p1 = _mm256_unpacklo_epi32(p1,p0);\
r3 = _mm256_unpackhi_epi32(r3,r0);\
p3 = _mm256_unpackhi_epi32(p3,p0);\
s0 = _mm256_unpackhi_epi64(s0,r3);\
q0 = _mm256_unpackhi_epi64(q0,p3);\
s1 = _mm256_unpacklo_epi64(s1,r3);\
q1 = _mm256_unpacklo_epi64(q1,p3);\
s2 = _mm256_unpackhi_epi64(s2,r1);\
q2 = _mm256_unpackhi_epi64(q2,p1);\
s3 = _mm256_unpacklo_epi64(s3,r1);\
q3 = _mm256_unpacklo_epi64(q3,p1);
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
/* initial values of chaining variables */
static const uint32 IV[40] __attribute((aligned(32))) = {
0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
0xdef610bb,0xee058139,0x90152df4,0x6e292011,
0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
};
/* Round Constants */
static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
0x00000000,0x00000000,0x00000000,0x5090d577,
0x00000000,0x00000000,0x00000000,0xac11d7fa,
0x00000000,0x00000000,0x00000000,0x2d1925ab,
0x00000000,0x00000000,0x00000000,0x1bcb66f2,
0x00000000,0x00000000,0x00000000,0xb46496ac,
0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
0x00000000,0x00000000,0x00000000,0xd1925ab0,
0x00000000,0x00000000,0x00000000,0x78602649,
0x00000000,0x00000000,0x00000000,0x29131ab6,
0x00000000,0x00000000,0x00000000,0x8edae952,
0x00000000,0x00000000,0x00000000,0x0fc053c3,
0x00000000,0x00000000,0x00000000,0x3b6ba548,
0x00000000,0x00000000,0x00000000,0x3f014f0c,
0x00000000,0x00000000,0x00000000,0xedae9520,
0x00000000,0x00000000,0x00000000,0xfc053c31
};
__m256i CNS[32];
/***************************************************/
/* Round function */
/* state: hash context */
void rnd512_2way( luffa_2way_context *state, __m256i *msg )
{
__m256i t0, t1;
__m256i *chainv = state->chainv;
__m256i msg0, msg1;
__m256i tmp[2];
__m256i x[8];
t0 = chainv[0];
t1 = chainv[1];
t0 = _mm256_xor_si256( t0, chainv[2] );
t1 = _mm256_xor_si256( t1, chainv[3] );
t0 = _mm256_xor_si256( t0, chainv[4] );
t1 = _mm256_xor_si256( t1, chainv[5] );
t0 = _mm256_xor_si256( t0, chainv[6] );
t1 = _mm256_xor_si256( t1, chainv[7] );
t0 = _mm256_xor_si256( t0, chainv[8] );
t1 = _mm256_xor_si256( t1, chainv[9] );
MULT2( t0, t1 );
msg0 = _mm256_shuffle_epi32( msg[0], 27 );
msg1 = _mm256_shuffle_epi32( msg[1], 27 );
chainv[0] = _mm256_xor_si256( chainv[0], t0 );
chainv[1] = _mm256_xor_si256( chainv[1], t1 );
chainv[2] = _mm256_xor_si256( chainv[2], t0 );
chainv[3] = _mm256_xor_si256( chainv[3], t1 );
chainv[4] = _mm256_xor_si256( chainv[4], t0 );
chainv[5] = _mm256_xor_si256( chainv[5], t1 );
chainv[6] = _mm256_xor_si256( chainv[6], t0 );
chainv[7] = _mm256_xor_si256( chainv[7], t1 );
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
t0 = chainv[0];
t1 = chainv[1];
MULT2( chainv[0], chainv[1]);
chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
MULT2( chainv[2], chainv[3]);
chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
MULT2( chainv[4], chainv[5]);
chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
MULT2( chainv[6], chainv[7]);
chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
MULT2( chainv[8], chainv[9]);
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
t0 = chainv[8];
t1 = chainv[9];
MULT2( chainv[8], chainv[9]);
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
MULT2( chainv[6], chainv[7]);
chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
MULT2( chainv[4], chainv[5]);
chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
MULT2( chainv[2], chainv[3] );
chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
MULT2( chainv[0], chainv[1] );
chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
MULT2( msg0, msg1);
chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
MULT2( msg0, msg1);
chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
MULT2( msg0, msg1);
chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
MULT2( msg0, msg1);
chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
MULT2( msg0, msg1);
chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3], 1 ),
_mm256_srli_epi32( chainv[3], 31 ) );
chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5], 2 ),
_mm256_srli_epi32( chainv[5], 30 ) );
chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7], 3 ),
_mm256_srli_epi32( chainv[7], 29 ) );
chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9], 4 ),
_mm256_srli_epi32( chainv[9], 28 ) );
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
x[0], x[1], x[2], x[3],
chainv[1],chainv[3],chainv[5],chainv[7],
x[4], x[5], x[6], x[7] );
STEP_PART( &x[0], &CNS[ 0], &tmp[0] );
STEP_PART( &x[0], &CNS[ 2], &tmp[0] );
STEP_PART( &x[0], &CNS[ 4], &tmp[0] );
STEP_PART( &x[0], &CNS[ 6], &tmp[0] );
STEP_PART( &x[0], &CNS[ 8], &tmp[0] );
STEP_PART( &x[0], &CNS[10], &tmp[0] );
STEP_PART( &x[0], &CNS[12], &tmp[0] );
STEP_PART( &x[0], &CNS[14], &tmp[0] );
MIXTON1024( x[0], x[1], x[2], x[3],
chainv[0], chainv[2], chainv[4],chainv[6],
x[4], x[5], x[6], x[7],
chainv[1],chainv[3],chainv[5],chainv[7]);
/* Process last 256-bit block */
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[16], CNS[17],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[18], CNS[19],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[20], CNS[21],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[22], CNS[23],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[24], CNS[25],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[26], CNS[27],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[28], CNS[29],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[30], CNS[31],
tmp[0], tmp[1] );
}
/***************************************************/
/* Finalization function */
/* state: hash context */
/* b[8]: hash values */
void finalization512_2way( luffa_2way_context *state, uint32 *b )
{
uint32 hash[8] __attribute((aligned(64)));
__m256i* chainv = state->chainv;
__m256i t[2];
__m256i zero[2];
zero[0] = zero[1] = _mm256_setzero_si256();
/*---- blank round with m=0 ----*/
rnd512_2way( state, zero );
t[0] = chainv[0];
t[1] = chainv[1];
t[0] = _mm256_xor_si256( t[0], chainv[2] );
t[1] = _mm256_xor_si256( t[1], chainv[3] );
t[0] = _mm256_xor_si256( t[0], chainv[4] );
t[1] = _mm256_xor_si256( t[1], chainv[5] );
t[0] = _mm256_xor_si256( t[0], chainv[6] );
t[1] = _mm256_xor_si256( t[1], chainv[7] );
t[0] = _mm256_xor_si256( t[0], chainv[8] );
t[1] = _mm256_xor_si256( t[1], chainv[9] );
t[0] = _mm256_shuffle_epi32( t[0], 27 );
t[1] = _mm256_shuffle_epi32( t[1], 27 );
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
rnd512_2way( state, zero );
t[0] = chainv[0];
t[1] = chainv[1];
t[0] = _mm256_xor_si256( t[0], chainv[2] );
t[1] = _mm256_xor_si256( t[1], chainv[3] );
t[0] = _mm256_xor_si256( t[0], chainv[4] );
t[1] = _mm256_xor_si256( t[1], chainv[5] );
t[0] = _mm256_xor_si256( t[0], chainv[6] );
t[1] = _mm256_xor_si256( t[1], chainv[7] );
t[0] = _mm256_xor_si256( t[0], chainv[8] );
t[1] = _mm256_xor_si256( t[1], chainv[9] );
t[0] = _mm256_shuffle_epi32( t[0], 27 );
t[1] = _mm256_shuffle_epi32( t[1], 27 );
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
}
int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
{
int i;
state->hashbitlen = hashbitlen;
for ( i=0; i<32; i++ ) CNS[i] =
_mm256_set_epi32( CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ],
CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ] );
for ( i=0; i<10; i++ ) state->chainv[i] =
_mm256_set_epi32( IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
IV[ (i<<2) +1 ], IV[ (i<<2) ],
IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
IV[ (i<<2) +1 ], IV[ (i<<2) ] );
((__m256i*)state->buffer)[0] = m256_zero;
((__m256i*)state->buffer)[1] = m256_zero;
return 0;
}
// Do not call luffa_update_close after having called luffa_update.
// Once luffa_update has been called only call luffa_update or luffa_close.
int luffa_2way_update( luffa_2way_context *state, const void *data,
size_t len )
{
__m256i *vdata = (__m256i*)data;
__m256i *buffer = (__m256i*)state->buffer;
__m256i msg[2];
int i;
int blocks = (int)len >> 5;
state-> rembytes = (int)len & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = mm256_bswap_32( vdata[ 0] );
msg[1] = mm256_bswap_32( vdata[ 1 ] );
rnd512_2way( state, msg );
}
// 16 byte partial block exists for 80 byte len
// store in buffer for transform in final for midstate to work
if ( state->rembytes )
{
// remaining data bytes
buffer[0] = mm256_bswap_32( vdata[0] );
buffer[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
}
return 0;
}
int luffa_2way_close( luffa_2way_context *state, void *hashval )
{
__m256i *buffer = (__m256i*)state->buffer;
__m256i msg[2];
// transform pad block
if ( state->rembytes )
// not empty, data is in buffer
rnd512_2way( state, buffer );
else
{ // empty pad block, constant data
msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
msg[1] = m256_zero;
rnd512_2way( state, msg );
}
finalization512_2way( state, (uint32*)hashval );
if ( state->hashbitlen > 512 )
finalization512_2way( state, (uint32*)( hashval+32 ) );
return 0;
}
int luffa_2way_update_close( luffa_2way_context *state,
void *output, const void *data, size_t inlen )
{
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
const __m256i *vdata = (__m256i*)data;
__m256i msg[2];
int i;
const int blocks = (int)( inlen >> 5 );
state->rembytes = inlen & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = mm256_bswap_32( vdata[ 0 ] );
msg[1] = mm256_bswap_32( vdata[ 1 ] );
rnd512_2way( state, msg );
}
// 16 byte partial block exists for 80 byte len
if ( state->rembytes )
{
// padding of partial block
msg[0] = mm256_bswap_32( vdata[0] );
msg[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
rnd512_2way( state, msg );
}
else
{
// empty pad block
msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
msg[1] = m256_zero;
rnd512_2way( state, msg );
}
finalization512_2way( state, (uint32*)output );
if ( state->hashbitlen > 512 )
finalization512_2way( state, (uint32*)( output+32 ) );
return 0;
}
#endif

View File

@@ -0,0 +1,69 @@
#if !defined(LUFFA_HASH_2WAY_H__)
#define LUFFA_HASH_2WAY_H__ 1
/*
* luffa_for_sse2.h
* Version 2.0 (Sep 15th 2009)
*
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
*
* Hitachi, Ltd. is the owner of this software and hereby grant
* the U.S. Government and any interested party the right to use
* this software for the purposes of the SHA-3 evaluation process,
* notwithstanding that this software is copyrighted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#if defined(__AVX2__)
#include <immintrin.h>
#include "algo/sha/sha3-defs.h"
#include "avxdefs.h"
/* The length of digests*/
#define DIGEST_BIT_LEN_224 224
#define DIGEST_BIT_LEN_256 256
#define DIGEST_BIT_LEN_384 384
#define DIGEST_BIT_LEN_512 512
/*********************************/
/* The parameters of Luffa */
#define MSG_BLOCK_BIT_LEN 256 /*The bit length of a message block*/
#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length
* of a message block*/
/* The number of blocks in Luffa */
#define WIDTH_224 3
#define WIDTH_256 3
#define WIDTH_384 4
#define WIDTH_512 5
/* The limit of the length of message */
#define LIMIT_224 64
#define LIMIT_256 64
#define LIMIT_384 128
#define LIMIT_512 128
/*********************************/
typedef struct {
uint32 buffer[8*2] __attribute((aligned(64)));
__m256i chainv[10] __attribute((aligned(32))); /* Chaining values */
int hashbitlen;
int rembytes;
} luffa_2way_context;
int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
int luffa_2way_update( luffa_2way_context *state, const void *data,
size_t len );
int luffa_2way_close( luffa_2way_context *state, void *hashval );
int luffa_2way_update_close( luffa_2way_context *state, void *output,
const void *data, size_t inlen );
#endif
#endif

View File

@@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
// full blocks
for ( i = 0; i < blocks; i++ )
{
rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
mm_byteswap_32( casti_m128i( data, 0 ) ) );
rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
mm_bswap_32( casti_m128i( data, 0 ) ) );
data += MSG_BLOCK_BYTE_LEN;
}
@@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
if ( state->rembytes )
{
// remaining data bytes
casti_m128i( state->buffer, 0 ) = mm_byteswap_32( cast_m128i( data ) );
casti_m128i( state->buffer, 0 ) = mm_bswap_32( cast_m128i( data ) );
// padding of partial block
casti_m128i( state->buffer, 1 ) =
_mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
@@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
// full blocks
for ( i = 0; i < blocks; i++ )
{
rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
mm_byteswap_32( casti_m128i( data, 0 ) ) );
rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
mm_bswap_32( casti_m128i( data, 0 ) ) );
data += MSG_BLOCK_BYTE_LEN;
}
@@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
{
// padding of partial block
rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
mm_byteswap_32( cast_m128i( data ) ) );
mm_bswap_32( cast_m128i( data ) ) );
}
else
{
@@ -542,7 +542,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 0 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
rnd512( state, zero, zero );
@@ -555,7 +555,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 1 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
}
#else
@@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm_store_si128((__m128i*)&hash[0], t[0]);
_mm_store_si128((__m128i*)&hash[4], t[1]);
casti_m128i( b, 0 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 1 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
casti_m128i( b, 0 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 1 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
rnd512( state, zero, zero );
@@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm_store_si128((__m128i*)&hash[0], t[0]);
_mm_store_si128((__m128i*)&hash[4], t[1]);
casti_m128i( b, 2 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 3 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
casti_m128i( b, 2 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 3 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
}
#endif

165
algo/lyra2/allium-4way.c Normal file
View File

@@ -0,0 +1,165 @@
#include "allium-gate.h"
#include <memory.h>
#include <mm_malloc.h>
#if defined (ALLIUM_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/groestl/aes_ni/hash-groestl256.h"
typedef struct {
blake256_4way_context blake;
keccak256_4way_context keccak;
cubehashParam cube;
skein256_4way_context skein;
hashState_groestl256 groestl;
} allium_4way_ctx_holder;
static __thread allium_4way_ctx_holder allium_4way_ctx;
bool init_allium_4way_ctx()
{
keccak256_4way_init( &allium_4way_ctx.keccak );
cubehashInit( &allium_4way_ctx.cube, 256, 16, 32 );
skein256_4way_init( &allium_4way_ctx.skein );
init_groestl256( &allium_4way_ctx.groestl, 32 );
return true;
}
void allium_4way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t vhash32[8*4] __attribute__ ((aligned (64)));
uint32_t vhash64[8*4] __attribute__ ((aligned (64)));
allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
blake256_4way( &ctx.blake, input + (64<<2), 16 );
blake256_4way_close( &ctx.blake, vhash32 );
mm256_reinterleave_4x64( vhash64, vhash32, 256 );
keccak256_4way( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 76; // 19*4
uint32_t *noncep1 = vdata + 77;
uint32_t *noncep2 = vdata + 78;
uint32_t *noncep3 = vdata + 79;
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256_4way_init( &allium_4way_ctx.blake );
blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
allium_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = pdata[19] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

22
algo/lyra2/allium-gate.c Normal file
View File

@@ -0,0 +1,22 @@
#include "allium-gate.h"
int64_t get_max64_0xFFFFLL() { return 0xFFFFLL; }
bool register_allium_algo( algo_gate_t* gate )
{
#if defined (ALLIUM_4WAY)
gate->miner_thread_init = (void*)&init_allium_4way_ctx;
gate->scanhash = (void*)&scanhash_allium_4way;
gate->hash = (void*)&allium_4way_hash;
#else
gate->miner_thread_init = (void*)&init_allium_ctx;
gate->scanhash = (void*)&scanhash_allium;
gate->hash = (void*)&allium_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->set_target = (void*)&alt_set_target;
gate->get_max64 = (void*)&get_max64_0xFFFFLL;
return true;
};

29
algo/lyra2/allium-gate.h Normal file
View File

@@ -0,0 +1,29 @@
#ifndef ALLIUM_GATE_H__
#define ALLIUM_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#include "lyra2.h"
#if defined(__AVX2__) && defined(__AES__)
#define ALLIUM_4WAY
#endif
bool register_allium_algo( algo_gate_t* gate );
#if defined(ALLIUM_4WAY)
void allium_4way_hash( void *state, const void *input );
int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool init_allium_4way_ctx();
#endif
void allium_hash( void *state, const void *input );
int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool init_allium_ctx();
#endif

112
algo/lyra2/allium.c Normal file
View File

@@ -0,0 +1,112 @@
#include "allium-gate.h"
#include <memory.h>
#include "algo/blake/sph_blake.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl256.h"
#else
#include "algo/groestl/sph_groestl.h"
#endif
#include "lyra2.h"
typedef struct {
sph_blake256_context blake;
sph_keccak256_context keccak;
cubehashParam cube;
sph_skein256_context skein;
#if defined (__AES__)
hashState_groestl256 groestl;
#else
sph_groestl256_context groestl;
#endif
} allium_ctx_holder;
static __thread allium_ctx_holder allium_ctx;
bool init_allium_ctx()
{
sph_keccak256_init( &allium_ctx.keccak );
cubehashInit( &allium_ctx.cube, 256, 16, 32 );
sph_skein256_init( &allium_ctx.skein );
#if defined (__AES__)
init_groestl256( &allium_ctx.groestl, 32 );
#else
sph_groestl256_init( &allium_ctx.groestl );
#endif
return true;
}
void allium_hash(void *state, const void *input)
{
uint32_t hash[8] __attribute__ ((aligned (64)));
allium_ctx_holder ctx __attribute__ ((aligned (32)));
memcpy( &ctx, &allium_ctx, sizeof(allium_ctx) );
sph_blake256( &ctx.blake, input + 64, 16 );
sph_blake256_close( &ctx.blake, hash );
sph_keccak256( &ctx.keccak, hash, 32 );
sph_keccak256_close( &ctx.keccak, hash );
LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash, (const byte*)hash, 32 );
LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
sph_skein256( &ctx.skein, hash, 32 );
sph_skein256_close( &ctx.skein, hash );
#if defined (__AES__)
update_and_final_groestl256( &ctx.groestl, hash, hash, 256 );
#else
sph_groestl256( &ctx.groestl, hash, 32 );
sph_groestl256_close( &ctx.groestl, hash );
#endif
memcpy(state, hash, 32);
}
int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
if ( opt_benchmark )
ptarget[7] = 0x3ffff;
for ( int i = 0; i < 19; i++ )
be32enc( &endiandata[i], pdata[i] );
sph_blake256_init( &allium_ctx.blake );
sph_blake256( &allium_ctx.blake, endiandata, 64 );
do {
be32enc( &endiandata[19], nonce );
allium_hash( hash, endiandata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
work_set_target_ratio( work, hash );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}

View File

@@ -47,8 +47,9 @@
*/
int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
const uint64_t timeCost, const uint64_t nRows,
const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
@@ -73,6 +74,8 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
uint64_t *ptrWord = wholeMatrix;
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
//=== Getting the password + salt + basil padded with 10*1 ==========//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
@@ -209,8 +212,9 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
}
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols )
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
const uint64_t timeCost, const uint64_t nRows,
const uint64_t nCols )
{
//========================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
@@ -230,6 +234,8 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
//==== Getting the password + salt + basil padded with 10*1 ============//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
@@ -347,9 +353,9 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
}
// Lyra2RE doesn't like the new wholeMatrix implementation
int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
const void *salt, const uint64_t saltlen, const uint64_t timeCost,
const uint64_t nRows, const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
@@ -377,15 +383,15 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
if (wholeMatrix == NULL)
return -1;
/*
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
#if defined(__AVX2__)
memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
memset_zero_128( (__m128i*)wholeMatrix, i>>4 );
#else
memset(wholeMatrix, 0, i);
memset( wholeMatrix, 0, i );
#endif
*/
uint64_t *ptrWord = wholeMatrix;
//=== Getting the password + salt + basil padded with 10*1 ==========//
@@ -406,8 +412,8 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
memcpy(ptrByte, salt, saltlen);
ptrByte += saltlen;
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
- (saltlen + pwdlen) );
// memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
// - (saltlen + pwdlen) );
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof(int64_t));

View File

@@ -54,4 +54,6 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
#endif /* LYRA2_H_ */

View File

@@ -7,9 +7,6 @@
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/cubehash/sph_cubehash.h"
//#include "algo/bmw/sph_bmw.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
typedef struct {
@@ -18,19 +15,17 @@ typedef struct {
cubehashParam cube;
skein256_4way_context skein;
bmw256_4way_context bmw;
// sph_bmw256_context bmw;
} lyra2v2_4way_ctx_holder;
static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
void init_lyra2rev2_4way_ctx()
bool init_lyra2rev2_4way_ctx()
{
// blake256_4way_init( &l2v2_4way_ctx.blake );
keccak256_4way_init( &l2v2_4way_ctx.keccak );
cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
skein256_4way_init( &l2v2_4way_ctx.skein );
bmw256_4way_init( &l2v2_4way_ctx.bmw );
// sph_bmw256_init( &l2v2_4way_ctx.bmw );
return true;
}
void lyra2rev2_4way_hash( void *state, const void *input )
@@ -45,7 +40,6 @@ void lyra2rev2_4way_hash( void *state, const void *input )
memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
blake256_4way( &ctx.blake, input + (64<<2), 16 );
// blake256_4way( &ctx.blake, input, 80 );
blake256_4way_close( &ctx.blake, vhash );
mm256_reinterleave_4x64( vhash64, vhash, 256 );
@@ -54,11 +48,11 @@ void lyra2rev2_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
@@ -71,36 +65,20 @@ void lyra2rev2_4way_hash( void *state, const void *input )
skein256_4way_close( &ctx.skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
// BMW256 4way has a lane corruption problem, only lanes 0 & 2 produce
// good hash. As a result this ugly workaround of running bmw256-4way
// twice with data shuffled to get all 4 lanes of good hash.
// The hash is then shuffled back into the appropriate lanes for output.
// Not as fast but still faster than using sph serially.
// shift lane 1 data to lane 2.
mm_interleave_4x32( vhash, hash0, hash0, hash1, hash1, 256 );
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, vhash );
uint32_t trash[8] __attribute__ ((aligned (32)));
// extract lane 0 as usual and lane2 containing lane 1 hash
mm_deinterleave_4x32( state, trash, state+32, trash, vhash, 256 );
// shift lane2 data to lane 0 and lane 3 data to lane 2
mm_interleave_4x32( vhash, hash2, hash2, hash3, hash3, 256 );
bmw256_4way_init( &ctx.bmw );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, vhash );
// extract lane 2 hash from lane 0 and lane 3 hash from lane 2.
mm_deinterleave_4x32( state+64, trash, state+96, trash, vhash, 256 );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -144,7 +122,6 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
//printf("found0\n");
found[0] = true;
num_found++;
nonces[0] = pdata[19] = n;
@@ -152,7 +129,6 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
//printf("found1\n");
found[1] = true;
num_found++;
nonces[1] = n+1;
@@ -160,7 +136,6 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
//printf("found2\n");
found[2] = true;
num_found++;
nonces[2] = n+2;
@@ -168,7 +143,6 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
//printf("found3\n");
found[3] = true;
num_found++;
nonces[3] = n+3;

View File

@@ -14,18 +14,20 @@ bool lyra2rev2_thread_init()
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
l2v2_wholeMatrix = _mm_malloc( i, 64 );
#if defined (LYRA2REV2_4WAY)
init_lyra2rev2_4way_ctx();;
#else
init_lyra2rev2_ctx();
#endif
return l2v2_wholeMatrix;
}
bool register_lyra2rev2_algo( algo_gate_t* gate )
{
#if defined (LYRA2REV2_4WAY)
init_lyra2rev2_4way_ctx();
gate->scanhash = (void*)&scanhash_lyra2rev2_4way;
gate->hash = (void*)&lyra2rev2_4way_hash;
#else
init_lyra2rev2_ctx();
gate->scanhash = (void*)&scanhash_lyra2rev2;
gate->hash = (void*)&lyra2rev2_hash;
#endif

View File

@@ -20,7 +20,7 @@ void lyra2rev2_4way_hash( void *state, const void *input );
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_lyra2rev2_4way_ctx();
bool init_lyra2rev2_4way_ctx();
#endif
@@ -29,7 +29,7 @@ void lyra2rev2_hash( void *state, const void *input );
int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_lyra2rev2_ctx();
bool init_lyra2rev2_ctx();
#endif

View File

@@ -21,7 +21,7 @@ typedef struct {
static lyra2v2_ctx_holder lyra2v2_ctx;
static __thread sph_blake256_context l2v2_blake_mid;
void init_lyra2rev2_ctx()
bool init_lyra2rev2_ctx()
{
cubehashInit( &lyra2v2_ctx.cube1, 256, 16, 32 );
cubehashInit( &lyra2v2_ctx.cube2, 256, 16, 32 );
@@ -29,6 +29,7 @@ void init_lyra2rev2_ctx()
sph_keccak256_init( &lyra2v2_ctx.keccak );
sph_skein256_init( &lyra2v2_ctx.skein );
sph_bmw256_init( &lyra2v2_ctx.bmw );
return true;
}
void l2v2_blake256_midstate( const void* input )

View File

@@ -42,7 +42,7 @@ inline void initState( uint64_t State[/*16*/] )
{
#if defined (__AVX2__)
__m256i* state = (__m256i*)State;
__m256i *state = (__m256i*)State;
state[0] = _mm256_setzero_si256();
state[1] = _mm256_setzero_si256();
@@ -53,7 +53,7 @@ inline void initState( uint64_t State[/*16*/] )
#elif defined (__AVX__)
__m128i* state = (__m128i*)State;
__m128i *state = (__m128i*)State;
state[0] = _mm_setzero_si128();
state[1] = _mm_setzero_si128();
@@ -123,8 +123,8 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
const int len_m256i = len / 32;
const int fullBlocks = len_m256i / BLOCK_LEN_M256I;
__m256i* state = (__m256i*)State;
__m256i* out = (__m256i*)Out;
__m256i *state = (__m256i*)State;
__m256i *out = (__m256i*)Out;
int i;
//Squeezes full blocks
@@ -141,8 +141,8 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
const int len_m128i = len / 16;
const int fullBlocks = len_m128i / BLOCK_LEN_M128I;
__m128i* state = (__m128i*)State;
__m128i* out = (__m128i*)Out;
__m128i *state = (__m128i*)State;
__m128i *out = (__m128i*)Out;
int i;
//Squeezes full blocks
@@ -186,19 +186,27 @@ inline void absorbBlock( uint64_t *State, const uint64_t *In )
{
#if defined (__AVX2__)
__m256i* state = (__m256i*)State;
__m256i* in = (__m256i*)In;
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
const __m256i *in = (const __m256i*)In;
state[0] = _mm256_xor_si256( state[0], in[0] );
state[1] = _mm256_xor_si256( state[1], in[1] );
state[2] = _mm256_xor_si256( state[2], in[2] );
state0 = _mm256_xor_si256( state0, in[0] );
state1 = _mm256_xor_si256( state1, in[1] );
state2 = _mm256_xor_si256( state2, in[2] );
LYRA_12_ROUNDS_AVX2( state[0], state[1], state[2], state[3] );
LYRA_12_ROUNDS_AVX2( state0, state1, state2, state3 );
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
#elif defined (__AVX__)
__m128i* state = (__m128i*)State;
__m128i* in = (__m128i*)In;
__m128i *state = (__m128i*)State;
const __m128i *in = (const __m128i*)In;
state[0] = _mm_xor_si128( state[0], in[0] );
state[1] = _mm_xor_si128( state[1], in[1] );
@@ -245,18 +253,26 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
//XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
#if defined (__AVX2__)
__m256i* state = (__m256i*)State;
__m256i* in = (__m256i*)In;
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
const __m256i *in = (const __m256i*)In;
state[0] = _mm256_xor_si256( state[0], in[0] );
state[1] = _mm256_xor_si256( state[1], in[1] );
state0 = _mm256_xor_si256( state0, in[0] );
state1 = _mm256_xor_si256( state1, in[1] );
LYRA_12_ROUNDS_AVX2( state[0], state[1], state[2], state[3] );
LYRA_12_ROUNDS_AVX2( state0, state1, state2, state3 );
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
#elif defined (__AVX__)
__m128i* state = (__m128i*)State;
__m128i* in = (__m128i*)In;
__m128i *state = (__m128i*)State;
const __m128i *in = (const __m128i*)In;
state[0] = _mm_xor_si128( state[0], in[0] );
state[1] = _mm_xor_si128( state[1], in[1] );
@@ -292,7 +308,7 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
* @param state The current state of the sponge
* @param rowOut Row to receive the data squeezed
*/
inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
inline void reducedSqueezeRow0( uint64_t *State, uint64_t *rowOut,
uint64_t nCols )
{
int i;
@@ -301,24 +317,19 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
#if defined (__AVX2__)
__m256i* state = (__m256i*)State;
__m256i state0 = _mm256_load_si256( state );
__m256i state1 = _mm256_load_si256( &state[1] );
__m256i state2 = _mm256_load_si256( &state[2] );
__m256i state3 = _mm256_load_si256( &state[3] );
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
__m256i *out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
__m256i* out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
for ( i = 0; i < 9; i += 3)
{
_mm_prefetch( out - i, _MM_HINT_T0 );
_mm_prefetch( out - i - 2, _MM_HINT_T0 );
}
__builtin_prefetch( out, 1, 0 );
__builtin_prefetch( out -2, 1, 0 );
__builtin_prefetch( out -4, 1, 0 );
for ( i = 0; i < nCols; i++ )
{
_mm_prefetch( out - 9, _MM_HINT_T0 );
_mm_prefetch( out - 11, _MM_HINT_T0 );
__builtin_prefetch( out -i-6, 1, 0 );
out[0] = state0;
out[1] = state1;
@@ -330,15 +341,14 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
LYRA_ROUND_AVX2( state0, state1, state2, state3 );
}
_mm256_store_si256( state, state0 );
_mm256_store_si256( &state[1], state1 );
_mm256_store_si256( &state[2], state2 );
_mm256_store_si256( &state[3], state3 );
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
#elif defined (__AVX__)
__m128i* state = (__m128i*)State;
__m128i *state = (__m128i*)State;
__m128i state0 = _mm_load_si128( state );
__m128i state1 = _mm_load_si128( &state[1] );
__m128i state2 = _mm_load_si128( &state[2] );
@@ -348,7 +358,7 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
__m128i state6 = _mm_load_si128( &state[6] );
__m128i state7 = _mm_load_si128( &state[7] );
__m128i* out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
__m128i *out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
for ( i = 0; i < 6; i += 3)
{
@@ -387,7 +397,7 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
#else
uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
uint64_t *ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
for ( i = 0; i < nCols; i++ )
{
@@ -422,37 +432,31 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
* @param rowIn Row to feed the sponge
* @param rowOut Row to receive the sponge's output
*/
inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
inline void reducedDuplexRow1( uint64_t *State, const uint64_t *rowIn,
uint64_t *rowOut, uint64_t nCols )
{
int i;
#if defined (__AVX2__)
__m256i* state = (__m256i*)State;
__m256i state0 = _mm256_load_si256( state );
__m256i state1 = _mm256_load_si256( &state[1] );
__m256i state2 = _mm256_load_si256( &state[2] );
__m256i state3 = _mm256_load_si256( &state[3] );
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
const __m256i *in = (const __m256i*)rowIn;
__m256i *out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
__m256i* in = (__m256i*)rowIn;
__m256i* out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
for ( i = 0; i < 9; i += 3)
{
_mm_prefetch( in + i, _MM_HINT_T0 );
_mm_prefetch( in + i + 2, _MM_HINT_T0 );
_mm_prefetch( out - i, _MM_HINT_T0 );
_mm_prefetch( out - i - 2, _MM_HINT_T0 );
}
__builtin_prefetch( in, 0, 0 );
__builtin_prefetch( in +2, 0, 0 );
__builtin_prefetch( in +4, 0, 0 );
__builtin_prefetch( out, 1, 0 );
__builtin_prefetch( out -2, 1, 0 );
__builtin_prefetch( out -4, 1, 0 );
for ( i = 0; i < nCols; i++ )
{
_mm_prefetch( in + 9, _MM_HINT_T0 );
_mm_prefetch( in + 11, _MM_HINT_T0 );
_mm_prefetch( out - 9, _MM_HINT_T0 );
_mm_prefetch( out - 11, _MM_HINT_T0 );
__builtin_prefetch( in +i+6, 0, 0 );
__builtin_prefetch( out -i-6, 1, 0 );
state0 = _mm256_xor_si256( state0, in[0] );
state1 = _mm256_xor_si256( state1, in[1] );
@@ -470,14 +474,14 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
out -= BLOCK_LEN_M256I;
}
_mm256_store_si256( state, state0 );
_mm256_store_si256( &state[1], state1 );
_mm256_store_si256( &state[2], state2 );
_mm256_store_si256( &state[3], state3 );
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
#elif defined (__AVX__)
__m128i* state = (__m128i*)State;
__m128i *state = (__m128i*)State;
__m128i state0 = _mm_load_si128( state );
__m128i state1 = _mm_load_si128( &state[1] );
__m128i state2 = _mm_load_si128( &state[2] );
@@ -487,8 +491,8 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
__m128i state6 = _mm_load_si128( &state[6] );
__m128i state7 = _mm_load_si128( &state[7] );
__m128i* in = (__m128i*)rowIn;
__m128i* out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
const __m128i *in = (const __m128i*)rowIn;
__m128i *out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
for ( i = 0; i < 6; i += 3)
{
@@ -540,8 +544,8 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
#else
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
const uint64_t *ptrWordIn = (const uint64_t*)rowIn; //In Lyra2: pointer to prev
uint64_t *ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
for ( i = 0; i < nCols; i++ )
{
@@ -600,7 +604,7 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
* @param rowOut Row receiving the output
*
*/
inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
inline void reducedDuplexRowSetup( uint64_t *State, const uint64_t *rowIn,
uint64_t *rowInOut, uint64_t *rowOut,
uint64_t nCols )
{
@@ -608,35 +612,30 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
#if defined (__AVX2__)
__m256i* state = (__m256i*)State;
__m256i state0 = _mm256_load_si256( state );
__m256i state1 = _mm256_load_si256( &state[1] );
__m256i state2 = _mm256_load_si256( &state[2] );
__m256i state3 = _mm256_load_si256( &state[3] );
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
const __m256i *in = (const __m256i*)rowIn;
__m256i *inout = (__m256i*)rowInOut;
__m256i *out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
__m256i t0, t1, t2;
__m256i* in = (__m256i*)rowIn;
__m256i* inout = (__m256i*)rowInOut;
__m256i* out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
__m256i t0, t1, t2;
for ( i = 0; i < 9; i += 3)
{
_mm_prefetch( in + i, _MM_HINT_T0 );
_mm_prefetch( in + i + 2, _MM_HINT_T0 );
_mm_prefetch( inout + i, _MM_HINT_T0 );
_mm_prefetch( inout + i + 2, _MM_HINT_T0 );
_mm_prefetch( out - i, _MM_HINT_T0 );
_mm_prefetch( out - i - 2, _MM_HINT_T0 );
}
__builtin_prefetch( in, 0, 0 );
__builtin_prefetch( in +2, 0, 0 );
__builtin_prefetch( in +4, 0, 0 );
__builtin_prefetch( inout, 1, 0 );
__builtin_prefetch( inout +2, 1, 0 );
__builtin_prefetch( inout +4, 1, 0 );
__builtin_prefetch( out, 1, 0 );
__builtin_prefetch( out -2, 1, 0 );
__builtin_prefetch( out -4, 1, 0 );
for ( i = 0; i < nCols; i++ )
{
_mm_prefetch( in + 9, _MM_HINT_T0 );
_mm_prefetch( in + 11, _MM_HINT_T0 );
_mm_prefetch( inout + 9, _MM_HINT_T0 );
_mm_prefetch( inout + 11, _MM_HINT_T0 );
_mm_prefetch( out - 9, _MM_HINT_T0 );
_mm_prefetch( out - 11, _MM_HINT_T0 );
__builtin_prefetch( in +i+6, 0, 0 );
__builtin_prefetch( inout +i+6, 1, 0 );
__builtin_prefetch( out -i-6, 1, 0 );
state0 = _mm256_xor_si256( state0,
_mm256_add_epi64( in[0], inout[0] ) );
@@ -670,16 +669,16 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
out -= BLOCK_LEN_M256I;
}
_mm256_store_si256( state, state0 );
_mm256_store_si256( &state[1], state1 );
_mm256_store_si256( &state[2], state2 );
_mm256_store_si256( &state[3], state3 );
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
#elif defined (__AVX__)
__m128i* in = (__m128i*)rowIn;
__m128i* inout = (__m128i*)rowInOut;
__m128i* out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
const __m128i *in = (const __m128i*)rowIn;
__m128i *inout = (__m128i*)rowInOut;
__m128i *out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
for ( i = 0; i < 6; i += 3)
{
@@ -691,12 +690,12 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
_mm_prefetch( out - i - 2, _MM_HINT_T0 );
}
__m128i* state = (__m128i*)State;
__m128i *state = (__m128i*)State;
// For the last round in this function not optimized for AVX
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
const uint64_t *ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t *ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t *ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
for ( i = 0; i < nCols; i++ )
{
@@ -757,9 +756,9 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
#else
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
const uint64_t *ptrWordIn = (const uint64_t*)rowIn; //In Lyra2: pointer to prev
uint64_t *ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t *ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
for ( i = 0; i < nCols; i++ )
{
@@ -834,7 +833,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
*
*/
inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
inline void reducedDuplexRow( uint64_t *State, const uint64_t *rowIn,
uint64_t *rowInOut, uint64_t *rowOut,
uint64_t nCols )
{
@@ -842,35 +841,30 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
#if defined __AVX2__
__m256i* state = (__m256i*)State;
__m256i state0 = _mm256_load_si256( state );
__m256i state1 = _mm256_load_si256( &state[1] );
__m256i state2 = _mm256_load_si256( &state[2] );
__m256i state3 = _mm256_load_si256( &state[3] );
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
const __m256i* in = (const __m256i*)rowIn;
__m256i *inout = (__m256i*)rowInOut;
__m256i *out = (__m256i*)rowOut;
__m256i t0, t1, t2;
__m256i* in = (__m256i*)rowIn;
__m256i* inout = (__m256i*)rowInOut;
__m256i* out = (__m256i*)rowOut;
__m256i t0, t1, t2;
for ( i = 0; i < 9; i += 3)
{
_mm_prefetch( in + i, _MM_HINT_T0 );
_mm_prefetch( in + i + 2, _MM_HINT_T0 );
_mm_prefetch( out + i, _MM_HINT_T0 );
_mm_prefetch( out + i + 2, _MM_HINT_T0 );
_mm_prefetch( inout + i, _MM_HINT_T0 );
_mm_prefetch( inout + i + 2, _MM_HINT_T0 );
}
__builtin_prefetch( in, 0, 0 );
__builtin_prefetch( in +2, 0, 0 );
__builtin_prefetch( in +4, 0, 0 );
__builtin_prefetch( inout, 1, 0 );
__builtin_prefetch( inout +2, 1, 0 );
__builtin_prefetch( inout +4, 1, 0 );
__builtin_prefetch( out, 1, 0 );
__builtin_prefetch( out +2, 1, 0 );
__builtin_prefetch( out +4, 1, 0 );
for ( i = 0; i < nCols; i++ )
{
_mm_prefetch( in + 9, _MM_HINT_T0 );
_mm_prefetch( in + 11, _MM_HINT_T0 );
_mm_prefetch( out + 9, _MM_HINT_T0 );
_mm_prefetch( out + 11, _MM_HINT_T0 );
_mm_prefetch( inout + 9, _MM_HINT_T0 );
_mm_prefetch( inout + 11, _MM_HINT_T0 );
__builtin_prefetch( in +i+6, 0, 0 );
__builtin_prefetch( inout +i+6, 1, 0 );
__builtin_prefetch( out +i+6, 1, 0 );
//Absorbing "M[prev] [+] M[row*]"
state0 = _mm256_xor_si256( state0,
@@ -906,17 +900,17 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
inout += BLOCK_LEN_M256I;
}
_mm256_store_si256( state, state0 );
_mm256_store_si256( &state[1], state1 );
_mm256_store_si256( &state[2], state2 );
_mm256_store_si256( &state[3], state3 );
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
#elif defined __AVX__
__m128i* state = (__m128i*)State;
__m128i* in = (__m128i*)rowIn;
__m128i* inout = (__m128i*)rowInOut;
__m128i* out = (__m128i*)rowOut;
__m128i *state = (__m128i*)State;
const __m128i *in = (const __m128i*)rowIn;
__m128i *inout = (__m128i*)rowInOut;
__m128i *out = (__m128i*)rowOut;
for ( i = 0; i < 6; i += 3)
{
@@ -929,9 +923,9 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
}
// for the last round in this function that isn't optimized for AVX
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
uint64_t *ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
const uint64_t *ptrWordIn = (const uint64_t*)rowIn; //In Lyra2: pointer to prev
uint64_t *ptrWordOut = rowOut; //In Lyra2: pointer to row
for ( i = 0; i < nCols; i++)
{
@@ -997,9 +991,9 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
#else
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
uint64_t *ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
const uint64_t *ptrWordIn = (const uint64_t*)rowIn; //In Lyra2: pointer to prev
uint64_t *ptrWordOut = rowOut; //In Lyra2: pointer to row
for ( i = 0; i < nCols; i++)
{

View File

@@ -159,23 +159,26 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
//---- Housekeeping
void initState(uint64_t state[/*16*/]);
void initState( uint64_t state[/*16*/] );
//---- Squeezes
void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols);
void squeeze( uint64_t *state, unsigned char *out, unsigned int len );
void reducedSqueezeRow0( uint64_t* state, uint64_t* row, uint64_t nCols );
//---- Absorbs
void absorbBlock(uint64_t *state, const uint64_t *in);
void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
void absorbBlock( uint64_t *state, const uint64_t *in );
void absorbBlockBlake2Safe( uint64_t *state, const uint64_t *in );
//---- Duplexes
void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols);
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
void reducedDuplexRow1( uint64_t *state, const uint64_t *rowIn,
uint64_t *rowOut, uint64_t nCols);
void reducedDuplexRowSetup( uint64_t *state, const uint64_t *rowIn,
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
void reducedDuplexRow( uint64_t *state, const uint64_t *rowIn,
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
//---- Misc
void printArray(unsigned char *array, unsigned int size, char *name);
//void printArray(unsigned char *array, unsigned int size, char *name);
////////////////////////////////////////////////////////////////////////////////////////////////

View File

@@ -85,12 +85,12 @@ typedef unsigned int uint;
U32TO8_BE((p) + 4, (uint32_t)((v) ));
typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE];
typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE] __attribute__ ((aligned (16)));
/* SHA-256 */
static const uint32_t sha256_constants[64] = {
static const uint32_t sha256_constants[64] __attribute__ ((aligned (16))) = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
@@ -123,10 +123,10 @@ static const uint32_t sha256_constants[64] = {
typedef struct sha256_hash_state_t {
uint32_t H[8];
uint32_t H[8] __attribute__ ((aligned (16)));
uint64_t T;
uint32_t leftover;
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE] __attribute__ ((aligned (16)));
} sha256_hash_state;
@@ -242,7 +242,7 @@ typedef struct sha256_hmac_state_t {
} sha256_hmac_state;
static void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, const uint8_t *key, size_t keylen) {
uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] __attribute__ ((aligned (16))) = {0};
size_t i;
neoscrypt_hash_init_sha256(&st->inner);
@@ -570,17 +570,17 @@ typedef struct blake2s_param_t {
/* State block of 180 bytes */
typedef struct blake2s_state_t {
uint h[8];
uint h[8] __attribute__ ((aligned (16)));
uint t[2];
uint f[2];
uchar buf[2 * BLAKE2S_BLOCK_SIZE];
uchar buf[2 * BLAKE2S_BLOCK_SIZE] __attribute__ ((aligned (16)));
uint buflen;
} blake2s_state;
static void blake2s_compress(blake2s_state *S, const void *buf) {
uint i;
uint m[16];
uint v[16];
uint m[16] __attribute__ ((aligned (16)));
uint v[16] __attribute__ ((aligned (16)));
neoscrypt_copy(m, buf, 64);
neoscrypt_copy(v, S, 32);
@@ -1082,6 +1082,7 @@ void neoscrypt_wait_for_diff( struct stratum_ctx *stratum )
bool register_neoscrypt_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_neoscrypt;
gate->hash = (void*)&neoscrypt;
gate->get_max64 = (void*)&get_neoscrypt_max64;

View File

@@ -21,6 +21,7 @@ void nist5hash( void *state, const void *input );
int scanhash_nist5( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_nist5_ctx();
#endif
#endif

View File

@@ -60,7 +60,7 @@ void anime_4way_hash( void *state, const void *input )
blake512_4way_close( &ctx.blake, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
@@ -97,7 +97,7 @@ void anime_4way_hash( void *state, const void *input )
jh512_4way_close( &ctx.jh, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
m256_zero );
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
@@ -118,7 +118,7 @@ void anime_4way_hash( void *state, const void *input )
skein512_4way_close( &ctx.skein, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
m256_zero );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );

View File

@@ -60,7 +60,7 @@ void quark_4way_hash( void *state, const void *input )
bmw512_4way_close( &ctx.bmw, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
@@ -97,7 +97,7 @@ void quark_4way_hash( void *state, const void *input )
jh512_4way_close( &ctx.jh, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
m256_zero );
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
@@ -118,7 +118,7 @@ void quark_4way_hash( void *state, const void *input )
skein512_4way_close( &ctx.skein, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
m256_zero );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );

130
algo/qubit/deep-2way.c Normal file
View File

@@ -0,0 +1,130 @@
#include "deep-gate.h"
#if defined(DEEP_2WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct
{
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_echo echo;
} deep_2way_ctx_holder;
deep_2way_ctx_holder deep_2way_ctx;
void init_deep_2way_ctx()
{
luffa_2way_init( &deep_2way_ctx.luffa, 512 );
cubehashInit(&deep_2way_ctx.cube,512,16,32);
sph_shavite512_init(&deep_2way_ctx.shavite);
init_echo(&deep_2way_ctx.echo, 512);
};
void deep_2way_hash( void *output, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*2] __attribute__ ((aligned (64)));
deep_2way_ctx_holder ctx;
memcpy( &ctx, &deep_2way_ctx, sizeof(deep_2way_ctx) );
luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
luffa_2way_close( &ctx.luffa, vhash );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*) hash0, 64 );
memcpy( &ctx.cube, &deep_2way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &deep_2way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &deep_2way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
}
int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 32+3; // 4*8 + 3
uint32_t *noncep1 = vdata + 32+7;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
luffa_2way_init( &deep_2way_ctx.luffa, 512 );
luffa_2way_update( &deep_2way_ctx.luffa, vdata, 64 );
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
deep_2way_hash( hash, vdata );
pdata[19] = n;
if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
n += 2;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

17
algo/qubit/deep-gate.c Normal file
View File

@@ -0,0 +1,17 @@
#include "deep-gate.h"
bool register_deep_algo( algo_gate_t* gate )
{
#if defined (DEEP_2WAY)
init_deep_2way_ctx();
gate->scanhash = (void*)&scanhash_deep_2way;
gate->hash = (void*)&deep_2way_hash;
#else
init_deep_ctx();
gate->scanhash = (void*)&scanhash_deep;
gate->hash = (void*)&deep_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
return true;
};

32
algo/qubit/deep-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef DEEP_GATE_H__
#define DEEP_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define DEEP_2WAY
#endif
bool register_deep_algo( algo_gate_t* gate );
#if defined(DEEP_2WAY)
void deep_2way_hash( void *state, const void *input );
int scanhash_deep_2way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_deep_2way_ctx();
#endif
void deep_hash( void *state, const void *input );
int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_deep_ctx();
#endif

View File

@@ -1,9 +1,9 @@
#include "algo-gate-api.h"
#include "deep-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#ifndef NO_AES_NI
#include "algo/echo/aes_ni/hash_api.h"
@@ -139,12 +139,3 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
bool register_deep_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_deep_ctx();
gate->scanhash = (void*)&scanhash_deep;
gate->hash = (void*)&deep_hash;
return true;
};

137
algo/qubit/qubit-2way.c Normal file
View File

@@ -0,0 +1,137 @@
#include "qubit-gate.h"
#if defined(QUBIT_2WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct
{
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd_2way_context simd;
hashState_echo echo;
} qubit_2way_ctx_holder;
qubit_2way_ctx_holder qubit_2way_ctx;
void init_qubit_2way_ctx()
{
cubehashInit(&qubit_2way_ctx.cube,512,16,32);
sph_shavite512_init(&qubit_2way_ctx.shavite);
simd_2way_init( &qubit_2way_ctx.simd, 512 );
init_echo(&qubit_2way_ctx.echo, 512);
};
void qubit_2way_hash( void *output, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*2] __attribute__ ((aligned (64)));
qubit_2way_ctx_holder ctx;
memcpy( &ctx, &qubit_2way_ctx, sizeof(qubit_2way_ctx) );
luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
luffa_2way_close( &ctx.luffa, vhash );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*) hash0, 64 );
memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
}
int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 32+3; // 4*8 + 3
uint32_t *noncep1 = vdata + 32+7;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
luffa_2way_init( &qubit_2way_ctx.luffa, 512 );
luffa_2way_update( &qubit_2way_ctx.luffa, vdata, 64 );
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
qubit_2way_hash( hash, vdata );
pdata[19] = n;
if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
n += 2;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

17
algo/qubit/qubit-gate.c Normal file
View File

@@ -0,0 +1,17 @@
#include "qubit-gate.h"
bool register_qubit_algo( algo_gate_t* gate )
{
#if defined (QUBIT_2WAY)
init_qubit_2way_ctx();
gate->scanhash = (void*)&scanhash_qubit_2way;
gate->hash = (void*)&qubit_2way_hash;
#else
init_qubit_ctx();
gate->scanhash = (void*)&scanhash_qubit;
gate->hash = (void*)&qubit_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
return true;
};

32
algo/qubit/qubit-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef QUBIT_GATE_H__
#define QUBIT_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define QUBIT_2WAY
#endif
bool register_qubit_algo( algo_gate_t* gate );
#if defined(QUBIT_2WAY)
void qubit_2way_hash( void *state, const void *input );
int scanhash_qubit_2way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_qubit_2way_ctx();
#endif
void qubit_hash( void *state, const void *input );
int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_qubit_ctx();
#endif

View File

@@ -1,11 +1,11 @@
#include "algo-gate-api.h"
#include "qubit-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#include "algo/shavite/sph_shavite.h"
#ifndef NO_AES_NI
#include "algo/echo/aes_ni/hash_api.h"
@@ -48,7 +48,7 @@ void qubit_luffa_midstate( const void* input )
update_luffa( &qubit_luffa_mid, input, 64 );
}
void qubithash(void *output, const void *input)
void qubit_hash(void *output, const void *input)
{
unsigned char hash[128] __attribute((aligned(64)));
#define hashB hash+64
@@ -115,7 +115,7 @@ int scanhash_qubit(int thr_id, struct work *work,
{
pdata[19] = ++n;
be32enc(&endiandata[19], n);
qubithash(hash64, endiandata);
qubit_hash(hash64, endiandata);
#ifndef DEBUG_ALGO
if (!(hash64[7] & mask))
{
@@ -151,12 +151,3 @@ int scanhash_qubit(int thr_id, struct work *work,
return 0;
}
bool register_qubit_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_qubit_ctx();
gate->scanhash = (void*)&scanhash_qubit;
gate->hash = (void*)&qubithash;
return true;
};

138
algo/ripemd/lbry-4way.c Normal file
View File

@@ -0,0 +1,138 @@
#include "lbry-gate.h"
#if defined(LBRY_4WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/sha/sha2-hash-4way.h"
#include "ripemd-hash-4way.h"
static __thread sha256_4way_context sha256_mid;
void lbry_4way_hash( void* output, const void* input )
{
sha256_4way_context ctx_sha256 __attribute__ ((aligned (64)));
sha512_4way_context ctx_sha512;
ripemd160_4way_context ctx_ripemd;
uint32_t _ALIGN(64) vhashA[16<<2];
uint32_t _ALIGN(64) vhashB[16<<2];
uint32_t _ALIGN(64) vhashC[16<<2];
memcpy( &ctx_sha256, &sha256_mid, sizeof(ctx_sha256) );
sha256_4way( &ctx_sha256, input+(64<<2), 48 );
sha256_4way_close( &ctx_sha256, vhashA );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhashA, 32 );
sha256_4way_close( &ctx_sha256, vhashA );
// sha512 64 bit data, 64 byte output
mm256_reinterleave_4x64( vhashB, vhashA, 256 );
sha512_4way_init( &ctx_sha512 );
sha512_4way( &ctx_sha512, vhashB, 32 );
sha512_4way_close( &ctx_sha512, vhashB );
mm256_reinterleave_4x32( vhashA, vhashB, 512 );
ripemd160_4way_init( &ctx_ripemd );
ripemd160_4way( &ctx_ripemd, vhashA, 32 );
ripemd160_4way_close( &ctx_ripemd, vhashB );
ripemd160_4way_init( &ctx_ripemd );
ripemd160_4way( &ctx_ripemd, vhashA+(8<<2), 32 );
ripemd160_4way_close( &ctx_ripemd, vhashC );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhashB, 20 );
sha256_4way( &ctx_sha256, vhashC, 20 );
sha256_4way_close( &ctx_sha256, vhashA );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhashA, 32 );
sha256_4way_close( &ctx_sha256, vhashA );
mm_deinterleave_4x32( output, output+32, output+64, output+96, vhashA, 256 );
}
int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[32*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[27];
const uint32_t first_nonce = pdata[27];
const uint32_t Htarg = ptarget[7];
uint32_t edata[32] __attribute__ ((aligned (64)));
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 108; // 27*4
uint32_t *noncep1 = vdata + 109;
uint32_t *noncep2 = vdata + 110;
uint32_t *noncep3 = vdata + 111;
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// we need bigendian data...
swab32_array( edata, pdata, 32 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 1024 );
sha256_4way_init( &sha256_mid );
sha256_4way( &sha256_mid, vdata, 64 );
for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
lbry_4way_hash( hash, vdata );
if ( !( hash[7] & mask ) && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = pdata[27] = n;
work_set_target_ratio( work, hash );
}
if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( !( (hash+16)[7] & mask ) && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( !( (hash+24)[7] & mask ) && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n+=4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce;
return num_found;
}
#endif

94
algo/ripemd/lbry-gate.c Normal file
View File

@@ -0,0 +1,94 @@
#include "lbry-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
double lbry_calc_network_diff( struct work *work )
{
// sample for diff 43.281 : 1c05ea29
// todo: endian reversed on longpoll could be zr5 specific...
uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
uint32_t bits = (nbits & 0xffffff);
int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
double d = (double)0x0000ffff / (double)bits;
for (int m=shift; m < 29; m++) d *= 256.0;
for (int m=29; m < shift; m++) d /= 256.0;
if (opt_debug_diff)
applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
return d;
}
// std_le should work but it doesn't
void lbry_le_build_stratum_request( char *req, struct work *work,
struct stratum_ctx *sctx )
{
unsigned char *xnonce2str;
uint32_t ntime, nonce;
char ntimestr[9], noncestr[9];
le32enc( &ntime, work->data[ LBRY_NTIME_INDEX ] );
le32enc( &nonce, work->data[ LBRY_NONCE_INDEX ] );
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
xnonce2str = abin2hex( work->xnonce2, work->xnonce2_len);
snprintf( req, JSON_BUF_LEN,
"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
free(xnonce2str);
}
void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
{
unsigned char merkle_root[64] = { 0 };
size_t t;
int i;
algo_gate.gen_merkle_root( merkle_root, sctx );
// Increment extranonce2
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
// Assemble block header
memset( g_work->data, 0, sizeof(g_work->data) );
g_work->data[0] = le32dec( sctx->job.version );
for ( i = 0; i < 8; i++ )
g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
for ( i = 0; i < 8; i++ )
g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
for ( int i = 0; i < 8; i++ )
g_work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
g_work->data[ LBRY_NTIME_INDEX ] = le32dec(sctx->job.ntime);
g_work->data[ LBRY_NBITS_INDEX ] = le32dec(sctx->job.nbits);
g_work->data[28] = 0x80000000;
}
void lbry_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
int64_t lbry_get_max64() { return 0x1ffffLL; }
bool register_lbry_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
#if defined (LBRY_4WAY)
gate->scanhash = (void*)&scanhash_lbry_4way;
gate->hash = (void*)&lbry_4way_hash;
#else
gate->scanhash = (void*)&scanhash_lbry;
gate->hash = (void*)&lbry_hash;
#endif
gate->calc_network_diff = (void*)&lbry_calc_network_diff;
gate->get_max64 = (void*)&lbry_get_max64;
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
gate->build_extraheader = (void*)&lbry_build_extraheader;
gate->set_target = (void*)&lbry_set_target;
gate->ntime_index = LBRY_NTIME_INDEX;
gate->nbits_index = LBRY_NBITS_INDEX;
gate->nonce_index = LBRY_NONCE_INDEX;
gate->work_data_size = LBRY_WORK_DATA_SIZE;
return true;
}

30
algo/ripemd/lbry-gate.h Normal file
View File

@@ -0,0 +1,30 @@
#ifndef LBRY_GATE_H__
#define LBRY_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__)
#define LBRY_4WAY
#endif
#define LBRY_NTIME_INDEX 25
#define LBRY_NBITS_INDEX 26
#define LBRY_NONCE_INDEX 27
#define LBRY_WORK_DATA_SIZE 192
#define LBRY_WORK_CMP_SIZE 76 // same as default
bool register_lbry_algo( algo_gate_t* gate );
#if defined(LBRY_4WAY)
void lbry_4way_hash( void *state, const void *input );
int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
void lbry_hash( void *state, const void *input );
int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif

View File

@@ -1,19 +1,12 @@
#include "algo-gate-api.h"
#include "lbry-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "ripemd/sph_ripemd.h"
#include "sha/sph_sha2.h"
#include "sph_ripemd.h"
#include "algo/sha/sph_sha2.h"
#include <openssl/sha.h>
#define LBRY_NTIME_INDEX 25
#define LBRY_NBITS_INDEX 26
#define LBRY_NONCE_INDEX 27
#define LBRY_WORK_DATA_SIZE 192
#define LBRY_WORK_CMP_SIZE 76 // same as default
void lbry_hash(void* output, const void* input)
{
#ifndef USE_SPH_SHA
@@ -151,88 +144,3 @@ int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
pdata[27] = n;
return 0;
}
double lbry_calc_network_diff( struct work *work )
{
// sample for diff 43.281 : 1c05ea29
// todo: endian reversed on longpoll could be zr5 specific...
uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
uint32_t bits = (nbits & 0xffffff);
int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
double d = (double)0x0000ffff / (double)bits;
for (int m=shift; m < 29; m++) d *= 256.0;
for (int m=29; m < shift; m++) d /= 256.0;
if (opt_debug_diff)
applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
return d;
}
// std_le should work but it doesn't
void lbry_le_build_stratum_request( char *req, struct work *work,
struct stratum_ctx *sctx )
{
unsigned char *xnonce2str;
uint32_t ntime, nonce;
char ntimestr[9], noncestr[9];
le32enc( &ntime, work->data[ LBRY_NTIME_INDEX ] );
le32enc( &nonce, work->data[ LBRY_NONCE_INDEX ] );
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
xnonce2str = abin2hex( work->xnonce2, work->xnonce2_len);
snprintf( req, JSON_BUF_LEN,
"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
free(xnonce2str);
}
void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
{
unsigned char merkle_root[64] = { 0 };
size_t t;
int i;
algo_gate.gen_merkle_root( merkle_root, sctx );
// Increment extranonce2
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
// Assemble block header
memset( g_work->data, 0, sizeof(g_work->data) );
g_work->data[0] = le32dec( sctx->job.version );
for ( i = 0; i < 8; i++ )
g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
for ( i = 0; i < 8; i++ )
g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
for ( int i = 0; i < 8; i++ )
g_work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
g_work->data[ LBRY_NTIME_INDEX ] = le32dec(sctx->job.ntime);
g_work->data[ LBRY_NBITS_INDEX ] = le32dec(sctx->job.nbits);
g_work->data[28] = 0x80000000;
}
void lbry_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
int64_t lbry_get_max64() { return 0x1ffffLL; }
bool register_lbry_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_lbry;
gate->hash = (void*)&lbry_hash;
gate->calc_network_diff = (void*)&lbry_calc_network_diff;
gate->get_max64 = (void*)&lbry_get_max64;
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
gate->build_extraheader = (void*)&lbry_build_extraheader;
gate->set_target = (void*)&lbry_set_target;
gate->ntime_index = LBRY_NTIME_INDEX;
gate->nbits_index = LBRY_NBITS_INDEX;
gate->nonce_index = LBRY_NONCE_INDEX;
gate->work_data_size = LBRY_WORK_DATA_SIZE;
return true;
}

View File

@@ -0,0 +1,323 @@
#include "ripemd-hash-4way.h"
#if defined(__AVX__)
#include <stddef.h>
#include <string.h>
/*
* Round functions for RIPEMD-128 and RIPEMD-160.
*/
#define F1(x, y, z) \
_mm_xor_si128( _mm_xor_si128( x, y ), z )
#define F2(x, y, z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
#define F3(x, y, z) \
_mm_xor_si128( _mm_or_si128( x, mm_not( y ) ), z )
#define F4(x, y, z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
#define F5(x, y, z) \
_mm_xor_si128( x, _mm_or_si128( y, mm_not( z ) ) )
static const uint32_t IV[5] =
{ 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 };
/*
* Round constants for RIPEMD-160.
*/
#define K11 0x00000000
#define K12 0x5A827999
#define K13 0x6ED9EBA1
#define K14 0x8F1BBCDC
#define K15 0xA953FD4E
#define K21 0x50A28BE6
#define K22 0x5C4DD124
#define K23 0x6D703EF3
#define K24 0x7A6D76E9
#define K25 0x00000000
#define RR(a, b, c, d, e, f, s, r, k) \
do{ \
a = _mm_add_epi32( mm_rotl_32( _mm_add_epi32( _mm_add_epi32( \
_mm_add_epi32( a, f( b ,c, d ) ), r ), \
_mm_set1_epi32( k ) ), s ), e ); \
c = mm_rotl_32( c, 10 );\
} while (0)
#define ROUND1(a, b, c, d, e, f, s, r, k) \
RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
#define ROUND2(a, b, c, d, e, f, s, r, k) \
RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
static void ripemd160_4way_round( ripemd160_4way_context *sc )
{
const __m128i *in = (__m128i*)sc->buf;
__m128i *h = (__m128i*)sc->val;
register __m128i A1, B1, C1, D1, E1;
register __m128i A2, B2, C2, D2, E2;
__m128i tmp;
A1 = A2 = h[0];
B1 = B2 = h[1];
C1 = C2 = h[2];
D1 = D2 = h[3];
E1 = E2 = h[4];
ROUND1( A, B, C, D, E, F1, 11, in[ 0], 1 );
ROUND1( E, A, B, C, D, F1, 14, in[ 1], 1 );
ROUND1( D, E, A, B, C, F1, 15, in[ 2], 1 );
ROUND1( C, D, E, A, B, F1, 12, in[ 3], 1 );
ROUND1( B, C, D, E, A, F1, 5, in[ 4], 1 );
ROUND1( A, B, C, D, E, F1, 8, in[ 5], 1 );
ROUND1( E, A, B, C, D, F1, 7, in[ 6], 1 );
ROUND1( D, E, A, B, C, F1, 9, in[ 7], 1 );
ROUND1( C, D, E, A, B, F1, 11, in[ 8], 1 );
ROUND1( B, C, D, E, A, F1, 13, in[ 9], 1 );
ROUND1( A, B, C, D, E, F1, 14, in[10], 1 );
ROUND1( E, A, B, C, D, F1, 15, in[11], 1 );
ROUND1( D, E, A, B, C, F1, 6, in[12], 1 );
ROUND1( C, D, E, A, B, F1, 7, in[13], 1 );
ROUND1( B, C, D, E, A, F1, 9, in[14], 1 );
ROUND1( A, B, C, D, E, F1, 8, in[15], 1 );
ROUND1( E, A, B, C, D, F2, 7, in[ 7], 2 );
ROUND1( D, E, A, B, C, F2, 6, in[ 4], 2 );
ROUND1( C, D, E, A, B, F2, 8, in[13], 2 );
ROUND1( B, C, D, E, A, F2, 13, in[ 1], 2 );
ROUND1( A, B, C, D, E, F2, 11, in[10], 2 );
ROUND1( E, A, B, C, D, F2, 9, in[ 6], 2 );
ROUND1( D, E, A, B, C, F2, 7, in[15], 2 );
ROUND1( C, D, E, A, B, F2, 15, in[ 3], 2 );
ROUND1( B, C, D, E, A, F2, 7, in[12], 2 );
ROUND1( A, B, C, D, E, F2, 12, in[ 0], 2 );
ROUND1( E, A, B, C, D, F2, 15, in[ 9], 2 );
ROUND1( D, E, A, B, C, F2, 9, in[ 5], 2 );
ROUND1( C, D, E, A, B, F2, 11, in[ 2], 2 );
ROUND1( B, C, D, E, A, F2, 7, in[14], 2 );
ROUND1( A, B, C, D, E, F2, 13, in[11], 2 );
ROUND1( E, A, B, C, D, F2, 12, in[ 8], 2 );
ROUND1( D, E, A, B, C, F3, 11, in[ 3], 3 );
ROUND1( C, D, E, A, B, F3, 13, in[10], 3 );
ROUND1( B, C, D, E, A, F3, 6, in[14], 3 );
ROUND1( A, B, C, D, E, F3, 7, in[ 4], 3 );
ROUND1( E, A, B, C, D, F3, 14, in[ 9], 3 );
ROUND1( D, E, A, B, C, F3, 9, in[15], 3 );
ROUND1( C, D, E, A, B, F3, 13, in[ 8], 3 );
ROUND1( B, C, D, E, A, F3, 15, in[ 1], 3 );
ROUND1( A, B, C, D, E, F3, 14, in[ 2], 3 );
ROUND1( E, A, B, C, D, F3, 8, in[ 7], 3 );
ROUND1( D, E, A, B, C, F3, 13, in[ 0], 3 );
ROUND1( C, D, E, A, B, F3, 6, in[ 6], 3 );
ROUND1( B, C, D, E, A, F3, 5, in[13], 3 );
ROUND1( A, B, C, D, E, F3, 12, in[11], 3 );
ROUND1( E, A, B, C, D, F3, 7, in[ 5], 3 );
ROUND1( D, E, A, B, C, F3, 5, in[12], 3 );
ROUND1( C, D, E, A, B, F4, 11, in[ 1], 4 );
ROUND1( B, C, D, E, A, F4, 12, in[ 9], 4 );
ROUND1( A, B, C, D, E, F4, 14, in[11], 4 );
ROUND1( E, A, B, C, D, F4, 15, in[10], 4 );
ROUND1( D, E, A, B, C, F4, 14, in[ 0], 4 );
ROUND1( C, D, E, A, B, F4, 15, in[ 8], 4 );
ROUND1( B, C, D, E, A, F4, 9, in[12], 4 );
ROUND1( A, B, C, D, E, F4, 8, in[ 4], 4 );
ROUND1( E, A, B, C, D, F4, 9, in[13], 4 );
ROUND1( D, E, A, B, C, F4, 14, in[ 3], 4 );
ROUND1( C, D, E, A, B, F4, 5, in[ 7], 4 );
ROUND1( B, C, D, E, A, F4, 6, in[15], 4 );
ROUND1( A, B, C, D, E, F4, 8, in[14], 4 );
ROUND1( E, A, B, C, D, F4, 6, in[ 5], 4 );
ROUND1( D, E, A, B, C, F4, 5, in[ 6], 4 );
ROUND1( C, D, E, A, B, F4, 12, in[ 2], 4 );
ROUND1( B, C, D, E, A, F5, 9, in[ 4], 5 );
ROUND1( A, B, C, D, E, F5, 15, in[ 0], 5 );
ROUND1( E, A, B, C, D, F5, 5, in[ 5], 5 );
ROUND1( D, E, A, B, C, F5, 11, in[ 9], 5 );
ROUND1( C, D, E, A, B, F5, 6, in[ 7], 5 );
ROUND1( B, C, D, E, A, F5, 8, in[12], 5 );
ROUND1( A, B, C, D, E, F5, 13, in[ 2], 5 );
ROUND1( E, A, B, C, D, F5, 12, in[10], 5 );
ROUND1( D, E, A, B, C, F5, 5, in[14], 5 );
ROUND1( C, D, E, A, B, F5, 12, in[ 1], 5 );
ROUND1( B, C, D, E, A, F5, 13, in[ 3], 5 );
ROUND1( A, B, C, D, E, F5, 14, in[ 8], 5 );
ROUND1( E, A, B, C, D, F5, 11, in[11], 5 );
ROUND1( D, E, A, B, C, F5, 8, in[ 6], 5 );
ROUND1( C, D, E, A, B, F5, 5, in[15], 5 );
ROUND1( B, C, D, E, A, F5, 6, in[13], 5 );
ROUND2( A, B, C, D, E, F5, 8, in[ 5], 1 );
ROUND2( E, A, B, C, D, F5, 9, in[14], 1 );
ROUND2( D, E, A, B, C, F5, 9, in[ 7], 1 );
ROUND2( C, D, E, A, B, F5, 11, in[ 0], 1 );
ROUND2( B, C, D, E, A, F5, 13, in[ 9], 1 );
ROUND2( A, B, C, D, E, F5, 15, in[ 2], 1 );
ROUND2( E, A, B, C, D, F5, 15, in[11], 1 );
ROUND2( D, E, A, B, C, F5, 5, in[ 4], 1 );
ROUND2( C, D, E, A, B, F5, 7, in[13], 1 );
ROUND2( B, C, D, E, A, F5, 7, in[ 6], 1 );
ROUND2( A, B, C, D, E, F5, 8, in[15], 1 );
ROUND2( E, A, B, C, D, F5, 11, in[ 8], 1 );
ROUND2( D, E, A, B, C, F5, 14, in[ 1], 1 );
ROUND2( C, D, E, A, B, F5, 14, in[10], 1 );
ROUND2( B, C, D, E, A, F5, 12, in[ 3], 1 );
ROUND2( A, B, C, D, E, F5, 6, in[12], 1 );
ROUND2( E, A, B, C, D, F4, 9, in[ 6], 2 );
ROUND2( D, E, A, B, C, F4, 13, in[11], 2 );
ROUND2( C, D, E, A, B, F4, 15, in[ 3], 2 );
ROUND2( B, C, D, E, A, F4, 7, in[ 7], 2 );
ROUND2( A, B, C, D, E, F4, 12, in[ 0], 2 );
ROUND2( E, A, B, C, D, F4, 8, in[13], 2 );
ROUND2( D, E, A, B, C, F4, 9, in[ 5], 2 );
ROUND2( C, D, E, A, B, F4, 11, in[10], 2 );
ROUND2( B, C, D, E, A, F4, 7, in[14], 2 );
ROUND2( A, B, C, D, E, F4, 7, in[15], 2 );
ROUND2( E, A, B, C, D, F4, 12, in[ 8], 2 );
ROUND2( D, E, A, B, C, F4, 7, in[12], 2 );
ROUND2( C, D, E, A, B, F4, 6, in[ 4], 2 );
ROUND2( B, C, D, E, A, F4, 15, in[ 9], 2 );
ROUND2( A, B, C, D, E, F4, 13, in[ 1], 2 );
ROUND2( E, A, B, C, D, F4, 11, in[ 2], 2 );
ROUND2( D, E, A, B, C, F3, 9, in[15], 3 );
ROUND2( C, D, E, A, B, F3, 7, in[ 5], 3 );
ROUND2( B, C, D, E, A, F3, 15, in[ 1], 3 );
ROUND2( A, B, C, D, E, F3, 11, in[ 3], 3 );
ROUND2( E, A, B, C, D, F3, 8, in[ 7], 3 );
ROUND2( D, E, A, B, C, F3, 6, in[14], 3 );
ROUND2( C, D, E, A, B, F3, 6, in[ 6], 3 );
ROUND2( B, C, D, E, A, F3, 14, in[ 9], 3 );
ROUND2( A, B, C, D, E, F3, 12, in[11], 3 );
ROUND2( E, A, B, C, D, F3, 13, in[ 8], 3 );
ROUND2( D, E, A, B, C, F3, 5, in[12], 3 );
ROUND2( C, D, E, A, B, F3, 14, in[ 2], 3 );
ROUND2( B, C, D, E, A, F3, 13, in[10], 3 );
ROUND2( A, B, C, D, E, F3, 13, in[ 0], 3 );
ROUND2( E, A, B, C, D, F3, 7, in[ 4], 3 );
ROUND2( D, E, A, B, C, F3, 5, in[13], 3 );
ROUND2( C, D, E, A, B, F2, 15, in[ 8], 4 );
ROUND2( B, C, D, E, A, F2, 5, in[ 6], 4 );
ROUND2( A, B, C, D, E, F2, 8, in[ 4], 4 );
ROUND2( E, A, B, C, D, F2, 11, in[ 1], 4 );
ROUND2( D, E, A, B, C, F2, 14, in[ 3], 4 );
ROUND2( C, D, E, A, B, F2, 14, in[11], 4 );
ROUND2( B, C, D, E, A, F2, 6, in[15], 4 );
ROUND2( A, B, C, D, E, F2, 14, in[ 0], 4 );
ROUND2( E, A, B, C, D, F2, 6, in[ 5], 4 );
ROUND2( D, E, A, B, C, F2, 9, in[12], 4 );
ROUND2( C, D, E, A, B, F2, 12, in[ 2], 4 );
ROUND2( B, C, D, E, A, F2, 9, in[13], 4 );
ROUND2( A, B, C, D, E, F2, 12, in[ 9], 4 );
ROUND2( E, A, B, C, D, F2, 5, in[ 7], 4 );
ROUND2( D, E, A, B, C, F2, 15, in[10], 4 );
ROUND2( C, D, E, A, B, F2, 8, in[14], 4 );
ROUND2( B, C, D, E, A, F1, 8, in[12], 5 );
ROUND2( A, B, C, D, E, F1, 5, in[15], 5 );
ROUND2( E, A, B, C, D, F1, 12, in[10], 5 );
ROUND2( D, E, A, B, C, F1, 9, in[ 4], 5 );
ROUND2( C, D, E, A, B, F1, 12, in[ 1], 5 );
ROUND2( B, C, D, E, A, F1, 5, in[ 5], 5 );
ROUND2( A, B, C, D, E, F1, 14, in[ 8], 5 );
ROUND2( E, A, B, C, D, F1, 6, in[ 7], 5 );
ROUND2( D, E, A, B, C, F1, 8, in[ 6], 5 );
ROUND2( C, D, E, A, B, F1, 13, in[ 2], 5 );
ROUND2( B, C, D, E, A, F1, 6, in[13], 5 );
ROUND2( A, B, C, D, E, F1, 5, in[14], 5 );
ROUND2( E, A, B, C, D, F1, 15, in[ 0], 5 );
ROUND2( D, E, A, B, C, F1, 13, in[ 3], 5 );
ROUND2( C, D, E, A, B, F1, 11, in[ 9], 5 );
ROUND2( B, C, D, E, A, F1, 11, in[11], 5 );
tmp = _mm_add_epi32( _mm_add_epi32( h[1], C1 ), D2 );
h[1] = _mm_add_epi32( _mm_add_epi32( h[2], D1 ), E2 );
h[2] = _mm_add_epi32( _mm_add_epi32( h[3], E1 ), A2 );
h[3] = _mm_add_epi32( _mm_add_epi32( h[4], A1 ), B2 );
h[4] = _mm_add_epi32( _mm_add_epi32( h[0], B1 ), C2 );
h[0] = tmp;
}
void ripemd160_4way_init( ripemd160_4way_context *sc )
{
sc->val[0] = _mm_set1_epi32( IV[0] );
sc->val[1] = _mm_set1_epi32( IV[1] );
sc->val[2] = _mm_set1_epi32( IV[2] );
sc->val[3] = _mm_set1_epi32( IV[3] );
sc->val[4] = _mm_set1_epi32( IV[4] );
sc->count_high = sc->count_low = 0;
}
void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len )
{
__m128i *vdata = (__m128i*)data;
size_t ptr;
const int block_size = 64;
ptr = (unsigned)sc->count_low & (block_size - 1U);
while ( len > 0 )
{
size_t clen;
uint32_t clow, clow2;
clen = block_size - ptr;
if ( clen > len )
clen = len;
memcpy_128( sc->buf + (ptr>>2), vdata, clen>>2 );
vdata = vdata + (clen>>2);
ptr += clen;
len -= clen;
if ( ptr == block_size )
{
ripemd160_4way_round( sc );
ptr = 0;
}
clow = sc->count_low;
clow2 = clow + clen;
sc->count_low = clow2;
if ( clow2 < clow )
sc->count_high++;
}
}
void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
{
unsigned ptr, u;
uint32_t low, high;
const int block_size = 64;
const int pad = block_size - 8;
ptr = (unsigned)sc->count_low & ( block_size - 1U);
sc->buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 );
ptr += 4;
if ( ptr > pad )
{
memset_zero_128( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
ripemd160_4way_round( sc );
memset_zero_128( sc->buf, pad>>2 );
}
else
memset_zero_128( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
low = sc->count_low;
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad>>2 ] = _mm_set1_epi32( low );
sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
ripemd160_4way_round( sc );
for (u = 0; u < 5; u ++)
casti_m128i( dst, u ) = sc->val[u];
}
#endif

View File

@@ -0,0 +1,23 @@
#ifndef RIPEMD_HASH_4WAY_H__
#define RIPEMD_HASH_4WAY_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#if defined(__AVX__)
#include "avxdefs.h"
typedef struct
{
__m128i buf[64>>2];
__m128i val[5];
uint32_t count_high, count_low;
} __attribute__ ((aligned (64))) ripemd160_4way_context;
void ripemd160_4way_init( ripemd160_4way_context *sc );
void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len );
void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst );
#endif
#endif

View File

@@ -778,6 +778,7 @@ bool scrypt_miner_thread_init( int thr_id )
bool register_scrypt_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
gate->scanhash = (void*)&scanhash_scrypt;
// gate->hash = (void*)&scrypt_1024_1_1_256_24way;

View File

@@ -215,18 +215,18 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
#if defined BE64
#if defined PLW1
sc->buf[ SPH_MAXPAD>>3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#elif defined PLW4
memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#else
sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#endif // PLW
#else // LE64
#if defined PLW1
@@ -255,7 +255,7 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
for ( u = 0; u < rnum; u ++ )
{
#if defined BE64
((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
#else // LE64
((__m256i*)dst)[u] = sc->val[u];
#endif

View File

@@ -1,247 +0,0 @@
/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */
/*
* SHA-384 / SHA-512 implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include <stddef.h>
#include <string.h>
#include "sph_sha2.h"
#if SPH_64
#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z)))
#define ROTR64 SPH_ROTR64
#define BSG5_0(x) (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39))
#define BSG5_1(x) (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41))
#define SSG5_0(x) (ROTR64(x, 1) ^ ROTR64(x, 8) ^ SPH_T64((x) >> 7))
#define SSG5_1(x) (ROTR64(x, 19) ^ ROTR64(x, 61) ^ SPH_T64((x) >> 6))
static const sph_u64 K512[80] = {
SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
};
static const sph_u64 H384[8] = {
SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507),
SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939),
SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511),
SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4)
};
static const sph_u64 H512[8] = {
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
};
/*
* This macro defines the body for a SHA-384 / SHA-512 compression function
* implementation. The "in" parameter should evaluate, when applied to a
* numerical input parameter from 0 to 15, to an expression which yields
* the corresponding input block. The "r" parameter should evaluate to
* an array or pointer expression designating the array of 8 words which
* contains the input and output of the compression function.
*
* SHA-512 is hard for the compiler. If the loop is completely unrolled,
* then the code will be quite huge (possibly more than 100 kB), and the
* performance will be degraded due to cache misses on the code. We
* unroll only eight steps, which avoids all needless copies when
* 64-bit registers are swapped.
*/
#define SHA3_STEP(A, B, C, D, E, F, G, H, i) do { \
sph_u64 T1, T2; \
T1 = SPH_T64(H + BSG5_1(E) + CH(E, F, G) + K512[i] + W[i]); \
T2 = SPH_T64(BSG5_0(A) + MAJ(A, B, C)); \
D = SPH_T64(D + T1); \
H = SPH_T64(T1 + T2); \
} while (0)
#define SHA3_ROUND_BODY(in, r) do { \
int i; \
sph_u64 A, B, C, D, E, F, G, H; \
sph_u64 W[80]; \
\
for (i = 0; i < 16; i ++) \
W[i] = in(i); \
for (i = 16; i < 80; i ++) \
W[i] = SPH_T64(SSG5_1(W[i - 2]) + W[i - 7] \
+ SSG5_0(W[i - 15]) + W[i - 16]); \
A = (r)[0]; \
B = (r)[1]; \
C = (r)[2]; \
D = (r)[3]; \
E = (r)[4]; \
F = (r)[5]; \
G = (r)[6]; \
H = (r)[7]; \
for (i = 0; i < 80; i += 8) { \
SHA3_STEP(A, B, C, D, E, F, G, H, i + 0); \
SHA3_STEP(H, A, B, C, D, E, F, G, i + 1); \
SHA3_STEP(G, H, A, B, C, D, E, F, i + 2); \
SHA3_STEP(F, G, H, A, B, C, D, E, i + 3); \
SHA3_STEP(E, F, G, H, A, B, C, D, i + 4); \
SHA3_STEP(D, E, F, G, H, A, B, C, i + 5); \
SHA3_STEP(C, D, E, F, G, H, A, B, i + 6); \
SHA3_STEP(B, C, D, E, F, G, H, A, i + 7); \
} \
(r)[0] = SPH_T64((r)[0] + A); \
(r)[1] = SPH_T64((r)[1] + B); \
(r)[2] = SPH_T64((r)[2] + C); \
(r)[3] = SPH_T64((r)[3] + D); \
(r)[4] = SPH_T64((r)[4] + E); \
(r)[5] = SPH_T64((r)[5] + F); \
(r)[6] = SPH_T64((r)[6] + G); \
(r)[7] = SPH_T64((r)[7] + H); \
} while (0)
/*
* One round of SHA-384 / SHA-512. The data must be aligned for 64-bit access.
*/
static void
sha3_round(const unsigned char *data, sph_u64 r[8])
{
#define SHA3_IN(x) sph_dec64be_aligned(data + (8 * (x)))
SHA3_ROUND_BODY(SHA3_IN, r);
#undef SHA3_IN
}
/* see sph_sha3.h */
void
sph_sha384_init(void *cc)
{
sph_sha384_context *sc;
sc = cc;
memcpy(sc->val, H384, sizeof H384);
sc->count = 0;
}
/* see sph_sha3.h */
void
sph_sha512_init(void *cc)
{
sph_sha512_context *sc;
sc = cc;
memcpy(sc->val, H512, sizeof H512);
sc->count = 0;
}
#define RFUN sha3_round
#define HASH sha384
#define BE64 1
#include "md_helper.c"
/* see sph_sha3.h */
void
sph_sha384_close(void *cc, void *dst)
{
sha384_close(cc, dst, 6);
// sph_sha384_init(cc);
}
/* see sph_sha3.h */
void
sph_sha384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
sha384_addbits_and_close(cc, ub, n, dst, 6);
// sph_sha384_init(cc);
}
/* see sph_sha3.h */
void
sph_sha512_close(void *cc, void *dst)
{
sha384_close(cc, dst, 8);
// sph_sha512_init(cc);
}
/* see sph_sha3.h */
void
sph_sha512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
sha384_addbits_and_close(cc, ub, n, dst, 8);
// sph_sha512_init(cc);
}
/* see sph_sha3.h */
void
sph_sha384_comp(const sph_u64 msg[16], sph_u64 val[8])
{
#define SHA3_IN(x) msg[x]
SHA3_ROUND_BODY(SHA3_IN, val);
#undef SHA3_IN
}
#endif

View File

@@ -30,13 +30,282 @@
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#if defined(__AVX__)
#include <stddef.h>
#include <string.h>
#include "sha2-hash-4way.h"
#include <stdio.h>
// SHA256 4 way 32 bit
static const sph_u32 H256[8] = {
SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
};
static const sph_u32 K256[64] = {
SPH_C32(0x428A2F98), SPH_C32(0x71374491),
SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
SPH_C32(0x06CA6351), SPH_C32(0x14292967),
SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
SPH_C32(0xD192E819), SPH_C32(0xD6990624),
SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
};
#define SHA2s_MEXP( a, b, c, d ) \
_mm_add_epi32( _mm_add_epi32( _mm_add_epi32( \
SSG2_1( W[a] ), W[b] ), SSG2_0( W[c] ) ), W[d] );
#define CHs(X, Y, Z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z )
#define MAJs(X, Y, Z) \
_mm_or_si128( _mm_and_si128( X, Y ), \
_mm_and_si128( _mm_or_si128( X, Y ), Z ) )
#define BSG2_0(x) \
_mm_xor_si128( _mm_xor_si128( \
mm_rotr_32(x, 2), mm_rotr_32(x, 13) ), mm_rotr_32( x, 22) )
#define BSG2_1(x) \
_mm_xor_si128( _mm_xor_si128( \
mm_rotr_32(x, 6), mm_rotr_32(x, 11) ), mm_rotr_32( x, 25) )
#define SSG2_0(x) \
_mm_xor_si128( _mm_xor_si128( \
mm_rotr_32(x, 7), mm_rotr_32(x, 18) ), _mm_srli_epi32(x, 3) )
#define SSG2_1(x) \
_mm_xor_si128( _mm_xor_si128( \
mm_rotr_32(x, 17), mm_rotr_32(x, 19) ), _mm_srli_epi32(x, 10) )
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \
register __m128i T1, T2; \
T1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( \
_mm_add_epi32( H, BSG2_1(E) ), CHs(E, F, G) ), \
_mm_set1_epi32( K256[( (j)+(i) )] ) ), W[i] ); \
T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
D = _mm_add_epi32( D, T1 ); \
H = _mm_add_epi32( T1, T2 ); \
} while (0)
static void
sha256_4way_round( __m128i *in, __m128i r[8] )
{
register __m128i A, B, C, D, E, F, G, H;
__m128i W[16];
W[ 0] = mm_bswap_32( in[ 0] );
W[ 1] = mm_bswap_32( in[ 1] );
W[ 2] = mm_bswap_32( in[ 2] );
W[ 3] = mm_bswap_32( in[ 3] );
W[ 4] = mm_bswap_32( in[ 4] );
W[ 5] = mm_bswap_32( in[ 5] );
W[ 6] = mm_bswap_32( in[ 6] );
W[ 7] = mm_bswap_32( in[ 7] );
W[ 8] = mm_bswap_32( in[ 8] );
W[ 9] = mm_bswap_32( in[ 9] );
W[10] = mm_bswap_32( in[10] );
W[11] = mm_bswap_32( in[11] );
W[12] = mm_bswap_32( in[12] );
W[13] = mm_bswap_32( in[13] );
W[14] = mm_bswap_32( in[14] );
W[15] = mm_bswap_32( in[15] );
A = r[0];
B = r[1];
C = r[2];
D = r[3];
E = r[4];
F = r[5];
G = r[6];
H = r[7];
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
for ( int j = 16; j < 64; j += 16 )
{
W[ 0] = SHA2s_MEXP( 14, 9, 1, 0 );
W[ 1] = SHA2s_MEXP( 15, 10, 2, 1 );
W[ 2] = SHA2s_MEXP( 0, 11, 3, 2 );
W[ 3] = SHA2s_MEXP( 1, 12, 4, 3 );
W[ 4] = SHA2s_MEXP( 2, 13, 5, 4 );
W[ 5] = SHA2s_MEXP( 3, 14, 6, 5 );
W[ 6] = SHA2s_MEXP( 4, 15, 7, 6 );
W[ 7] = SHA2s_MEXP( 5, 0, 8, 7 );
W[ 8] = SHA2s_MEXP( 6, 1, 9, 8 );
W[ 9] = SHA2s_MEXP( 7, 2, 10, 9 );
W[10] = SHA2s_MEXP( 8, 3, 11, 10 );
W[11] = SHA2s_MEXP( 9, 4, 12, 11 );
W[12] = SHA2s_MEXP( 10, 5, 13, 12 );
W[13] = SHA2s_MEXP( 11, 6, 14, 13 );
W[14] = SHA2s_MEXP( 12, 7, 15, 14 );
W[15] = SHA2s_MEXP( 13, 8, 0, 15 );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
}
r[0] = _mm_add_epi32( r[0], A );
r[1] = _mm_add_epi32( r[1], B );
r[2] = _mm_add_epi32( r[2], C );
r[3] = _mm_add_epi32( r[3], D );
r[4] = _mm_add_epi32( r[4], E );
r[5] = _mm_add_epi32( r[5], F );
r[6] = _mm_add_epi32( r[6], G );
r[7] = _mm_add_epi32( r[7], H );
}
void sha256_4way_init( sha256_4way_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = _mm_set1_epi32( H256[0] );
sc->val[1] = _mm_set1_epi32( H256[1] );
sc->val[2] = _mm_set1_epi32( H256[2] );
sc->val[3] = _mm_set1_epi32( H256[3] );
sc->val[4] = _mm_set1_epi32( H256[4] );
sc->val[5] = _mm_set1_epi32( H256[5] );
sc->val[6] = _mm_set1_epi32( H256[6] );
sc->val[7] = _mm_set1_epi32( H256[7] );
}
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
{
__m128i *vdata = (__m128i*)data;
size_t ptr;
const int buf_size = 64;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
while ( len > 0 )
{
size_t clen;
uint32_t clow, clow2;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_128( sc->buf + (ptr>>2), vdata, clen>>2 );
vdata = vdata + (clen>>2);
ptr += clen;
len -= clen;
if ( ptr == buf_size )
{
sha256_4way_round( sc->buf, sc->val );
ptr = 0;
}
clow = sc->count_low;
clow2 = SPH_T32( clow + clen );
sc->count_low = clow2;
if ( clow2 < clow )
sc->count_high++;
}
}
void sha256_4way_close( sha256_4way_context *sc, void *dst )
{
unsigned ptr, u;
uint32_t low, high;
const int buf_size = 64;
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 );
ptr += 4;
if ( ptr > pad )
{
memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
sha256_4way_round( sc->buf, sc->val );
memset_zero_128( sc->buf, pad >> 2 );
}
else
memset_zero_128( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
low = sc->count_low;
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad >> 2 ] =
mm_bswap_32( _mm_set1_epi32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] =
mm_bswap_32( _mm_set1_epi32( low ) );
sha256_4way_round( sc->buf, sc->val );
for ( u = 0; u < 8; u ++ )
((__m128i*)dst)[u] = mm_bswap_32( sc->val[u] );
}
#if defined(__AVX2__)
// SHA512 4 way 64 bit
static const sph_u64 H512[8] = {
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
};
static const sph_u64 K512[80] = {
SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
@@ -80,13 +349,6 @@ static const sph_u64 K512[80] = {
SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
};
static const sph_u64 H512[8] = {
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
};
#define CH(X, Y, Z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
@@ -112,7 +374,7 @@ static const sph_u64 H512[8] = {
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
do { \
__m256i T1, T2; \
register __m256i T1, T2; \
T1 = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64( \
_mm256_add_epi64( H, BSG5_1(E) ), CH(E, F, G) ), \
_mm256_set1_epi64x( K512[i] ) ), W[i] ); \
@@ -125,11 +387,11 @@ static void
sha512_4way_round( __m256i *in, __m256i r[8] )
{
int i;
__m256i A, B, C, D, E, F, G, H;
register __m256i A, B, C, D, E, F, G, H;
__m256i W[80];
for ( i = 0; i < 16; i++ )
W[i] = mm256_byteswap_64( in[i] );
W[i] = mm256_bswap_64( in[i] );
for ( i = 16; i < 80; i++ )
W[i] = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64(
SSG5_1( W[ i-2 ] ), W[ i-7 ] ), SSG5_0( W[ i-15 ] ) ), W[ i-16 ] );
@@ -182,7 +444,7 @@ void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
size_t ptr;
int buf_size = 128;
const int buf_size = 128;
ptr = (unsigned)sc->count & (buf_size - 1U);
while ( len > 0 )
@@ -207,13 +469,12 @@ void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
void sha512_4way_close( sha512_4way_context *sc, void *dst )
{
unsigned ptr, u;
int buf_size = 128;
int pad = buf_size - 16;
const int buf_size = 128;
const int pad = buf_size - 16;
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
ptr += 8;
if ( ptr > pad )
{
memset_zero_256( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
@@ -224,13 +485,14 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ ( pad+8 ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
sha512_4way_round( sc->buf, sc->val );
for ( u = 0; u < 8; u ++ )
((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
}
#endif
#endif // __AVX2__
#endif // __AVX__

View File

@@ -44,47 +44,19 @@
#include "sph_types.h"
#include "avxdefs.h"
#if 0
#define SPH_SIZE_sha224 224
#if defined(__AVX__)
#define SPH_SIZE_sha256 256
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[64]; /* first field, for alignment */
sph_u32 val[8];
#if SPH_64
sph_u64 count;
#else
sph_u32 count_high, count_low;
#endif
#endif
} sph_sha224_context;
__m128i buf[64>>2];
__m128i val[8];
uint32_t count_high, count_low;
} sha256_4way_context;
typedef sph_sha224_context sph_sha256_context;
void sph_sha224_init(void *cc);
void sph_sha224(void *cc, const void *data, size_t len);
void sph_sha224_close(void *cc, void *dst);
void sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
void sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8]);
void sph_sha256_init(void *cc);
void sph_sha256(void *cc, const void *data, size_t len);
void sph_sha256_close(void *cc, void *dst);
void sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
#endif
void sha256_4way_init( sha256_4way_context *sc );
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
void sha256_4way_close( sha256_4way_context *sc, void *dst );
#if defined (__AVX2__)
@@ -102,3 +74,4 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst );
#endif
#endif
#endif

View File

@@ -74,6 +74,18 @@ static const sph_u32 IV512[] = {
C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
};
// Return hi 128 bits with elements shifted one lane with vacated lane filled
// with data rotated from lo.
// Partially rotate elements in two 128 bit vectors as one 256 bit vector
// and return the rotated high 128 bits.
// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not
// completed. It's faster than a full rotation.
static inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo, int n )
{ return _mm_or_si128( _mm_srli_si128( hi, n<<2 ),
_mm_slli_si128( lo, 16 - (n<<2) ) );
}
#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \
sph_u32 t0 = (x0); \
sph_u32 t1 = (x1); \
@@ -284,42 +296,42 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round
k00 = m[0];
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k01 = m[1];
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k02 = m[2];
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k03 = m[3];
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p0 = _mm_xor_si128( p0, x );
k10 = m[4];
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k11 = m[5];
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k12 = m[6];
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k13 = m[7];
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p2 = _mm_xor_si128( p2, x );
for ( r = 0; r < 3; r ++ )
{
// round 1, 5, 9
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 );
if ( r == 0 )
@@ -327,8 +339,8 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count3, sc->count2, sc->count1, sc->count0 ) );
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 );
if ( r == 1 )
@@ -336,34 +348,34 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count0, sc->count1, sc->count2, sc->count3 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 );
if ( r == 2 )
@@ -371,89 +383,89 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count1, sc->count0, sc->count3, sc->count2 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p1 = _mm_xor_si128( p1, x );
// round 2, 6, 10
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
x = _mm_xor_si128( p3, k00 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p2 = _mm_xor_si128( p2, x );
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
x = _mm_xor_si128( p1, k10 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p0 = _mm_xor_si128( p0, x );
// round 3, 7, 11
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p2, k00 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p1 = _mm_xor_si128( p1, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p0, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p3 = _mm_xor_si128( p3, x );
// round 4, 8, 12
@@ -461,83 +473,83 @@ c512( sph_shavite_big_context *sc, const void *msg )
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p0 = _mm_xor_si128( p0, x );
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p2 = _mm_xor_si128( p2, x );
}
// round 13
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p1 = _mm_xor_si128( p1, x );
h[0] = _mm_xor_si128( h[0], p2 );

853
algo/simd/simd-hash-2way.c Normal file
View File

@@ -0,0 +1,853 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "simd-hash-2way.h"
#if defined (__AVX2__)
// imported from simd_iv.h
uint32_t SIMD_IV_512[] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc,
0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f,
0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8,
0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4,
0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 };
/* Twiddle tables */
static const m256_v16 FFT64_Twiddle[] =
{
{{ 1, 2, 4, 8, 16, 32, 64, 128,
1, 2, 4, 8, 16, 32, 64, 128 }},
{{ 1, 60, 2, 120, 4, -17, 8, -34,
1, 60, 2, 120, 4, -17, 8, -34 }},
{{ 1, 120, 8, -68, 64, -30, -2, 17,
1, 120, 8, -68, 64, -30, -2, 17 }},
{{ 1, 46, 60, -67, 2, 92, 120, 123,
1, 46, 60, -67, 2, 92, 120, 123 }},
{{ 1, 92, -17, -22, 32, 117, -30, 67,
1, 92, -17, -22, 32, 117, -30, 67 }},
{{ 1, -67, 120, -73, 8, -22, -68, -70,
1, -67, 120, -73, 8, -22, -68, -70 }},
{{ 1, 123, -34, -70, 128, 67, 17, 35,
1, 123, -34, -70, 128, 67, 17, 35 }},
};
static const m256_v16 FFT128_Twiddle[] =
{
{{ 1, -118, 46, -31, 60, 116, -67, -61,
1, -118, 46, -31, 60, 116, -67, -61 }},
{{ 2, 21, 92, -62, 120, -25, 123, -122,
2, 21, 92, -62, 120, -25, 123, -122 }},
{{ 4, 42, -73, -124, -17, -50, -11, 13,
4, 42, -73, -124, -17, -50, -11, 13 }},
{{ 8, 84, 111, 9, -34, -100, -22, 26,
8, 84, 111, 9, -34, -100, -22, 26 }},
{{ 16, -89, -35, 18, -68, 57, -44, 52,
16, -89, -35, 18, -68, 57, -44, 52 }},
{{ 32, 79, -70, 36, 121, 114, -88, 104,
32, 79, -70, 36, 121, 114, -88, 104 }},
{{ 64, -99, 117, 72, -15, -29, 81, -49,
64, -99, 117, 72, -15, -29, 81, -49 }},
{{ 128, 59, -23, -113, -30, -58, -95, -98,
128, 59, -23, -113, -30, -58, -95, -98 }},
};
static const m256_v16 FFT256_Twiddle[] =
{
{{ 1, 41, -118, 45, 46, 87, -31, 14,
1, 41, -118, 45, 46, 87, -31, 14 }},
{{ 60, -110, 116, -127, -67, 80, -61, 69,
60, -110, 116, -127, -67, 80, -61, 69 }},
{{ 2, 82, 21, 90, 92, -83, -62, 28,
2, 82, 21, 90, 92, -83, -62, 28 }},
{{ 120, 37, -25, 3, 123, -97, -122, -119,
120, 37, -25, 3, 123, -97, -122, -119 }},
{{ 4, -93, 42, -77, -73, 91, -124, 56,
4, -93, 42, -77, -73, 91, -124, 56 }},
{{ -17, 74, -50, 6, -11, 63, 13, 19,
-17, 74, -50, 6, -11, 63, 13, 19 }},
{{ 8, 71, 84, 103, 111, -75, 9, 112,
8, 71, 84, 103, 111, -75, 9, 112 }},
{{ -34, -109, -100, 12, -22, 126, 26, 38,
-34, -109, -100, 12, -22, 126, 26, 38 }},
{{ 16, -115, -89, -51, -35, 107, 18, -33,
16, -115, -89, -51, -35, 107, 18, -33 }},
{{ -68, 39, 57, 24, -44, -5, 52, 76,
-68, 39, 57, 24, -44, -5, 52, 76 }},
{{ 32, 27, 79, -102, -70, -43, 36, -66,
32, 27, 79, -102, -70, -43, 36, -66 }},
{{ 121, 78, 114, 48, -88, -10, 104, -105,
121, 78, 114, 48, -88, -10, 104, -105 }},
{{ 64, 54, -99, 53, 117, -86, 72, 125,
64, 54, -99, 53, 117, -86, 72, 125 }},
{{ -15, -101, -29, 96, 81, -20, -49, 47,
-15, -101, -29, 96, 81, -20, -49, 47 }},
{{ 128, 108, 59, 106, -23, 85, -113, -7,
128, 108, 59, 106, -23, 85, -113, -7 }},
{{ -30, 55, -58, -65, -95, -40, -98, 94,
-30, 55, -58, -65, -95, -40, -98, 94 }}
};
#define SHUFXOR_1 0xb1 /* 0b10110001 */
#define SHUFXOR_2 0x4e /* 0b01001110 */
#define SHUFXOR_3 0x1b /* 0b00011011 */
#define CAT(x, y) x##y
#define XCAT(x,y) CAT(x,y)
#define shufxor(x,s) _mm256_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
// imported from vector.c
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi16( 255 ) ), \
_mm256_srai_epi16( x, 8 ) )
#define EXTRA_REDUCE_S(x)\
_mm256_sub_epi16( x, \
_mm256_and_si256( _mm256_set1_epi16( 257 ), \
_mm256_cmpgt_epi16( x, _mm256_set1_epi16( 128 ) ) ) )
#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) )
#define DO_REDUCE( i ) X(i) = REDUCE( X(i) )
#define DO_REDUCE_FULL_S(i) \
do { \
X(i) = REDUCE( X(i) ); \
X(i) = EXTRA_REDUCE_S( X(i) ); \
} while(0)
void fft64_2way( void *a )
{
__m256i* const A = a;
register __m256i X0, X1, X2, X3, X4, X5, X6, X7;
#define X(i) X##i
X0 = A[0];
X1 = A[1];
X2 = A[2];
X3 = A[3];
X4 = A[4];
X5 = A[5];
X6 = A[6];
X7 = A[7];
#define DO_REDUCE(i) X(i) = REDUCE( X(i) )
// Begin with 8 parallels DIF FFT_8
//
// FFT_8 using w=4 as 8th root of unity
// Unrolled decimation in frequency (DIF) radix-2 NTT.
// Output data is in revbin_permuted order.
static const int w[] = {0, 2, 4, 6};
// __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
#define BUTTERFLY_0( i,j ) \
do { \
__m256i v = X(j); \
X(j) = _mm256_add_epi16( X(i), X(j) ); \
X(i) = _mm256_sub_epi16( X(i), v ); \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
do { \
__m256i v = X(j); \
X(j) = _mm256_add_epi16( X(i), X(j) ); \
X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
} while(0)
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
DO_REDUCE( 2 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
DO_REDUCE( 1 );
BUTTERFLY_0( 0, 1 );
BUTTERFLY_0( 2, 3 );
BUTTERFLY_0( 4, 5 );
BUTTERFLY_0( 6, 7 );
/* We don't need to reduce X(7) */
DO_REDUCE_FULL_S( 0 );
DO_REDUCE_FULL_S( 1 );
DO_REDUCE_FULL_S( 2 );
DO_REDUCE_FULL_S( 3 );
DO_REDUCE_FULL_S( 4 );
DO_REDUCE_FULL_S( 5 );
DO_REDUCE_FULL_S( 6 );
#undef BUTTERFLY_0
#undef BUTTERFLY_N
// Multiply by twiddle factors
X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i );
X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i );
X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i );
X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i );
X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i );
X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i );
X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i );
// Transpose the FFT state with a revbin order permutation
// on the rows and the column.
// This will make the full FFT_64 in order.
#define INTERLEAVE(i,j) \
do { \
__m256i t1= X(i); \
__m256i t2= X(j); \
X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
} while(0)
INTERLEAVE( 1, 0 );
INTERLEAVE( 3, 2 );
INTERLEAVE( 5, 4 );
INTERLEAVE( 7, 6 );
INTERLEAVE( 2, 0 );
INTERLEAVE( 3, 1 );
INTERLEAVE( 6, 4 );
INTERLEAVE( 7, 5 );
INTERLEAVE( 4, 0 );
INTERLEAVE( 5, 1 );
INTERLEAVE( 6, 2 );
INTERLEAVE( 7, 3 );
#undef INTERLEAVE
//Finish with 8 parallels DIT FFT_8
//FFT_8 using w=4 as 8th root of unity
// Unrolled decimation in time (DIT) radix-2 NTT.
// Input data is in revbin_permuted order.
#define BUTTERFLY_0( i,j ) \
do { \
__m256i u = X(j); \
X(j) = _mm256_sub_epi16( X(j), X(i) ); \
X(i) = _mm256_add_epi16( u, X(i) ); \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
do { \
__m256i u = X(j); \
X(i) = _mm256_slli_epi16( X(i), w[n] ); \
X(j) = _mm256_sub_epi16( X(j), X(i) ); \
X(i) = _mm256_add_epi16( u, X(i) ); \
} while(0)
DO_REDUCE( 0 );
DO_REDUCE( 1 );
DO_REDUCE( 2 );
DO_REDUCE( 3 );
DO_REDUCE( 4 );
DO_REDUCE( 5 );
DO_REDUCE( 6 );
DO_REDUCE( 7 );
BUTTERFLY_0( 0, 1 );
BUTTERFLY_0( 2, 3 );
BUTTERFLY_0( 4, 5 );
BUTTERFLY_0( 6, 7 );
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
DO_REDUCE_FULL_S( 0 );
DO_REDUCE_FULL_S( 1 );
DO_REDUCE_FULL_S( 2 );
DO_REDUCE_FULL_S( 3 );
DO_REDUCE_FULL_S( 4 );
DO_REDUCE_FULL_S( 5 );
DO_REDUCE_FULL_S( 6 );
DO_REDUCE_FULL_S( 7 );
#undef BUTTERFLY
A[0] = X0;
A[1] = X1;
A[2] = X2;
A[3] = X3;
A[4] = X4;
A[5] = X5;
A[6] = X6;
A[7] = X7;
#undef X
}
void fft128_2way( void *a )
{
int i;
// Temp space to help for interleaving in the end
__m256i B[8];
__m256i *A = (__m256i*) a;
// __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
/* Size-2 butterflies */
for ( i = 0; i<8; i++ )
{
B[ i ] = _mm256_add_epi16( A[ i ], A[ i+8 ] );
B[ i ] = REDUCE_FULL_S( B[ i ] );
A[ i+8 ] = _mm256_sub_epi16( A[ i ], A[ i+8 ] );
A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
A[ i+8 ] = _mm256_mullo_epi16( A[ i+8 ], FFT128_Twiddle[i].m256i );
A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
}
fft64_2way( B );
fft64_2way( A+8 );
/* Transpose (i.e. interleave) */
for ( i = 0; i < 8; i++ )
{
A[ 2*i ] = _mm256_unpacklo_epi16( B[ i ], A[ i+8 ] );
A[ 2*i+1 ] = _mm256_unpackhi_epi16( B[ i ], A[ i+8 ] );
}
}
void fft128_2way_msg( uint16_t *a, const uint8_t *x, int final )
{
static const m256_v16 Tweak = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
__m256i *X = (__m256i*)x;
__m256i *A = (__m256i*)a;
// __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
#define UNPACK( i ) \
do { \
__m256i t = X[i]; \
A[2*i] = _mm256_unpacklo_epi8( t, m256_zero ); \
A[2*i+8] = _mm256_mullo_epi16( A[2*i], FFT128_Twiddle[2*i].m256i ); \
A[2*i+8] = REDUCE(A[2*i+8]); \
A[2*i+1] = _mm256_unpackhi_epi8( t, m256_zero ); \
A[2*i+9] = _mm256_mullo_epi16(A[2*i+1], FFT128_Twiddle[2*i+1].m256i ); \
A[2*i+9] = REDUCE(A[2*i+9]); \
} while(0)
// This allows to tweak the last butterflies to introduce X^127
#define UNPACK_TWEAK( i,tw ) \
do { \
__m256i t = X[i]; \
__m256i tmp; \
A[2*i] = _mm256_unpacklo_epi8( t, m256_zero ); \
A[2*i+8] = _mm256_mullo_epi16( A[ 2*i ], FFT128_Twiddle[ 2*i ].m256i ); \
A[2*i+8] = REDUCE( A[ 2*i+8 ] ); \
tmp = _mm256_unpackhi_epi8( t, m256_zero ); \
A[2*i+1] = _mm256_add_epi16( tmp, tw ); \
A[2*i+9] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
FFT128_Twiddle[ 2*i+1 ].m256i );\
A[2*i+9] = REDUCE( A[ 2*i+9 ] ); \
} while(0)
UNPACK( 0 );
UNPACK( 1 );
UNPACK( 2 );
if ( final )
UNPACK_TWEAK( 3, FinalTweak.m256i );
else
UNPACK_TWEAK( 3, Tweak.m256i );
#undef UNPACK
#undef UNPACK_TWEAK
fft64_2way( a );
fft64_2way( a+128 );
}
void fft256_2way_msg( uint16_t *a, const uint8_t *x, int final )
{
static const m256_v16 Tweak = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
__m256i *X = (__m256i*)x;
__m256i *A = (__m256i*)a;
// __m256i *Twiddle = (__m256i*)FFT256_Twiddle;
#define UNPACK( i ) \
do { \
__m256i t = X[i]; \
A[ 2*i ] = _mm256_unpacklo_epi8( t, m256_zero ); \
A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
FFT256_Twiddle[ 2*i ].m256i ); \
A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
A[ 2*i + 1 ] = _mm256_unpackhi_epi8( t, m256_zero ); \
A[ 2*i + 17 ] = _mm256_mullo_epi16( A[ 2*i + 1 ], \
FFT256_Twiddle[ 2*i + 1 ].m256i ); \
A[ 2*i + 17 ] = REDUCE( A[ 2*i + 17 ] ); \
} while(0)
// This allows to tweak the last butterflies to introduce X^127
#define UNPACK_TWEAK( i,tw ) \
do { \
__m256i t = X[i]; \
__m256i tmp; \
A[ 2*i ] = _mm256_unpacklo_epi8( t, m256_zero ); \
A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
FFT256_Twiddle[ 2*i ].m256i ); \
A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
tmp = _mm256_unpackhi_epi8( t, m256_zero ); \
A[ 2*i + 1 ] = _mm256_add_epi16( tmp, tw ); \
A[ 2*i + 17 ] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
FFT256_Twiddle[ 2*i + 1 ].m256i ); \
} while(0)
UNPACK( 0 );
UNPACK( 1 );
UNPACK( 2 );
UNPACK( 3 );
UNPACK( 4 );
UNPACK( 5 );
UNPACK( 6 );
if ( final )
UNPACK_TWEAK( 7, FinalTweak.m256i );
else
UNPACK_TWEAK( 7, Tweak.m256i );
#undef UNPACK
#undef UNPACK_TWEAK
fft128_2way( a );
fft128_2way( a+256 );
}
void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
{
register __m256i S0l, S1l, S2l, S3l;
register __m256i S0h, S1h, S2h, S3h;
__m256i *S = (__m256i*) state;
__m256i *M = (__m256i*) msg;
__m256i *W = (__m256i*) fft;
static const m256_v16 code[] = { mm256_setc1_16(185), mm256_setc1_16(233) };
S0l = _mm256_xor_si256( S[0], M[0] );
S0h = _mm256_xor_si256( S[1], M[1] );
S1l = _mm256_xor_si256( S[2], M[2] );
S1h = _mm256_xor_si256( S[3], M[3] );
S2l = _mm256_xor_si256( S[4], M[4] );
S2h = _mm256_xor_si256( S[5], M[5] );
S3l = _mm256_xor_si256( S[6], M[6] );
S3h = _mm256_xor_si256( S[7], M[7] );
#define S(i) S##i
#define F_0(B, C, D) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( C,D ), B ), D )
#define F_1(B, C, D) \
_mm256_or_si256( _mm256_and_si256( D, C ),\
_mm256_and_si256( _mm256_or_si256( D,C ), B ) )
#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
// We split the round function in two halfes
// so as to insert some independent computations in between
#define SUM7_00 0
#define SUM7_01 1
#define SUM7_02 2
#define SUM7_03 3
#define SUM7_04 4
#define SUM7_05 5
#define SUM7_06 6
#define SUM7_10 1
#define SUM7_11 2
#define SUM7_12 3
#define SUM7_13 4
#define SUM7_14 5
#define SUM7_15 6
#define SUM7_16 0
#define SUM7_20 2
#define SUM7_21 3
#define SUM7_22 4
#define SUM7_23 5
#define SUM7_24 6
#define SUM7_25 0
#define SUM7_26 1
#define SUM7_30 3
#define SUM7_31 4
#define SUM7_32 5
#define SUM7_33 6
#define SUM7_34 0
#define SUM7_35 1
#define SUM7_36 2
#define SUM7_40 4
#define SUM7_41 5
#define SUM7_42 6
#define SUM7_43 0
#define SUM7_44 1
#define SUM7_45 2
#define SUM7_46 3
#define SUM7_50 5
#define SUM7_51 6
#define SUM7_52 0
#define SUM7_53 1
#define SUM7_54 2
#define SUM7_55 3
#define SUM7_56 4
#define SUM7_60 6
#define SUM7_61 0
#define SUM7_62 1
#define SUM7_63 2
#define SUM7_64 3
#define SUM7_65 4
#define SUM7_66 5
#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
#define PERM_0(d,a) /* XOR 1 */ \
do { \
d##l = shufxor( a##l, 1 ); \
d##h = shufxor( a##h, 1 ); \
} while(0)
#define PERM_1(d,a) /* XOR 6 */ \
do { \
d##l = shufxor( a##h, 2 ); \
d##h = shufxor( a##l, 2 ); \
} while(0)
#define PERM_2(d,a) /* XOR 2 */ \
do { \
d##l = shufxor( a##l, 2 ); \
d##h = shufxor( a##h, 2 ); \
} while(0)
#define PERM_3(d,a) /* XOR 3 */ \
do { \
d##l = shufxor( a##l, 3 ); \
d##h = shufxor( a##h, 3 ); \
} while(0)
#define PERM_4(d,a) /* XOR 5 */ \
do { \
d##l = shufxor( a##h, 1 ); \
d##h = shufxor( a##l, 1 ); \
} while(0)
#define PERM_5(d,a) /* XOR 7 */ \
do { \
d##l = shufxor( a##h, 3 ); \
d##h = shufxor( a##l, 3 ); \
} while(0)
#define PERM_6(d,a) /* XOR 4 */ \
do { \
d##l = a##h; \
d##h = a##l; \
} while(0)
#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
do { \
TTl = Fl( a,b,c,fun ); \
TTh = Fh( a,b,c,fun ); \
a##l = mm256_rotl_32( a##l, r ); \
a##h = mm256_rotl_32( a##h, r ); \
w##l = _mm256_add_epi32( w##l, d##l ); \
w##h = _mm256_add_epi32( w##h, d##h ); \
TTl = _mm256_add_epi32( TTl, w##l ); \
TTh = _mm256_add_epi32( TTh, w##h ); \
TTl = mm256_rotl_32( TTl, s ); \
TTh = mm256_rotl_32( TTh, s ); \
PERM( z,d,a ); \
} while(0)
#define STEP_1( a,b,c,d,w,fun,r,s,z ) STEP_1_( a,b,c,d,w,fun,r,s,z )
#define STEP_2_( a,b,c,d,w,fun,r,s ) \
do { \
d##l = _mm256_add_epi32( d##l, TTl ); \
d##h = _mm256_add_epi32( d##h, TTh ); \
} while(0)
#define STEP_2( a,b,c,d,w,fun,r,s ) STEP_2_( a,b,c,d,w,fun,r,s )
#define STEP( a,b,c,d,w1,w2,fun,r,s,z ) \
do { \
register __m256i TTl, TTh, Wl=w1, Wh=w2; \
STEP_1( a,b,c,d,W,fun,r,s,z ); \
STEP_2( a,b,c,d,W,fun,r,s ); \
} while(0);
#define MSG_l(x) (2*(x))
#define MSG_h(x) (2*(x)+1)
#define MSG( w,hh,ll,u,z ) \
do { \
int a = MSG_##u(hh); \
int b = MSG_##u(ll); \
w##l = _mm256_unpacklo_epi16( W[a], W[b] ); \
w##l = _mm256_mullo_epi16( w##l, code[z].m256i ); \
w##h = _mm256_unpackhi_epi16( W[a], W[b]) ; \
w##h = _mm256_mullo_epi16( w##h, code[z].m256i ); \
} while(0)
#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \
do { \
register __m256i W0l, W1l, W2l, W3l, TTl; \
register __m256i W0h, W1h, W2h, W3h, TTh; \
MSG( W0, h0, l0, u0, z ); \
STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0 ); \
MSG( W1, h1, l1, u1, z ); \
STEP_2( S(0), S(1), S(2), S(3), W0, fun, r, s ); \
STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1 ); \
MSG( W2,h2,l2,u2,z ); \
STEP_2( S(3), S(0), S(1), S(2), W1, fun, s, t ); \
STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2 ); \
MSG( W3,h3,l3,u3,z ); \
STEP_2( S(2), S(3), S(0), S(1), W2, fun, t, u ); \
STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3 ); \
STEP_2( S(1), S(2), S(3), S(0), W3, fun, u, r ); \
} while(0)
// 4 rounds with code 185
#define PERM_START 0
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 4
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 1
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0);
#undef PERM_START
#define PERM_START 5
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0);
#undef PERM_START
// 4 rounds with code 233
#define PERM_START 2
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 6
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 3
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1);
#undef PERM_START
#define PERM_START 0
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1);
#undef PERM_START
// 1 round as feed-forward
#define PERM_START 4
STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0 );
STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1 );
STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2 );
STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3 );
S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h;
S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h;
#undef PERM_START
#undef STEP_1
#undef STEP_2
#undef STEP
#undef ROUND
}
void SIMD_2way_Compress( simd_2way_context *state, const void *m, int final )
{
m256_v16 Y[32];
uint16_t *y = (uint16_t*) Y[0].u16;
fft256_2way_msg( y, m, final );
rounds512_2way( state->A, m, y );
}
// imported from nist.c
int simd_2way_init( simd_2way_context *state, int hashbitlen )
{
__m256i *A = (__m256i*)state->A;
int n = 8;
state->hashbitlen = hashbitlen;
state->n_feistels = n;
state->blocksize = 128*8;
state->count = 0;
for ( int i = 0; i < 8; i++ )
A[i] = _mm256_set_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0],
SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] );
return 0;
}
int simd_2way_update( simd_2way_context *state, const void *data,
int databitlen )
{
int bs = state->blocksize;
int current = state->count & (bs - 1);
while ( databitlen > 0 )
{
if ( current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_2way_Compress( state, data, 0 );
databitlen -= bs;
data += 2*(bs/8);
state->count += bs;
}
else
{
// Copy a chunk of data to the buffer
int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer + 2*(current/8), data, 2*((databitlen+7)/8) );
state->count += databitlen;
return 0;
}
else
{
memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
state->count += len;
databitlen -= len;
data += 2*(len/8);
current = 0;
SIMD_2way_Compress( state, state->buffer, 0 );
}
}
}
return 0;
}
int simd_2way_close( simd_2way_context *state, void *hashval )
{
uint64_t l;
int current = state->count & (state->blocksize - 1);
int i;
int isshort = 1;
// If there is still some data in the buffer, hash it
if ( current )
{
current = ( current+7 ) / 8;
memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current ) );
SIMD_2way_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, 2*(state->blocksize / 8) );
l = state->count;
for ( i = 0; i < 8; i++ )
{
state->buffer[ i ] = l & 0xff;
state->buffer[ i+16 ] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
SIMD_2way_Compress( state, state->buffer, isshort );
memcpy( hashval, state->A, 2*(state->hashbitlen / 8) );
return 0;
}
int simd_2way_update_close( simd_2way_context *state, void *hashval,
const void *data, int databitlen )
{
int current, i;
int bs = state->blocksize; // bits in one lane
int isshort = 1;
uint64_t l;
current = state->count & (bs - 1);
while ( databitlen > 0 )
{
if ( current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_2way_Compress( state, data, 0 );
databitlen -= bs;
data += 2*( bs/8 );
state->count += bs;
}
else
{
// Copy a chunk of data to the buffer
int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) );
state->count += databitlen;
break;
}
else
{
memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
state->count += len;
databitlen -= len;
data += 2*( len/8 );
current = 0;
SIMD_2way_Compress( state, state->buffer, 0 );
}
}
}
current = state->count & (state->blocksize - 1);
// If there is still some data in the buffer, hash it
if ( current )
{
current = ( current+7 ) / 8;
memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current) );
SIMD_2way_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, 2*( state->blocksize/8 ) );
l = state->count;
for ( i = 0; i < 8; i++ )
{
state->buffer[ i ] = l & 0xff;
state->buffer[ i+16 ] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
SIMD_2way_Compress( state, state->buffer, isshort );
memcpy( hashval, state->A, 2*( state->hashbitlen / 8 ) );
return 0;
}
#endif

View File

@@ -0,0 +1,27 @@
#ifndef SIMD_HASH_2WAY_H__
#define SIMD_HASH_2WAY_H__ 1
#include "simd-compat.h"
#if defined(__AVX2__)
#include "avxdefs.h"
typedef struct {
uint32_t A[ 32*2 ] __attribute__((aligned(64)));
uint8_t buffer[ 128*2 ] __attribute__((aligned(64)));
uint64_t count;
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
} simd_2way_context;
int simd_2way_init( simd_2way_context *state, int hashbitlen );
int simd_2way_update( simd_2way_context *state, const void *data,
int databitlen );
int simd_2way_close( simd_2way_context *state, void *hashval );
int simd_2way_update_close( simd_2way_context *state, void *hashval,
const void *data, int databitlen );
#endif
#endif

View File

@@ -1,3 +1,6 @@
#if !defined(SIMD_IV_H__)
#define SIMD_IV_H__
u32 IV_224[] = {
0x33586e9f, 0x12fff033, 0xb2d9f64d, 0x6f8fea53,
0xde943106, 0x2742e439, 0x4fbab5ac, 0x62b9ff96,
@@ -25,3 +28,5 @@ u32 IV_512[] = {
0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
};
#endif

View File

@@ -1,23 +0,0 @@
#ifndef DEFS_X5_H__
#define DEFS_X5_H__
#include <emmintrin.h>
typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn;
typedef unsigned char uint8;
typedef unsigned int uint32;
typedef unsigned long long uint64;
typedef struct {
uint32 buffer[8]; /* Buffer to be hashed */
__m128i chainv[10]; /* Chaining values */
uint64 bitlen[2]; /* Message length in bits */
uint32 rembitlen; /* Length of buffer data to be hashed */
int hashbitlen;
} hashState_luffa;
typedef unsigned char byte;
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -63,13 +63,13 @@ MAYBE_INLINE void fft64(void *a) {
v16* const A = a;
register v16 X0, X1, X2, X3, X4, X5, X6, X7;
/*
#if V16_SIZE == 8
#define X(i) A[i]
#elif V16_SIZE == 4
#define X(i) A[2*i]
#endif
*/
#define X(i) X##i
X0 = A[0];
@@ -623,6 +623,11 @@ void rounds(u32* state, const unsigned char* msg, short* fft) {
STEP(S(1), S(2), S(3), S(0), S[3], 0, 25, 4, 20);
S[0] = S(0); S[1] = S(1); S[2] = S(2); S[3] = S(3);
#undef ROUND
#undef STEP
#undef STEP_1
#undef STEP_2
}
@@ -849,24 +854,32 @@ void rounds512(u32* state, const unsigned char* msg, short* fft) {
*/
#define PERM_START 0
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 4
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 1
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0);
#undef PERM_START
#define PERM_START 5
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0);
#undef PERM_START
/*
* 4 rounds with code 233
*/
#define PERM_START 2
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 6
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 3
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1);
#undef PERM_START
#define PERM_START 0
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1);
#undef PERM_START
/*
@@ -877,9 +890,15 @@ void rounds512(u32* state, const unsigned char* msg, short* fft) {
STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1);
STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2);
STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3);
#undef PERM_START
S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h;
S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h;
#undef ROUND
#undef STEP
#undef STEP_1
#undef STEP_2
}
void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) {

View File

@@ -1,47 +1,29 @@
#include "skein-gate.h"
#include <string.h>
#include <stdint.h>
#include <openssl/sha.h>
#include "skein-hash-4way.h"
#include "algo/sha/sha2-hash-4way.h"
#if defined (__AVX2__)
#if defined (SKEIN_4WAY)
void skeinhash_4way( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhash64[8*4] __attribute__ ((aligned (64)));
uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
skein512_4way_context ctx_skein;
SHA256_CTX ctx_sha256;
sha256_4way_context ctx_sha256;
skein512_4way_init( &ctx_skein );
skein512_4way( &ctx_skein, input, 80 );
skein512_4way_close( &ctx_skein, vhash );
skein512_4way_close( &ctx_skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_reinterleave_4x32( vhash32, vhash64, 512 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhash32, 64 );
sha256_4way_close( &ctx_sha256, vhash32 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
memcpy( state, hash0, 32 );
memcpy( state + 32, hash1, 32 );
memcpy( state + 64, hash2, 32 );
memcpy( state + 96, hash3, 32 );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash32, 256 );
}
int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,

View File

@@ -125,14 +125,14 @@ void sm3_4way_close( void *cc, void *dst )
memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
}
count[0] = mm_byteswap_32(
count[0] = mm_bswap_32(
_mm_set1_epi32( ctx->nblocks >> 23 ) );
count[1] = mm_byteswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
count[1] = mm_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
( ctx->num << 3 ) ) );
sm3_4way_compress( ctx->digest, block );
for ( i = 0; i < 8 ; i++ )
hash[i] = mm_byteswap_32( ctx->digest[i] );
hash[i] = mm_bswap_32( ctx->digest[i] );
}
#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 9 ), \
@@ -165,7 +165,7 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
int j;
for ( j = 0; j < 16; j++ )
W[j] = mm_byteswap_32( block[j] );
W[j] = mm_bswap_32( block[j] );
for ( j = 16; j < 68; j++ )
W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],

View File

@@ -229,18 +229,18 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
#if defined BE64
#if defined PLW1
sc->buf[ SPH_MAXPAD>>3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#elif defined PLW4
memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#else
sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#endif // PLW
#else // LE64
#if defined PLW1
@@ -276,7 +276,7 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
for ( u = 0; u < rnum; u ++ )
{
#if defined BE64
((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
#else // LE64
((__m256i*)dst)[u] = sc->val[u];
#endif

View File

@@ -11,15 +11,15 @@ void whirlpoolx_hash(void *state, const void *input)
sph_whirlpool_context ctx_whirlpool;
unsigned char hash[64];
unsigned char hash_xored[32];
// unsigned char hash_xored[32];
sph_whirlpool1_init(&ctx_whirlpool);
sph_whirlpool1(&ctx_whirlpool, input, 80);
sph_whirlpool1_close(&ctx_whirlpool, hash);
// compress the 48 first bytes of the hash to 32
for (int i = 0; i < 32; i++)
hash_xored[i] = hash[i] ^ hash[i + 16];
// for (int i = 0; i < 32; i++)
// hash_xored[i] = hash[i] ^ hash[i + 16];
memcpy(state, hash, 32);
}

View File

@@ -12,10 +12,10 @@
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
@@ -25,10 +25,10 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
hashState_echo echo;
} c11_4way_ctx_holder;
@@ -42,10 +42,10 @@ void init_c11_4way_ctx()
skein512_4way_init( &c11_4way_ctx.skein );
jh512_4way_init( &c11_4way_ctx.jh );
keccak512_4way_init( &c11_4way_ctx.keccak );
init_luffa( &c11_4way_ctx.luffa, 512 );
luffa_2way_init( &c11_4way_ctx.luffa, 512 );
cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &c11_4way_ctx.shavite );
init_sd( &c11_4way_ctx.simd, 512 );
simd_2way_init( &c11_4way_ctx.simd, 512 );
init_echo( &c11_4way_ctx.echo, 512 );
}
@@ -56,6 +56,7 @@ void c11_4way_hash( void *state, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*2] __attribute__ ((aligned (64)));
c11_4way_ctx_holder ctx;
memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
@@ -98,17 +99,13 @@ void c11_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -136,17 +133,13 @@ void c11_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,

View File

@@ -22,9 +22,9 @@
#include "algo/echo/aes_ni/hash_api.h"
#endif
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c"
#include "algo/keccak/sse2/keccak.c"
#include "algo/bmw/sse2/bmw.c"

View File

@@ -12,7 +12,7 @@
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
static __thread uint32_t s_ntime = UINT32_MAX;
@@ -25,7 +25,7 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
} tt8_4way_ctx_holder;
@@ -39,7 +39,7 @@ void init_tt8_4way_ctx()
skein512_4way_init( &tt8_4way_ctx.skein );
jh512_4way_init( &tt8_4way_ctx.jh );
keccak512_4way_init( &tt8_4way_ctx.keccak );
init_luffa( &tt8_4way_ctx.luffa, 512 );
luffa_2way_init( &tt8_4way_ctx.luffa, 512 );
cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 );
};
@@ -139,17 +139,13 @@ void timetravel_4way_hash(void *output, const void *input)
case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence *)hash0, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
if ( i != 7 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );

View File

@@ -9,7 +9,7 @@
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#ifdef NO_AES_NI
#include "algo/groestl/sph_groestl.h"

View File

@@ -12,10 +12,10 @@
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
@@ -27,10 +27,10 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
} tt10_4way_ctx_holder;
tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64)));
@@ -43,10 +43,10 @@ void init_tt10_4way_ctx()
skein512_4way_init( &tt10_4way_ctx.skein );
jh512_4way_init( &tt10_4way_ctx.jh );
keccak512_4way_init( &tt10_4way_ctx.keccak );
init_luffa( &tt10_4way_ctx.luffa, 512 );
luffa_2way_init( &tt10_4way_ctx.luffa, 512 );
cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &tt10_4way_ctx.shavite );
init_sd( &tt10_4way_ctx.simd, 512 );
simd_2way_init( &tt10_4way_ctx.simd, 512 );
};
void timetravel10_4way_hash(void *output, const void *input)
@@ -145,17 +145,13 @@ void timetravel10_4way_hash(void *output, const void *input)
case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence *)hash0, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
@@ -199,17 +195,13 @@ void timetravel10_4way_hash(void *output, const void *input)
case 9:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, dataLen<<3 );
mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );

View File

@@ -8,10 +8,10 @@
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#ifdef NO_AES_NI
#include "algo/groestl/sph_groestl.h"

View File

@@ -5,17 +5,16 @@
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
@@ -25,10 +24,10 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
hashState_echo echo;
} x11_4way_ctx_holder;
@@ -42,10 +41,10 @@ void init_x11_4way_ctx()
skein512_4way_init( &x11_4way_ctx.skein );
jh512_4way_init( &x11_4way_ctx.jh );
keccak512_4way_init( &x11_4way_ctx.keccak );
init_luffa( &x11_4way_ctx.luffa, 512 );
luffa_2way_init( &x11_4way_ctx.luffa, 512 );
cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11_4way_ctx.shavite );
init_sd( &x11_4way_ctx.simd, 512 );
simd_2way_init( &x11_4way_ctx.simd, 512 );
init_echo( &x11_4way_ctx.echo, 512 );
}
@@ -56,6 +55,8 @@ void x11_4way_hash( void *state, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*2] __attribute__ ((aligned (64)));
x11_4way_ctx_holder ctx;
memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
@@ -94,21 +95,16 @@ void x11_4way_hash( void *state, const void *input )
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 7 Luffa parallel 2 way 128 bit
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -136,17 +132,13 @@ void x11_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,

View File

@@ -10,10 +10,8 @@
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sph_simd.h"
#include "algo/echo/sph_echo.h"
#ifndef NO_AES_NI
@@ -21,9 +19,9 @@
#include "algo/echo/aes_ni/hash_api.h"
#endif
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c"
#include "algo/keccak/sse2/keccak.c"
#include "algo/bmw/sse2/bmw.c"

View File

@@ -11,15 +11,12 @@
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sph_simd.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
typedef struct {
blake512_4way_context blake;
@@ -28,10 +25,10 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
hashState_echo echo;
} x11evo_4way_ctx_holder;
@@ -45,10 +42,10 @@ void init_x11evo_4way_ctx()
skein512_4way_init( &x11evo_4way_ctx.skein );
jh512_4way_init( &x11evo_4way_ctx.jh );
keccak512_4way_init( &x11evo_4way_ctx.keccak );
init_luffa( &x11evo_4way_ctx.luffa, 512 );
luffa_2way_init( &x11evo_4way_ctx.luffa, 512 );
cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11evo_4way_ctx.shavite );
init_sd( &x11evo_4way_ctx.simd, 512 );
simd_2way_init( &x11evo_4way_ctx.simd, 512 );
init_echo( &x11evo_4way_ctx.echo, 512 );
}
@@ -142,20 +139,13 @@ void x11evo_4way_hash( void *state, const void *input )
case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
mm256_interleave_2x128( vhash, hash0, hash1, 64<<3 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 64<<3 );
mm256_interleave_2x128( vhash, hash2, hash3, 64<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 64<<3 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
@@ -202,17 +192,13 @@ void x11evo_4way_hash( void *state, const void *input )
case 9:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
mm256_interleave_2x128( vhash, hash0, hash1, 64<<3 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 64<<3 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 64<<3 );
mm256_interleave_2x128( vhash, hash2, hash3, 64<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 64<<3 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 64<<3 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );

View File

@@ -22,9 +22,9 @@
#include "algo/echo/aes_ni/hash_api.h"
#endif
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
typedef struct {
#ifdef NO_AES_NI

View File

@@ -13,10 +13,10 @@
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/gost/sph_gost.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
@@ -27,10 +27,10 @@ typedef struct {
jh512_4way_context jh;
keccak512_4way_context keccak;
sph_gost512_context gost;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
hashState_echo echo;
} x11gost_4way_ctx_holder;
@@ -45,10 +45,10 @@ void init_x11gost_4way_ctx()
jh512_4way_init( &x11gost_4way_ctx.jh );
keccak512_4way_init( &x11gost_4way_ctx.keccak );
sph_gost512_init( &x11gost_4way_ctx.gost );
init_luffa( &x11gost_4way_ctx.luffa, 512 );
luffa_2way_init( &x11gost_4way_ctx.luffa, 512 );
cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11gost_4way_ctx.shavite );
init_sd( &x11gost_4way_ctx.simd, 512 );
simd_2way_init( &x11gost_4way_ctx.simd, 512 );
init_echo( &x11gost_4way_ctx.echo, 512 );
}
@@ -59,6 +59,7 @@ void x11gost_4way_hash( void *state, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x11gost_4way_ctx_holder ctx;
memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
@@ -109,17 +110,13 @@ void x11gost_4way_hash( void *state, const void *input )
sph_gost512( &ctx.gost, hash3, 64 );
sph_gost512_close( &ctx.gost, hash3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
@@ -144,17 +141,12 @@ void x11gost_4way_hash( void *state, const void *input )
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );

View File

@@ -10,9 +10,9 @@
#include "algo/shavite/sph_shavite.h"
#include "algo/echo/sph_echo.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c"
#include "algo/keccak/sse2/keccak.c"
#include "algo/bmw/sse2/bmw.c"

273
algo/x12/x12-4way.c Normal file
View File

@@ -0,0 +1,273 @@
#include "x12-gate.h"
#if defined(X12_4WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
//#include "algo/fugue/sph_fugue.h"
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd_2way_context simd;
hashState_echo echo;
hamsi512_4way_context hamsi;
// sph_fugue512_context fugue;
} x12_4way_ctx_holder;
x12_4way_ctx_holder x12_4way_ctx __attribute__ ((aligned (64)));
void init_x12_4way_ctx()
{
blake512_4way_init( &x12_4way_ctx.blake );
bmw512_4way_init( &x12_4way_ctx.bmw );
init_groestl( &x12_4way_ctx.groestl, 64 );
skein512_4way_init( &x12_4way_ctx.skein );
jh512_4way_init( &x12_4way_ctx.jh );
keccak512_4way_init( &x12_4way_ctx.keccak );
luffa_2way_init( &x12_4way_ctx.luffa, 512 );
cubehashInit( &x12_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x12_4way_ctx.shavite );
simd_2way_init( &x12_4way_ctx.simd, 512 );
init_echo( &x12_4way_ctx.echo, 512 );
hamsi512_4way_init( &x12_4way_ctx.hamsi );
// sph_fugue512_init( &x12_4way_ctx.fugue );
};
void x12_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x12_4way_ctx_holder ctx;
memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) );
// 1 Blake
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// 2 Bmw
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 3 Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallel 4way 64 bit
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// 5 JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
// 12 Hamsi parallel 4way 32 bit
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
/*
// 13 Fugue serial
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
*/
}
int scanhash_x12_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
x12_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/x12/x12-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "x12-gate.h"
bool register_x12_algo( algo_gate_t* gate )
{
#if defined (X12_4WAY)
init_x12_4way_ctx();
gate->scanhash = (void*)&scanhash_x12_4way;
gate->hash = (void*)&x12_4way_hash;
#else
init_x12_ctx();
gate->scanhash = (void*)&scanhash_x12;
gate->hash = (void*)&x12hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

32
algo/x12/x12-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef X12_GATE_H__
#define X12_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define X12_4WAY
#endif
bool register_x12_algo( algo_gate_t* gate );
#if defined(X12_4WAY)
void x12_4way_hash( void *state, const void *input );
int scanhash_x12_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x12_4way_ctx();
#endif
void x12hash( void *state, const void *input );
int scanhash_x12( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x12_ctx();
#endif

252
algo/x12/x12.c Normal file
View File

@@ -0,0 +1,252 @@
#include "x12-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/groestl/sph_groestl.h"
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/simd/sph_simd.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
//#include "algo/fugue/sph_fugue.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c"
#include "algo/bmw/sse2/bmw.c"
#include "algo/keccak/sse2/keccak.c"
#include "algo/skein/sse2/skein.c"
#include "algo/jh/sse2/jh_sse2_opt64.h"
#ifndef NO_AES_NI
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#endif
typedef struct {
#ifdef NO_AES_NI
sph_groestl512_context groestl;
sph_echo512_context echo;
#else
hashState_groestl groestl;
hashState_echo echo;
#endif
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
hashState_sd simd;
sph_hamsi512_context hamsi;
// sph_fugue512_context fugue;
} x12_ctx_holder;
x12_ctx_holder x12_ctx;
void init_x12_ctx()
{
#ifdef NO_AES_NI
sph_groestl512_init(&x12_ctx.groestl);
sph_echo512_init(&x12_ctx.echo);
#else
init_echo( &x12_ctx.echo, 512 );
init_groestl (&x12_ctx.groestl, 64 );
#endif
init_luffa( &x12_ctx.luffa, 512 );
cubehashInit( &x12_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &x12_ctx.shavite );
init_sd( &x12_ctx.simd, 512 );
sph_hamsi512_init( &x12_ctx.hamsi );
// sph_fugue512_init( &x13_ctx.fugue );
};
void x12hash(void *output, const void *input)
{
unsigned char hash[128] __attribute__ ((aligned (32)));
#define hashB hash+64
x12_ctx_holder ctx;
memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) );
// X11 algos
unsigned char hashbuf[128];
size_t hashptr;
sph_u64 hashctA;
sph_u64 hashctB;
//---blake1---
DECL_BLK;
BLK_I;
BLK_W;
BLK_C;
//---bmw2---
DECL_BMW;
BMW_I;
BMW_U;
#define M(x) sph_dec64le_aligned(data + 8 * (x))
#define H(x) (h[x])
#define dH(x) (dh[x])
BMW_C;
#undef M
#undef H
#undef dH
//---groetl----
#ifdef NO_AES_NI
sph_groestl512 (&ctx.groestl, hash, 64);
sph_groestl512_close(&ctx.groestl, hash);
#else
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#endif
//---skein4---
DECL_SKN;
SKN_I;
SKN_U;
SKN_C;
//---jh5------
DECL_JH;
JH_H;
//---keccak6---
DECL_KEC;
KEC_I;
KEC_U;
KEC_C;
//--- luffa7
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
(const BitSequence*)hash, 64 );
// 8 Cube
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
(const byte*)hashB, 64 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hashB);
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hashB, 512 );
//11---echo---
#ifdef NO_AES_NI
sph_echo512(&ctx.echo, hash, 64);
sph_echo512_close(&ctx.echo, hashB);
#else
update_final_echo ( &ctx.echo, (BitSequence *)hashB,
(const BitSequence *)hash, 512 );
#endif
// 12 Hamsi
sph_hamsi512(&ctx.hamsi, hashB, 64);
sph_hamsi512_close(&ctx.hamsi, hash);
/*
// 13 Fugue
sph_fugue512(&ctx.fugue, hash, 64);
sph_fugue512_close(&ctx.fugue, hashB);
*/
asm volatile ("emms");
memcpy(output, hashB, 32);
}
int scanhash_x12(int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = {
0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000
};
uint32_t masks[] = {
0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0
};
// we need bigendian data...
swab32_array( endiandata, pdata, 20 );
#ifdef DEBUG_ALGO
printf("[%d] Htarg=%X\n", thr_id, Htarg);
#endif
for (int m=0; m < 6; m++) {
if (Htarg <= htmax[m]) {
uint32_t mask = masks[m];
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
x12hash(hash64, endiandata);
#ifndef DEBUG_ALGO
if (!(hash64[7] & mask))
{
if ( fulltest(hash64, ptarget) )
{
*hashes_done = n - first_nonce + 1;
return true;
}
// else
// {
// applog(LOG_INFO, "Result does not validate on CPU!");
// }
}
#else
if (!(n % 0x1000) && !thr_id) printf(".");
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash );
*hashes_done = n - first_nonce + 1;
return true;
}
}
#endif
} while (n < max_nonce && !work_restart[thr_id].restart);
// see blake.c if else to understand the loop on htmax => mask
break;
}
}
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}

Some files were not shown because too many files have changed in this diff Show More