Compare commits

...

2 Commits

Author SHA1 Message Date
Jay D Dee
e4265a6f11 v3.8.1.1 2018-02-09 23:30:14 -05:00
Jay D Dee
a28daca3ce v3.8.1 2018-02-07 16:38:45 -05:00
85 changed files with 5426 additions and 3985 deletions

View File

@@ -100,7 +100,8 @@ cpuminer_SOURCES = \
algo/lbry.c \ algo/lbry.c \
algo/luffa/sph_luffa.c \ algo/luffa/sph_luffa.c \
algo/luffa/luffa.c \ algo/luffa/luffa.c \
algo/luffa/sse2/luffa_for_sse2.c \ algo/luffa/luffa_for_sse2.c \
algo/luffa/luffa-hash-2way.c \
algo/lyra2/lyra2.c \ algo/lyra2/lyra2.c \
algo/lyra2/sponge.c \ algo/lyra2/sponge.c \
algo/lyra2/lyra2rev2-gate.c \ algo/lyra2/lyra2rev2-gate.c \
@@ -127,7 +128,11 @@ cpuminer_SOURCES = \
algo/quark/anime-gate.c \ algo/quark/anime-gate.c \
algo/quark/anime.c \ algo/quark/anime.c \
algo/quark/anime-4way.c \ algo/quark/anime-4way.c \
algo/qubit/qubit-gate.c \
algo/qubit/qubit.c \ algo/qubit/qubit.c \
algo/qubit/qubit-2way.c \
algo/qubit/deep-gate.c \
algo/qubit/deep-2way.c \
algo/qubit/deep.c \ algo/qubit/deep.c \
algo/ripemd/sph_ripemd.c \ algo/ripemd/sph_ripemd.c \
algo/scrypt.c \ algo/scrypt.c \
@@ -143,8 +148,9 @@ cpuminer_SOURCES = \
algo/shavite/sph-shavite-aesni.c \ algo/shavite/sph-shavite-aesni.c \
algo/shavite/shavite.c \ algo/shavite/shavite.c \
algo/simd/sph_simd.c \ algo/simd/sph_simd.c \
algo/simd/sse2/nist.c \ algo/simd/nist.c \
algo/simd/sse2/vector.c \ algo/simd/vector.c \
algo/simd/simd-hash-2way.c \
algo/skein/sph_skein.c \ algo/skein/sph_skein.c \
algo/skein/skein-hash-4way.c \ algo/skein/skein-hash-4way.c \
algo/skein/skein.c \ algo/skein/skein.c \

View File

@@ -16,6 +16,7 @@ See file RELEASE_NOTES for change log and compile instructions.
Supported Algorithms Supported Algorithms
-------------------- --------------------
anime Animecoin
argon2 argon2
axiom Shabal-256 MemoHash axiom Shabal-256 MemoHash
bastion bastion
@@ -78,6 +79,7 @@ Supported Algorithms
x13sm3 hsr (Hshare) x13sm3 hsr (Hshare)
x14 X14 x14 X14
x15 X15 x15 X15
x16r Ravencoin
x17 x17
xevan Bitsend xevan Bitsend
yescrypt Globalboost-Y (BSTY) yescrypt Globalboost-Y (BSTY)
@@ -136,10 +138,13 @@ output from the miner showing the startup and any errors.
Donations Donations
--------- ---------
I do not do this for money but I have a donation address if users cpuminer-opt has no fees of any kind but donations are accepted.
are so inclined.
bitcoin:12tdvfF7KmAsihBXQXynT6E6th2c2pByTT?label=donations BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
Happy mining! Happy mining!

View File

@@ -25,3 +25,12 @@ cpuminer-aes-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
cpuminer-avx2.exe "-march=core-avx2" Haswell... cpuminer-avx2.exe "-march=core-avx2" Haswell...
cpuminer-avx2-sha.exe "-march=core-avx2 -msha" Ryzen cpuminer-avx2-sha.exe "-march=core-avx2 -msha" Ryzen
If you like this software feel free to donate:
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ

View File

@@ -98,8 +98,8 @@ Start mining.
Windows Windows
The following in how the Windows binary releases are built. It's old and Precompiled Windows binaries are built on a Linux host using Mingw
not very good but it works, for me anyway. with a more recent compiler than the following Windows hosted procedure.
Building on Windows prerequisites: Building on Windows prerequisites:
@@ -131,7 +131,7 @@ or similar Windows program.
In msys shell cd to miner directory. In msys shell cd to miner directory.
cd /c/path/to/cpuminer-opt cd /c/path/to/cpuminer-opt
Run winbuild.sh to build on Windows or execute the following commands. Run build.sh to build on Windows or execute the following commands.
./autogen.sh ./autogen.sh
CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
@@ -159,6 +159,20 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log Change Log
---------- ----------
v3.8.1.1
Fixed Windows AVX2 crash.
v3.8.1
Fixes x16r on CPUs with only SSE2.
More Optimizations for X algos, qubit & deep.
Corrected algo optimizations for scrypt and yescrypt, no new optimizations.
v3.8.0.1
Fixed x16r AVX2 low hash rate.
v3.8.0 v3.8.0
4way no longer a seperate feature, included in AVX2. 4way no longer a seperate feature, included in AVX2.

View File

@@ -553,22 +553,22 @@ do { \
, _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \ , _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \ VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \ _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
M[0x0] = mm_byteswap_32( *(buf + 0) ); \ M[0x0] = mm_bswap_32( *(buf + 0) ); \
M[0x1] = mm_byteswap_32( *(buf + 1) ); \ M[0x1] = mm_bswap_32( *(buf + 1) ); \
M[0x2] = mm_byteswap_32( *(buf + 2) ); \ M[0x2] = mm_bswap_32( *(buf + 2) ); \
M[0x3] = mm_byteswap_32( *(buf + 3) ); \ M[0x3] = mm_bswap_32( *(buf + 3) ); \
M[0x4] = mm_byteswap_32( *(buf + 4) ); \ M[0x4] = mm_bswap_32( *(buf + 4) ); \
M[0x5] = mm_byteswap_32( *(buf + 5) ); \ M[0x5] = mm_bswap_32( *(buf + 5) ); \
M[0x6] = mm_byteswap_32( *(buf + 6) ); \ M[0x6] = mm_bswap_32( *(buf + 6) ); \
M[0x7] = mm_byteswap_32( *(buf + 7) ); \ M[0x7] = mm_bswap_32( *(buf + 7) ); \
M[0x8] = mm_byteswap_32( *(buf + 8) ); \ M[0x8] = mm_bswap_32( *(buf + 8) ); \
M[0x9] = mm_byteswap_32( *(buf + 9) ); \ M[0x9] = mm_bswap_32( *(buf + 9) ); \
M[0xA] = mm_byteswap_32( *(buf + 10) ); \ M[0xA] = mm_bswap_32( *(buf + 10) ); \
M[0xB] = mm_byteswap_32( *(buf + 11) ); \ M[0xB] = mm_bswap_32( *(buf + 11) ); \
M[0xC] = mm_byteswap_32( *(buf + 12) ); \ M[0xC] = mm_bswap_32( *(buf + 12) ); \
M[0xD] = mm_byteswap_32( *(buf + 13) ); \ M[0xD] = mm_bswap_32( *(buf + 13) ); \
M[0xE] = mm_byteswap_32( *(buf + 14) ); \ M[0xE] = mm_bswap_32( *(buf + 14) ); \
M[0xF] = mm_byteswap_32( *(buf + 15) ); \ M[0xF] = mm_bswap_32( *(buf + 15) ); \
for (r = 0; r < rounds; r ++) \ for (r = 0; r < rounds; r ++) \
ROUND_S_4WAY(r); \ ROUND_S_4WAY(r); \
H0 = _mm_xor_si128( _mm_xor_si128( \ H0 = _mm_xor_si128( _mm_xor_si128( \
@@ -615,22 +615,22 @@ do { \
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \ VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \ VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \ VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
M0 = mm_byteswap_32( * buf ); \ M0 = mm_bswap_32( * buf ); \
M1 = mm_byteswap_32( *(buf+1) ); \ M1 = mm_bswap_32( *(buf+1) ); \
M2 = mm_byteswap_32( *(buf+2) ); \ M2 = mm_bswap_32( *(buf+2) ); \
M3 = mm_byteswap_32( *(buf+3) ); \ M3 = mm_bswap_32( *(buf+3) ); \
M4 = mm_byteswap_32( *(buf+4) ); \ M4 = mm_bswap_32( *(buf+4) ); \
M5 = mm_byteswap_32( *(buf+5) ); \ M5 = mm_bswap_32( *(buf+5) ); \
M6 = mm_byteswap_32( *(buf+6) ); \ M6 = mm_bswap_32( *(buf+6) ); \
M7 = mm_byteswap_32( *(buf+7) ); \ M7 = mm_bswap_32( *(buf+7) ); \
M8 = mm_byteswap_32( *(buf+8) ); \ M8 = mm_bswap_32( *(buf+8) ); \
M9 = mm_byteswap_32( *(buf+9) ); \ M9 = mm_bswap_32( *(buf+9) ); \
MA = mm_byteswap_32( *(buf+10) ); \ MA = mm_bswap_32( *(buf+10) ); \
MB = mm_byteswap_32( *(buf+11) ); \ MB = mm_bswap_32( *(buf+11) ); \
MC = mm_byteswap_32( *(buf+12) ); \ MC = mm_bswap_32( *(buf+12) ); \
MD = mm_byteswap_32( *(buf+13) ); \ MD = mm_bswap_32( *(buf+13) ); \
ME = mm_byteswap_32( *(buf+14) ); \ ME = mm_bswap_32( *(buf+14) ); \
MF = mm_byteswap_32( *(buf+15) ); \ MF = mm_bswap_32( *(buf+15) ); \
ROUND_S_4WAY(0); \ ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \ ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \ ROUND_S_4WAY(2); \
@@ -727,22 +727,22 @@ do { \
VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \ VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \ VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \ VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
M0 = mm256_byteswap_32( * buf ); \ M0 = mm256_bswap_32( * buf ); \
M1 = mm256_byteswap_32( *(buf+1) ); \ M1 = mm256_bswap_32( *(buf+1) ); \
M2 = mm256_byteswap_32( *(buf+2) ); \ M2 = mm256_bswap_32( *(buf+2) ); \
M3 = mm256_byteswap_32( *(buf+3) ); \ M3 = mm256_bswap_32( *(buf+3) ); \
M4 = mm256_byteswap_32( *(buf+4) ); \ M4 = mm256_bswap_32( *(buf+4) ); \
M5 = mm256_byteswap_32( *(buf+5) ); \ M5 = mm256_bswap_32( *(buf+5) ); \
M6 = mm256_byteswap_32( *(buf+6) ); \ M6 = mm256_bswap_32( *(buf+6) ); \
M7 = mm256_byteswap_32( *(buf+7) ); \ M7 = mm256_bswap_32( *(buf+7) ); \
M8 = mm256_byteswap_32( *(buf+8) ); \ M8 = mm256_bswap_32( *(buf+8) ); \
M9 = mm256_byteswap_32( *(buf+9) ); \ M9 = mm256_bswap_32( *(buf+9) ); \
MA = mm256_byteswap_32( *(buf+10) ); \ MA = mm256_bswap_32( *(buf+10) ); \
MB = mm256_byteswap_32( *(buf+11) ); \ MB = mm256_bswap_32( *(buf+11) ); \
MC = mm256_byteswap_32( *(buf+12) ); \ MC = mm256_bswap_32( *(buf+12) ); \
MD = mm256_byteswap_32( *(buf+13) ); \ MD = mm256_bswap_32( *(buf+13) ); \
ME = mm256_byteswap_32( *(buf+14) ); \ ME = mm256_bswap_32( *(buf+14) ); \
MF = mm256_byteswap_32( *(buf+15) ); \ MF = mm256_bswap_32( *(buf+15) ); \
ROUND_S_8WAY(0); \ ROUND_S_8WAY(0); \
ROUND_S_8WAY(1); \ ROUND_S_8WAY(1); \
ROUND_S_8WAY(2); \ ROUND_S_8WAY(2); \
@@ -848,22 +848,22 @@ do { \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \ _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \ VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \ _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M[0x0] = mm256_byteswap_64( *(buf+0) ); \ M[0x0] = mm256_bswap_64( *(buf+0) ); \
M[0x1] = mm256_byteswap_64( *(buf+1) ); \ M[0x1] = mm256_bswap_64( *(buf+1) ); \
M[0x2] = mm256_byteswap_64( *(buf+2) ); \ M[0x2] = mm256_bswap_64( *(buf+2) ); \
M[0x3] = mm256_byteswap_64( *(buf+3) ); \ M[0x3] = mm256_bswap_64( *(buf+3) ); \
M[0x4] = mm256_byteswap_64( *(buf+4) ); \ M[0x4] = mm256_bswap_64( *(buf+4) ); \
M[0x5] = mm256_byteswap_64( *(buf+5) ); \ M[0x5] = mm256_bswap_64( *(buf+5) ); \
M[0x6] = mm256_byteswap_64( *(buf+6) ); \ M[0x6] = mm256_bswap_64( *(buf+6) ); \
M[0x7] = mm256_byteswap_64( *(buf+7) ); \ M[0x7] = mm256_bswap_64( *(buf+7) ); \
M[0x8] = mm256_byteswap_64( *(buf+8) ); \ M[0x8] = mm256_bswap_64( *(buf+8) ); \
M[0x9] = mm256_byteswap_64( *(buf+9) ); \ M[0x9] = mm256_bswap_64( *(buf+9) ); \
M[0xA] = mm256_byteswap_64( *(buf+10) ); \ M[0xA] = mm256_bswap_64( *(buf+10) ); \
M[0xB] = mm256_byteswap_64( *(buf+11) ); \ M[0xB] = mm256_bswap_64( *(buf+11) ); \
M[0xC] = mm256_byteswap_64( *(buf+12) ); \ M[0xC] = mm256_bswap_64( *(buf+12) ); \
M[0xD] = mm256_byteswap_64( *(buf+13) ); \ M[0xD] = mm256_bswap_64( *(buf+13) ); \
M[0xE] = mm256_byteswap_64( *(buf+14) ); \ M[0xE] = mm256_bswap_64( *(buf+14) ); \
M[0xF] = mm256_byteswap_64( *(buf+15) ); \ M[0xF] = mm256_bswap_64( *(buf+15) ); \
for (r = 0; r < 16; r ++) \ for (r = 0; r < 16; r ++) \
ROUND_B_4WAY(r); \ ROUND_B_4WAY(r); \
H0 = _mm256_xor_si256( _mm256_xor_si256( \ H0 = _mm256_xor_si256( _mm256_xor_si256( \
@@ -913,22 +913,22 @@ do { \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \ _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \ VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \ _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M0 = mm256_byteswap_64( *(buf + 0) ); \ M0 = mm256_bswap_64( *(buf + 0) ); \
M1 = mm256_byteswap_64( *(buf + 1) ); \ M1 = mm256_bswap_64( *(buf + 1) ); \
M2 = mm256_byteswap_64( *(buf + 2) ); \ M2 = mm256_bswap_64( *(buf + 2) ); \
M3 = mm256_byteswap_64( *(buf + 3) ); \ M3 = mm256_bswap_64( *(buf + 3) ); \
M4 = mm256_byteswap_64( *(buf + 4) ); \ M4 = mm256_bswap_64( *(buf + 4) ); \
M5 = mm256_byteswap_64( *(buf + 5) ); \ M5 = mm256_bswap_64( *(buf + 5) ); \
M6 = mm256_byteswap_64( *(buf + 6) ); \ M6 = mm256_bswap_64( *(buf + 6) ); \
M7 = mm256_byteswap_64( *(buf + 7) ); \ M7 = mm256_bswap_64( *(buf + 7) ); \
M8 = mm256_byteswap_64( *(buf + 8) ); \ M8 = mm256_bswap_64( *(buf + 8) ); \
M9 = mm256_byteswap_64( *(buf + 9) ); \ M9 = mm256_bswap_64( *(buf + 9) ); \
MA = mm256_byteswap_64( *(buf + 10) ); \ MA = mm256_bswap_64( *(buf + 10) ); \
MB = mm256_byteswap_64( *(buf + 11) ); \ MB = mm256_bswap_64( *(buf + 11) ); \
MC = mm256_byteswap_64( *(buf + 12) ); \ MC = mm256_bswap_64( *(buf + 12) ); \
MD = mm256_byteswap_64( *(buf + 13) ); \ MD = mm256_bswap_64( *(buf + 13) ); \
ME = mm256_byteswap_64( *(buf + 14) ); \ ME = mm256_bswap_64( *(buf + 14) ); \
MF = mm256_byteswap_64( *(buf + 15) ); \ MF = mm256_bswap_64( *(buf + 15) ); \
ROUND_B_4WAY(0); \ ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \ ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \ ROUND_B_4WAY(2); \
@@ -1064,8 +1064,8 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
if (out_size_w32 == 8) if (out_size_w32 == 8)
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2], u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
_mm_set1_epi32( 0x01000000UL ) ); _mm_set1_epi32( 0x01000000UL ) );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) ); *(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) ); *(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr ); blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
} }
else else
@@ -1077,13 +1077,13 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
memset_zero_128( u.buf, 56>>2 ); memset_zero_128( u.buf, 56>>2 );
if (out_size_w32 == 8) if (out_size_w32 == 8)
u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL ); u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) ); *(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) ); *(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, u.buf, 64 ); blake32_4way( sc, u.buf, 64 );
} }
out = (__m128i*)dst; out = (__m128i*)dst;
for ( k = 0; k < out_size_w32; k++ ) for ( k = 0; k < out_size_w32; k++ )
out[k] = mm_byteswap_32( sc->H[k] ); out[k] = mm_bswap_32( sc->H[k] );
} }
#if defined (__AVX2__) #if defined (__AVX2__)
@@ -1187,8 +1187,8 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
if (out_size_w32 == 8) if (out_size_w32 == 8)
u.buf[52>>2] = _mm256_or_si256( u.buf[52>>2], u.buf[52>>2] = _mm256_or_si256( u.buf[52>>2],
_mm256_set1_epi32( 0x01000000UL ) ); _mm256_set1_epi32( 0x01000000UL ) );
*(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) ); *(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) ); *(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr ); blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr );
} }
else else
@@ -1200,13 +1200,13 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
memset_zero_256( u.buf, 56>>2 ); memset_zero_256( u.buf, 56>>2 );
if (out_size_w32 == 8) if (out_size_w32 == 8)
u.buf[52>>2] = _mm256_set1_epi32( 0x01000000UL ); u.buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
*(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) ); *(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) ); *(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, u.buf, 64 ); blake32_8way( sc, u.buf, 64 );
} }
out = (__m256i*)dst; out = (__m256i*)dst;
for ( k = 0; k < out_size_w32; k++ ) for ( k = 0; k < out_size_w32; k++ )
out[k] = mm256_byteswap_32( sc->H[k] ); out[k] = mm256_bswap_32( sc->H[k] );
} }
// Blake-512 4 way // Blake-512 4 way
@@ -1311,9 +1311,9 @@ blake64_4way_close( blake_4way_big_context *sc,
if ( out_size_w64 == 8 ) if ( out_size_w64 == 8 )
u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)], u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
_mm256_set1_epi64x( 0x0100000000000000ULL ) ); _mm256_set1_epi64x( 0x0100000000000000ULL ) );
*(u.buf+(112>>3)) = mm256_byteswap_64( *(u.buf+(112>>3)) = mm256_bswap_64(
_mm256_set_epi64x( th, th, th, th ) ); _mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_64( *(u.buf+(120>>3)) = mm256_bswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) ); _mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr ); blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
@@ -1328,16 +1328,16 @@ blake64_4way_close( blake_4way_big_context *sc,
memset_zero_256( u.buf, 112>>3 ); memset_zero_256( u.buf, 112>>3 );
if ( out_size_w64 == 8 ) if ( out_size_w64 == 8 )
u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL ); u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
*(u.buf+(112>>3)) = mm256_byteswap_64( *(u.buf+(112>>3)) = mm256_bswap_64(
_mm256_set_epi64x( th, th, th, th ) ); _mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_64( *(u.buf+(120>>3)) = mm256_bswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) ); _mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf, 128 ); blake64_4way( sc, u.buf, 128 );
} }
out = (__m256i*)dst; out = (__m256i*)dst;
for ( k = 0; k < out_size_w64; k++ ) for ( k = 0; k < out_size_w64; k++ )
out[k] = mm256_byteswap_64( sc->H[k] ); out[k] = mm256_bswap_64( sc->H[k] );
} }
#endif #endif

View File

@@ -51,7 +51,9 @@ extern "C"{
// BMW small has a bug not present in big. Lanes 0 & 2 produce valid hash // BMW small has a bug not present in big. Lanes 0 & 2 produce valid hash
// while lanes 1 & 3 produce invalid hash. The cause is not known. // while lanes 1 & 3 produce invalid hash. The cause is not known.
// Some things that could cause it are: using epi64 instead of epi32,
// a memory write that is the wrong size, an attempt to index a vector
// like an array (only works for 64 bit elements).
static const sph_u32 IV256[] = { static const sph_u32 IV256[] = {
@@ -984,7 +986,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
} }
memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 ); memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n ); buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
buf[ (buf_size - 4) >> 2 ] = mm_zero; buf[ (buf_size - 4) >> 2 ] = m128_zero;
compress_small( buf, h, h2 ); compress_small( buf, h, h2 );
for ( u = 0; u < 16; u ++ ) for ( u = 0; u < 16; u ++ )

View File

@@ -129,7 +129,7 @@ static void transform( cubehashParam *sp )
#endif #endif
} // transform } // transform
// Ccubehash context initializing is very expensive. // Cubehash context initializing is very expensive.
// Cache the intial value for faster reinitializing. // Cache the intial value for faster reinitializing.
cubehashParam cube_ctx_cache __attribute__ ((aligned (64))); cubehashParam cube_ctx_cache __attribute__ ((aligned (64)));

View File

@@ -20,11 +20,11 @@ typedef struct {
#else #else
hashState_groestl groestl; hashState_groestl groestl;
#endif #endif
#ifndef USE_SPH_SHA //#ifndef USE_SPH_SHA
SHA256_CTX sha; // SHA256_CTX sha;
#else //#else
sph_sha256_context sha; sph_sha256_context sha;
#endif //#endif
} myrgr_ctx_holder; } myrgr_ctx_holder;
myrgr_ctx_holder myrgr_ctx; myrgr_ctx_holder myrgr_ctx;
@@ -36,11 +36,11 @@ void init_myrgr_ctx()
#else #else
init_groestl (&myrgr_ctx.groestl, 64 ); init_groestl (&myrgr_ctx.groestl, 64 );
#endif #endif
#ifndef USE_SPH_SHA //#ifndef USE_SPH_SHA
SHA256_Init( &myrgr_ctx.sha ); // SHA256_Init( &myrgr_ctx.sha );
#else //#else
sph_sha256_init( &myrgr_ctx.sha ); sph_sha256_init( &myrgr_ctx.sha );
#endif //#endif
} }
void myriadhash( void *output, const void *input ) void myriadhash( void *output, const void *input )
@@ -57,13 +57,13 @@ void myriadhash( void *output, const void *input )
(const char*)input, 640 ); (const char*)input, 640 );
#endif #endif
#ifndef USE_SPH_SHA //#ifndef USE_SPH_SHA
SHA256_Update( &ctx.sha, hash, 64 ); // SHA256_Update( &ctx.sha, hash, 64 );
SHA256_Final( (unsigned char*) hash, &ctx.sha ); // SHA256_Final( (unsigned char*) hash, &ctx.sha );
#else //#else
sph_sha256(&ctx.sha, hash, 64); sph_sha256(&ctx.sha, hash, 64);
sph_sha256_close(&ctx.sha, hash); sph_sha256_close(&ctx.sha, hash);
#endif //#endif
memcpy(output, hash, 32); memcpy(output, hash, 32);
} }

File diff suppressed because it is too large Load Diff

View File

@@ -48,20 +48,20 @@ extern "C"{
#define SPH_SIZE_hamsi512 512 #define SPH_SIZE_hamsi512 512
// Partial is only scalar but needs pointer ref for hamsi-helper
// deprecate partial_len
typedef struct { typedef struct {
__m128i h[16]; __m256i h[8];
__m128i partial[2]; __m256i buf[1];
size_t partial_len; size_t partial_len;
sph_u32 count_high, count_low; sph_u32 count_high, count_low;
} hamsi_4way_big_context; } hamsi_4way_big_context;
typedef hamsi_4way_big_context hamsi512_4way_context; typedef hamsi_4way_big_context hamsi512_4way_context;
void hamsi512_4way_init(void *cc); void hamsi512_4way_init( hamsi512_4way_context *sc );
void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
void hamsi512_4way(void *cc, const void *data, size_t len); void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
void hamsi512_4way_close(void *cc, void *dst);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@@ -1,482 +0,0 @@
/* $Id: hamsi_helper.c 202 2010-05-31 15:46:48Z tp $ */
/*
* Helper code for Hamsi (input block expansion). This code is
* automatically generated and includes precomputed tables for
* expansion code which handles 2 to 8 bits at a time.
*
* This file is included from hamsi.c, and is not meant to be compiled
* independently.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifdef __cplusplus
extern "C"{
#endif
/* Note: this table lists bits within each byte from least
siginificant to most significant. */
static const sph_u32 T512[64][16] = {
{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
SPH_C32(0x9e69af68) },
{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
SPH_C32(0x0c26f262) },
{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
SPH_C32(0xdc24e61f) },
{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
SPH_C32(0x3daac2da) },
{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
SPH_C32(0x78cace29) },
{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
SPH_C32(0x2dd1f9ab) },
{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
SPH_C32(0xbf2c0be2) },
{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
SPH_C32(0x32219526) },
{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
SPH_C32(0xac8e6c88) },
{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
SPH_C32(0x7b1bd6b9) },
{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
SPH_C32(0xf746c320) },
{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
SPH_C32(0x69505b3a) },
{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
SPH_C32(0x8a341574) },
{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
SPH_C32(0x450360bf) },
{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
SPH_C32(0xf3d45758) },
{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
SPH_C32(0x925c44e9) },
{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
SPH_C32(0xa123ff9f) },
{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
SPH_C32(0x1568ff0f) },
{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
SPH_C32(0xc5c1eb3e) },
{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
SPH_C32(0x1af21fe1) },
{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
SPH_C32(0x857f3c2b) },
{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
SPH_C32(0x2ba05a55) },
{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
SPH_C32(0xfeabf254) },
{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
SPH_C32(0xfe1cdc7f) },
{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
SPH_C32(0xb0a51834) },
{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
SPH_C32(0xa6b8c28d) },
{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
SPH_C32(0x3a4e99d7) },
{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
SPH_C32(0xe1844257) },
{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
SPH_C32(0x2c3b504e) },
{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
SPH_C32(0x524a0d59) },
{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
SPH_C32(0x378dd173) },
{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
SPH_C32(0x8b6c72bd) },
{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
SPH_C32(0x8e67b7fa) },
{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
SPH_C32(0x443d3004) },
{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
SPH_C32(0xf4f6ea7b) },
{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
SPH_C32(0x979961d0) },
{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
SPH_C32(0x98aa496e) },
{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
SPH_C32(0x094e3198) },
{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
SPH_C32(0xe86cba2e) },
{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
SPH_C32(0x4b7eec55) },
{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
SPH_C32(0x1e7536a6) },
{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
SPH_C32(0x24314f17) },
{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
SPH_C32(0x9075b1ce) },
{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
SPH_C32(0x9b6ef888) },
{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
SPH_C32(0xd8b61463) },
{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
SPH_C32(0x3ea660f7) },
{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
SPH_C32(0x7f975691) },
{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
SPH_C32(0x2c94459e) },
{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
SPH_C32(0x56a7b19f) },
{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
SPH_C32(0x81fdf908) },
{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
SPH_C32(0x5bd61539) },
{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
SPH_C32(0x15b961e7) },
{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
SPH_C32(0x2a2c18f0) },
{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
SPH_C32(0x551e3d6e) },
{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
SPH_C32(0x33c5244f) },
{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
SPH_C32(0x8a58e6a4) },
{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
SPH_C32(0xda878000) },
{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
SPH_C32(0x3c5dfffe) },
{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
SPH_C32(0x7b1675d7) },
{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
SPH_C32(0x2879ebac) },
{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
SPH_C32(0xbe0a679e) },
{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
SPH_C32(0x30aebcf7) },
{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
SPH_C32(0xc7ff60f0) },
{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
SPH_C32(0xe7e00a94) }
};
#define U_BIG( n ) \
do { \
__m128i db = buf[n]; \
for ( int u = 0; u < 32; u++ ) \
{ \
__m128i dm = mm_negate_32( _mm_and_si128( db, mm_one_32 ) ); \
m0 = _mm_xor_si128( m0, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m1 = _mm_xor_si128( m1, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m2 = _mm_xor_si128( m2, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m3 = _mm_xor_si128( m3, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m4 = _mm_xor_si128( m4, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m5 = _mm_xor_si128( m5, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m6 = _mm_xor_si128( m6, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m7 = _mm_xor_si128( m7, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m8 = _mm_xor_si128( m8, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m9 = _mm_xor_si128( m9, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mA = _mm_xor_si128( mA, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mB = _mm_xor_si128( mB, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mC = _mm_xor_si128( mC, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mD = _mm_xor_si128( mD, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mE = _mm_xor_si128( mE, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mF = _mm_xor_si128( mF, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
db = _mm_srli_epi32( db, 1 ); \
} \
} while (0);
#define INPUT_BIG \
do { \
const sph_u32 *tp = &T512[0][0]; \
m0 = mm_zero; \
m1 = mm_zero; \
m2 = mm_zero; \
m3 = mm_zero; \
m4 = mm_zero; \
m5 = mm_zero; \
m6 = mm_zero; \
m7 = mm_zero; \
m8 = mm_zero; \
m9 = mm_zero; \
mA = mm_zero; \
mB = mm_zero; \
mC = mm_zero; \
mD = mm_zero; \
mE = mm_zero; \
mF = mm_zero; \
U_BIG( 0 ); \
U_BIG( 1 ); \
} while (0)
#ifdef __cplusplus
}
#endif

940
algo/hamsi/sph_hamsi.c.test Normal file
View File

@@ -0,0 +1,940 @@
/* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */
/*
* Hamsi implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include <stddef.h>
#include <string.h>
#include "sph_hamsi.h"
#ifdef __cplusplus
extern "C"{
#endif
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAMSI
#define SPH_SMALL_FOOTPRINT_HAMSI 1
#endif
/*
* The SPH_HAMSI_EXPAND_* define how many input bits we handle in one
* table lookup during message expansion (1 to 8, inclusive). If we note
* w the number of bits per message word (w=32 for Hamsi-224/256, w=64
* for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for
* Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level,
* then we will get t tables (where t=ceil(w/n)) of individual size
* 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and
* n=5, there are 7 tables, but the last one uses only two bits on
* input, not five).
*
* Also, we read t rows of r words from RAM. Words in a given row are
* concatenated in RAM in that order, so most of the cost is about
* reading the first row word; comparatively, cache misses are thus
* less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8).
*
* When n=1, tables are "special" in that we omit the first entry of
* each table (which always contains 0), so that total table size is
* halved.
*
* We thus have the following (size1 is the cumulative table size of
* Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2
* are for Hamsi-224/256 and Hamsi-384/512, respectively).
*
* n size1 size2 t1 t2
* ---------------------------------------
* 1 1024 4096 32 64
* 2 2048 8192 16 32
* 3 2688 10880 11 22
* 4 4096 16384 8 16
* 5 6272 25600 7 13
* 6 10368 41984 6 11
* 7 16896 73856 5 10
* 8 32768 131072 4 8
*
* So there is a trade-off: a lower n makes the tables fit better in
* L1 cache, but increases the number of memory accesses. The optimal
* value depends on the amount of available L1 cache and the relative
* impact of a cache miss.
*
* Experimentally, in ideal benchmark conditions (which are not necessarily
* realistic with regards to L1 cache contention), it seems that n=8 is
* the best value on "big" architectures (those with 32 kB or more of L1
* cache), while n=4 is better on "small" architectures. This was tested
* on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3
* (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302
* (8 kB L1 cache).
*
* Note: with n=1, the 32 tables (actually implemented as one big table)
* are read entirely and sequentially, regardless of the input data,
* thus avoiding any data-dependent table access pattern.
*/
#if !defined SPH_HAMSI_EXPAND_SMALL
#if SPH_SMALL_FOOTPRINT_HAMSI
#define SPH_HAMSI_EXPAND_SMALL 4
#else
#define SPH_HAMSI_EXPAND_SMALL 8
#endif
#endif
#if !defined SPH_HAMSI_EXPAND_BIG
#define SPH_HAMSI_EXPAND_BIG 8
#endif
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
#include "sph_hamsi_helper.c"
static const sph_u32 IV224[] = {
SPH_C32(0xc3967a67), SPH_C32(0xc3bc6c20), SPH_C32(0x4bc3bcc3),
SPH_C32(0xa7c3bc6b), SPH_C32(0x2c204b61), SPH_C32(0x74686f6c),
SPH_C32(0x69656b65), SPH_C32(0x20556e69)
};
/*
* This version is the one used in the Hamsi submission package for
* round 2 of the SHA-3 competition; the UTF-8 encoding is wrong and
* shall soon be corrected in the official Hamsi specification.
*
static const sph_u32 IV224[] = {
SPH_C32(0x3c967a67), SPH_C32(0x3cbc6c20), SPH_C32(0xb4c343c3),
SPH_C32(0xa73cbc6b), SPH_C32(0x2c204b61), SPH_C32(0x74686f6c),
SPH_C32(0x69656b65), SPH_C32(0x20556e69)
};
*/
static const sph_u32 IV256[] = {
SPH_C32(0x76657273), SPH_C32(0x69746569), SPH_C32(0x74204c65),
SPH_C32(0x7576656e), SPH_C32(0x2c204465), SPH_C32(0x70617274),
SPH_C32(0x656d656e), SPH_C32(0x7420456c)
};
static const sph_u32 IV384[] = {
SPH_C32(0x656b7472), SPH_C32(0x6f746563), SPH_C32(0x686e6965),
SPH_C32(0x6b2c2043), SPH_C32(0x6f6d7075), SPH_C32(0x74657220),
SPH_C32(0x53656375), SPH_C32(0x72697479), SPH_C32(0x20616e64),
SPH_C32(0x20496e64), SPH_C32(0x75737472), SPH_C32(0x69616c20),
SPH_C32(0x43727970), SPH_C32(0x746f6772), SPH_C32(0x61706879),
SPH_C32(0x2c204b61)
};
static const sph_u32 IV512[] = {
SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33),
SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48),
SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
SPH_C32(0x6769756d)
};
static const sph_u32 alpha_n[] = {
SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
};
static const sph_u32 alpha_f[] = {
SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
};
#define DECL_STATE_SMALL \
sph_u32 c0, c1, c2, c3, c4, c5, c6, c7;
#define READ_STATE_SMALL(sc) do { \
c0 = sc->h[0x0]; \
c1 = sc->h[0x1]; \
c2 = sc->h[0x2]; \
c3 = sc->h[0x3]; \
c4 = sc->h[0x4]; \
c5 = sc->h[0x5]; \
c6 = sc->h[0x6]; \
c7 = sc->h[0x7]; \
} while (0)
#define WRITE_STATE_SMALL(sc) do { \
sc->h[0x0] = c0; \
sc->h[0x1] = c1; \
sc->h[0x2] = c2; \
sc->h[0x3] = c3; \
sc->h[0x4] = c4; \
sc->h[0x5] = c5; \
sc->h[0x6] = c6; \
sc->h[0x7] = c7; \
} while (0)
#define s0 m0
#define s1 m1
#define s2 c0
#define s3 c1
#define s4 c2
#define s5 c3
#define s6 m2
#define s7 m3
#define s8 m4
#define s9 m5
#define sA c4
#define sB c5
#define sC c6
#define sD c7
#define sE m6
#define sF m7
#define SBOX(a, b, c, d) do { \
sph_u32 t; \
t = (a); \
(a) &= (c); \
(a) ^= (d); \
(c) ^= (b); \
(c) ^= (a); \
(d) |= t; \
(d) ^= (b); \
t ^= (c); \
(b) = (d); \
(d) |= t; \
(d) ^= (a); \
(a) &= (b); \
t ^= (a); \
(b) ^= (d); \
(b) ^= t; \
(a) = (c); \
(c) = (b); \
(b) = (d); \
(d) = SPH_T32(~t); \
} while (0)
#define L(a, b, c, d) do { \
(a) = SPH_ROTL32(a, 13); \
(c) = SPH_ROTL32(c, 3); \
(b) ^= (a) ^ (c); \
(d) ^= (c) ^ SPH_T32((a) << 3); \
(b) = SPH_ROTL32(b, 1); \
(d) = SPH_ROTL32(d, 7); \
(a) ^= (b) ^ (d); \
(c) ^= (d) ^ SPH_T32((b) << 7); \
(a) = SPH_ROTL32(a, 5); \
(c) = SPH_ROTL32(c, 22); \
} while (0)
#define ROUND_SMALL(rc, alpha) do { \
s0 ^= alpha[0x00]; \
s1 ^= alpha[0x01] ^ (sph_u32)(rc); \
s2 ^= alpha[0x02]; \
s3 ^= alpha[0x03]; \
s4 ^= alpha[0x08]; \
s5 ^= alpha[0x09]; \
s6 ^= alpha[0x0A]; \
s7 ^= alpha[0x0B]; \
s8 ^= alpha[0x10]; \
s9 ^= alpha[0x11]; \
sA ^= alpha[0x12]; \
sB ^= alpha[0x13]; \
sC ^= alpha[0x18]; \
sD ^= alpha[0x19]; \
sE ^= alpha[0x1A]; \
sF ^= alpha[0x1B]; \
SBOX(s0, s4, s8, sC); \
SBOX(s1, s5, s9, sD); \
SBOX(s2, s6, sA, sE); \
SBOX(s3, s7, sB, sF); \
L(s0, s5, sA, sF); \
L(s1, s6, sB, sC); \
L(s2, s7, s8, sD); \
L(s3, s4, s9, sE); \
} while (0)
#define P_SMALL do { \
ROUND_SMALL(0, alpha_n); \
ROUND_SMALL(1, alpha_n); \
ROUND_SMALL(2, alpha_n); \
} while (0)
#define PF_SMALL do { \
ROUND_SMALL(0, alpha_f); \
ROUND_SMALL(1, alpha_f); \
ROUND_SMALL(2, alpha_f); \
ROUND_SMALL(3, alpha_f); \
ROUND_SMALL(4, alpha_f); \
ROUND_SMALL(5, alpha_f); \
} while (0)
#define T_SMALL do { \
/* order is important */ \
c7 = (sc->h[7] ^= sB); \
c6 = (sc->h[6] ^= sA); \
c5 = (sc->h[5] ^= s9); \
c4 = (sc->h[4] ^= s8); \
c3 = (sc->h[3] ^= s3); \
c2 = (sc->h[2] ^= s2); \
c1 = (sc->h[1] ^= s1); \
c0 = (sc->h[0] ^= s0); \
} while (0)
static void
hamsi_small(sph_hamsi_small_context *sc, const unsigned char *buf, size_t num)
{
DECL_STATE_SMALL
#if !SPH_64
sph_u32 tmp;
#endif
#if SPH_64
sc->count += (sph_u64)num << 5;
#else
tmp = SPH_T32((sph_u32)num << 5);
sc->count_low = SPH_T32(sc->count_low + tmp);
sc->count_high += (sph_u32)((num >> 13) >> 14);
if (sc->count_low < tmp)
sc->count_high ++;
#endif
READ_STATE_SMALL(sc);
while (num -- > 0) {
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
INPUT_SMALL;
P_SMALL;
T_SMALL;
buf += 4;
}
WRITE_STATE_SMALL(sc);
}
static void
hamsi_small_final(sph_hamsi_small_context *sc, const unsigned char *buf)
{
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
DECL_STATE_SMALL
READ_STATE_SMALL(sc);
INPUT_SMALL;
PF_SMALL;
T_SMALL;
WRITE_STATE_SMALL(sc);
}
static void
hamsi_small_init(sph_hamsi_small_context *sc, const sph_u32 *iv)
{
sc->partial_len = 0;
memcpy(sc->h, iv, sizeof sc->h);
#if SPH_64
sc->count = 0;
#else
sc->count_high = sc->count_low = 0;
#endif
}
static void
hamsi_small_core(sph_hamsi_small_context *sc, const void *data, size_t len)
{
if (sc->partial_len != 0) {
size_t mlen;
mlen = 4 - sc->partial_len;
if (len < mlen) {
memcpy(sc->partial + sc->partial_len, data, len);
sc->partial_len += len;
return;
} else {
memcpy(sc->partial + sc->partial_len, data, mlen);
len -= mlen;
data = (const unsigned char *)data + mlen;
hamsi_small(sc, sc->partial, 1);
sc->partial_len = 0;
}
}
hamsi_small(sc, data, (len >> 2));
data = (const unsigned char *)data + (len & ~(size_t)3);
len &= (size_t)3;
memcpy(sc->partial, data, len);
sc->partial_len = len;
}
static void
hamsi_small_close(sph_hamsi_small_context *sc,
unsigned ub, unsigned n, void *dst, size_t out_size_w32)
{
unsigned char pad[12];
size_t ptr, u;
unsigned z;
unsigned char *out;
ptr = sc->partial_len;
memcpy(pad, sc->partial, ptr);
#if SPH_64
sph_enc64be(pad + 4, sc->count + (ptr << 3) + n);
#else
sph_enc32be(pad + 4, sc->count_high);
sph_enc32be(pad + 8, sc->count_low + (ptr << 3) + n);
#endif
z = 0x80 >> n;
pad[ptr ++] = ((ub & -z) | z) & 0xFF;
while (ptr < 4)
pad[ptr ++] = 0;
hamsi_small(sc, pad, 2);
hamsi_small_final(sc, pad + 8);
out = dst;
for (u = 0; u < out_size_w32; u ++)
sph_enc32be(out + (u << 2), sc->h[u]);
}
#define DECL_STATE_BIG \
sph_u32 c0, c1, c2, c3, c4, c5, c6, c7; \
sph_u32 c8, c9, cA, cB, cC, cD, cE, cF;
#define READ_STATE_BIG(sc) do { \
c0 = sc->h[0x0]; \
c1 = sc->h[0x1]; \
c2 = sc->h[0x2]; \
c3 = sc->h[0x3]; \
c4 = sc->h[0x4]; \
c5 = sc->h[0x5]; \
c6 = sc->h[0x6]; \
c7 = sc->h[0x7]; \
c8 = sc->h[0x8]; \
c9 = sc->h[0x9]; \
cA = sc->h[0xA]; \
cB = sc->h[0xB]; \
cC = sc->h[0xC]; \
cD = sc->h[0xD]; \
cE = sc->h[0xE]; \
cF = sc->h[0xF]; \
} while (0)
#define WRITE_STATE_BIG(sc) do { \
sc->h[0x0] = c0; \
sc->h[0x1] = c1; \
sc->h[0x2] = c2; \
sc->h[0x3] = c3; \
sc->h[0x4] = c4; \
sc->h[0x5] = c5; \
sc->h[0x6] = c6; \
sc->h[0x7] = c7; \
sc->h[0x8] = c8; \
sc->h[0x9] = c9; \
sc->h[0xA] = cA; \
sc->h[0xB] = cB; \
sc->h[0xC] = cC; \
sc->h[0xD] = cD; \
sc->h[0xE] = cE; \
sc->h[0xF] = cF; \
} while (0)
#define s00 m0
#define s01 m1
#define s02 c0
#define s03 c1
#define s04 m2
#define s05 m3
#define s06 c2
#define s07 c3
#define s08 c4
#define s09 c5
#define s0A m4
#define s0B m5
#define s0C c6
#define s0D c7
#define s0E m6
#define s0F m7
#define s10 m8
#define s11 m9
#define s12 c8
#define s13 c9
#define s14 mA
#define s15 mB
#define s16 cA
#define s17 cB
#define s18 cC
#define s19 cD
#define s1A mC
#define s1B mD
#define s1C cE
#define s1D cF
#define s1E mE
#define s1F mF
#define ROUND_BIG(rc, alpha) do { \
s00 ^= alpha[0x00]; \
s01 ^= alpha[0x01] ^ (sph_u32)(rc); \
s02 ^= alpha[0x02]; \
s03 ^= alpha[0x03]; \
s04 ^= alpha[0x04]; \
s05 ^= alpha[0x05]; \
s06 ^= alpha[0x06]; \
s07 ^= alpha[0x07]; \
s08 ^= alpha[0x08]; \
s09 ^= alpha[0x09]; \
s0A ^= alpha[0x0A]; \
s0B ^= alpha[0x0B]; \
s0C ^= alpha[0x0C]; \
s0D ^= alpha[0x0D]; \
s0E ^= alpha[0x0E]; \
s0F ^= alpha[0x0F]; \
s10 ^= alpha[0x10]; \
s11 ^= alpha[0x11]; \
s12 ^= alpha[0x12]; \
s13 ^= alpha[0x13]; \
s14 ^= alpha[0x14]; \
s15 ^= alpha[0x15]; \
s16 ^= alpha[0x16]; \
s17 ^= alpha[0x17]; \
s18 ^= alpha[0x18]; \
s19 ^= alpha[0x19]; \
s1A ^= alpha[0x1A]; \
s1B ^= alpha[0x1B]; \
s1C ^= alpha[0x1C]; \
s1D ^= alpha[0x1D]; \
s1E ^= alpha[0x1E]; \
s1F ^= alpha[0x1F]; \
SBOX(s00, s08, s10, s18); \
SBOX(s01, s09, s11, s19); \
SBOX(s02, s0A, s12, s1A); \
SBOX(s03, s0B, s13, s1B); \
SBOX(s04, s0C, s14, s1C); \
SBOX(s05, s0D, s15, s1D); \
SBOX(s06, s0E, s16, s1E); \
SBOX(s07, s0F, s17, s1F); \
L(s00, s09, s12, s1B); \
L(s01, s0A, s13, s1C); \
L(s02, s0B, s14, s1D); \
L(s03, s0C, s15, s1E); \
L(s04, s0D, s16, s1F); \
L(s05, s0E, s17, s18); \
L(s06, s0F, s10, s19); \
L(s07, s08, s11, s1A); \
/*if (rc == 0 ) { \
printf("S L5 post s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
}*/ \
L(s00, s02, s05, s07); \
L(s10, s13, s15, s16); \
/*if (rc == 0 ) { \
printf("S L5 post s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
}*/ \
L(s09, s0B, s0C, s0E); \
L(s19, s1A, s1C, s1F); \
} while (0)
#if SPH_SMALL_FOOTPRINT_HAMSI
#define P_BIG do { \
unsigned r; \
for (r = 0; r < 6; r ++) \
ROUND_BIG(r, alpha_n); \
} while (0)
#define PF_BIG do { \
unsigned r; \
for (r = 0; r < 12; r ++) \
ROUND_BIG(r, alpha_f); \
} while (0)
#else
#define P_BIG do { \
ROUND_BIG(0, alpha_n); \
/*printf("S R0 s00 %08lx s01 %08lx s02 %08lx s03 %08lx\n",s00,s01,s02,s03); \
printf("S R0 s04 %08lx s05 %08lx s06 %08lx s07 %08lx\n",s04,s05,s06,s07); \
printf("S R0 s08 %08lx s09 %08lx s0A %08lx s0B %08lx\n",s08,s09,s0A,s0B); \
printf("S R0 s0C %08lx s0D %08lx s0E %08lx s0F %08lx\n",s0C,s0D,s0E,s0F); \
printf("S R0 s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
printf("S R0 s14 %08lx s15 %08lx s16 %08lx s17 %08lx\n",s14,s15,s16,s17); \
printf("S R0 s18 %08lx s19 %08lx s1A %08lx s1B %08lx\n",s18,s19,s1A,s1B); \
printf("S R0 s1C %08lx s1D %08lx s1E %08lx s1F %08lx\n",s1C,s1D,s1E,s1F); \
*/\
ROUND_BIG(1, alpha_n); \
ROUND_BIG(2, alpha_n); \
ROUND_BIG(3, alpha_n); \
ROUND_BIG(4, alpha_n); \
ROUND_BIG(5, alpha_n); \
} while (0)
#define PF_BIG do { \
ROUND_BIG(0, alpha_f); \
ROUND_BIG(1, alpha_f); \
ROUND_BIG(2, alpha_f); \
ROUND_BIG(3, alpha_f); \
ROUND_BIG(4, alpha_f); \
ROUND_BIG(5, alpha_f); \
ROUND_BIG(6, alpha_f); \
ROUND_BIG(7, alpha_f); \
ROUND_BIG(8, alpha_f); \
ROUND_BIG(9, alpha_f); \
ROUND_BIG(10, alpha_f); \
ROUND_BIG(11, alpha_f); \
} while (0)
#endif
#define T_BIG do { \
/* order is important */ \
cF = (sc->h[0xF] ^= s17); \
cE = (sc->h[0xE] ^= s16); \
cD = (sc->h[0xD] ^= s15); \
cC = (sc->h[0xC] ^= s14); \
cB = (sc->h[0xB] ^= s13); \
cA = (sc->h[0xA] ^= s12); \
c9 = (sc->h[0x9] ^= s11); \
c8 = (sc->h[0x8] ^= s10); \
c7 = (sc->h[0x7] ^= s07); \
c6 = (sc->h[0x6] ^= s06); \
c5 = (sc->h[0x5] ^= s05); \
c4 = (sc->h[0x4] ^= s04); \
c3 = (sc->h[0x3] ^= s03); \
c2 = (sc->h[0x2] ^= s02); \
c1 = (sc->h[0x1] ^= s01); \
c0 = (sc->h[0x0] ^= s00); \
} while (0)
static void
hamsi_big(sph_hamsi_big_context *sc, const unsigned char *buf, size_t num)
{
DECL_STATE_BIG
#if !SPH_64
sph_u32 tmp;
#endif
#if SPH_64
sc->count += (sph_u64)num << 6;
#else
tmp = SPH_T32((sph_u32)num << 6);
sc->count_low = SPH_T32(sc->count_low + tmp);
sc->count_high += (sph_u32)((num >> 13) >> 13);
if (sc->count_low < tmp)
sc->count_high ++;
#endif
READ_STATE_BIG(sc);
/*
uint32_t* b = (uint32_t*)buf;
//printf("S s64: %016llx\n",*ss);
//printf("S buf: %08lx %08lx\n",b[0], b[1]);
int n1 = 1;
int n2 = 1;
*/
while (num -- > 0) {
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
INPUT_BIG;
/*if ( n1 )
{
n1 = 0;
printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m0,m1,m2,m3 );
printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m4,m5,m6,m7);
printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m8,m9,mA,mB );
printf("S INPUT m: %08lx %08lx %08lx %08lx\n",mC,mD,mE,mF);
}
*/
P_BIG;
/*if ( n2 )
{
n2 = 0;
printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s00,s01,s02,s03 );
printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s04,s05,s07,s07);
printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s08,s09,s0A,s0B );
printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s0C,s0D,s0E,s0F);
}
*/
T_BIG;
buf += 8;
}
WRITE_STATE_BIG(sc);
}
static void
hamsi_big_final(sph_hamsi_big_context *sc, const unsigned char *buf)
{
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
DECL_STATE_BIG
READ_STATE_BIG(sc);
INPUT_BIG;
PF_BIG;
T_BIG;
WRITE_STATE_BIG(sc);
}
static void
hamsi_big_init(sph_hamsi_big_context *sc, const sph_u32 *iv)
{
sc->partial_len = 0;
memcpy(sc->h, iv, sizeof sc->h);
#if SPH_64
sc->count = 0;
#else
sc->count_high = sc->count_low = 0;
#endif
}
static void
hamsi_big_core(sph_hamsi_big_context *sc, const void *data, size_t len)
{
uint64_t* d = (uint64_t*)data;
uint64_t* h = (uint64_t*)sc->h;
/*
printf("S core1 len = %d\n",len);
printf("S data: %016llx %016llx %016llx %016llx\n",d[0],d[1],d[2],d[3]);
printf("S data: %016llx %016llx %016llx %016llx\n",d[4],d[5],d[6],d[7]);
printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
*/
if (sc->partial_len != 0) {
//printf("WARNING partial_len != 0\n");
size_t mlen;
mlen = 8 - sc->partial_len;
if (len < mlen) {
memcpy(sc->partial + sc->partial_len, data, len);
sc->partial_len += len;
return;
} else {
memcpy(sc->partial + sc->partial_len, data, mlen);
len -= mlen;
data = (const unsigned char *)data + mlen;
hamsi_big(sc, sc->partial, 1);
sc->partial_len = 0;
}
}
hamsi_big(sc, data, (len >> 3));
/*
printf("S core2\n");
printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
*/
data = (const unsigned char *)data + (len & ~(size_t)7);
len &= (size_t)7;
memcpy(sc->partial, data, len);
sc->partial_len = len;
}
static void
hamsi_big_close(sph_hamsi_big_context *sc,
unsigned ub, unsigned n, void *dst, size_t out_size_w32)
{
unsigned char pad[8];
size_t ptr, u;
unsigned z;
unsigned char *out;
//uint64_t* h = (uint64_t*)sc->h;
ptr = sc->partial_len;
#if SPH_64
sph_enc64be(pad, sc->count + (ptr << 3) + n);
#else
sph_enc32be(pad, sc->count_high);
sph_enc32be(pad + 4, sc->count_low + (ptr << 3) + n);
#endif
z = 0x80 >> n;
sc->partial[ptr ++] = ((ub & -z) | z) & 0xFF;
while (ptr < 8)
sc->partial[ptr ++] = 0;
//printf("S close1\n");
//printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
hamsi_big(sc, sc->partial, 1);
//printf("S close2\n");
//printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
hamsi_big_final(sc, pad);
//printf("S close3\n");
//printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
out = dst;
if (out_size_w32 == 12) {
sph_enc32be(out + 0, sc->h[ 0]);
sph_enc32be(out + 4, sc->h[ 1]);
sph_enc32be(out + 8, sc->h[ 3]);
sph_enc32be(out + 12, sc->h[ 4]);
sph_enc32be(out + 16, sc->h[ 5]);
sph_enc32be(out + 20, sc->h[ 6]);
sph_enc32be(out + 24, sc->h[ 8]);
sph_enc32be(out + 28, sc->h[ 9]);
sph_enc32be(out + 32, sc->h[10]);
sph_enc32be(out + 36, sc->h[12]);
sph_enc32be(out + 40, sc->h[13]);
sph_enc32be(out + 44, sc->h[15]);
} else {
for (u = 0; u < 16; u ++)
sph_enc32be(out + (u << 2), sc->h[u]);
}
}
/* see sph_hamsi.h */
void
sph_hamsi224_init(void *cc)
{
hamsi_small_init(cc, IV224);
}
/* see sph_hamsi.h */
void
sph_hamsi224(void *cc, const void *data, size_t len)
{
hamsi_small_core(cc, data, len);
}
/* see sph_hamsi.h */
void
sph_hamsi224_close(void *cc, void *dst)
{
hamsi_small_close(cc, 0, 0, dst, 7);
// hamsi_small_init(cc, IV224);
}
/* see sph_hamsi.h */
void
sph_hamsi224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
hamsi_small_close(cc, ub, n, dst, 7);
// hamsi_small_init(cc, IV224);
}
/* see sph_hamsi.h */
void
sph_hamsi256_init(void *cc)
{
hamsi_small_init(cc, IV256);
}
/* see sph_hamsi.h */
void
sph_hamsi256(void *cc, const void *data, size_t len)
{
hamsi_small_core(cc, data, len);
}
/* see sph_hamsi.h */
void
sph_hamsi256_close(void *cc, void *dst)
{
hamsi_small_close(cc, 0, 0, dst, 8);
// hamsi_small_init(cc, IV256);
}
/* see sph_hamsi.h */
void
sph_hamsi256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
hamsi_small_close(cc, ub, n, dst, 8);
// hamsi_small_init(cc, IV256);
}
/* see sph_hamsi.h */
void
sph_hamsi384_init(void *cc)
{
hamsi_big_init(cc, IV384);
}
/* see sph_hamsi.h */
void
sph_hamsi384(void *cc, const void *data, size_t len)
{
hamsi_big_core(cc, data, len);
}
/* see sph_hamsi.h */
void
sph_hamsi384_close(void *cc, void *dst)
{
hamsi_big_close(cc, 0, 0, dst, 12);
// hamsi_big_init(cc, IV384);
}
/* see sph_hamsi.h */
void
sph_hamsi384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
hamsi_big_close(cc, ub, n, dst, 12);
// hamsi_big_init(cc, IV384);
}
/* see sph_hamsi.h */
void
sph_hamsi512_init(void *cc)
{
hamsi_big_init(cc, IV512);
}
/* see sph_hamsi.h */
void
sph_hamsi512(void *cc, const void *data, size_t len)
{
hamsi_big_core(cc, data, len);
}
/* see sph_hamsi.h */
void
sph_hamsi512_close(void *cc, void *dst)
{
hamsi_big_close(cc, 0, 0, dst, 16);
// hamsi_big_init(cc, IV512);
}
/* see sph_hamsi.h */
void
sph_hamsi512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
hamsi_big_close(cc, ub, n, dst, 16);
// hamsi_big_init(cc, IV512);
}
#ifdef __cplusplus
}
#endif

View File

@@ -83,7 +83,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
current = (unsigned)sc->count_low & 127UL; current = (unsigned)sc->count_low & 127UL;
sc->buf[ current>>2 ] = mm_one_32; sc->buf[ current>>2 ] = m128_one_32;
current += 4; current += 4;
RSTATE; RSTATE;
if ( current > 116UL ) if ( current > 116UL )

View File

@@ -15,7 +15,7 @@
#include "algo/shabal/sph_shabal.h" #include "algo/shabal/sph_shabal.h"
#include "algo/echo/sph_echo.h" #include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h" #include "algo/hamsi/sph_hamsi.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/skein/sse2/skein.c" #include "algo/skein/sse2/skein.c"
#ifndef NO_AES_NI #ifndef NO_AES_NI

View File

@@ -99,6 +99,7 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
pthread_barrier_wait( &hodl_barrier ); pthread_barrier_wait( &hodl_barrier );
return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done ); return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
#endif #endif
return false;
} }
bool register_hodl_algo( algo_gate_t* gate ) bool register_hodl_algo( algo_gate_t* gate )

View File

@@ -44,7 +44,7 @@ void jha_hash_4way( void *out, const void *input )
for ( int round = 0; round < 3; round++ ) for ( int round = 0; round < 3; round++ )
{ {
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
vh[0], _mm256_set1_epi64x( 1 ) ), mm256_zero ); vh[0], _mm256_set1_epi64x( 1 ) ), m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx_groestl, 64 ); init_groestl( &ctx_groestl, 64 );

View File

@@ -59,7 +59,7 @@ static const sph_u64 RC[] = {
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b)) #define XOR64(d, a, b) (d = _mm256_xor_si256(a,b))
#define AND64(d, a, b) (d = _mm256_and_si256(a,b)) #define AND64(d, a, b) (d = _mm256_and_si256(a,b))
#define OR64(d, a, b) (d = _mm256_or_si256(a,b)) #define OR64(d, a, b) (d = _mm256_or_si256(a,b))
#define NOT64(d, s) (d = _mm256_xor_si256(s,mm256_neg1)) #define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
#define ROL64(d, v, n) (d = mm256_rotl_64(v, n)) #define ROL64(d, v, n) (d = mm256_rotl_64(v, n))
#define XOR64_IOTA XOR64 #define XOR64_IOTA XOR64
@@ -375,12 +375,12 @@ static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
kc->w[i] = _mm256_setzero_si256(); kc->w[i] = _mm256_setzero_si256();
// Initialization for the "lane complement". // Initialization for the "lane complement".
kc->w[ 1] = mm256_neg1; kc->w[ 1] = m256_neg1;
kc->w[ 2] = mm256_neg1; kc->w[ 2] = m256_neg1;
kc->w[ 8] = mm256_neg1; kc->w[ 8] = m256_neg1;
kc->w[12] = mm256_neg1; kc->w[12] = m256_neg1;
kc->w[17] = mm256_neg1; kc->w[17] = m256_neg1;
kc->w[20] = mm256_neg1; kc->w[20] = m256_neg1;
kc->ptr = 0; kc->ptr = 0;
kc->lim = 200 - (out_size >> 2); kc->lim = 200 - (out_size >> 2);
} }

View File

@@ -0,0 +1,584 @@
/*
* luffa_for_sse2.c
* Version 2.0 (Sep 15th 2009)
*
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
*
* Hitachi, Ltd. is the owner of this software and hereby grant
* the U.S. Government and any interested party the right to use
* this software for the purposes of the SHA-3 evaluation process,
* notwithstanding that this software is copyrighted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <string.h>
#include <immintrin.h>
#include "luffa-hash-2way.h"
#if defined(__AVX2__)
#include "avxdefs.h"
#define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
0UL, 0UL, 0UL, 0xffffffffUL )
#define ADD_CONSTANT(a,b,c0,c1)\
a = _mm256_xor_si256(a,c0);\
b = _mm256_xor_si256(b,c1);\
#define MULT2(a0,a1) \
do { \
__m256i b = _mm256_xor_si256( a0, \
_mm256_shuffle_epi32( _mm256_and_si256(a1,MASK), 16 ) ); \
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
} while(0)
// confirm pointer arithmetic
// ok but use array indexes
#define STEP_PART(x,c,t)\
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
MIXWORD(*x,*(x+4),*t,*(t+1));\
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
ADD_CONSTANT(*x, *(x+4), *c, *(c+1));
#define SUBCRUMB(a0,a1,a2,a3,t)\
t = _mm256_load_si256(&a0);\
a0 = _mm256_or_si256(a0,a1);\
a2 = _mm256_xor_si256(a2,a3);\
a1 = _mm256_andnot_si256(a1, m256_neg1 );\
a0 = _mm256_xor_si256(a0,a3);\
a3 = _mm256_and_si256(a3,t);\
a1 = _mm256_xor_si256(a1,a3);\
a3 = _mm256_xor_si256(a3,a2);\
a2 = _mm256_and_si256(a2,a0);\
a0 = _mm256_andnot_si256(a0, m256_neg1 );\
a2 = _mm256_xor_si256(a2,a1);\
a1 = _mm256_or_si256(a1,a3);\
t = _mm256_xor_si256(t,a1);\
a3 = _mm256_xor_si256(a3,a2);\
a2 = _mm256_and_si256(a2,a1);\
a1 = _mm256_xor_si256(a1,a0);\
a0 = _mm256_load_si256(&t);\
#define MIXWORD(a,b,t1,t2)\
b = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(a,2);\
t2 = _mm256_srli_epi32(a,30);\
a = _mm256_or_si256(t1,t2);\
a = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(b,14);\
t2 = _mm256_srli_epi32(b,18);\
b = _mm256_or_si256(t1,t2);\
b = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(a,10);\
t2 = _mm256_srli_epi32(a,22);\
a = _mm256_or_si256(t1,t2);\
a = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(b,1);\
t2 = _mm256_srli_epi32(b,31);\
b = _mm256_or_si256(t1,t2);
#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
a1 = _mm256_shuffle_epi32(a1,147);\
t0 = _mm256_load_si256(&a1);\
a1 = _mm256_unpacklo_epi32(a1,a0);\
t0 = _mm256_unpackhi_epi32(t0,a0);\
t1 = _mm256_shuffle_epi32(t0,78);\
a0 = _mm256_shuffle_epi32(a1,78);\
SUBCRUMB(t1,t0,a0,a1,tmp0);\
t0 = _mm256_unpacklo_epi32(t0,t1);\
a1 = _mm256_unpacklo_epi32(a1,a0);\
a0 = _mm256_load_si256(&a1);\
a0 = _mm256_unpackhi_epi64(a0,t0);\
a1 = _mm256_unpacklo_epi64(a1,t0);\
a1 = _mm256_shuffle_epi32(a1,57);\
MIXWORD(a0,a1,tmp0,tmp1);\
ADD_CONSTANT(a0,a1,c0,c1);
#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
s2 = _mm256_load_si256(&r1);\
q2 = _mm256_load_si256(&p1);\
r2 = _mm256_shuffle_epi32(r2,216);\
p2 = _mm256_shuffle_epi32(p2,216);\
r1 = _mm256_unpacklo_epi32(r1,r0);\
p1 = _mm256_unpacklo_epi32(p1,p0);\
s2 = _mm256_unpackhi_epi32(s2,r0);\
q2 = _mm256_unpackhi_epi32(q2,p0);\
s0 = _mm256_load_si256(&r2);\
q0 = _mm256_load_si256(&p2);\
r2 = _mm256_unpacklo_epi64(r2,r1);\
p2 = _mm256_unpacklo_epi64(p2,p1);\
s1 = _mm256_load_si256(&s0);\
q1 = _mm256_load_si256(&q0);\
s0 = _mm256_unpackhi_epi64(s0,r1);\
q0 = _mm256_unpackhi_epi64(q0,p1);\
r2 = _mm256_shuffle_epi32(r2,225);\
p2 = _mm256_shuffle_epi32(p2,225);\
r0 = _mm256_load_si256(&s1);\
p0 = _mm256_load_si256(&q1);\
s0 = _mm256_shuffle_epi32(s0,225);\
q0 = _mm256_shuffle_epi32(q0,225);\
s1 = _mm256_unpacklo_epi64(s1,s2);\
q1 = _mm256_unpacklo_epi64(q1,q2);\
r0 = _mm256_unpackhi_epi64(r0,s2);\
p0 = _mm256_unpackhi_epi64(p0,q2);\
s2 = _mm256_load_si256(&r0);\
q2 = _mm256_load_si256(&p0);\
s3 = _mm256_load_si256(&r2);\
q3 = _mm256_load_si256(&p2);\
#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
s0 = _mm256_load_si256(&r0);\
q0 = _mm256_load_si256(&p0);\
s1 = _mm256_load_si256(&r2);\
q1 = _mm256_load_si256(&p2);\
r0 = _mm256_unpackhi_epi32(r0,r1);\
p0 = _mm256_unpackhi_epi32(p0,p1);\
r2 = _mm256_unpackhi_epi32(r2,r3);\
p2 = _mm256_unpackhi_epi32(p2,p3);\
s0 = _mm256_unpacklo_epi32(s0,r1);\
q0 = _mm256_unpacklo_epi32(q0,p1);\
s1 = _mm256_unpacklo_epi32(s1,r3);\
q1 = _mm256_unpacklo_epi32(q1,p3);\
r1 = _mm256_load_si256(&r0);\
p1 = _mm256_load_si256(&p0);\
r0 = _mm256_unpackhi_epi64(r0,r2);\
p0 = _mm256_unpackhi_epi64(p0,p2);\
s0 = _mm256_unpackhi_epi64(s0,s1);\
q0 = _mm256_unpackhi_epi64(q0,q1);\
r1 = _mm256_unpacklo_epi64(r1,r2);\
p1 = _mm256_unpacklo_epi64(p1,p2);\
s2 = _mm256_load_si256(&r0);\
q2 = _mm256_load_si256(&p0);\
s1 = _mm256_load_si256(&r1);\
q1 = _mm256_load_si256(&p1);\
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
s1 = _mm256_load_si256(&r3);\
q1 = _mm256_load_si256(&p3);\
s3 = _mm256_load_si256(&r3);\
q3 = _mm256_load_si256(&p3);\
s1 = _mm256_unpackhi_epi32(s1,r2);\
q1 = _mm256_unpackhi_epi32(q1,p2);\
s3 = _mm256_unpacklo_epi32(s3,r2);\
q3 = _mm256_unpacklo_epi32(q3,p2);\
s0 = _mm256_load_si256(&s1);\
q0 = _mm256_load_si256(&q1);\
s2 = _mm256_load_si256(&s3);\
q2 = _mm256_load_si256(&q3);\
r3 = _mm256_load_si256(&r1);\
p3 = _mm256_load_si256(&p1);\
r1 = _mm256_unpacklo_epi32(r1,r0);\
p1 = _mm256_unpacklo_epi32(p1,p0);\
r3 = _mm256_unpackhi_epi32(r3,r0);\
p3 = _mm256_unpackhi_epi32(p3,p0);\
s0 = _mm256_unpackhi_epi64(s0,r3);\
q0 = _mm256_unpackhi_epi64(q0,p3);\
s1 = _mm256_unpacklo_epi64(s1,r3);\
q1 = _mm256_unpacklo_epi64(q1,p3);\
s2 = _mm256_unpackhi_epi64(s2,r1);\
q2 = _mm256_unpackhi_epi64(q2,p1);\
s3 = _mm256_unpacklo_epi64(s3,r1);\
q3 = _mm256_unpacklo_epi64(q3,p1);
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
/* initial values of chaining variables */
static const uint32 IV[40] __attribute((aligned(32))) = {
0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
0xdef610bb,0xee058139,0x90152df4,0x6e292011,
0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
};
/* Round Constants */
static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
0x00000000,0x00000000,0x00000000,0x5090d577,
0x00000000,0x00000000,0x00000000,0xac11d7fa,
0x00000000,0x00000000,0x00000000,0x2d1925ab,
0x00000000,0x00000000,0x00000000,0x1bcb66f2,
0x00000000,0x00000000,0x00000000,0xb46496ac,
0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
0x00000000,0x00000000,0x00000000,0xd1925ab0,
0x00000000,0x00000000,0x00000000,0x78602649,
0x00000000,0x00000000,0x00000000,0x29131ab6,
0x00000000,0x00000000,0x00000000,0x8edae952,
0x00000000,0x00000000,0x00000000,0x0fc053c3,
0x00000000,0x00000000,0x00000000,0x3b6ba548,
0x00000000,0x00000000,0x00000000,0x3f014f0c,
0x00000000,0x00000000,0x00000000,0xedae9520,
0x00000000,0x00000000,0x00000000,0xfc053c31
};
__m256i CNS[32];
/***************************************************/
/* Round function */
/* state: hash context */
void rnd512_2way( luffa_2way_context *state, __m256i *msg )
{
__m256i t[2];
__m256i *chainv = state->chainv;
__m256i msg0, msg1;
__m256i tmp[2];
__m256i x[8];
t[0] = chainv[0];
t[1] = chainv[1];
t[0] = _mm256_xor_si256( t[0], chainv[2] );
t[1] = _mm256_xor_si256( t[1], chainv[3] );
t[0] = _mm256_xor_si256( t[0], chainv[4] );
t[1] = _mm256_xor_si256( t[1], chainv[5] );
t[0] = _mm256_xor_si256( t[0], chainv[6] );
t[1] = _mm256_xor_si256( t[1], chainv[7] );
t[0] = _mm256_xor_si256( t[0], chainv[8] );
t[1] = _mm256_xor_si256( t[1], chainv[9] );
MULT2( t[0], t[1] );
msg0 = _mm256_shuffle_epi32( msg[0], 27 );
msg1 = _mm256_shuffle_epi32( msg[1], 27 );
chainv[0] = _mm256_xor_si256( chainv[0], t[0] );
chainv[1] = _mm256_xor_si256( chainv[1], t[1] );
chainv[2] = _mm256_xor_si256( chainv[2], t[0] );
chainv[3] = _mm256_xor_si256( chainv[3], t[1] );
chainv[4] = _mm256_xor_si256( chainv[4], t[0] );
chainv[5] = _mm256_xor_si256( chainv[5], t[1] );
chainv[6] = _mm256_xor_si256( chainv[6], t[0] );
chainv[7] = _mm256_xor_si256( chainv[7], t[1] );
chainv[8] = _mm256_xor_si256( chainv[8], t[0] );
chainv[9] = _mm256_xor_si256( chainv[9], t[1] );
t[0] = chainv[0];
t[1] = chainv[1];
MULT2( chainv[0], chainv[1]);
chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
MULT2( chainv[2], chainv[3]);
chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
MULT2( chainv[4], chainv[5]);
chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
MULT2( chainv[6], chainv[7]);
chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
MULT2( chainv[8], chainv[9]);
chainv[8] = _mm256_xor_si256( chainv[8], t[0] );
chainv[9] = _mm256_xor_si256( chainv[9], t[1] );
t[0] = chainv[8];
t[1] = chainv[9];
MULT2( chainv[8], chainv[9]);
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
MULT2( chainv[6], chainv[7]);
chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
MULT2( chainv[4], chainv[5]);
chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
MULT2( chainv[2], chainv[3] );
chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
MULT2( chainv[0], chainv[1] );
chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t[0] ), msg0 );
chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t[1] ), msg1 );
MULT2( msg0, msg1);
chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
MULT2( msg0, msg1);
chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
MULT2( msg0, msg1);
chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
MULT2( msg0, msg1);
chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
MULT2( msg0, msg1);
chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3], 1 ),
_mm256_srli_epi32( chainv[3], 31 ) );
chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5], 2 ),
_mm256_srli_epi32( chainv[5], 30 ) );
chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7], 3 ),
_mm256_srli_epi32( chainv[7], 29 ) );
chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9], 4 ),
_mm256_srli_epi32( chainv[9], 28 ) );
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
x[0], x[1], x[2], x[3],
chainv[1],chainv[3],chainv[5],chainv[7],
x[4], x[5], x[6], x[7] );
STEP_PART( &x[0], &CNS[ 0], &tmp[0] );
STEP_PART( &x[0], &CNS[ 2], &tmp[0] );
STEP_PART( &x[0], &CNS[ 4], &tmp[0] );
STEP_PART( &x[0], &CNS[ 6], &tmp[0] );
STEP_PART( &x[0], &CNS[ 8], &tmp[0] );
STEP_PART( &x[0], &CNS[10], &tmp[0] );
STEP_PART( &x[0], &CNS[12], &tmp[0] );
STEP_PART( &x[0], &CNS[14], &tmp[0] );
MIXTON1024( x[0], x[1], x[2], x[3],
chainv[0], chainv[2], chainv[4],chainv[6],
x[4], x[5], x[6], x[7],
chainv[1],chainv[3],chainv[5],chainv[7]);
/* Process last 256-bit block */
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[16], CNS[17],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[18], CNS[19],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[20], CNS[21],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[22], CNS[23],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[24], CNS[25],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[26], CNS[27],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[28], CNS[29],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[30], CNS[31],
tmp[0], tmp[1] );
}
/***************************************************/
/* Finalization function */
/* state: hash context */
/* b[8]: hash values */
void finalization512_2way( luffa_2way_context *state, uint32 *b )
{
uint32 hash[8] __attribute((aligned(64)));
__m256i* chainv = state->chainv;
__m256i t[2];
__m256i zero[2];
zero[0] = zero[1] = _mm256_setzero_si256();
/*---- blank round with m=0 ----*/
rnd512_2way( state, zero );
t[0] = chainv[0];
t[1] = chainv[1];
t[0] = _mm256_xor_si256( t[0], chainv[2] );
t[1] = _mm256_xor_si256( t[1], chainv[3] );
t[0] = _mm256_xor_si256( t[0], chainv[4] );
t[1] = _mm256_xor_si256( t[1], chainv[5] );
t[0] = _mm256_xor_si256( t[0], chainv[6] );
t[1] = _mm256_xor_si256( t[1], chainv[7] );
t[0] = _mm256_xor_si256( t[0], chainv[8] );
t[1] = _mm256_xor_si256( t[1], chainv[9] );
t[0] = _mm256_shuffle_epi32( t[0], 27 );
t[1] = _mm256_shuffle_epi32( t[1], 27 );
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
rnd512_2way( state, zero );
t[0] = chainv[0];
t[1] = chainv[1];
t[0] = _mm256_xor_si256( t[0], chainv[2] );
t[1] = _mm256_xor_si256( t[1], chainv[3] );
t[0] = _mm256_xor_si256( t[0], chainv[4] );
t[1] = _mm256_xor_si256( t[1], chainv[5] );
t[0] = _mm256_xor_si256( t[0], chainv[6] );
t[1] = _mm256_xor_si256( t[1], chainv[7] );
t[0] = _mm256_xor_si256( t[0], chainv[8] );
t[1] = _mm256_xor_si256( t[1], chainv[9] );
t[0] = _mm256_shuffle_epi32( t[0], 27 );
t[1] = _mm256_shuffle_epi32( t[1], 27 );
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
}
int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
{
int i;
state->hashbitlen = hashbitlen;
for ( i=0; i<32; i++ ) CNS[i] =
_mm256_set_epi32( CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ],
CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ] );
for ( i=0; i<10; i++ ) state->chainv[i] =
_mm256_set_epi32( IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
IV[ (i<<2) +1 ], IV[ (i<<2) ],
IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
IV[ (i<<2) +1 ], IV[ (i<<2) ] );
((__m256i*)state->buffer)[0] = m256_zero;
((__m256i*)state->buffer)[1] = m256_zero;
return 0;
}
// Do not call luffa_update_close after having called luffa_update.
// Once luffa_update has been called only call luffa_update or luffa_close.
int luffa_2way_update( luffa_2way_context *state, const void *data,
size_t len )
{
__m256i *vdata = (__m256i*)data;
__m256i *buffer = (__m256i*)state->buffer;
__m256i msg[2];
int i;
int blocks = (int)len / 32;
state-> rembytes = (int)len % 32;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = mm256_bswap_32( vdata[ i ] );
msg[1] = mm256_bswap_32( vdata[ i+1 ] );
rnd512_2way( state, msg );
}
// 16 byte partial block exists for 80 byte len
// store in buffer for transform in final for midstate to work
if ( state->rembytes )
{
// remaining data bytes
buffer[0] = mm256_bswap_32( vdata[0] );
buffer[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
}
return 0;
}
int luffa_2way_close( luffa_2way_context *state, void *hashval )
{
__m256i *buffer = (__m256i*)state->buffer;
__m256i msg[2];
// transform pad block
if ( state->rembytes )
// not empty, data is in buffer
rnd512_2way( state, buffer );
else
{ // empty pad block, constant data
msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
msg[1] = m256_zero;
rnd512_2way( state, msg );
}
finalization512_2way( state, (uint32*)hashval );
if ( state->hashbitlen > 512 )
finalization512_2way( state, (uint32*)( hashval+128 ) );
return 0;
}
int luffa_2way_update_close( luffa_2way_context *state,
void *output, const void *data, size_t inlen )
{
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
const __m256i *vdata = (__m256i*)data;
__m256i msg[2];
int i;
const int blocks = (int)( inlen >> 5 );
state->rembytes = inlen & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = mm256_bswap_32( vdata[ 0 ] );
msg[1] = mm256_bswap_32( vdata[ 1 ] );
rnd512_2way( state, msg );
}
// 16 byte partial block exists for 80 byte len
if ( state->rembytes )
{
// padding of partial block
msg[0] = mm256_bswap_32( vdata[0] );
msg[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
rnd512_2way( state, msg );
}
else
{
// empty pad block
msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
msg[1] = m256_zero;
rnd512_2way( state, msg );
}
finalization512_2way( state, (uint32*)output );
if ( state->hashbitlen > 512 )
finalization512_2way( state, (uint32*)( output+128 ) );
return 0;
}
#endif

View File

@@ -0,0 +1,69 @@
#if !defined(LUFFA_HASH_2WAY_H__)
#define LUFFA_HASH_2WAY_H__ 1
/*
* luffa_for_sse2.h
* Version 2.0 (Sep 15th 2009)
*
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
*
* Hitachi, Ltd. is the owner of this software and hereby grant
* the U.S. Government and any interested party the right to use
* this software for the purposes of the SHA-3 evaluation process,
* notwithstanding that this software is copyrighted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#if defined(__AVX2__)
#include <immintrin.h>
#include "algo/sha/sha3-defs.h"
#include "avxdefs.h"
/* The length of digests*/
#define DIGEST_BIT_LEN_224 224
#define DIGEST_BIT_LEN_256 256
#define DIGEST_BIT_LEN_384 384
#define DIGEST_BIT_LEN_512 512
/*********************************/
/* The parameters of Luffa */
#define MSG_BLOCK_BIT_LEN 256 /*The bit length of a message block*/
#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length
* of a message block*/
/* The number of blocks in Luffa */
#define WIDTH_224 3
#define WIDTH_256 3
#define WIDTH_384 4
#define WIDTH_512 5
/* The limit of the length of message */
#define LIMIT_224 64
#define LIMIT_256 64
#define LIMIT_384 128
#define LIMIT_512 128
/*********************************/
typedef struct {
uint32 buffer[8*2] __attribute((aligned(64)));
__m256i chainv[10] __attribute((aligned(32))); /* Chaining values */
int hashbitlen;
int rembytes;
} luffa_2way_context;
int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
int luffa_2way_update( luffa_2way_context *state, const void *data,
size_t len );
int luffa_2way_close( luffa_2way_context *state, void *hashval );
int luffa_2way_update_close( luffa_2way_context *state, void *output,
const void *data, size_t inlen );
#endif
#endif

View File

@@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
// full blocks // full blocks
for ( i = 0; i < blocks; i++ ) for ( i = 0; i < blocks; i++ )
{ {
rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ), rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
mm_byteswap_32( casti_m128i( data, 0 ) ) ); mm_bswap_32( casti_m128i( data, 0 ) ) );
data += MSG_BLOCK_BYTE_LEN; data += MSG_BLOCK_BYTE_LEN;
} }
@@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
if ( state->rembytes ) if ( state->rembytes )
{ {
// remaining data bytes // remaining data bytes
casti_m128i( state->buffer, 0 ) = mm_byteswap_32( cast_m128i( data ) ); casti_m128i( state->buffer, 0 ) = mm_bswap_32( cast_m128i( data ) );
// padding of partial block // padding of partial block
casti_m128i( state->buffer, 1 ) = casti_m128i( state->buffer, 1 ) =
_mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ); _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
@@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
// full blocks // full blocks
for ( i = 0; i < blocks; i++ ) for ( i = 0; i < blocks; i++ )
{ {
rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ), rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
mm_byteswap_32( casti_m128i( data, 0 ) ) ); mm_bswap_32( casti_m128i( data, 0 ) ) );
data += MSG_BLOCK_BYTE_LEN; data += MSG_BLOCK_BYTE_LEN;
} }
@@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
{ {
// padding of partial block // padding of partial block
rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ), rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
mm_byteswap_32( cast_m128i( data ) ) ); mm_bswap_32( cast_m128i( data ) ) );
} }
else else
{ {
@@ -542,7 +542,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm256_store_si256( (__m256i*)hash, t ); _mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 0 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) ); casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
rnd512( state, zero, zero ); rnd512( state, zero, zero );
@@ -555,7 +555,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm256_store_si256( (__m256i*)hash, t ); _mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 1 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) ); casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
} }
#else #else
@@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm_store_si128((__m128i*)&hash[0], t[0]); _mm_store_si128((__m128i*)&hash[0], t[0]);
_mm_store_si128((__m128i*)&hash[4], t[1]); _mm_store_si128((__m128i*)&hash[4], t[1]);
casti_m128i( b, 0 ) = mm_byteswap_32( casti_m128i( hash, 0 ) ); casti_m128i( b, 0 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 1 ) = mm_byteswap_32( casti_m128i( hash, 1 ) ); casti_m128i( b, 1 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
rnd512( state, zero, zero ); rnd512( state, zero, zero );
@@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm_store_si128((__m128i*)&hash[0], t[0]); _mm_store_si128((__m128i*)&hash[0], t[0]);
_mm_store_si128((__m128i*)&hash[4], t[1]); _mm_store_si128((__m128i*)&hash[4], t[1]);
casti_m128i( b, 2 ) = mm_byteswap_32( casti_m128i( hash, 0 ) ); casti_m128i( b, 2 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 3 ) = mm_byteswap_32( casti_m128i( hash, 1 ) ); casti_m128i( b, 3 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
} }
#endif #endif

View File

@@ -60,7 +60,7 @@ void anime_4way_hash( void *state, const void *input )
blake512_4way_close( &ctx.blake, vhash ); blake512_4way_close( &ctx.blake, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero ); m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, update_and_final_groestl( &ctx.groestl, (char*)hash0,
@@ -97,7 +97,7 @@ void anime_4way_hash( void *state, const void *input )
jh512_4way_close( &ctx.jh, vhash ); jh512_4way_close( &ctx.jh, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero ); m256_zero );
blake512_4way_init( &ctx.blake ); blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 ); blake512_4way( &ctx.blake, vhash, 64 );
@@ -118,7 +118,7 @@ void anime_4way_hash( void *state, const void *input )
skein512_4way_close( &ctx.skein, vhash ); skein512_4way_close( &ctx.skein, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero ); m256_zero );
keccak512_4way_init( &ctx.keccak ); keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 ); keccak512_4way( &ctx.keccak, vhash, 64 );

View File

@@ -60,7 +60,7 @@ void quark_4way_hash( void *state, const void *input )
bmw512_4way_close( &ctx.bmw, vhash ); bmw512_4way_close( &ctx.bmw, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero ); m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, update_and_final_groestl( &ctx.groestl, (char*)hash0,
@@ -97,7 +97,7 @@ void quark_4way_hash( void *state, const void *input )
jh512_4way_close( &ctx.jh, vhash ); jh512_4way_close( &ctx.jh, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero ); m256_zero );
blake512_4way_init( &ctx.blake ); blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 ); blake512_4way( &ctx.blake, vhash, 64 );
@@ -118,7 +118,7 @@ void quark_4way_hash( void *state, const void *input )
skein512_4way_close( &ctx.skein, vhash ); skein512_4way_close( &ctx.skein, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero ); m256_zero );
keccak512_4way_init( &ctx.keccak ); keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 ); keccak512_4way( &ctx.keccak, vhash, 64 );

130
algo/qubit/deep-2way.c Normal file
View File

@@ -0,0 +1,130 @@
#include "deep-gate.h"
#if defined(DEEP_2WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct
{
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_echo echo;
} deep_2way_ctx_holder;
deep_2way_ctx_holder deep_2way_ctx;
void init_deep_2way_ctx()
{
luffa_2way_init( &deep_2way_ctx.luffa, 512 );
cubehashInit(&deep_2way_ctx.cube,512,16,32);
sph_shavite512_init(&deep_2way_ctx.shavite);
init_echo(&deep_2way_ctx.echo, 512);
};
void deep_2way_hash( void *output, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*2] __attribute__ ((aligned (64)));
deep_2way_ctx_holder ctx;
memcpy( &ctx, &deep_2way_ctx, sizeof(deep_2way_ctx) );
luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
luffa_2way_close( &ctx.luffa, vhash );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*) hash0, 64 );
memcpy( &ctx.cube, &deep_2way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &deep_2way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &deep_2way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
}
int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 32+3; // 4*8 + 3
uint32_t *noncep1 = vdata + 32+7;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
luffa_2way_init( &deep_2way_ctx.luffa, 512 );
luffa_2way_update( &deep_2way_ctx.luffa, vdata, 64 );
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
deep_2way_hash( hash, vdata );
pdata[19] = n;
if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
n += 2;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

17
algo/qubit/deep-gate.c Normal file
View File

@@ -0,0 +1,17 @@
#include "deep-gate.h"
bool register_deep_algo( algo_gate_t* gate )
{
#if defined (DEEP_2WAY)
init_deep_2way_ctx();
gate->scanhash = (void*)&scanhash_deep_2way;
gate->hash = (void*)&deep_2way_hash;
#else
init_deep_ctx();
gate->scanhash = (void*)&scanhash_deep;
gate->hash = (void*)&deep_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
return true;
};

32
algo/qubit/deep-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef DEEP_GATE_H__
#define DEEP_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define DEEP_2WAY
#endif
bool register_deep_algo( algo_gate_t* gate );
#if defined(DEEP_2WAY)
void deep_2way_hash( void *state, const void *input );
int scanhash_deep_2way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_deep_2way_ctx();
#endif
void deep_hash( void *state, const void *input );
int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_deep_ctx();
#endif

View File

@@ -1,9 +1,9 @@
#include "algo-gate-api.h" #include "deep-gate.h"
#include <stdlib.h> #include <stdlib.h>
#include <stdint.h> #include <stdint.h>
#include <string.h> #include <string.h>
#include <stdio.h> #include <stdio.h>
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#ifndef NO_AES_NI #ifndef NO_AES_NI
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
@@ -139,12 +139,3 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
return 0; return 0;
} }
bool register_deep_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_deep_ctx();
gate->scanhash = (void*)&scanhash_deep;
gate->hash = (void*)&deep_hash;
return true;
};

138
algo/qubit/qubit-2way.c Normal file
View File

@@ -0,0 +1,138 @@
#include "qubit-gate.h"
#if defined(QUBIT_2WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct
{
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd_2way_context simd;
hashState_echo echo;
} qubit_2way_ctx_holder;
qubit_2way_ctx_holder qubit_2way_ctx;
void init_qubit_2way_ctx()
{
luffa_2way_init( &qubit_2way_ctx.luffa, 512 );
cubehashInit(&qubit_2way_ctx.cube,512,16,32);
sph_shavite512_init(&qubit_2way_ctx.shavite);
simd_2way_init( &qubit_2way_ctx.simd, 512 );
init_echo(&qubit_2way_ctx.echo, 512);
};
void qubit_2way_hash( void *output, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*2] __attribute__ ((aligned (64)));
qubit_2way_ctx_holder ctx;
memcpy( &ctx, &qubit_2way_ctx, sizeof(qubit_2way_ctx) );
luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
luffa_2way_close( &ctx.luffa, vhash );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*) hash0, 64 );
memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
}
int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 32+3; // 4*8 + 3
uint32_t *noncep1 = vdata + 32+7;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
luffa_2way_init( &qubit_2way_ctx.luffa, 512 );
luffa_2way_update( &qubit_2way_ctx.luffa, vdata, 64 );
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
qubit_2way_hash( hash, vdata );
pdata[19] = n;
if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
n += 2;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

17
algo/qubit/qubit-gate.c Normal file
View File

@@ -0,0 +1,17 @@
#include "qubit-gate.h"
bool register_qubit_algo( algo_gate_t* gate )
{
#if defined (QUBIT_2WAY)
init_qubit_2way_ctx();
gate->scanhash = (void*)&scanhash_qubit_2way;
gate->hash = (void*)&qubit_2way_hash;
#else
init_qubit_ctx();
gate->scanhash = (void*)&scanhash_qubit;
gate->hash = (void*)&qubit_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
return true;
};

32
algo/qubit/qubit-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef QUBIT_GATE_H__
#define QUBIT_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define QUBIT_2WAY
#endif
bool register_qubit_algo( algo_gate_t* gate );
#if defined(QUBIT_2WAY)
void qubit_2way_hash( void *state, const void *input );
int scanhash_qubit_2way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_qubit_2way_ctx();
#endif
void qubit_hash( void *state, const void *input );
int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_qubit_ctx();
#endif

View File

@@ -1,11 +1,11 @@
#include "algo-gate-api.h" #include "qubit-gate.h"
#include <stdlib.h> #include <stdlib.h>
#include <stdint.h> #include <stdint.h>
#include <string.h> #include <string.h>
#include <stdio.h> #include <stdio.h>
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/nist.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#ifndef NO_AES_NI #ifndef NO_AES_NI
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
@@ -48,7 +48,7 @@ void qubit_luffa_midstate( const void* input )
update_luffa( &qubit_luffa_mid, input, 64 ); update_luffa( &qubit_luffa_mid, input, 64 );
} }
void qubithash(void *output, const void *input) void qubit_hash(void *output, const void *input)
{ {
unsigned char hash[128] __attribute((aligned(64))); unsigned char hash[128] __attribute((aligned(64)));
#define hashB hash+64 #define hashB hash+64
@@ -115,7 +115,7 @@ int scanhash_qubit(int thr_id, struct work *work,
{ {
pdata[19] = ++n; pdata[19] = ++n;
be32enc(&endiandata[19], n); be32enc(&endiandata[19], n);
qubithash(hash64, endiandata); qubit_hash(hash64, endiandata);
#ifndef DEBUG_ALGO #ifndef DEBUG_ALGO
if (!(hash64[7] & mask)) if (!(hash64[7] & mask))
{ {
@@ -151,12 +151,3 @@ int scanhash_qubit(int thr_id, struct work *work,
return 0; return 0;
} }
bool register_qubit_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_qubit_ctx();
gate->scanhash = (void*)&scanhash_qubit;
gate->hash = (void*)&qubithash;
return true;
};

View File

@@ -778,6 +778,7 @@ bool scrypt_miner_thread_init( int thr_id )
bool register_scrypt_algo( algo_gate_t* gate ) bool register_scrypt_algo( algo_gate_t* gate )
{ {
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
gate->miner_thread_init =(void*)&scrypt_miner_thread_init; gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
gate->scanhash = (void*)&scanhash_scrypt; gate->scanhash = (void*)&scanhash_scrypt;
// gate->hash = (void*)&scrypt_1024_1_1_256_24way; // gate->hash = (void*)&scrypt_1024_1_1_256_24way;

View File

@@ -215,18 +215,18 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
#if defined BE64 #if defined BE64
#if defined PLW1 #if defined PLW1
sc->buf[ SPH_MAXPAD>>3 ] = sc->buf[ SPH_MAXPAD>>3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#elif defined PLW4 #elif defined PLW4
memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 ); memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] = sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] = sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#else #else
sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] = sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] = sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#endif // PLW #endif // PLW
#else // LE64 #else // LE64
#if defined PLW1 #if defined PLW1
@@ -255,7 +255,7 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
for ( u = 0; u < rnum; u ++ ) for ( u = 0; u < rnum; u ++ )
{ {
#if defined BE64 #if defined BE64
((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] ); ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
#else // LE64 #else // LE64
((__m256i*)dst)[u] = sc->val[u]; ((__m256i*)dst)[u] = sc->val[u];
#endif #endif

View File

@@ -30,13 +30,235 @@
* @author Thomas Pornin <thomas.pornin@cryptolog.com> * @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/ */
#if defined(__AVX__)
#include <stddef.h> #include <stddef.h>
#include <string.h> #include <string.h>
#include "sha2-hash-4way.h" #include "sha2-hash-4way.h"
// SHA256 4 way 32 bit
static const sph_u32 H256[8] = {
SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
};
static const sph_u32 K256[80] = {
SPH_C32(0x428A2F98), SPH_C32(0x71374491),
SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
SPH_C32(0x06CA6351), SPH_C32(0x14292967),
SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
SPH_C32(0xD192E819), SPH_C32(0xD6990624),
SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2),
SPH_C32(0xCA273ECE), SPH_C32(0xD186B8C7),
SPH_C32(0xEADA7DD6), SPH_C32(0xF57D4F7F),
SPH_C32(0x06F067AA), SPH_C32(0x0A637DC5),
SPH_C32(0x113F9804), SPH_C32(0x1B710B35),
SPH_C32(0x28DB77F5), SPH_C32(0x32CAAB7B),
SPH_C32(0x3C9EBE0A), SPH_C32(0x431D67C4),
SPH_C32(0x4CC5D4BE), SPH_C32(0x597F299C),
SPH_C32(0x5FCB6FAB), SPH_C32(0x6C44198C)
};
#define CHs(X, Y, Z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z )
#define MAJs(X, Y, Z) \
_mm_or_si128( _mm_and_si128( X, Y ), \
_mm_and_si128( _mm_or_si128( X, Y ), Z ) )
#define BSG2_0(x) \
_mm_xor_si128( _mm_xor_si128( \
mm_rotr_32(x, 2), mm_rotr_32(x, 13) ), mm_rotr_32( x, 22) )
#define BSG2_1(x) \
_mm_xor_si128( _mm_xor_si128( \
mm_rotr_32(x, 6), mm_rotr_32(x, 11) ), mm_rotr_32( x, 25) )
#define SSG2_0(x) \
_mm_xor_si128( _mm_xor_si128( \
mm_rotr_32(x, 7), mm_rotr_32(x, 18) ), _mm_srli_epi32(x, 3) )
#define SSG2_1(x) \
_mm_xor_si128( _mm_xor_si128( \
mm_rotr_32(x, 17), mm_rotr_32(x, 19) ), _mm_srli_epi32(x, 10) )
#define SHA256_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
do { \
__m128i T1, T2; \
T1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( \
_mm_add_epi32( H, BSG2_1(E) ), CHs(E, F, G) ), \
_mm_set1_epi32( K256[i] ) ), W[i] ); \
T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
D = _mm_add_epi32( D, T1 ); \
H = _mm_add_epi32( T1, T2 ); \
} while (0)
static void
sha256_4way_round( __m128i *in, __m128i r[8] )
{
int i;
__m128i A, B, C, D, E, F, G, H;
__m128i W[80];
for ( i = 0; i < 16; i++ )
W[i] = mm_bswap_32( in[i] );
for ( i = 16; i < 80; i++ )
W[i] = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32(
SSG2_1( W[ i-2 ] ), W[ i-7 ] ), SSG2_0( W[ i-15 ] ) ), W[ i-16 ] );
A = r[0];
B = r[1];
C = r[2];
D = r[3];
E = r[4];
F = r[5];
G = r[6];
H = r[7];
for ( i = 0; i < 80; i += 8 )
{
SHA256_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
SHA256_4WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
SHA256_4WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
SHA256_4WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
SHA256_4WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
SHA256_4WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
SHA256_4WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
SHA256_4WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
}
r[0] = _mm_add_epi32( r[0], A );
r[1] = _mm_add_epi32( r[1], B );
r[2] = _mm_add_epi32( r[2], C );
r[3] = _mm_add_epi32( r[3], D );
r[4] = _mm_add_epi32( r[4], E );
r[5] = _mm_add_epi32( r[5], F );
r[6] = _mm_add_epi32( r[6], G );
r[7] = _mm_add_epi32( r[7], H );
}
void sha256_4way_init( sha256_4way_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = _mm_set1_epi32( H256[0] );
sc->val[1] = _mm_set1_epi32( H256[1] );
sc->val[2] = _mm_set1_epi32( H256[2] );
sc->val[3] = _mm_set1_epi32( H256[3] );
sc->val[4] = _mm_set1_epi32( H256[4] );
sc->val[5] = _mm_set1_epi32( H256[5] );
sc->val[6] = _mm_set1_epi32( H256[6] );
sc->val[7] = _mm_set1_epi32( H256[7] );
}
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
{
__m128i *vdata = (__m128i*)data;
size_t ptr;
const int buf_size = 64;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
while ( len > 0 )
{
size_t clen;
uint32_t clow, clow2;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_128( sc->buf + (ptr>>2), vdata, clen>>2 );
vdata = vdata + (clen>>2);
ptr += clen;
len -= clen;
if ( ptr == buf_size )
{
sha256_4way_round( sc->buf, sc->val );
ptr = 0;
}
clow = sc->count_low;
clow2 = SPH_T32( clow + clen );
sc->count_low = clow2;
if ( clow2 < clow )
sc->count_high++;
}
}
void sha256_4way_close( sha256_4way_context *sc, void *dst )
{
unsigned ptr, u;
uint32_t low, high;
const int buf_size = 64;
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 );
ptr += 4;
if ( ptr > pad )
{
memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
sha256_4way_round( sc->buf, sc->val );
memset_zero_128( sc->buf, pad >> 2 );
}
else
memset_zero_128( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
low = sc->count_low;
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad >> 2 ] =
mm_bswap_32( _mm_set1_epi32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] =
mm_bswap_32( _mm_set1_epi32( low ) );
sha256_4way_round( sc->buf, sc->val );
for ( u = 0; u < 8; u ++ )
((__m128i*)dst)[u] = mm_bswap_32( sc->val[u] );
}
#if defined(__AVX2__) #if defined(__AVX2__)
// SHA512 4 way 64 bit
static const sph_u64 H512[8] = {
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
};
static const sph_u64 K512[80] = { static const sph_u64 K512[80] = {
SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD), SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC), SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
@@ -80,13 +302,6 @@ static const sph_u64 K512[80] = {
SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817) SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
}; };
static const sph_u64 H512[8] = {
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
};
#define CH(X, Y, Z) \ #define CH(X, Y, Z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
@@ -129,7 +344,7 @@ sha512_4way_round( __m256i *in, __m256i r[8] )
__m256i W[80]; __m256i W[80];
for ( i = 0; i < 16; i++ ) for ( i = 0; i < 16; i++ )
W[i] = mm256_byteswap_64( in[i] ); W[i] = mm256_bswap_64( in[i] );
for ( i = 16; i < 80; i++ ) for ( i = 16; i < 80; i++ )
W[i] = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64( W[i] = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64(
SSG5_1( W[ i-2 ] ), W[ i-7 ] ), SSG5_0( W[ i-15 ] ) ), W[ i-16 ] ); SSG5_1( W[ i-2 ] ), W[ i-7 ] ), SSG5_0( W[ i-15 ] ) ), W[ i-16 ] );
@@ -182,7 +397,7 @@ void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
{ {
__m256i *vdata = (__m256i*)data; __m256i *vdata = (__m256i*)data;
size_t ptr; size_t ptr;
int buf_size = 128; const int buf_size = 128;
ptr = (unsigned)sc->count & (buf_size - 1U); ptr = (unsigned)sc->count & (buf_size - 1U);
while ( len > 0 ) while ( len > 0 )
@@ -207,8 +422,8 @@ void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
void sha512_4way_close( sha512_4way_context *sc, void *dst ) void sha512_4way_close( sha512_4way_context *sc, void *dst )
{ {
unsigned ptr, u; unsigned ptr, u;
int buf_size = 128; const int buf_size = 128;
int pad = buf_size - 16; const int pad = buf_size - 16;
ptr = (unsigned)sc->count & (buf_size - 1U); ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 ); sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
@@ -224,13 +439,14 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 ); memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] = sc->buf[ pad >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ ( pad+8 ) >> 3 ] = sc->buf[ ( pad+8 ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
sha512_4way_round( sc->buf, sc->val ); sha512_4way_round( sc->buf, sc->val );
for ( u = 0; u < 8; u ++ ) for ( u = 0; u < 8; u ++ )
((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] ); ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
} }
#endif #endif // __AVX2__
#endif // __AVX__

View File

@@ -44,47 +44,19 @@
#include "sph_types.h" #include "sph_types.h"
#include "avxdefs.h" #include "avxdefs.h"
#if 0 #if defined(__AVX__)
#define SPH_SIZE_sha224 224
#define SPH_SIZE_sha256 256 #define SPH_SIZE_sha256 256
typedef struct { typedef struct {
#ifndef DOXYGEN_IGNORE __m128i buf[64>>2];
unsigned char buf[64]; /* first field, for alignment */ __m128i val[8];
sph_u32 val[8]; uint32_t count_high, count_low;
#if SPH_64 } sha256_4way_context;
sph_u64 count;
#else
sph_u32 count_high, count_low;
#endif
#endif
} sph_sha224_context;
typedef sph_sha224_context sph_sha256_context; void sha256_4way_init( sha256_4way_context *sc );
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
void sph_sha224_init(void *cc); void sha256_4way_close( sha256_4way_context *sc, void *dst );
void sph_sha224(void *cc, const void *data, size_t len);
void sph_sha224_close(void *cc, void *dst);
void sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
void sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8]);
void sph_sha256_init(void *cc);
void sph_sha256(void *cc, const void *data, size_t len);
void sph_sha256_close(void *cc, void *dst);
void sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
#endif
#if defined (__AVX2__) #if defined (__AVX2__)
@@ -102,3 +74,4 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst );
#endif #endif
#endif #endif
#endif

View File

@@ -74,6 +74,18 @@ static const sph_u32 IV512[] = {
C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A) C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
}; };
// Return hi 128 bits with elements shifted one lane with vacated lane filled
// with data rotated from lo.
// Partially rotate elements in two 128 bit vectors as one 256 bit vector
// and return the rotated high 128 bits.
// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not
// completed. It's faster than a full rotation.
static inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo, int n )
{ return _mm_or_si128( _mm_srli_si128( hi, n<<2 ),
_mm_slli_si128( lo, 16 - (n<<2) ) );
}
#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \ #define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \
sph_u32 t0 = (x0); \ sph_u32 t0 = (x0); \
sph_u32 t1 = (x1); \ sph_u32 t1 = (x1); \
@@ -284,42 +296,42 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round // round
k00 = m[0]; k00 = m[0];
x = _mm_xor_si128( p1, k00 ); x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k01 = m[1]; k01 = m[1];
x = _mm_xor_si128( x, k01 ); x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k02 = m[2]; k02 = m[2];
x = _mm_xor_si128( x, k02 ); x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k03 = m[3]; k03 = m[3];
x = _mm_xor_si128( x, k03 ); x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
p0 = _mm_xor_si128( p0, x ); p0 = _mm_xor_si128( p0, x );
k10 = m[4]; k10 = m[4];
x = _mm_xor_si128( p3, k10 ); x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k11 = m[5]; k11 = m[5];
x = _mm_xor_si128( x, k11 ); x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k12 = m[6]; k12 = m[6];
x = _mm_xor_si128( x, k12 ); x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k13 = m[7]; k13 = m[7];
x = _mm_xor_si128( x, k13 ); x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
p2 = _mm_xor_si128( p2, x ); p2 = _mm_xor_si128( p2, x );
for ( r = 0; r < 3; r ++ ) for ( r = 0; r < 3; r ++ )
{ {
// round 1, 5, 9 // round 1, 5, 9
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) ); k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 ); k00 = _mm_xor_si128( k00, k13 );
if ( r == 0 ) if ( r == 0 )
@@ -327,8 +339,8 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count3, sc->count2, sc->count1, sc->count0 ) ); ~sc->count3, sc->count2, sc->count1, sc->count0 ) );
x = _mm_xor_si128( p0, k00 ); x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) ); k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 ); k01 = _mm_xor_si128( k01, k00 );
if ( r == 1 ) if ( r == 1 )
@@ -336,34 +348,34 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count0, sc->count1, sc->count2, sc->count3 ) ); ~sc->count0, sc->count1, sc->count2, sc->count3 ) );
x = _mm_xor_si128( x, k01 ); x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) ); k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 ); k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 ); x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) ); k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 ); k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 ); x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
p3 = _mm_xor_si128( p3, x ); p3 = _mm_xor_si128( p3, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) ); k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 ); k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 ); x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) ); k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 ); k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 ); x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) ); k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, k11 ); k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 ); x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) ); k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 ); k13 = _mm_xor_si128( k13, k12 );
if ( r == 2 ) if ( r == 2 )
@@ -371,89 +383,89 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count1, sc->count0, sc->count3, sc->count2 ) ); ~sc->count1, sc->count0, sc->count3, sc->count2 ) );
x = _mm_xor_si128( x, k13 ); x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
p1 = _mm_xor_si128( p1, x ); p1 = _mm_xor_si128( p1, x );
// round 2, 6, 10 // round 2, 6, 10
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) ); k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
x = _mm_xor_si128( p3, k00 ); x = _mm_xor_si128( p3, k00 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) ); k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
x = _mm_xor_si128( x, k01 ); x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) ); k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
x = _mm_xor_si128( x, k02 ); x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) ); k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
x = _mm_xor_si128( x, k03 ); x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
p2 = _mm_xor_si128( p2, x ); p2 = _mm_xor_si128( p2, x );
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) ); k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
x = _mm_xor_si128( p1, k10 ); x = _mm_xor_si128( p1, k10 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) ); k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
x = _mm_xor_si128( x, k11 ); x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) ); k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
x = _mm_xor_si128( x, k12 ); x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) ); k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
x = _mm_xor_si128( x, k13 ); x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
p0 = _mm_xor_si128( p0, x ); p0 = _mm_xor_si128( p0, x );
// round 3, 7, 11 // round 3, 7, 11
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) ); k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 ); k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p2, k00 ); x = _mm_xor_si128( p2, k00 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) ); k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 ); k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 ); x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) ); k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 ); k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 ); x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) ); k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 ); k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 ); x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
p1 = _mm_xor_si128( p1, x ); p1 = _mm_xor_si128( p1, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) ); k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 ); k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p0, k10 ); x = _mm_xor_si128( p0, k10 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) ); k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 ); k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 ); x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) ); k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, k11 ); k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 ); x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) ); k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 ); k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 ); x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
p3 = _mm_xor_si128( p3, x ); p3 = _mm_xor_si128( p3, x );
// round 4, 8, 12 // round 4, 8, 12
@@ -461,83 +473,83 @@ c512( sph_shavite_big_context *sc, const void *msg )
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) ); k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
x = _mm_xor_si128( p1, k00 ); x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) ); k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
x = _mm_xor_si128( x, k01 ); x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) ); k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
x = _mm_xor_si128( x, k02 ); x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) ); k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
x = _mm_xor_si128( x, k03 ); x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
p0 = _mm_xor_si128( p0, x ); p0 = _mm_xor_si128( p0, x );
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) ); k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
x = _mm_xor_si128( p3, k10 ); x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) ); k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
x = _mm_xor_si128( x, k11 ); x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) ); k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
x = _mm_xor_si128( x, k12 ); x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) ); k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
x = _mm_xor_si128( x, k13 ); x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
p2 = _mm_xor_si128( p2, x ); p2 = _mm_xor_si128( p2, x );
} }
// round 13 // round 13
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) ); k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 ); k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p0, k00 ); x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) ); k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 ); k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 ); x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) ); k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 ); k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 ); x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) ); k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 ); k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 ); x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
p3 = _mm_xor_si128( p3, x ); p3 = _mm_xor_si128( p3, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) ); k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 ); k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 ); x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) ); k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 ); k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 ); x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) ); k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32( k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) ); ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
x = _mm_xor_si128( x, k12 ); x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) ); k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 ); k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 ); x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero ); x = _mm_aesenc_si128( x, m128_zero );
p1 = _mm_xor_si128( p1, x ); p1 = _mm_xor_si128( p1, x );
h[0] = _mm_xor_si128( h[0], p2 ); h[0] = _mm_xor_si128( h[0], p2 );

853
algo/simd/simd-hash-2way.c Normal file
View File

@@ -0,0 +1,853 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "simd-hash-2way.h"
#if defined (__AVX2__)
// imported from simd_iv.h
uint32_t SIMD_IV_512[] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc,
0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f,
0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8,
0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4,
0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 };
/* Twiddle tables */
static const m256_v16 FFT64_Twiddle[] =
{
{{ 1, 2, 4, 8, 16, 32, 64, 128,
1, 2, 4, 8, 16, 32, 64, 128 }},
{{ 1, 60, 2, 120, 4, -17, 8, -34,
1, 60, 2, 120, 4, -17, 8, -34 }},
{{ 1, 120, 8, -68, 64, -30, -2, 17,
1, 120, 8, -68, 64, -30, -2, 17 }},
{{ 1, 46, 60, -67, 2, 92, 120, 123,
1, 46, 60, -67, 2, 92, 120, 123 }},
{{ 1, 92, -17, -22, 32, 117, -30, 67,
1, 92, -17, -22, 32, 117, -30, 67 }},
{{ 1, -67, 120, -73, 8, -22, -68, -70,
1, -67, 120, -73, 8, -22, -68, -70 }},
{{ 1, 123, -34, -70, 128, 67, 17, 35,
1, 123, -34, -70, 128, 67, 17, 35 }},
};
static const m256_v16 FFT128_Twiddle[] =
{
{{ 1, -118, 46, -31, 60, 116, -67, -61,
1, -118, 46, -31, 60, 116, -67, -61 }},
{{ 2, 21, 92, -62, 120, -25, 123, -122,
2, 21, 92, -62, 120, -25, 123, -122 }},
{{ 4, 42, -73, -124, -17, -50, -11, 13,
4, 42, -73, -124, -17, -50, -11, 13 }},
{{ 8, 84, 111, 9, -34, -100, -22, 26,
8, 84, 111, 9, -34, -100, -22, 26 }},
{{ 16, -89, -35, 18, -68, 57, -44, 52,
16, -89, -35, 18, -68, 57, -44, 52 }},
{{ 32, 79, -70, 36, 121, 114, -88, 104,
32, 79, -70, 36, 121, 114, -88, 104 }},
{{ 64, -99, 117, 72, -15, -29, 81, -49,
64, -99, 117, 72, -15, -29, 81, -49 }},
{{ 128, 59, -23, -113, -30, -58, -95, -98,
128, 59, -23, -113, -30, -58, -95, -98 }},
};
static const m256_v16 FFT256_Twiddle[] =
{
{{ 1, 41, -118, 45, 46, 87, -31, 14,
1, 41, -118, 45, 46, 87, -31, 14 }},
{{ 60, -110, 116, -127, -67, 80, -61, 69,
60, -110, 116, -127, -67, 80, -61, 69 }},
{{ 2, 82, 21, 90, 92, -83, -62, 28,
2, 82, 21, 90, 92, -83, -62, 28 }},
{{ 120, 37, -25, 3, 123, -97, -122, -119,
120, 37, -25, 3, 123, -97, -122, -119 }},
{{ 4, -93, 42, -77, -73, 91, -124, 56,
4, -93, 42, -77, -73, 91, -124, 56 }},
{{ -17, 74, -50, 6, -11, 63, 13, 19,
-17, 74, -50, 6, -11, 63, 13, 19 }},
{{ 8, 71, 84, 103, 111, -75, 9, 112,
8, 71, 84, 103, 111, -75, 9, 112 }},
{{ -34, -109, -100, 12, -22, 126, 26, 38,
-34, -109, -100, 12, -22, 126, 26, 38 }},
{{ 16, -115, -89, -51, -35, 107, 18, -33,
16, -115, -89, -51, -35, 107, 18, -33 }},
{{ -68, 39, 57, 24, -44, -5, 52, 76,
-68, 39, 57, 24, -44, -5, 52, 76 }},
{{ 32, 27, 79, -102, -70, -43, 36, -66,
32, 27, 79, -102, -70, -43, 36, -66 }},
{{ 121, 78, 114, 48, -88, -10, 104, -105,
121, 78, 114, 48, -88, -10, 104, -105 }},
{{ 64, 54, -99, 53, 117, -86, 72, 125,
64, 54, -99, 53, 117, -86, 72, 125 }},
{{ -15, -101, -29, 96, 81, -20, -49, 47,
-15, -101, -29, 96, 81, -20, -49, 47 }},
{{ 128, 108, 59, 106, -23, 85, -113, -7,
128, 108, 59, 106, -23, 85, -113, -7 }},
{{ -30, 55, -58, -65, -95, -40, -98, 94,
-30, 55, -58, -65, -95, -40, -98, 94 }}
};
#define SHUFXOR_1 0xb1 /* 0b10110001 */
#define SHUFXOR_2 0x4e /* 0b01001110 */
#define SHUFXOR_3 0x1b /* 0b00011011 */
#define CAT(x, y) x##y
#define XCAT(x,y) CAT(x,y)
#define shufxor(x,s) _mm256_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
// imported from vector.c
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi16( 255 ) ), \
_mm256_srai_epi16( x, 8 ) )
#define EXTRA_REDUCE_S(x)\
_mm256_sub_epi16( x, \
_mm256_and_si256( _mm256_set1_epi16( 257 ), \
_mm256_cmpgt_epi16( x, _mm256_set1_epi16( 128 ) ) ) )
#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) )
#define DO_REDUCE( i ) X(i) = REDUCE( X(i) )
#define DO_REDUCE_FULL_S(i) \
do { \
X(i) = REDUCE( X(i) ); \
X(i) = EXTRA_REDUCE_S( X(i) ); \
} while(0)
void fft64_2way( void *a )
{
__m256i* const A = a;
register __m256i X0, X1, X2, X3, X4, X5, X6, X7;
#define X(i) X##i
X0 = A[0];
X1 = A[1];
X2 = A[2];
X3 = A[3];
X4 = A[4];
X5 = A[5];
X6 = A[6];
X7 = A[7];
#define DO_REDUCE(i) X(i) = REDUCE( X(i) )
// Begin with 8 parallels DIF FFT_8
//
// FFT_8 using w=4 as 8th root of unity
// Unrolled decimation in frequency (DIF) radix-2 NTT.
// Output data is in revbin_permuted order.
static const int w[] = {0, 2, 4, 6};
// __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
#define BUTTERFLY_0( i,j ) \
do { \
__m256i v = X(j); \
X(j) = _mm256_add_epi16( X(i), X(j) ); \
X(i) = _mm256_sub_epi16( X(i), v ); \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
do { \
__m256i v = X(j); \
X(j) = _mm256_add_epi16( X(i), X(j) ); \
X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
} while(0)
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
DO_REDUCE( 2 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
DO_REDUCE( 1 );
BUTTERFLY_0( 0, 1 );
BUTTERFLY_0( 2, 3 );
BUTTERFLY_0( 4, 5 );
BUTTERFLY_0( 6, 7 );
/* We don't need to reduce X(7) */
DO_REDUCE_FULL_S( 0 );
DO_REDUCE_FULL_S( 1 );
DO_REDUCE_FULL_S( 2 );
DO_REDUCE_FULL_S( 3 );
DO_REDUCE_FULL_S( 4 );
DO_REDUCE_FULL_S( 5 );
DO_REDUCE_FULL_S( 6 );
#undef BUTTERFLY_0
#undef BUTTERFLY_N
// Multiply by twiddle factors
X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i );
X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i );
X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i );
X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i );
X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i );
X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i );
X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i );
// Transpose the FFT state with a revbin order permutation
// on the rows and the column.
// This will make the full FFT_64 in order.
#define INTERLEAVE(i,j) \
do { \
__m256i t1= X(i); \
__m256i t2= X(j); \
X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
} while(0)
INTERLEAVE( 1, 0 );
INTERLEAVE( 3, 2 );
INTERLEAVE( 5, 4 );
INTERLEAVE( 7, 6 );
INTERLEAVE( 2, 0 );
INTERLEAVE( 3, 1 );
INTERLEAVE( 6, 4 );
INTERLEAVE( 7, 5 );
INTERLEAVE( 4, 0 );
INTERLEAVE( 5, 1 );
INTERLEAVE( 6, 2 );
INTERLEAVE( 7, 3 );
#undef INTERLEAVE
//Finish with 8 parallels DIT FFT_8
//FFT_8 using w=4 as 8th root of unity
// Unrolled decimation in time (DIT) radix-2 NTT.
// Input data is in revbin_permuted order.
#define BUTTERFLY_0( i,j ) \
do { \
__m256i u = X(j); \
X(j) = _mm256_sub_epi16( X(j), X(i) ); \
X(i) = _mm256_add_epi16( u, X(i) ); \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
do { \
__m256i u = X(j); \
X(i) = _mm256_slli_epi16( X(i), w[n] ); \
X(j) = _mm256_sub_epi16( X(j), X(i) ); \
X(i) = _mm256_add_epi16( u, X(i) ); \
} while(0)
DO_REDUCE( 0 );
DO_REDUCE( 1 );
DO_REDUCE( 2 );
DO_REDUCE( 3 );
DO_REDUCE( 4 );
DO_REDUCE( 5 );
DO_REDUCE( 6 );
DO_REDUCE( 7 );
BUTTERFLY_0( 0, 1 );
BUTTERFLY_0( 2, 3 );
BUTTERFLY_0( 4, 5 );
BUTTERFLY_0( 6, 7 );
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
DO_REDUCE_FULL_S( 0 );
DO_REDUCE_FULL_S( 1 );
DO_REDUCE_FULL_S( 2 );
DO_REDUCE_FULL_S( 3 );
DO_REDUCE_FULL_S( 4 );
DO_REDUCE_FULL_S( 5 );
DO_REDUCE_FULL_S( 6 );
DO_REDUCE_FULL_S( 7 );
#undef BUTTERFLY
A[0] = X0;
A[1] = X1;
A[2] = X2;
A[3] = X3;
A[4] = X4;
A[5] = X5;
A[6] = X6;
A[7] = X7;
#undef X
}
void fft128_2way( void *a )
{
int i;
// Temp space to help for interleaving in the end
__m256i B[8];
__m256i *A = (__m256i*) a;
// __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
/* Size-2 butterflies */
for ( i = 0; i<8; i++ )
{
B[ i ] = _mm256_add_epi16( A[ i ], A[ i+8 ] );
B[ i ] = REDUCE_FULL_S( B[ i ] );
A[ i+8 ] = _mm256_sub_epi16( A[ i ], A[ i+8 ] );
A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
A[ i+8 ] = _mm256_mullo_epi16( A[ i+8 ], FFT128_Twiddle[i].m256i );
A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
}
fft64_2way( B );
fft64_2way( A+8 );
/* Transpose (i.e. interleave) */
for ( i = 0; i < 8; i++ )
{
A[ 2*i ] = _mm256_unpacklo_epi16( B[ i ], A[ i+8 ] );
A[ 2*i+1 ] = _mm256_unpackhi_epi16( B[ i ], A[ i+8 ] );
}
}
void fft128_2way_msg( uint16_t *a, const uint8_t *x, int final )
{
static const m256_v16 Tweak = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
__m256i *X = (__m256i*)x;
__m256i *A = (__m256i*)a;
// __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
#define UNPACK( i ) \
do { \
__m256i t = X[i]; \
A[2*i] = _mm256_unpacklo_epi8( t, m256_zero ); \
A[2*i+8] = _mm256_mullo_epi16( A[2*i], FFT128_Twiddle[2*i].m256i ); \
A[2*i+8] = REDUCE(A[2*i+8]); \
A[2*i+1] = _mm256_unpackhi_epi8( t, m256_zero ); \
A[2*i+9] = _mm256_mullo_epi16(A[2*i+1], FFT128_Twiddle[2*i+1].m256i ); \
A[2*i+9] = REDUCE(A[2*i+9]); \
} while(0)
// This allows to tweak the last butterflies to introduce X^127
#define UNPACK_TWEAK( i,tw ) \
do { \
__m256i t = X[i]; \
__m256i tmp; \
A[2*i] = _mm256_unpacklo_epi8( t, m256_zero ); \
A[2*i+8] = _mm256_mullo_epi16( A[ 2*i ], FFT128_Twiddle[ 2*i ].m256i ); \
A[2*i+8] = REDUCE( A[ 2*i+8 ] ); \
tmp = _mm256_unpackhi_epi8( t, m256_zero ); \
A[2*i+1] = _mm256_add_epi16( tmp, tw ); \
A[2*i+9] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
FFT128_Twiddle[ 2*i+1 ].m256i );\
A[2*i+9] = REDUCE( A[ 2*i+9 ] ); \
} while(0)
UNPACK( 0 );
UNPACK( 1 );
UNPACK( 2 );
if ( final )
UNPACK_TWEAK( 3, FinalTweak.m256i );
else
UNPACK_TWEAK( 3, Tweak.m256i );
#undef UNPACK
#undef UNPACK_TWEAK
fft64_2way( a );
fft64_2way( a+128 );
}
void fft256_2way_msg( uint16_t *a, const uint8_t *x, int final )
{
static const m256_v16 Tweak = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
__m256i *X = (__m256i*)x;
__m256i *A = (__m256i*)a;
// __m256i *Twiddle = (__m256i*)FFT256_Twiddle;
#define UNPACK( i ) \
do { \
__m256i t = X[i]; \
A[ 2*i ] = _mm256_unpacklo_epi8( t, m256_zero ); \
A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
FFT256_Twiddle[ 2*i ].m256i ); \
A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
A[ 2*i + 1 ] = _mm256_unpackhi_epi8( t, m256_zero ); \
A[ 2*i + 17 ] = _mm256_mullo_epi16( A[ 2*i + 1 ], \
FFT256_Twiddle[ 2*i + 1 ].m256i ); \
A[ 2*i + 17 ] = REDUCE( A[ 2*i + 17 ] ); \
} while(0)
// This allows to tweak the last butterflies to introduce X^127
#define UNPACK_TWEAK( i,tw ) \
do { \
__m256i t = X[i]; \
__m256i tmp; \
A[ 2*i ] = _mm256_unpacklo_epi8( t, m256_zero ); \
A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
FFT256_Twiddle[ 2*i ].m256i ); \
A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
tmp = _mm256_unpackhi_epi8( t, m256_zero ); \
A[ 2*i + 1 ] = _mm256_add_epi16( tmp, tw ); \
A[ 2*i + 17 ] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
FFT256_Twiddle[ 2*i + 1 ].m256i ); \
} while(0)
UNPACK( 0 );
UNPACK( 1 );
UNPACK( 2 );
UNPACK( 3 );
UNPACK( 4 );
UNPACK( 5 );
UNPACK( 6 );
if ( final )
UNPACK_TWEAK( 7, FinalTweak.m256i );
else
UNPACK_TWEAK( 7, Tweak.m256i );
#undef UNPACK
#undef UNPACK_TWEAK
fft128_2way( a );
fft128_2way( a+256 );
}
void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
{
register __m256i S0l, S1l, S2l, S3l;
register __m256i S0h, S1h, S2h, S3h;
__m256i *S = (__m256i*) state;
__m256i *M = (__m256i*) msg;
__m256i *W = (__m256i*) fft;
static const m256_v16 code[] = { mm256_setc1_16(185), mm256_setc1_16(233) };
S0l = _mm256_xor_si256( S[0], M[0] );
S0h = _mm256_xor_si256( S[1], M[1] );
S1l = _mm256_xor_si256( S[2], M[2] );
S1h = _mm256_xor_si256( S[3], M[3] );
S2l = _mm256_xor_si256( S[4], M[4] );
S2h = _mm256_xor_si256( S[5], M[5] );
S3l = _mm256_xor_si256( S[6], M[6] );
S3h = _mm256_xor_si256( S[7], M[7] );
#define S(i) S##i
#define F_0(B, C, D) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( C,D ), B ), D )
#define F_1(B, C, D) \
_mm256_or_si256( _mm256_and_si256( D, C ),\
_mm256_and_si256( _mm256_or_si256( D,C ), B ) )
#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
// We split the round function in two halfes
// so as to insert some independent computations in between
#define SUM7_00 0
#define SUM7_01 1
#define SUM7_02 2
#define SUM7_03 3
#define SUM7_04 4
#define SUM7_05 5
#define SUM7_06 6
#define SUM7_10 1
#define SUM7_11 2
#define SUM7_12 3
#define SUM7_13 4
#define SUM7_14 5
#define SUM7_15 6
#define SUM7_16 0
#define SUM7_20 2
#define SUM7_21 3
#define SUM7_22 4
#define SUM7_23 5
#define SUM7_24 6
#define SUM7_25 0
#define SUM7_26 1
#define SUM7_30 3
#define SUM7_31 4
#define SUM7_32 5
#define SUM7_33 6
#define SUM7_34 0
#define SUM7_35 1
#define SUM7_36 2
#define SUM7_40 4
#define SUM7_41 5
#define SUM7_42 6
#define SUM7_43 0
#define SUM7_44 1
#define SUM7_45 2
#define SUM7_46 3
#define SUM7_50 5
#define SUM7_51 6
#define SUM7_52 0
#define SUM7_53 1
#define SUM7_54 2
#define SUM7_55 3
#define SUM7_56 4
#define SUM7_60 6
#define SUM7_61 0
#define SUM7_62 1
#define SUM7_63 2
#define SUM7_64 3
#define SUM7_65 4
#define SUM7_66 5
#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
#define PERM_0(d,a) /* XOR 1 */ \
do { \
d##l = shufxor( a##l, 1 ); \
d##h = shufxor( a##h, 1 ); \
} while(0)
#define PERM_1(d,a) /* XOR 6 */ \
do { \
d##l = shufxor( a##h, 2 ); \
d##h = shufxor( a##l, 2 ); \
} while(0)
#define PERM_2(d,a) /* XOR 2 */ \
do { \
d##l = shufxor( a##l, 2 ); \
d##h = shufxor( a##h, 2 ); \
} while(0)
#define PERM_3(d,a) /* XOR 3 */ \
do { \
d##l = shufxor( a##l, 3 ); \
d##h = shufxor( a##h, 3 ); \
} while(0)
#define PERM_4(d,a) /* XOR 5 */ \
do { \
d##l = shufxor( a##h, 1 ); \
d##h = shufxor( a##l, 1 ); \
} while(0)
#define PERM_5(d,a) /* XOR 7 */ \
do { \
d##l = shufxor( a##h, 3 ); \
d##h = shufxor( a##l, 3 ); \
} while(0)
#define PERM_6(d,a) /* XOR 4 */ \
do { \
d##l = a##h; \
d##h = a##l; \
} while(0)
#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
do { \
TTl = Fl( a,b,c,fun ); \
TTh = Fh( a,b,c,fun ); \
a##l = mm256_rotl_32( a##l, r ); \
a##h = mm256_rotl_32( a##h, r ); \
w##l = _mm256_add_epi32( w##l, d##l ); \
w##h = _mm256_add_epi32( w##h, d##h ); \
TTl = _mm256_add_epi32( TTl, w##l ); \
TTh = _mm256_add_epi32( TTh, w##h ); \
TTl = mm256_rotl_32( TTl, s ); \
TTh = mm256_rotl_32( TTh, s ); \
PERM( z,d,a ); \
} while(0)
#define STEP_1( a,b,c,d,w,fun,r,s,z ) STEP_1_( a,b,c,d,w,fun,r,s,z )
#define STEP_2_( a,b,c,d,w,fun,r,s ) \
do { \
d##l = _mm256_add_epi32( d##l, TTl ); \
d##h = _mm256_add_epi32( d##h, TTh ); \
} while(0)
#define STEP_2( a,b,c,d,w,fun,r,s ) STEP_2_( a,b,c,d,w,fun,r,s )
#define STEP( a,b,c,d,w1,w2,fun,r,s,z ) \
do { \
register __m256i TTl, TTh, Wl=w1, Wh=w2; \
STEP_1( a,b,c,d,W,fun,r,s,z ); \
STEP_2( a,b,c,d,W,fun,r,s ); \
} while(0);
#define MSG_l(x) (2*(x))
#define MSG_h(x) (2*(x)+1)
#define MSG( w,hh,ll,u,z ) \
do { \
int a = MSG_##u(hh); \
int b = MSG_##u(ll); \
w##l = _mm256_unpacklo_epi16( W[a], W[b] ); \
w##l = _mm256_mullo_epi16( w##l, code[z].m256i ); \
w##h = _mm256_unpackhi_epi16( W[a], W[b]) ; \
w##h = _mm256_mullo_epi16( w##h, code[z].m256i ); \
} while(0)
#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \
do { \
register __m256i W0l, W1l, W2l, W3l, TTl; \
register __m256i W0h, W1h, W2h, W3h, TTh; \
MSG( W0, h0, l0, u0, z ); \
STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0 ); \
MSG( W1, h1, l1, u1, z ); \
STEP_2( S(0), S(1), S(2), S(3), W0, fun, r, s ); \
STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1 ); \
MSG( W2,h2,l2,u2,z ); \
STEP_2( S(3), S(0), S(1), S(2), W1, fun, s, t ); \
STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2 ); \
MSG( W3,h3,l3,u3,z ); \
STEP_2( S(2), S(3), S(0), S(1), W2, fun, t, u ); \
STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3 ); \
STEP_2( S(1), S(2), S(3), S(0), W3, fun, u, r ); \
} while(0)
// 4 rounds with code 185
#define PERM_START 0
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 4
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 1
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0);
#undef PERM_START
#define PERM_START 5
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0);
#undef PERM_START
// 4 rounds with code 233
#define PERM_START 2
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 6
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 3
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1);
#undef PERM_START
#define PERM_START 0
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1);
#undef PERM_START
// 1 round as feed-forward
#define PERM_START 4
STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0 );
STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1 );
STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2 );
STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3 );
S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h;
S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h;
#undef PERM_START
#undef STEP_1
#undef STEP_2
#undef STEP
#undef ROUND
}
void SIMD_2way_Compress( simd_2way_context *state, const void *m, int final )
{
m256_v16 Y[32];
uint16_t *y = (uint16_t*) Y[0].u16;
fft256_2way_msg( y, m, final );
rounds512_2way( state->A, m, y );
}
// imported from nist.c
int simd_2way_init( simd_2way_context *state, int hashbitlen )
{
__m256i *A = (__m256i*)state->A;
int n = 8;
state->hashbitlen = hashbitlen;
state->n_feistels = n;
state->blocksize = 128*8;
state->count = 0;
for ( int i = 0; i < 8; i++ )
A[i] = _mm256_set_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0],
SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] );
return 0;
}
int simd_2way_update( simd_2way_context *state, const void *data,
int databitlen )
{
int bs = state->blocksize;
int current = state->count & (bs - 1);
while ( databitlen > 0 )
{
if ( current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_2way_Compress( state, data, 0 );
databitlen -= bs;
data += 2*(bs/8);
state->count += bs;
}
else
{
// Copy a chunk of data to the buffer
int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer + 2*(current/8), data, 2*((databitlen+7)/8) );
state->count += databitlen;
return 0;
}
else
{
memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
state->count += len;
databitlen -= len;
data += 2*(len/8);
current = 0;
SIMD_2way_Compress( state, state->buffer, 0 );
}
}
}
return 0;
}
int simd_2way_close( simd_2way_context *state, void *hashval )
{
uint64_t l;
int current = state->count & (state->blocksize - 1);
int i;
int isshort = 1;
// If there is still some data in the buffer, hash it
if ( current )
{
current = ( current+7 ) / 8;
memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current ) );
SIMD_2way_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, 2*(state->blocksize / 8) );
l = state->count;
for ( i = 0; i < 8; i++ )
{
state->buffer[ i ] = l & 0xff;
state->buffer[ i+16 ] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
SIMD_2way_Compress( state, state->buffer, isshort );
memcpy( hashval, state->A, 2*(state->hashbitlen / 8) );
return 0;
}
int simd_2way_update_close( simd_2way_context *state, void *hashval,
const void *data, int databitlen )
{
int current, i;
int bs = state->blocksize; // bits in one lane
int isshort = 1;
uint64_t l;
current = state->count & (bs - 1);
while ( databitlen > 0 )
{
if ( current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_2way_Compress( state, data, 0 );
databitlen -= bs;
data += 2*( bs/8 );
state->count += bs;
}
else
{
// Copy a chunk of data to the buffer
int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) );
state->count += databitlen;
break;
}
else
{
memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
state->count += len;
databitlen -= len;
data += 2*( len/8 );
current = 0;
SIMD_2way_Compress( state, state->buffer, 0 );
}
}
}
current = state->count & (state->blocksize - 1);
// If there is still some data in the buffer, hash it
if ( current )
{
current = ( current+7 ) / 8;
memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current) );
SIMD_2way_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, 2*( state->blocksize/8 ) );
l = state->count;
for ( i = 0; i < 8; i++ )
{
state->buffer[ i ] = l & 0xff;
state->buffer[ i+16 ] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
SIMD_2way_Compress( state, state->buffer, isshort );
memcpy( hashval, state->A, 2*( state->hashbitlen / 8 ) );
return 0;
}
#endif

View File

@@ -0,0 +1,27 @@
#ifndef SIMD_HASH_2WAY_H__
#define SIMD_HASH_2WAY_H__ 1
#include "simd-compat.h"
#if defined(__AVX2__)
#include "avxdefs.h"
typedef struct {
uint32_t A[ 32*2 ] __attribute__((aligned(64)));
uint8_t buffer[ 128*2 ] __attribute__((aligned(64)));
uint64_t count;
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
} simd_2way_context;
int simd_2way_init( simd_2way_context *state, int hashbitlen );
int simd_2way_update( simd_2way_context *state, const void *data,
int databitlen );
int simd_2way_close( simd_2way_context *state, void *hashval );
int simd_2way_update_close( simd_2way_context *state, void *hashval,
const void *data, int databitlen );
#endif
#endif

View File

@@ -1,3 +1,6 @@
#if !defined(SIMD_IV_H__)
#define SIMD_IV_H__
u32 IV_224[] = { u32 IV_224[] = {
0x33586e9f, 0x12fff033, 0xb2d9f64d, 0x6f8fea53, 0x33586e9f, 0x12fff033, 0xb2d9f64d, 0x6f8fea53,
0xde943106, 0x2742e439, 0x4fbab5ac, 0x62b9ff96, 0xde943106, 0x2742e439, 0x4fbab5ac, 0x62b9ff96,
@@ -25,3 +28,5 @@ u32 IV_512[] = {
0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257, 0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
}; };
#endif

View File

@@ -1,23 +0,0 @@
#ifndef DEFS_X5_H__
#define DEFS_X5_H__
#include <emmintrin.h>
typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn;
typedef unsigned char uint8;
typedef unsigned int uint32;
typedef unsigned long long uint64;
typedef struct {
uint32 buffer[8]; /* Buffer to be hashed */
__m128i chainv[10]; /* Chaining values */
uint64 bitlen[2]; /* Message length in bits */
uint32 rembitlen; /* Length of buffer data to be hashed */
int hashbitlen;
} hashState_luffa;
typedef unsigned char byte;
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -63,13 +63,13 @@ MAYBE_INLINE void fft64(void *a) {
v16* const A = a; v16* const A = a;
register v16 X0, X1, X2, X3, X4, X5, X6, X7; register v16 X0, X1, X2, X3, X4, X5, X6, X7;
/*
#if V16_SIZE == 8 #if V16_SIZE == 8
#define X(i) A[i] #define X(i) A[i]
#elif V16_SIZE == 4 #elif V16_SIZE == 4
#define X(i) A[2*i] #define X(i) A[2*i]
#endif #endif
*/
#define X(i) X##i #define X(i) X##i
X0 = A[0]; X0 = A[0];
@@ -623,6 +623,11 @@ void rounds(u32* state, const unsigned char* msg, short* fft) {
STEP(S(1), S(2), S(3), S(0), S[3], 0, 25, 4, 20); STEP(S(1), S(2), S(3), S(0), S[3], 0, 25, 4, 20);
S[0] = S(0); S[1] = S(1); S[2] = S(2); S[3] = S(3); S[0] = S(0); S[1] = S(1); S[2] = S(2); S[3] = S(3);
#undef ROUND
#undef STEP
#undef STEP_1
#undef STEP_2
} }
@@ -849,24 +854,32 @@ void rounds512(u32* state, const unsigned char* msg, short* fft) {
*/ */
#define PERM_START 0 #define PERM_START 0
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0); ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 4 #define PERM_START 4
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0); ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 1 #define PERM_START 1
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0); ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0);
#undef PERM_START
#define PERM_START 5 #define PERM_START 5
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0); ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0);
#undef PERM_START
/* /*
* 4 rounds with code 233 * 4 rounds with code 233
*/ */
#define PERM_START 2 #define PERM_START 2
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1); ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 6 #define PERM_START 6
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1); ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 3 #define PERM_START 3
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1); ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1);
#undef PERM_START
#define PERM_START 0 #define PERM_START 0
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1); ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1);
#undef PERM_START
/* /*
@@ -877,9 +890,15 @@ void rounds512(u32* state, const unsigned char* msg, short* fft) {
STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1); STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1);
STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2); STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2);
STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3); STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3);
#undef PERM_START
S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h; S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h;
S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h; S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h;
#undef ROUND
#undef STEP
#undef STEP_1
#undef STEP_2
} }
void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) { void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) {

View File

@@ -125,14 +125,14 @@ void sm3_4way_close( void *cc, void *dst )
memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 ); memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
} }
count[0] = mm_byteswap_32( count[0] = mm_bswap_32(
_mm_set1_epi32( ctx->nblocks >> 23 ) ); _mm_set1_epi32( ctx->nblocks >> 23 ) );
count[1] = mm_byteswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) + count[1] = mm_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
( ctx->num << 3 ) ) ); ( ctx->num << 3 ) ) );
sm3_4way_compress( ctx->digest, block ); sm3_4way_compress( ctx->digest, block );
for ( i = 0; i < 8 ; i++ ) for ( i = 0; i < 8 ; i++ )
hash[i] = mm_byteswap_32( ctx->digest[i] ); hash[i] = mm_bswap_32( ctx->digest[i] );
} }
#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 9 ), \ #define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 9 ), \
@@ -165,7 +165,7 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
int j; int j;
for ( j = 0; j < 16; j++ ) for ( j = 0; j < 16; j++ )
W[j] = mm_byteswap_32( block[j] ); W[j] = mm_bswap_32( block[j] );
for ( j = 16; j < 68; j++ ) for ( j = 16; j < 68; j++ )
W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ], W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],

View File

@@ -229,18 +229,18 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
#if defined BE64 #if defined BE64
#if defined PLW1 #if defined PLW1
sc->buf[ SPH_MAXPAD>>3 ] = sc->buf[ SPH_MAXPAD>>3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#elif defined PLW4 #elif defined PLW4
memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 ); memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] = sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] = sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#else #else
sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] = sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] = sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#endif // PLW #endif // PLW
#else // LE64 #else // LE64
#if defined PLW1 #if defined PLW1
@@ -276,7 +276,7 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
for ( u = 0; u < rnum; u ++ ) for ( u = 0; u < rnum; u ++ )
{ {
#if defined BE64 #if defined BE64
((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] ); ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
#else // LE64 #else // LE64
((__m256i*)dst)[u] = sc->val[u]; ((__m256i*)dst)[u] = sc->val[u];
#endif #endif

View File

@@ -12,10 +12,10 @@
#include "algo/skein/skein-hash-4way.h" #include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h" #include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
typedef struct { typedef struct {
@@ -25,10 +25,10 @@ typedef struct {
skein512_4way_context skein; skein512_4way_context skein;
jh512_4way_context jh; jh512_4way_context jh;
keccak512_4way_context keccak; keccak512_4way_context keccak;
hashState_luffa luffa; luffa_2way_context luffa;
cubehashParam cube; cubehashParam cube;
sph_shavite512_context shavite; sph_shavite512_context shavite;
hashState_sd simd; simd_2way_context simd;
hashState_echo echo; hashState_echo echo;
} c11_4way_ctx_holder; } c11_4way_ctx_holder;
@@ -42,10 +42,10 @@ void init_c11_4way_ctx()
skein512_4way_init( &c11_4way_ctx.skein ); skein512_4way_init( &c11_4way_ctx.skein );
jh512_4way_init( &c11_4way_ctx.jh ); jh512_4way_init( &c11_4way_ctx.jh );
keccak512_4way_init( &c11_4way_ctx.keccak ); keccak512_4way_init( &c11_4way_ctx.keccak );
init_luffa( &c11_4way_ctx.luffa, 512 ); luffa_2way_init( &c11_4way_ctx.luffa, 512 );
cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 ); cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &c11_4way_ctx.shavite ); sph_shavite512_init( &c11_4way_ctx.shavite );
init_sd( &c11_4way_ctx.simd, 512 ); simd_2way_init( &c11_4way_ctx.simd, 512 );
init_echo( &c11_4way_ctx.echo, 512 ); init_echo( &c11_4way_ctx.echo, 512 );
} }
@@ -56,6 +56,7 @@ void c11_4way_hash( void *state, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64))); uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64))); uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*2] __attribute__ ((aligned (64)));
c11_4way_ctx_holder ctx; c11_4way_ctx_holder ctx;
memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) ); memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
@@ -98,17 +99,13 @@ void c11_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa // 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence*)hash0, 64 ); mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, luffa_2way_init( &ctx.luffa, 512 );
(const BitSequence*)hash1, 64 ); luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 8 Cubehash // 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -136,17 +133,13 @@ void c11_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 ); sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd // 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence *)hash0, 512 ); mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash1, simd_2way_init( &ctx.simd, 512 );
(const BitSequence *)hash1, 512 ); simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash2, mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// 11 Echo // 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0, update_final_echo( &ctx.echo, (BitSequence *)hash0,

View File

@@ -22,9 +22,9 @@
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
#endif #endif
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c" #include "algo/blake/sse2/blake.c"
#include "algo/keccak/sse2/keccak.c" #include "algo/keccak/sse2/keccak.c"
#include "algo/bmw/sse2/bmw.c" #include "algo/bmw/sse2/bmw.c"

View File

@@ -12,7 +12,7 @@
#include "algo/skein/skein-hash-4way.h" #include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h" #include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
static __thread uint32_t s_ntime = UINT32_MAX; static __thread uint32_t s_ntime = UINT32_MAX;
@@ -25,7 +25,7 @@ typedef struct {
skein512_4way_context skein; skein512_4way_context skein;
jh512_4way_context jh; jh512_4way_context jh;
keccak512_4way_context keccak; keccak512_4way_context keccak;
hashState_luffa luffa; luffa_2way_context luffa;
cubehashParam cube; cubehashParam cube;
} tt8_4way_ctx_holder; } tt8_4way_ctx_holder;
@@ -39,7 +39,7 @@ void init_tt8_4way_ctx()
skein512_4way_init( &tt8_4way_ctx.skein ); skein512_4way_init( &tt8_4way_ctx.skein );
jh512_4way_init( &tt8_4way_ctx.jh ); jh512_4way_init( &tt8_4way_ctx.jh );
keccak512_4way_init( &tt8_4way_ctx.keccak ); keccak512_4way_init( &tt8_4way_ctx.keccak );
init_luffa( &tt8_4way_ctx.luffa, 512 ); luffa_2way_init( &tt8_4way_ctx.luffa, 512 );
cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 ); cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 );
}; };
@@ -139,17 +139,13 @@ void timetravel_4way_hash(void *output, const void *input)
case 6: case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 ); vhashA, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
(const BitSequence *)hash0, dataLen ); luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) ); mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
(const BitSequence*)hash1, dataLen ); luffa_2way_init( &ctx.luffa, 512 );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) ); luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
if ( i != 7 ) if ( i != 7 )
mm256_interleave_4x64( vhashB, mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 ); hash0, hash1, hash2, hash3, dataLen<<3 );

View File

@@ -9,7 +9,7 @@
#include "algo/jh/sph_jh.h" #include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h" #include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h" #include "algo/skein/sph_skein.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#ifdef NO_AES_NI #ifdef NO_AES_NI
#include "algo/groestl/sph_groestl.h" #include "algo/groestl/sph_groestl.h"

View File

@@ -12,10 +12,10 @@
#include "algo/skein/skein-hash-4way.h" #include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h" #include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/simd-hash-2way.h"
static __thread uint32_t s_ntime = UINT32_MAX; static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[TT10_FUNC_COUNT] = { 0 }; static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
@@ -27,10 +27,10 @@ typedef struct {
skein512_4way_context skein; skein512_4way_context skein;
jh512_4way_context jh; jh512_4way_context jh;
keccak512_4way_context keccak; keccak512_4way_context keccak;
hashState_luffa luffa; luffa_2way_context luffa;
cubehashParam cube; cubehashParam cube;
sph_shavite512_context shavite; sph_shavite512_context shavite;
hashState_sd simd; simd_2way_context simd;
} tt10_4way_ctx_holder; } tt10_4way_ctx_holder;
tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64))); tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64)));
@@ -43,10 +43,10 @@ void init_tt10_4way_ctx()
skein512_4way_init( &tt10_4way_ctx.skein ); skein512_4way_init( &tt10_4way_ctx.skein );
jh512_4way_init( &tt10_4way_ctx.jh ); jh512_4way_init( &tt10_4way_ctx.jh );
keccak512_4way_init( &tt10_4way_ctx.keccak ); keccak512_4way_init( &tt10_4way_ctx.keccak );
init_luffa( &tt10_4way_ctx.luffa, 512 ); luffa_2way_init( &tt10_4way_ctx.luffa, 512 );
cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 ); cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &tt10_4way_ctx.shavite ); sph_shavite512_init( &tt10_4way_ctx.shavite );
init_sd( &tt10_4way_ctx.simd, 512 ); simd_2way_init( &tt10_4way_ctx.simd, 512 );
}; };
void timetravel10_4way_hash(void *output, const void *input) void timetravel10_4way_hash(void *output, const void *input)
@@ -145,17 +145,13 @@ void timetravel10_4way_hash(void *output, const void *input)
case 6: case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 ); vhashA, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
(const BitSequence *)hash0, dataLen ); luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) ); mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
(const BitSequence*)hash1, dataLen ); luffa_2way_init( &ctx.luffa, 512 );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) ); luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
if ( i != 9 ) if ( i != 9 )
mm256_interleave_4x64( vhashB, mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 ); hash0, hash1, hash2, hash3, dataLen<<3 );
@@ -199,17 +195,13 @@ void timetravel10_4way_hash(void *output, const void *input)
case 9: case 9:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 ); vhashA, dataLen<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0, mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
(const BitSequence *)hash0, dataLen<<3 ); simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd ); mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash1, mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
(const BitSequence *)hash1, dataLen<<3 ); simd_2way_init( &ctx.simd, 512 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd ); simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash2, mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
(const BitSequence *)hash2, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, dataLen<<3 );
if ( i != 9 ) if ( i != 9 )
mm256_interleave_4x64( vhashB, mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 ); hash0, hash1, hash2, hash3, dataLen<<3 );

View File

@@ -8,10 +8,10 @@
#include "algo/jh/sph_jh.h" #include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h" #include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h" #include "algo/skein/sph_skein.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/nist.h"
#ifdef NO_AES_NI #ifdef NO_AES_NI
#include "algo/groestl/sph_groestl.h" #include "algo/groestl/sph_groestl.h"

View File

@@ -5,17 +5,16 @@
#include <string.h> #include <string.h>
#include <stdint.h> #include <stdint.h>
#include "algo/blake/blake-hash-4way.h" #include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h" #include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h" #include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h" #include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
typedef struct { typedef struct {
@@ -25,10 +24,10 @@ typedef struct {
skein512_4way_context skein; skein512_4way_context skein;
jh512_4way_context jh; jh512_4way_context jh;
keccak512_4way_context keccak; keccak512_4way_context keccak;
hashState_luffa luffa; luffa_2way_context luffa;
cubehashParam cube; cubehashParam cube;
sph_shavite512_context shavite; sph_shavite512_context shavite;
hashState_sd simd; simd_2way_context simd;
hashState_echo echo; hashState_echo echo;
} x11_4way_ctx_holder; } x11_4way_ctx_holder;
@@ -42,10 +41,10 @@ void init_x11_4way_ctx()
skein512_4way_init( &x11_4way_ctx.skein ); skein512_4way_init( &x11_4way_ctx.skein );
jh512_4way_init( &x11_4way_ctx.jh ); jh512_4way_init( &x11_4way_ctx.jh );
keccak512_4way_init( &x11_4way_ctx.keccak ); keccak512_4way_init( &x11_4way_ctx.keccak );
init_luffa( &x11_4way_ctx.luffa, 512 ); luffa_2way_init( &x11_4way_ctx.luffa, 512 );
cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 ); cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11_4way_ctx.shavite ); sph_shavite512_init( &x11_4way_ctx.shavite );
init_sd( &x11_4way_ctx.simd, 512 ); simd_2way_init( &x11_4way_ctx.simd, 512 );
init_echo( &x11_4way_ctx.echo, 512 ); init_echo( &x11_4way_ctx.echo, 512 );
} }
@@ -56,6 +55,8 @@ void x11_4way_hash( void *state, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64))); uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64))); uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*2] __attribute__ ((aligned (64)));
x11_4way_ctx_holder ctx; x11_4way_ctx_holder ctx;
memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) ); memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
@@ -94,21 +95,16 @@ void x11_4way_hash( void *state, const void *input )
keccak512_4way( &ctx.keccak, vhash, 64 ); keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash ); keccak512_4way_close( &ctx.keccak, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa // 7 Luffa parallel 2 way 128 bit
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence*)hash0, 64 ); mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, luffa_2way_init( &ctx.luffa, 512 );
(const BitSequence*)hash1, 64 ); luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 8 Cubehash // 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -136,17 +132,13 @@ void x11_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 ); sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd // 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence *)hash0, 512 ); mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash1, simd_2way_init( &ctx.simd, 512 );
(const BitSequence *)hash1, 512 ); simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash2, mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// 11 Echo // 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0, update_final_echo( &ctx.echo, (BitSequence *)hash0,

View File

@@ -10,10 +10,8 @@
#include "algo/jh/sph_jh.h" #include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h" #include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h" #include "algo/skein/sph_skein.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h" #include "algo/cubehash/sph_cubehash.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/simd/sph_simd.h"
#include "algo/echo/sph_echo.h" #include "algo/echo/sph_echo.h"
#ifndef NO_AES_NI #ifndef NO_AES_NI
@@ -21,9 +19,9 @@
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
#endif #endif
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c" #include "algo/blake/sse2/blake.c"
#include "algo/keccak/sse2/keccak.c" #include "algo/keccak/sse2/keccak.c"
#include "algo/bmw/sse2/bmw.c" #include "algo/bmw/sse2/bmw.c"

View File

@@ -11,15 +11,12 @@
#include "algo/skein/skein-hash-4way.h" #include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h" #include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/simd/sph_simd.h"
#include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/simd-hash-2way.h"
typedef struct { typedef struct {
blake512_4way_context blake; blake512_4way_context blake;
@@ -28,10 +25,10 @@ typedef struct {
skein512_4way_context skein; skein512_4way_context skein;
jh512_4way_context jh; jh512_4way_context jh;
keccak512_4way_context keccak; keccak512_4way_context keccak;
hashState_luffa luffa; luffa_2way_context luffa;
cubehashParam cube; cubehashParam cube;
sph_shavite512_context shavite; sph_shavite512_context shavite;
hashState_sd simd; simd_2way_context simd;
hashState_echo echo; hashState_echo echo;
} x11evo_4way_ctx_holder; } x11evo_4way_ctx_holder;
@@ -45,10 +42,11 @@ void init_x11evo_4way_ctx()
skein512_4way_init( &x11evo_4way_ctx.skein ); skein512_4way_init( &x11evo_4way_ctx.skein );
jh512_4way_init( &x11evo_4way_ctx.jh ); jh512_4way_init( &x11evo_4way_ctx.jh );
keccak512_4way_init( &x11evo_4way_ctx.keccak ); keccak512_4way_init( &x11evo_4way_ctx.keccak );
luffa_2way_init( &x11evo_4way_ctx.luffa, 512 );
init_luffa( &x11evo_4way_ctx.luffa, 512 ); init_luffa( &x11evo_4way_ctx.luffa, 512 );
cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 ); cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11evo_4way_ctx.shavite ); sph_shavite512_init( &x11evo_4way_ctx.shavite );
init_sd( &x11evo_4way_ctx.simd, 512 ); simd_2way_init( &x11evo_4way_ctx.simd, 512 );
init_echo( &x11evo_4way_ctx.echo, 512 ); init_echo( &x11evo_4way_ctx.echo, 512 );
} }
@@ -142,20 +140,13 @@ void x11evo_4way_hash( void *state, const void *input )
case 6: case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 ); vhash, 64<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 64<<3 );
(const BitSequence*)hash0, 64 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa, mm256_deinterleave_2x128( hash0, hash1, vhash, 64<<3 );
sizeof(hashState_luffa) ); mm256_interleave_2x128( vhash, hash2, hash3, 64<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, luffa_2way_init( &ctx.luffa, 512 );
(const BitSequence*)hash1, 64 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa, mm256_deinterleave_2x128( hash2, hash3, vhash, 64<<3 );
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
if ( i < len-1 ) if ( i < len-1 )
mm256_interleave_4x64( vhash, mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 ); hash0, hash1, hash2, hash3, 64<<3 );
@@ -202,17 +193,13 @@ void x11evo_4way_hash( void *state, const void *input )
case 9: case 9:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 ); vhash, 64<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 64<<3 );
(const BitSequence *)hash0, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, 64<<3 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 64<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash1, mm256_interleave_2x128( vhash, hash2, hash3, 64<<3 );
(const BitSequence *)hash1, 512 ); simd_2way_init( &ctx.simd, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) ); simd_2way_update_close( &ctx.simd, vhash, vhash, 64<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, 64<<3 );
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
if ( i < len-1 ) if ( i < len-1 )
mm256_interleave_4x64( vhash, mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 ); hash0, hash1, hash2, hash3, 64<<3 );

View File

@@ -22,9 +22,9 @@
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
#endif #endif
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/nist.h"
typedef struct { typedef struct {
#ifdef NO_AES_NI #ifdef NO_AES_NI

View File

@@ -13,10 +13,10 @@
#include "algo/jh/jh-hash-4way.h" #include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h"
#include "algo/gost/sph_gost.h" #include "algo/gost/sph_gost.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
typedef struct { typedef struct {
@@ -27,10 +27,10 @@ typedef struct {
jh512_4way_context jh; jh512_4way_context jh;
keccak512_4way_context keccak; keccak512_4way_context keccak;
sph_gost512_context gost; sph_gost512_context gost;
hashState_luffa luffa; luffa_2way_context luffa;
cubehashParam cube; cubehashParam cube;
sph_shavite512_context shavite; sph_shavite512_context shavite;
hashState_sd simd; simd_2way_context simd;
hashState_echo echo; hashState_echo echo;
} x11gost_4way_ctx_holder; } x11gost_4way_ctx_holder;
@@ -45,10 +45,10 @@ void init_x11gost_4way_ctx()
jh512_4way_init( &x11gost_4way_ctx.jh ); jh512_4way_init( &x11gost_4way_ctx.jh );
keccak512_4way_init( &x11gost_4way_ctx.keccak ); keccak512_4way_init( &x11gost_4way_ctx.keccak );
sph_gost512_init( &x11gost_4way_ctx.gost ); sph_gost512_init( &x11gost_4way_ctx.gost );
init_luffa( &x11gost_4way_ctx.luffa, 512 ); luffa_2way_init( &x11gost_4way_ctx.luffa, 512 );
cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 ); cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11gost_4way_ctx.shavite ); sph_shavite512_init( &x11gost_4way_ctx.shavite );
init_sd( &x11gost_4way_ctx.simd, 512 ); simd_2way_init( &x11gost_4way_ctx.simd, 512 );
init_echo( &x11gost_4way_ctx.echo, 512 ); init_echo( &x11gost_4way_ctx.echo, 512 );
} }
@@ -59,6 +59,7 @@ void x11gost_4way_hash( void *state, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64))); uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64))); uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x11gost_4way_ctx_holder ctx; x11gost_4way_ctx_holder ctx;
memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) ); memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
@@ -109,17 +110,13 @@ void x11gost_4way_hash( void *state, const void *input )
sph_gost512( &ctx.gost, hash3, 64 ); sph_gost512( &ctx.gost, hash3, 64 );
sph_gost512_close( &ctx.gost, hash3 ); sph_gost512_close( &ctx.gost, hash3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence*)hash0, 64 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, mm256_interleave_2x128( vhash, hash2, hash3, 512 );
(const BitSequence*)hash1, 64 ); luffa_2way_init( &ctx.luffa, 512 );
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) ); memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
@@ -144,17 +141,12 @@ void x11gost_4way_hash( void *state, const void *input )
sph_shavite512( &ctx.shavite, hash3, 64 ); sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 ); sph_shavite512_close( &ctx.shavite, hash3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence *)hash0, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash1, mm256_interleave_2x128( vhash, hash2, hash3, 512 );
(const BitSequence *)hash1, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) ); mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0, update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 ); (const BitSequence *) hash0, 512 );

View File

@@ -10,9 +10,9 @@
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/echo/sph_echo.h" #include "algo/echo/sph_echo.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c" #include "algo/blake/sse2/blake.c"
#include "algo/keccak/sse2/keccak.c" #include "algo/keccak/sse2/keccak.c"
#include "algo/bmw/sse2/bmw.c" #include "algo/bmw/sse2/bmw.c"

View File

@@ -12,10 +12,10 @@
#include "algo/skein/skein-hash-4way.h" #include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h" #include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h" #include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h" #include "algo/fugue/sph_fugue.h"
@@ -27,10 +27,10 @@ typedef struct {
skein512_4way_context skein; skein512_4way_context skein;
jh512_4way_context jh; jh512_4way_context jh;
keccak512_4way_context keccak; keccak512_4way_context keccak;
hashState_luffa luffa; luffa_2way_context luffa;
cubehashParam cube; cubehashParam cube;
sph_shavite512_context shavite; sph_shavite512_context shavite;
hashState_sd simd; simd_2way_context simd;
hashState_echo echo; hashState_echo echo;
hamsi512_4way_context hamsi; hamsi512_4way_context hamsi;
sph_fugue512_context fugue; sph_fugue512_context fugue;
@@ -46,10 +46,10 @@ void init_x13_4way_ctx()
skein512_4way_init( &x13_4way_ctx.skein ); skein512_4way_init( &x13_4way_ctx.skein );
jh512_4way_init( &x13_4way_ctx.jh ); jh512_4way_init( &x13_4way_ctx.jh );
keccak512_4way_init( &x13_4way_ctx.keccak ); keccak512_4way_init( &x13_4way_ctx.keccak );
init_luffa( &x13_4way_ctx.luffa, 512 ); luffa_2way_init( &x13_4way_ctx.luffa, 512 );
cubehashInit( &x13_4way_ctx.cube, 512, 16, 32 ); cubehashInit( &x13_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x13_4way_ctx.shavite ); sph_shavite512_init( &x13_4way_ctx.shavite );
init_sd( &x13_4way_ctx.simd, 512 ); simd_2way_init( &x13_4way_ctx.simd, 512 );
init_echo( &x13_4way_ctx.echo, 512 ); init_echo( &x13_4way_ctx.echo, 512 );
hamsi512_4way_init( &x13_4way_ctx.hamsi ); hamsi512_4way_init( &x13_4way_ctx.hamsi );
sph_fugue512_init( &x13_4way_ctx.fugue ); sph_fugue512_init( &x13_4way_ctx.fugue );
@@ -104,17 +104,13 @@ void x13_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa // 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence*)hash0, 64 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, mm256_interleave_2x128( vhash, hash2, hash3, 512 );
(const BitSequence*)hash1, 64 ); luffa_2way_init( &ctx.luffa, 512 );
memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 8 Cubehash // 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -142,17 +138,13 @@ void x13_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 ); sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd // 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence *)hash0, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash1, mm256_interleave_2x128( vhash, hash2, hash3, 512 );
(const BitSequence *)hash1, 512 ); simd_2way_init( &ctx.simd, 512 );
memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// 11 Echo // 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0, update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -168,10 +160,10 @@ void x13_4way_hash( void *state, const void *input )
(const BitSequence *) hash3, 512 ); (const BitSequence *) hash3, 512 );
// 12 Hamsi parallel 4way 32 bit // 12 Hamsi parallel 4way 32 bit
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 ); hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash ); hamsi512_4way_close( &ctx.hamsi, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 13 Fugue serial // 13 Fugue serial
sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512( &ctx.fugue, hash0, 64 );

View File

@@ -19,9 +19,9 @@
#include "algo/hamsi/sph_hamsi.h" #include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h" #include "algo/fugue/sph_fugue.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c" #include "algo/blake/sse2/blake.c"
#include "algo/bmw/sse2/bmw.c" #include "algo/bmw/sse2/bmw.c"
#include "algo/keccak/sse2/keccak.c" #include "algo/keccak/sse2/keccak.c"

View File

@@ -12,10 +12,10 @@
#include "algo/skein/skein-hash-4way.h" #include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h" #include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
#include "algo/sm3/sm3-hash-4way.h" #include "algo/sm3/sm3-hash-4way.h"
#include "algo/hamsi/hamsi-hash-4way.h" #include "algo/hamsi/hamsi-hash-4way.h"
@@ -28,10 +28,10 @@ typedef struct {
skein512_4way_context skein; skein512_4way_context skein;
jh512_4way_context jh; jh512_4way_context jh;
keccak512_4way_context keccak; keccak512_4way_context keccak;
hashState_luffa luffa; luffa_2way_context luffa;
cubehashParam cube; cubehashParam cube;
sph_shavite512_context shavite; sph_shavite512_context shavite;
hashState_sd simd; simd_2way_context simd;
hashState_echo echo; hashState_echo echo;
sm3_4way_ctx_t sm3; sm3_4way_ctx_t sm3;
hamsi512_4way_context hamsi; hamsi512_4way_context hamsi;
@@ -49,10 +49,10 @@ void init_x13sm3_4way_ctx()
skein512_4way_init( &x13sm3_4way_ctx.skein ); skein512_4way_init( &x13sm3_4way_ctx.skein );
jh512_4way_init( &x13sm3_4way_ctx.jh ); jh512_4way_init( &x13sm3_4way_ctx.jh );
keccak512_4way_init( &x13sm3_4way_ctx.keccak ); keccak512_4way_init( &x13sm3_4way_ctx.keccak );
init_luffa( &x13sm3_4way_ctx.luffa, 512 ); luffa_2way_init( &x13sm3_4way_ctx.luffa, 512 );
cubehashInit( &x13sm3_4way_ctx.cube, 512, 16, 32 ); cubehashInit( &x13sm3_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x13sm3_4way_ctx.shavite ); sph_shavite512_init( &x13sm3_4way_ctx.shavite );
init_sd( &x13sm3_4way_ctx.simd, 512 ); simd_2way_init( &x13sm3_4way_ctx.simd, 512 );
init_echo( &x13sm3_4way_ctx.echo, 512 ); init_echo( &x13sm3_4way_ctx.echo, 512 );
sm3_4way_init( &x13sm3_4way_ctx.sm3 ); sm3_4way_init( &x13sm3_4way_ctx.sm3 );
hamsi512_4way_init( &x13sm3_4way_ctx.hamsi ); hamsi512_4way_init( &x13sm3_4way_ctx.hamsi );
@@ -111,17 +111,13 @@ void x13sm3_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// Luffa // Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence*)hash0, 64 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, mm256_interleave_2x128( vhash, hash2, hash3, 512 );
(const BitSequence*)hash1, 64 ); luffa_2way_init( &ctx.luffa, 512 );
memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// Cubehash // Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -149,17 +145,13 @@ void x13sm3_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 ); sph_shavite512_close( &ctx.shavite, hash3 );
// Simd // Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence *)hash0, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash1, mm256_interleave_2x128( vhash, hash2, hash3, 512 );
(const BitSequence *)hash1, 512 ); simd_2way_init( &ctx.simd, 512 );
memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// Echo // Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0, update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -190,12 +182,13 @@ void x13sm3_4way_hash( void *state, const void *input )
sm3_4way( &ctx.sm3, vhash, 64 ); sm3_4way( &ctx.sm3, vhash, 64 );
sm3_4way_close( &ctx.sm3, sm3_vhash ); sm3_4way_close( &ctx.sm3, sm3_vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
// Hamsi parallel 32 bit // Hamsi parallel 4x32x2
hamsi512_4way( &ctx.hamsi, sm3_vhash, 64 ); mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash ); hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
// Fugue serial // Fugue serial
sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512( &ctx.fugue, hash0, 64 );

View File

@@ -15,9 +15,9 @@
#include "algo/fugue/sph_fugue.h" #include "algo/fugue/sph_fugue.h"
#include "algo/sm3/sph_sm3.h" #include "algo/sm3/sph_sm3.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/nist.h"
#include "algo/echo/sse2/sph_echo.h" #include "algo/echo/sse2/sph_echo.h"
#include "algo/blake/sse2/blake.c" #include "algo/blake/sse2/blake.c"
#include "algo/bmw/sse2/bmw.c" #include "algo/bmw/sse2/bmw.c"

View File

@@ -9,8 +9,7 @@
#include "algo/skein/skein-hash-4way.h" #include "algo/skein/skein-hash-4way.h"
#include "algo/shabal/shabal-hash-4way.h" #include "algo/shabal/shabal-hash-4way.h"
#include "algo/fugue//sph_fugue.h" #include "algo/fugue//sph_fugue.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa-hash-2way.h"
//#include "algo/shabal/sph_shabal.h"
#include "algo/gost/sph_gost.h" #include "algo/gost/sph_gost.h"
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
@@ -18,7 +17,7 @@ typedef struct {
skein512_4way_context skein; skein512_4way_context skein;
shabal512_4way_context shabal; shabal512_4way_context shabal;
hashState_echo echo; hashState_echo echo;
hashState_luffa luffa; luffa_2way_context luffa;
sph_fugue512_context fugue; sph_fugue512_context fugue;
sph_gost512_context gost; sph_gost512_context gost;
} poly_4way_ctx_holder; } poly_4way_ctx_holder;
@@ -27,12 +26,12 @@ poly_4way_ctx_holder poly_4way_ctx;
void init_polytimos_4way_ctx() void init_polytimos_4way_ctx()
{ {
skein512_4way_init( &poly_4way_ctx.skein ); skein512_4way_init( &poly_4way_ctx.skein );
shabal512_4way_init( &poly_4way_ctx.shabal ); shabal512_4way_init( &poly_4way_ctx.shabal );
init_echo( &poly_4way_ctx.echo, 512 ); init_echo( &poly_4way_ctx.echo, 512 );
init_luffa( &poly_4way_ctx.luffa, 512 ); luffa_2way_init( &poly_4way_ctx.luffa, 512 );
sph_fugue512_init( &poly_4way_ctx.fugue ); sph_fugue512_init( &poly_4way_ctx.fugue );
sph_gost512_init( &poly_4way_ctx.gost ); sph_gost512_init( &poly_4way_ctx.gost );
} }
void polytimos_4way_hash( void *output, const void *input ) void polytimos_4way_hash( void *output, const void *input )
@@ -67,17 +66,13 @@ void polytimos_4way_hash( void *output, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3, update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 ); (const BitSequence *) hash3, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence*)hash0, 64 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, mm256_interleave_2x128( vhash, hash2, hash3, 512 );
(const BitSequence*)hash1, 64 ); luffa_2way_init( &ctx.luffa, 512 );
memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 ); sph_fugue512_close( &ctx.fugue, hash0 );

View File

@@ -8,7 +8,7 @@
#include "algo/skein/sph_skein.h" #include "algo/skein/sph_skein.h"
#include "algo/echo/sph_echo.h" #include "algo/echo/sph_echo.h"
#include "algo/fugue//sph_fugue.h" #include "algo/fugue//sph_fugue.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/shabal/sph_shabal.h" #include "algo/shabal/sph_shabal.h"
#include "algo/gost/sph_gost.h" #include "algo/gost/sph_gost.h"
#ifndef NO_AES_NI #ifndef NO_AES_NI

View File

@@ -12,10 +12,10 @@
#include "algo/skein/skein-hash-4way.h" #include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h" #include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
#include "algo/echo/sph_echo.h" #include "algo/echo/sph_echo.h"
#include "algo/hamsi/hamsi-hash-4way.h" #include "algo/hamsi/hamsi-hash-4way.h"
@@ -29,10 +29,10 @@ typedef struct {
skein512_4way_context skein; skein512_4way_context skein;
jh512_4way_context jh; jh512_4way_context jh;
keccak512_4way_context keccak; keccak512_4way_context keccak;
hashState_luffa luffa; luffa_2way_context luffa;
cubehashParam cube; cubehashParam cube;
sph_shavite512_context shavite; sph_shavite512_context shavite;
hashState_sd simd; simd_2way_context simd;
hashState_echo echo; hashState_echo echo;
hamsi512_4way_context hamsi; hamsi512_4way_context hamsi;
sph_fugue512_context fugue; sph_fugue512_context fugue;
@@ -45,15 +45,14 @@ void init_x14_4way_ctx()
{ {
blake512_4way_init( &x14_4way_ctx.blake ); blake512_4way_init( &x14_4way_ctx.blake );
bmw512_4way_init( &x14_4way_ctx.bmw ); bmw512_4way_init( &x14_4way_ctx.bmw );
sph_bmw512_init( &x14_4way_ctx.bmw );
init_groestl( &x14_4way_ctx.groestl, 64 ); init_groestl( &x14_4way_ctx.groestl, 64 );
skein512_4way_init( &x14_4way_ctx.skein ); skein512_4way_init( &x14_4way_ctx.skein );
jh512_4way_init( &x14_4way_ctx.jh ); jh512_4way_init( &x14_4way_ctx.jh );
keccak512_4way_init( &x14_4way_ctx.keccak ); keccak512_4way_init( &x14_4way_ctx.keccak );
init_luffa( &x14_4way_ctx.luffa, 512 ); luffa_2way_init( &x14_4way_ctx.luffa, 512 );
cubehashInit( &x14_4way_ctx.cube, 512, 16, 32 ); cubehashInit( &x14_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x14_4way_ctx.shavite ); sph_shavite512_init( &x14_4way_ctx.shavite );
init_sd( &x14_4way_ctx.simd, 512 ); simd_2way_init( &x14_4way_ctx.simd, 512 );
init_echo( &x14_4way_ctx.echo, 512 ); init_echo( &x14_4way_ctx.echo, 512 );
hamsi512_4way_init( &x14_4way_ctx.hamsi ); hamsi512_4way_init( &x14_4way_ctx.hamsi );
sph_fugue512_init( &x14_4way_ctx.fugue ); sph_fugue512_init( &x14_4way_ctx.fugue );
@@ -109,17 +108,13 @@ void x14_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa // 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence*)hash0, 64 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, mm256_interleave_2x128( vhash, hash2, hash3, 512 );
(const BitSequence*)hash1, 64 ); luffa_2way_init( &ctx.luffa, 512 );
memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 8 Cubehash // 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -147,17 +142,13 @@ void x14_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 ); sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd // 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence *)hash0, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash1, mm256_interleave_2x128( vhash, hash2, hash3, 512 );
(const BitSequence *)hash1, 512 ); simd_2way_init( &ctx.simd, 512 );
memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// 11 Echo // 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0, update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -173,10 +164,10 @@ void x14_4way_hash( void *state, const void *input )
(const BitSequence *) hash3, 512 ); (const BitSequence *) hash3, 512 );
// 12 Hamsi parallel 4way 32 bit // 12 Hamsi parallel 4way 32 bit
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 ); hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash ); hamsi512_4way_close( &ctx.hamsi, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 13 Fugue serial // 13 Fugue serial
sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512( &ctx.fugue, hash0, 64 );

View File

@@ -20,9 +20,9 @@
#include "algo/fugue/sph_fugue.h" #include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h" #include "algo/shabal/sph_shabal.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/nist.h"
#include "algo/echo/sse2/sph_echo.h" #include "algo/echo/sse2/sph_echo.h"
#include "algo/blake/sse2/blake.c" #include "algo/blake/sse2/blake.c"
#include "algo/bmw/sse2/bmw.c" #include "algo/bmw/sse2/bmw.c"

View File

@@ -12,14 +12,13 @@
#include "algo/skein/skein-hash-4way.h" #include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h" #include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
#include "algo/echo/sph_echo.h" #include "algo/echo/sph_echo.h"
#include "algo/hamsi/hamsi-hash-4way.h" #include "algo/hamsi/hamsi-hash-4way.h"
//#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h" #include "algo/fugue/sph_fugue.h"
#include "algo/shabal/shabal-hash-4way.h" #include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h" #include "algo/whirlpool/sph_whirlpool.h"
@@ -31,13 +30,12 @@ typedef struct {
skein512_4way_context skein; skein512_4way_context skein;
jh512_4way_context jh; jh512_4way_context jh;
keccak512_4way_context keccak; keccak512_4way_context keccak;
hashState_luffa luffa; luffa_2way_context luffa;
cubehashParam cube; cubehashParam cube;
sph_shavite512_context shavite; sph_shavite512_context shavite;
hashState_sd simd; simd_2way_context simd;
hashState_echo echo; hashState_echo echo;
hamsi512_4way_context hamsi; hamsi512_4way_context hamsi;
// sph_hamsi512_context hamsi;
sph_fugue512_context fugue; sph_fugue512_context fugue;
shabal512_4way_context shabal; shabal512_4way_context shabal;
sph_whirlpool_context whirlpool; sph_whirlpool_context whirlpool;
@@ -53,13 +51,12 @@ void init_x15_4way_ctx()
skein512_4way_init( &x15_4way_ctx.skein ); skein512_4way_init( &x15_4way_ctx.skein );
jh512_4way_init( &x15_4way_ctx.jh ); jh512_4way_init( &x15_4way_ctx.jh );
keccak512_4way_init( &x15_4way_ctx.keccak ); keccak512_4way_init( &x15_4way_ctx.keccak );
init_luffa( &x15_4way_ctx.luffa, 512 ); luffa_2way_init( &x15_4way_ctx.luffa, 512 );
cubehashInit( &x15_4way_ctx.cube, 512, 16, 32 ); cubehashInit( &x15_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x15_4way_ctx.shavite ); sph_shavite512_init( &x15_4way_ctx.shavite );
init_sd( &x15_4way_ctx.simd, 512 ); simd_2way_init( &x15_4way_ctx.simd, 512 );
init_echo( &x15_4way_ctx.echo, 512 ); init_echo( &x15_4way_ctx.echo, 512 );
hamsi512_4way_init( &x15_4way_ctx.hamsi ); hamsi512_4way_init( &x15_4way_ctx.hamsi );
// sph_hamsi512_init( &x15_4way_ctx.hamsi );
sph_fugue512_init( &x15_4way_ctx.fugue ); sph_fugue512_init( &x15_4way_ctx.fugue );
shabal512_4way_init( &x15_4way_ctx.shabal ); shabal512_4way_init( &x15_4way_ctx.shabal );
sph_whirlpool_init( &x15_4way_ctx.whirlpool ); sph_whirlpool_init( &x15_4way_ctx.whirlpool );
@@ -114,17 +111,13 @@ void x15_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa // 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence*)hash0, 64 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, mm256_interleave_2x128( vhash, hash2, hash3, 512 );
(const BitSequence*)hash1, 64 ); luffa_2way_init( &ctx.luffa, 512 );
memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 8 Cubehash // 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -152,17 +145,13 @@ void x15_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 ); sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd // 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence *)hash0, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash1, mm256_interleave_2x128( vhash, hash2, hash3, 512 );
(const BitSequence *)hash1, 512 ); simd_2way_init( &ctx.simd, 512 );
memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// 11 Echo // 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0, update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -178,24 +167,11 @@ void x15_4way_hash( void *state, const void *input )
(const BitSequence *) hash3, 512 ); (const BitSequence *) hash3, 512 );
// 12 Hamsi parallel 4way 32 bit // 12 Hamsi parallel 4way 32 bit
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 ); hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash ); hamsi512_4way_close( &ctx.hamsi, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
/*
// 12 Hamsi
sph_hamsi512( &ctx.hamsi, hash0, 64 );
sph_hamsi512_close( &ctx.hamsi, hash0 );
memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash1, 64 );
sph_hamsi512_close( &ctx.hamsi, hash1 );
memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash2, 64 );
sph_hamsi512_close( &ctx.hamsi, hash2 );
memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash3, 64 );
sph_hamsi512_close( &ctx.hamsi, hash3 );
*/
// 13 Fugue // 13 Fugue
sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 ); sph_fugue512_close( &ctx.fugue, hash0 );

View File

@@ -21,9 +21,9 @@
#include "algo/shabal/sph_shabal.h" #include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h" #include "algo/whirlpool/sph_whirlpool.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c" #include "algo/blake/sse2/blake.c"
#include "algo/bmw/sse2/bmw.c" #include "algo/bmw/sse2/bmw.c"
#include "algo/keccak/sse2/keccak.c" #include "algo/keccak/sse2/keccak.c"

View File

@@ -23,9 +23,9 @@
#include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
#endif #endif
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/nist.h"
#include "algo/jh/sse2/jh_sse2_opt64.h" #include "algo/jh/sse2/jh_sse2_opt64.h"
typedef struct { typedef struct {

View File

@@ -19,9 +19,9 @@
#include "algo/jh/jh-hash-4way.h" #include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h" #include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h" #include "algo/fugue/sph_fugue.h"
@@ -41,10 +41,10 @@ typedef struct {
skein512_4way_context skein; skein512_4way_context skein;
jh512_4way_context jh; jh512_4way_context jh;
keccak512_4way_context keccak; keccak512_4way_context keccak;
hashState_luffa luffa; luffa_2way_context luffa;
cubehashParam cube; cubehashParam cube;
sph_shavite512_context shavite; sph_shavite512_context shavite;
hashState_sd simd; simd_2way_context simd;
hamsi512_4way_context hamsi; hamsi512_4way_context hamsi;
sph_fugue512_context fugue; sph_fugue512_context fugue;
shabal512_4way_context shabal; shabal512_4way_context shabal;
@@ -68,6 +68,10 @@ void x16r_4way_hash( void* output, const void* input )
uint32_t hash2[24] __attribute__ ((aligned (64))); uint32_t hash2[24] __attribute__ ((aligned (64)));
uint32_t hash3[24] __attribute__ ((aligned (64))); uint32_t hash3[24] __attribute__ ((aligned (64)));
uint32_t vhash[24*4] __attribute__ ((aligned (64))); uint32_t vhash[24*4] __attribute__ ((aligned (64)));
// uint32_t inp0[24] __attribute__ ((aligned (64)));
// uint32_t inp1[24] __attribute__ ((aligned (64)));
// uint32_t inp2[24] __attribute__ ((aligned (64)));
// uint32_t inp3[24] __attribute__ ((aligned (64)));
x16r_4way_ctx_holder ctx; x16r_4way_ctx_holder ctx;
@@ -75,7 +79,6 @@ void x16r_4way_hash( void* output, const void* input )
void *in1 = (void*) hash1; void *in1 = (void*) hash1;
void *in2 = (void*) hash2; void *in2 = (void*) hash2;
void *in3 = (void*) hash3; void *in3 = (void*) hash3;
int size = 80; int size = 80;
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, input, 640 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, input, 640 );
@@ -111,7 +114,7 @@ void x16r_4way_hash( void* output, const void* input )
blake512_4way( &ctx.blake, vhash, size ); blake512_4way( &ctx.blake, vhash, size );
} }
blake512_4way_close( &ctx.blake, vhash ); blake512_4way_close( &ctx.blake, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break; break;
case BMW: case BMW:
bmw512_4way_init( &ctx.bmw ); bmw512_4way_init( &ctx.bmw );
@@ -123,7 +126,7 @@ void x16r_4way_hash( void* output, const void* input )
bmw512_4way( &ctx.bmw, vhash, size ); bmw512_4way( &ctx.bmw, vhash, size );
} }
bmw512_4way_close( &ctx.bmw, vhash ); bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break; break;
case GROESTL: case GROESTL:
init_groestl( &ctx.groestl, 64 ); init_groestl( &ctx.groestl, 64 );
@@ -149,7 +152,7 @@ void x16r_4way_hash( void* output, const void* input )
skein512_4way( &ctx.skein, vhash, size ); skein512_4way( &ctx.skein, vhash, size );
} }
skein512_4way_close( &ctx.skein, vhash ); skein512_4way_close( &ctx.skein, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break; break;
case JH: case JH:
jh512_4way_init( &ctx.jh ); jh512_4way_init( &ctx.jh );
@@ -161,7 +164,7 @@ void x16r_4way_hash( void* output, const void* input )
jh512_4way( &ctx.jh, vhash, size ); jh512_4way( &ctx.jh, vhash, size );
} }
jh512_4way_close( &ctx.jh, vhash ); jh512_4way_close( &ctx.jh, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break; break;
case KECCAK: case KECCAK:
keccak512_4way_init( &ctx.keccak ); keccak512_4way_init( &ctx.keccak );
@@ -173,21 +176,17 @@ void x16r_4way_hash( void* output, const void* input )
keccak512_4way( &ctx.keccak, vhash, size ); keccak512_4way( &ctx.keccak, vhash, size );
} }
keccak512_4way_close( &ctx.keccak, vhash ); keccak512_4way_close( &ctx.keccak, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break; break;
case LUFFA: case LUFFA:
init_luffa( &ctx.luffa, 512 ); mm256_interleave_2x128( vhash, in0, in1, size<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, luffa_2way_init( &ctx.luffa, 512 );
(const BitSequence*)in0, size ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
init_luffa( &ctx.luffa, 512 ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, mm256_interleave_2x128( vhash, in2, in3, size<<3 );
(const BitSequence*)in1, size ); luffa_2way_init( &ctx.luffa, 512 );
init_luffa( &ctx.luffa, 512 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
(const BitSequence*)in2, size );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)in3, size );
break; break;
case CUBEHASH: case CUBEHASH:
cubehashReinit( &ctx.cube ); cubehashReinit( &ctx.cube );
@@ -218,18 +217,14 @@ void x16r_4way_hash( void* output, const void* input )
sph_shavite512_close( &ctx.shavite, hash3 ); sph_shavite512_close( &ctx.shavite, hash3 );
break; break;
case SIMD: case SIMD:
init_sd( &ctx.simd, 512 ); mm256_interleave_2x128( vhash, in0, in1, size<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0, simd_2way_init( &ctx.simd, 512 );
(const BitSequence*)in0, size<<3 ); simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
init_sd( &ctx.simd, 512 ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash1, mm256_interleave_2x128( vhash, in2, in3, size<<3 );
(const BitSequence*)in1, size<<3 ); simd_2way_init( &ctx.simd, 512 );
init_sd( &ctx.simd, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
(const BitSequence*)in2, size<<3 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence*)in3, size<<3 );
break; break;
case ECHO: case ECHO:
init_echo( &ctx.echo, 512 ); init_echo( &ctx.echo, 512 );
@@ -246,11 +241,11 @@ void x16r_4way_hash( void* output, const void* input )
(const BitSequence*)in3, size<<3 ); (const BitSequence*)in3, size<<3 );
break; break;
case HAMSI: case HAMSI:
mm_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 ); mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
hamsi512_4way_init( &ctx.hamsi ); hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, size ); hamsi512_4way( &ctx.hamsi, vhash, size );
hamsi512_4way_close( &ctx.hamsi, vhash ); hamsi512_4way_close( &ctx.hamsi, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, size<<3 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break; break;
case FUGUE: case FUGUE:
sph_fugue512_init( &ctx.fugue ); sph_fugue512_init( &ctx.fugue );
@@ -271,7 +266,7 @@ void x16r_4way_hash( void* output, const void* input )
shabal512_4way_init( &ctx.shabal ); shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, size ); shabal512_4way( &ctx.shabal, vhash, size );
shabal512_4way_close( &ctx.shabal, vhash ); shabal512_4way_close( &ctx.shabal, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, size<<3 ); mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
break; break;
case WHIRLPOOL: case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool ); sph_whirlpool_init( &ctx.whirlpool );
@@ -292,9 +287,13 @@ void x16r_4way_hash( void* output, const void* input )
sha512_4way_init( &ctx.sha512 ); sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, size ); sha512_4way( &ctx.sha512, vhash, size );
sha512_4way_close( &ctx.sha512, vhash ); sha512_4way_close( &ctx.sha512, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break; break;
} }
// in0 = (void*) hash0;
// in1 = (void*) hash1;
// in2 = (void*) hash2;
// in3 = (void*) hash3;
size = 64; size = 64;
} }
memcpy( output, hash0, 32 ); memcpy( output, hash0, 32 );
@@ -351,28 +350,28 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
x16r_4way_hash( hash, vdata ); x16r_4way_hash( hash, vdata );
pdata[19] = n; pdata[19] = n;
if ( ( hash[7] <= Htarg ) && fulltest( hash, ptarget ) ) if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{ {
found[0] = true; found[0] = true;
num_found++; num_found++;
nonces[0] = n; nonces[0] = n;
work_set_target_ratio( work, hash ); work_set_target_ratio( work, hash );
} }
if ( ( (hash+8)[7] <= Htarg ) && fulltest( hash+8, ptarget ) ) if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{ {
found[1] = true; found[1] = true;
num_found++; num_found++;
nonces[1] = n+1; nonces[1] = n+1;
work_set_target_ratio( work, hash+8 ); work_set_target_ratio( work, hash+8 );
} }
if ( ( (hash+16)[7] <= Htarg ) && fulltest( hash+16, ptarget ) ) if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{ {
found[2] = true; found[2] = true;
num_found++; num_found++;
nonces[2] = n+2; nonces[2] = n+2;
work_set_target_ratio( work, hash+16 ); work_set_target_ratio( work, hash+16 );
} }
if ( ( (hash+24)[7] <= Htarg ) && fulltest( hash+24, ptarget ) ) if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{ {
found[3] = true; found[3] = true;
num_found++; num_found++;

View File

@@ -16,9 +16,9 @@
#include "algo/keccak/sph_keccak.h" #include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h" #include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/nist.h"
#include "algo/echo/sph_echo.h" #include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h" #include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h" #include "algo/fugue/sph_fugue.h"
@@ -117,7 +117,7 @@ void x16r_hash( void* output, const void* input )
case GROESTL: case GROESTL:
#ifdef NO_AES_NI #ifdef NO_AES_NI
sph_groestl512_init( &ctx.groestl ); sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size<<3 ); sph_groestl512( &ctx.groestl, in, size );
sph_groestl512_close(&ctx.groestl, hash); sph_groestl512_close(&ctx.groestl, hash);
#else #else
init_groestl( &ctx.groestl, 64 ); init_groestl( &ctx.groestl, 64 );

View File

@@ -12,10 +12,10 @@
#include "algo/skein/skein-hash-4way.h" #include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h" #include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h" #include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h" #include "algo/fugue/sph_fugue.h"
@@ -31,10 +31,10 @@ typedef struct {
skein512_4way_context skein; skein512_4way_context skein;
jh512_4way_context jh; jh512_4way_context jh;
keccak512_4way_context keccak; keccak512_4way_context keccak;
hashState_luffa luffa; luffa_2way_context luffa;
cubehashParam cube; cubehashParam cube;
sph_shavite512_context shavite; sph_shavite512_context shavite;
hashState_sd simd; simd_2way_context simd;
hashState_echo echo; hashState_echo echo;
hamsi512_4way_context hamsi; hamsi512_4way_context hamsi;
sph_fugue512_context fugue; sph_fugue512_context fugue;
@@ -54,10 +54,10 @@ void init_x17_4way_ctx()
skein512_4way_init( &x17_4way_ctx.skein ); skein512_4way_init( &x17_4way_ctx.skein );
jh512_4way_init( &x17_4way_ctx.jh ); jh512_4way_init( &x17_4way_ctx.jh );
keccak512_4way_init( &x17_4way_ctx.keccak ); keccak512_4way_init( &x17_4way_ctx.keccak );
init_luffa( &x17_4way_ctx.luffa, 512 ); luffa_2way_init( &x17_4way_ctx.luffa, 512 );
cubehashInit( &x17_4way_ctx.cube, 512, 16, 32 ); cubehashInit( &x17_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x17_4way_ctx.shavite ); sph_shavite512_init( &x17_4way_ctx.shavite );
init_sd( &x17_4way_ctx.simd, 512 ); simd_2way_init( &x17_4way_ctx.simd, 512 );
init_echo( &x17_4way_ctx.echo, 512 ); init_echo( &x17_4way_ctx.echo, 512 );
hamsi512_4way_init( &x17_4way_ctx.hamsi ); hamsi512_4way_init( &x17_4way_ctx.hamsi );
sph_fugue512_init( &x17_4way_ctx.fugue ); sph_fugue512_init( &x17_4way_ctx.fugue );
@@ -114,18 +114,14 @@ void x17_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa serial // 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence*)hash0, 64 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, mm256_interleave_2x128( vhash, hash2, hash3, 512 );
(const BitSequence*)hash1, 64 ); luffa_2way_init( &ctx.luffa, 512 );
memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 8 Cubehash // 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -153,17 +149,13 @@ void x17_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 ); sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd // 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0, mm256_interleave_2x128( vhash, hash0, hash1, 512 );
(const BitSequence *)hash0, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) ); mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash1, mm256_interleave_2x128( vhash, hash2, hash3, 512 );
(const BitSequence *)hash1, 512 ); simd_2way_init( &ctx.simd, 512 );
memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// 11 Echo // 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0, update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -178,11 +170,11 @@ void x17_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3, update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 ); (const BitSequence *) hash3, 512 );
// 12 Hamsi parallel 4way 32 bit // 12 Hamsi
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 ); hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash ); hamsi512_4way_close( &ctx.hamsi, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 13 Fugue // 13 Fugue
sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512( &ctx.fugue, hash0, 64 );

View File

@@ -21,9 +21,9 @@
#include "algo/sha/sph_sha2.h" #include "algo/sha/sph_sha2.h"
#include "algo/haval/sph-haval.h" #include "algo/haval/sph-haval.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c" #include "algo/blake/sse2/blake.c"
#include "algo/bmw/sse2/bmw.c" #include "algo/bmw/sse2/bmw.c"
#include "algo/keccak/sse2/keccak.c" #include "algo/keccak/sse2/keccak.c"

View File

@@ -13,9 +13,9 @@
#include "algo/keccak/keccak-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h" #include "algo/skein/skein-hash-4way.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h" #include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h" #include "algo/fugue/sph_fugue.h"
@@ -31,10 +31,10 @@ typedef struct {
skein512_4way_context skein; skein512_4way_context skein;
jh512_4way_context jh; jh512_4way_context jh;
keccak512_4way_context keccak; keccak512_4way_context keccak;
hashState_luffa luffa; luffa_2way_context luffa;
cubehashParam cube; cubehashParam cube;
sph_shavite512_context shavite; sph_shavite512_context shavite;
hashState_sd simd; simd_2way_context simd;
hashState_echo echo; hashState_echo echo;
hamsi512_4way_context hamsi; hamsi512_4way_context hamsi;
sph_fugue512_context fugue; sph_fugue512_context fugue;
@@ -56,10 +56,10 @@ void init_xevan_4way_ctx()
skein512_4way_init(&xevan_4way_ctx.skein); skein512_4way_init(&xevan_4way_ctx.skein);
jh512_4way_init(&xevan_4way_ctx.jh); jh512_4way_init(&xevan_4way_ctx.jh);
keccak512_4way_init(&xevan_4way_ctx.keccak); keccak512_4way_init(&xevan_4way_ctx.keccak);
init_luffa( &xevan_4way_ctx.luffa, 512 ); luffa_2way_init( &xevan_4way_ctx.luffa, 512 );
cubehashInit( &xevan_4way_ctx.cube, 512, 16, 32 ); cubehashInit( &xevan_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &xevan_4way_ctx.shavite ); sph_shavite512_init( &xevan_4way_ctx.shavite );
init_sd( &xevan_4way_ctx.simd, 512 ); simd_2way_init( &xevan_4way_ctx.simd, 512 );
init_echo( &xevan_4way_ctx.echo, 512 ); init_echo( &xevan_4way_ctx.echo, 512 );
hamsi512_4way_init( &xevan_4way_ctx.hamsi ); hamsi512_4way_init( &xevan_4way_ctx.hamsi );
sph_fugue512_init( &xevan_4way_ctx.fugue ); sph_fugue512_init( &xevan_4way_ctx.fugue );
@@ -127,20 +127,14 @@ void xevan_4way_hash( void *output, const void *input )
keccak512_4way( &ctx.keccak, vhash, dataLen ); keccak512_4way( &ctx.keccak, vhash, dataLen );
keccak512_4way_close( &ctx.keccak, vhash ); keccak512_4way_close( &ctx.keccak, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
(const BitSequence*)hash0, dataLen ); mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) ); mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, luffa_2way_init( &ctx.luffa, 512 );
(const BitSequence*)hash1, dataLen ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) ); mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
dataLen ); dataLen );
@@ -169,17 +163,13 @@ void xevan_4way_hash( void *output, const void *input )
sph_shavite512( &ctx.shavite, hash3, dataLen ); sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 ); sph_shavite512_close( &ctx.shavite, hash3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0, mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
(const BitSequence *)hash0, dataLen<<3 ); simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) ); mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash1, mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
(const BitSequence *)hash1, dataLen<<3 ); simd_2way_init( &ctx.simd, 512 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) ); simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
(const BitSequence *)hash2, dataLen<<3 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, dataLen<<3 );
update_final_echo( &ctx.echo, (BitSequence *)hash0, update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, dataLen<<3 ); (const BitSequence *) hash0, dataLen<<3 );
@@ -192,12 +182,11 @@ void xevan_4way_hash( void *output, const void *input )
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) ); memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3, update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, dataLen<<3 ); (const BitSequence *) hash3, dataLen<<3 );
// Parallel
// Parallel 32 bit mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
hamsi512_4way( &ctx.hamsi, vhash, dataLen ); hamsi512_4way( &ctx.hamsi, vhash, dataLen );
hamsi512_4way_close( &ctx.hamsi, vhash ); hamsi512_4way_close( &ctx.hamsi, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
sph_fugue512( &ctx.fugue, hash0, dataLen ); sph_fugue512( &ctx.fugue, hash0, dataLen );
sph_fugue512_close( &ctx.fugue, hash0 ); sph_fugue512_close( &ctx.fugue, hash0 );
@@ -278,18 +267,13 @@ void xevan_4way_hash( void *output, const void *input )
keccak512_4way_close( &ctx.keccak, vhash ); keccak512_4way_close( &ctx.keccak, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
(const BitSequence*)hash0, dataLen ); mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) ); mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, luffa_2way_init( &ctx.luffa, 512 );
(const BitSequence*)hash1, dataLen ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) ); mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
dataLen ); dataLen );
@@ -318,17 +302,13 @@ void xevan_4way_hash( void *output, const void *input )
sph_shavite512( &ctx.shavite, hash3, dataLen ); sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 ); sph_shavite512_close( &ctx.shavite, hash3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0, mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
(const BitSequence *)hash0, dataLen<<3 ); simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) ); mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash1, mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
(const BitSequence *)hash1, dataLen<<3 ); simd_2way_init( &ctx.simd, 512 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) ); simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash2, mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
(const BitSequence *)hash2, dataLen<<3 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, dataLen<<3 );
update_final_echo( &ctx.echo, (BitSequence *)hash0, update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, dataLen<<3 ); (const BitSequence *) hash0, dataLen<<3 );
@@ -342,10 +322,10 @@ void xevan_4way_hash( void *output, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3, update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, dataLen<<3 ); (const BitSequence *) hash3, dataLen<<3 );
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
hamsi512_4way( &ctx.hamsi, vhash, dataLen ); hamsi512_4way( &ctx.hamsi, vhash, dataLen );
hamsi512_4way_close( &ctx.hamsi, vhash ); hamsi512_4way_close( &ctx.hamsi, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
sph_fugue512( &ctx.fugue, hash0, dataLen ); sph_fugue512( &ctx.fugue, hash0, dataLen );
sph_fugue512_close( &ctx.fugue, hash0 ); sph_fugue512_close( &ctx.fugue, hash0 );

View File

@@ -11,14 +11,14 @@
#include "algo/keccak/sph_keccak.h" #include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h" #include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/luffa_for_sse2.h"
#include "algo/hamsi/sph_hamsi.h" #include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h" #include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h" #include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h" #include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h" #include "algo/sha/sph_sha2.h"
#include "algo/haval/sph-haval.h" #include "algo/haval/sph-haval.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/nist.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include <openssl/sha.h> #include <openssl/sha.h>
#ifdef NO_AES_NI #ifdef NO_AES_NI

View File

@@ -424,12 +424,17 @@ int64_t yescryptr16_get_max64()
return 0xfffLL; return 0xfffLL;
} }
bool register_yescrypt_algo( algo_gate_t* gate ) void yescrypt_gate_base(algo_gate_t *gate )
{ {
gate->optimizations = SSE2_OPT | SHA_OPT; gate->optimizations = SSE2_OPT | AVX_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yescrypt; gate->scanhash = (void*)&scanhash_yescrypt;
gate->hash = (void*)&yescrypt_hash; gate->hash = (void*)&yescrypt_hash;
gate->set_target = (void*)&scrypt_set_target; gate->set_target = (void*)&scrypt_set_target;
}
bool register_yescrypt_algo( algo_gate_t* gate )
{
yescrypt_gate_base( gate );
gate->get_max64 = (void*)&yescrypt_get_max64; gate->get_max64 = (void*)&yescrypt_get_max64;
client_key_hack = true; client_key_hack = true;
YESCRYPT_N = 2048; YESCRYPT_N = 2048;
@@ -440,10 +445,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )
bool register_yescryptr8_algo( algo_gate_t* gate ) bool register_yescryptr8_algo( algo_gate_t* gate )
{ {
gate->optimizations = SSE2_OPT | SHA_OPT; yescrypt_gate_base( gate );
gate->scanhash = (void*)&scanhash_yescrypt;
gate->hash = (void*)&yescrypt_hash;
gate->set_target = (void*)&scrypt_set_target;
gate->get_max64 = (void*)&yescrypt_get_max64; gate->get_max64 = (void*)&yescrypt_get_max64;
client_key_hack = false; client_key_hack = false;
YESCRYPT_N = 2048; YESCRYPT_N = 2048;
@@ -454,10 +456,7 @@ bool register_yescryptr8_algo( algo_gate_t* gate )
bool register_yescryptr16_algo( algo_gate_t* gate ) bool register_yescryptr16_algo( algo_gate_t* gate )
{ {
gate->optimizations = SSE2_OPT | SHA_OPT; yescrypt_gate_base( gate );
gate->scanhash = (void*)&scanhash_yescrypt;
gate->hash = (void*)&yescrypt_hash;
gate->set_target = (void*)&scrypt_set_target;
gate->get_max64 = (void*)&yescryptr16_get_max64; gate->get_max64 = (void*)&yescryptr16_get_max64;
client_key_hack = false; client_key_hack = false;
YESCRYPT_N = 4096; YESCRYPT_N = 4096;

1314
avxdefs.h

File diff suppressed because it is too large Load Diff

View File

@@ -3,16 +3,6 @@
make distclean || echo clean make distclean || echo clean
rm -f config.status rm -f config.status
./autogen.sh || echo done ./autogen.sh || echo done
CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-4way.exe
strip -s cpuminer
mv cpuminer cpuminer-4way
make clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
make -j 4 make -j 4
strip -s cpuminer.exe strip -s cpuminer.exe

View File

@@ -18,8 +18,8 @@ rm -f config.status
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+) # Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores" #extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl #CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl #CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
make -j 4 make -j 4

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.0.1. # Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.1.1.
# #
# #
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='cpuminer-opt' PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.8.0.1' PACKAGE_VERSION='3.8.1.1'
PACKAGE_STRING='cpuminer-opt 3.8.0.1' PACKAGE_STRING='cpuminer-opt 3.8.1.1'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures cpuminer-opt 3.8.0.1 to adapt to many kinds of systems. \`configure' configures cpuminer-opt 3.8.1.1 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.8.0.1:";; short | recursive ) echo "Configuration of cpuminer-opt 3.8.1.1:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
cpuminer-opt configure 3.8.0.1 cpuminer-opt configure 3.8.1.1
generated by GNU Autoconf 2.69 generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc. Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.8.0.1, which was It was created by cpuminer-opt $as_me 3.8.1.1, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@ $ $0 $@
@@ -2981,7 +2981,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='cpuminer-opt' PACKAGE='cpuminer-opt'
VERSION='3.8.0.1' VERSION='3.8.1.1'
cat >>confdefs.h <<_ACEOF cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by cpuminer-opt $as_me 3.8.0.1, which was This file was extended by cpuminer-opt $as_me 3.8.1.1, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\ ac_cs_version="\\
cpuminer-opt config.status 3.8.0.1 cpuminer-opt config.status 3.8.1.1
configured by $0, generated by GNU Autoconf 2.69, configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.8.0.1]) AC_INIT([cpuminer-opt], [3.8.1.1])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

View File

@@ -3238,10 +3238,10 @@ int main(int argc, char *argv[])
} }
} }
//#ifdef HAVE_SYSLOG_H #ifdef HAVE_SYSLOG_H
// if (use_syslog) if (use_syslog)
// openlog("cpuminer", LOG_PID, LOG_USER); openlog("cpuminer", LOG_PID, LOG_USER);
//#endif #endif
work_restart = (struct work_restart*) calloc(opt_n_threads, sizeof(*work_restart)); work_restart = (struct work_restart*) calloc(opt_n_threads, sizeof(*work_restart));
if (!work_restart) if (!work_restart)

10
miner.h
View File

@@ -80,10 +80,10 @@ void *alloca (size_t);
# endif # endif
//#endif //#endif
//#ifdef HAVE_SYSLOG_H #ifdef HAVE_SYSLOG_H
//#include <syslog.h> #include <syslog.h>
//#define LOG_BLUE 0x10 /* unique value */ #define LOG_BLUE 0x10 /* unique value */
//#else #else
enum { enum {
LOG_ERR, LOG_ERR,
LOG_WARNING, LOG_WARNING,
@@ -93,7 +93,7 @@ enum {
/* custom notices */ /* custom notices */
LOG_BLUE = 0x10, LOG_BLUE = 0x10,
}; };
//#endif #endif
static inline bool is_windows(void) static inline bool is_windows(void)
{ {