This commit is contained in:
Jay D Dee
2018-02-07 16:38:45 -05:00
parent 54b8fd7362
commit a28daca3ce
83 changed files with 5153 additions and 3924 deletions

View File

@@ -100,7 +100,8 @@ cpuminer_SOURCES = \
algo/lbry.c \
algo/luffa/sph_luffa.c \
algo/luffa/luffa.c \
algo/luffa/sse2/luffa_for_sse2.c \
algo/luffa/luffa_for_sse2.c \
algo/luffa/luffa-hash-2way.c \
algo/lyra2/lyra2.c \
algo/lyra2/sponge.c \
algo/lyra2/lyra2rev2-gate.c \
@@ -127,7 +128,11 @@ cpuminer_SOURCES = \
algo/quark/anime-gate.c \
algo/quark/anime.c \
algo/quark/anime-4way.c \
algo/qubit/qubit-gate.c \
algo/qubit/qubit.c \
algo/qubit/qubit-2way.c \
algo/qubit/deep-gate.c \
algo/qubit/deep-2way.c \
algo/qubit/deep.c \
algo/ripemd/sph_ripemd.c \
algo/scrypt.c \
@@ -143,8 +148,9 @@ cpuminer_SOURCES = \
algo/shavite/sph-shavite-aesni.c \
algo/shavite/shavite.c \
algo/simd/sph_simd.c \
algo/simd/sse2/nist.c \
algo/simd/sse2/vector.c \
algo/simd/nist.c \
algo/simd/vector.c \
algo/simd/simd-hash-2way.c \
algo/skein/sph_skein.c \
algo/skein/skein-hash-4way.c \
algo/skein/skein.c \

View File

@@ -16,6 +16,7 @@ See file RELEASE_NOTES for change log and compile instructions.
Supported Algorithms
--------------------
anime Animecoin
argon2
axiom Shabal-256 MemoHash
bastion
@@ -78,6 +79,7 @@ Supported Algorithms
x13sm3 hsr (Hshare)
x14 X14
x15 X15
x16r Ravencoin
x17
xevan Bitsend
yescrypt Globalboost-Y (BSTY)
@@ -136,10 +138,13 @@ output from the miner showing the startup and any errors.
Donations
---------
I do not do this for money but I have a donation address if users
are so inclined.
cpuminer-opt has no fees of any kind but donations are accepted.
bitcoin:12tdvfF7KmAsihBXQXynT6E6th2c2pByTT?label=donations
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
Happy mining!

View File

@@ -25,3 +25,12 @@ cpuminer-aes-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
cpuminer-avx2.exe "-march=core-avx2" Haswell...
cpuminer-avx2-sha.exe "-march=core-avx2 -msha" Ryzen
If you like this software feel free to donate:
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ

View File

@@ -98,8 +98,8 @@ Start mining.
Windows
The following in how the Windows binary releases are built. It's old and
not very good but it works, for me anyway.
Precompiled Windows binaries are built on a Linux host using Mingw
with a more recent compiler than the following Windows hosted procedure.
Building on Windows prerequisites:
@@ -131,7 +131,7 @@ or similar Windows program.
In msys shell cd to miner directory.
cd /c/path/to/cpuminer-opt
Run winbuild.sh to build on Windows or execute the following commands.
Run build.sh to build on Windows or execute the following commands.
./autogen.sh
CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
@@ -159,6 +159,16 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log
----------
v3.8.1
Fixes x16r on CPUs with only SSE2.
More Optimizations for X algos, qubit & deep.
Corrected algo optimizations for scrypt and yescrypt, no new optimizations.
v3.8.0.1
Fixed x16r AVX2 low hash rate.
v3.8.0
4way no longer a seperate feature, included in AVX2.

View File

@@ -553,22 +553,22 @@ do { \
, _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
M[0x0] = mm_byteswap_32( *(buf + 0) ); \
M[0x1] = mm_byteswap_32( *(buf + 1) ); \
M[0x2] = mm_byteswap_32( *(buf + 2) ); \
M[0x3] = mm_byteswap_32( *(buf + 3) ); \
M[0x4] = mm_byteswap_32( *(buf + 4) ); \
M[0x5] = mm_byteswap_32( *(buf + 5) ); \
M[0x6] = mm_byteswap_32( *(buf + 6) ); \
M[0x7] = mm_byteswap_32( *(buf + 7) ); \
M[0x8] = mm_byteswap_32( *(buf + 8) ); \
M[0x9] = mm_byteswap_32( *(buf + 9) ); \
M[0xA] = mm_byteswap_32( *(buf + 10) ); \
M[0xB] = mm_byteswap_32( *(buf + 11) ); \
M[0xC] = mm_byteswap_32( *(buf + 12) ); \
M[0xD] = mm_byteswap_32( *(buf + 13) ); \
M[0xE] = mm_byteswap_32( *(buf + 14) ); \
M[0xF] = mm_byteswap_32( *(buf + 15) ); \
M[0x0] = mm_bswap_32( *(buf + 0) ); \
M[0x1] = mm_bswap_32( *(buf + 1) ); \
M[0x2] = mm_bswap_32( *(buf + 2) ); \
M[0x3] = mm_bswap_32( *(buf + 3) ); \
M[0x4] = mm_bswap_32( *(buf + 4) ); \
M[0x5] = mm_bswap_32( *(buf + 5) ); \
M[0x6] = mm_bswap_32( *(buf + 6) ); \
M[0x7] = mm_bswap_32( *(buf + 7) ); \
M[0x8] = mm_bswap_32( *(buf + 8) ); \
M[0x9] = mm_bswap_32( *(buf + 9) ); \
M[0xA] = mm_bswap_32( *(buf + 10) ); \
M[0xB] = mm_bswap_32( *(buf + 11) ); \
M[0xC] = mm_bswap_32( *(buf + 12) ); \
M[0xD] = mm_bswap_32( *(buf + 13) ); \
M[0xE] = mm_bswap_32( *(buf + 14) ); \
M[0xF] = mm_bswap_32( *(buf + 15) ); \
for (r = 0; r < rounds; r ++) \
ROUND_S_4WAY(r); \
H0 = _mm_xor_si128( _mm_xor_si128( \
@@ -615,22 +615,22 @@ do { \
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
M0 = mm_byteswap_32( * buf ); \
M1 = mm_byteswap_32( *(buf+1) ); \
M2 = mm_byteswap_32( *(buf+2) ); \
M3 = mm_byteswap_32( *(buf+3) ); \
M4 = mm_byteswap_32( *(buf+4) ); \
M5 = mm_byteswap_32( *(buf+5) ); \
M6 = mm_byteswap_32( *(buf+6) ); \
M7 = mm_byteswap_32( *(buf+7) ); \
M8 = mm_byteswap_32( *(buf+8) ); \
M9 = mm_byteswap_32( *(buf+9) ); \
MA = mm_byteswap_32( *(buf+10) ); \
MB = mm_byteswap_32( *(buf+11) ); \
MC = mm_byteswap_32( *(buf+12) ); \
MD = mm_byteswap_32( *(buf+13) ); \
ME = mm_byteswap_32( *(buf+14) ); \
MF = mm_byteswap_32( *(buf+15) ); \
M0 = mm_bswap_32( * buf ); \
M1 = mm_bswap_32( *(buf+1) ); \
M2 = mm_bswap_32( *(buf+2) ); \
M3 = mm_bswap_32( *(buf+3) ); \
M4 = mm_bswap_32( *(buf+4) ); \
M5 = mm_bswap_32( *(buf+5) ); \
M6 = mm_bswap_32( *(buf+6) ); \
M7 = mm_bswap_32( *(buf+7) ); \
M8 = mm_bswap_32( *(buf+8) ); \
M9 = mm_bswap_32( *(buf+9) ); \
MA = mm_bswap_32( *(buf+10) ); \
MB = mm_bswap_32( *(buf+11) ); \
MC = mm_bswap_32( *(buf+12) ); \
MD = mm_bswap_32( *(buf+13) ); \
ME = mm_bswap_32( *(buf+14) ); \
MF = mm_bswap_32( *(buf+15) ); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
@@ -727,22 +727,22 @@ do { \
VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
M0 = mm256_byteswap_32( * buf ); \
M1 = mm256_byteswap_32( *(buf+1) ); \
M2 = mm256_byteswap_32( *(buf+2) ); \
M3 = mm256_byteswap_32( *(buf+3) ); \
M4 = mm256_byteswap_32( *(buf+4) ); \
M5 = mm256_byteswap_32( *(buf+5) ); \
M6 = mm256_byteswap_32( *(buf+6) ); \
M7 = mm256_byteswap_32( *(buf+7) ); \
M8 = mm256_byteswap_32( *(buf+8) ); \
M9 = mm256_byteswap_32( *(buf+9) ); \
MA = mm256_byteswap_32( *(buf+10) ); \
MB = mm256_byteswap_32( *(buf+11) ); \
MC = mm256_byteswap_32( *(buf+12) ); \
MD = mm256_byteswap_32( *(buf+13) ); \
ME = mm256_byteswap_32( *(buf+14) ); \
MF = mm256_byteswap_32( *(buf+15) ); \
M0 = mm256_bswap_32( * buf ); \
M1 = mm256_bswap_32( *(buf+1) ); \
M2 = mm256_bswap_32( *(buf+2) ); \
M3 = mm256_bswap_32( *(buf+3) ); \
M4 = mm256_bswap_32( *(buf+4) ); \
M5 = mm256_bswap_32( *(buf+5) ); \
M6 = mm256_bswap_32( *(buf+6) ); \
M7 = mm256_bswap_32( *(buf+7) ); \
M8 = mm256_bswap_32( *(buf+8) ); \
M9 = mm256_bswap_32( *(buf+9) ); \
MA = mm256_bswap_32( *(buf+10) ); \
MB = mm256_bswap_32( *(buf+11) ); \
MC = mm256_bswap_32( *(buf+12) ); \
MD = mm256_bswap_32( *(buf+13) ); \
ME = mm256_bswap_32( *(buf+14) ); \
MF = mm256_bswap_32( *(buf+15) ); \
ROUND_S_8WAY(0); \
ROUND_S_8WAY(1); \
ROUND_S_8WAY(2); \
@@ -848,22 +848,22 @@ do { \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M[0x0] = mm256_byteswap_64( *(buf+0) ); \
M[0x1] = mm256_byteswap_64( *(buf+1) ); \
M[0x2] = mm256_byteswap_64( *(buf+2) ); \
M[0x3] = mm256_byteswap_64( *(buf+3) ); \
M[0x4] = mm256_byteswap_64( *(buf+4) ); \
M[0x5] = mm256_byteswap_64( *(buf+5) ); \
M[0x6] = mm256_byteswap_64( *(buf+6) ); \
M[0x7] = mm256_byteswap_64( *(buf+7) ); \
M[0x8] = mm256_byteswap_64( *(buf+8) ); \
M[0x9] = mm256_byteswap_64( *(buf+9) ); \
M[0xA] = mm256_byteswap_64( *(buf+10) ); \
M[0xB] = mm256_byteswap_64( *(buf+11) ); \
M[0xC] = mm256_byteswap_64( *(buf+12) ); \
M[0xD] = mm256_byteswap_64( *(buf+13) ); \
M[0xE] = mm256_byteswap_64( *(buf+14) ); \
M[0xF] = mm256_byteswap_64( *(buf+15) ); \
M[0x0] = mm256_bswap_64( *(buf+0) ); \
M[0x1] = mm256_bswap_64( *(buf+1) ); \
M[0x2] = mm256_bswap_64( *(buf+2) ); \
M[0x3] = mm256_bswap_64( *(buf+3) ); \
M[0x4] = mm256_bswap_64( *(buf+4) ); \
M[0x5] = mm256_bswap_64( *(buf+5) ); \
M[0x6] = mm256_bswap_64( *(buf+6) ); \
M[0x7] = mm256_bswap_64( *(buf+7) ); \
M[0x8] = mm256_bswap_64( *(buf+8) ); \
M[0x9] = mm256_bswap_64( *(buf+9) ); \
M[0xA] = mm256_bswap_64( *(buf+10) ); \
M[0xB] = mm256_bswap_64( *(buf+11) ); \
M[0xC] = mm256_bswap_64( *(buf+12) ); \
M[0xD] = mm256_bswap_64( *(buf+13) ); \
M[0xE] = mm256_bswap_64( *(buf+14) ); \
M[0xF] = mm256_bswap_64( *(buf+15) ); \
for (r = 0; r < 16; r ++) \
ROUND_B_4WAY(r); \
H0 = _mm256_xor_si256( _mm256_xor_si256( \
@@ -913,22 +913,22 @@ do { \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M0 = mm256_byteswap_64( *(buf + 0) ); \
M1 = mm256_byteswap_64( *(buf + 1) ); \
M2 = mm256_byteswap_64( *(buf + 2) ); \
M3 = mm256_byteswap_64( *(buf + 3) ); \
M4 = mm256_byteswap_64( *(buf + 4) ); \
M5 = mm256_byteswap_64( *(buf + 5) ); \
M6 = mm256_byteswap_64( *(buf + 6) ); \
M7 = mm256_byteswap_64( *(buf + 7) ); \
M8 = mm256_byteswap_64( *(buf + 8) ); \
M9 = mm256_byteswap_64( *(buf + 9) ); \
MA = mm256_byteswap_64( *(buf + 10) ); \
MB = mm256_byteswap_64( *(buf + 11) ); \
MC = mm256_byteswap_64( *(buf + 12) ); \
MD = mm256_byteswap_64( *(buf + 13) ); \
ME = mm256_byteswap_64( *(buf + 14) ); \
MF = mm256_byteswap_64( *(buf + 15) ); \
M0 = mm256_bswap_64( *(buf + 0) ); \
M1 = mm256_bswap_64( *(buf + 1) ); \
M2 = mm256_bswap_64( *(buf + 2) ); \
M3 = mm256_bswap_64( *(buf + 3) ); \
M4 = mm256_bswap_64( *(buf + 4) ); \
M5 = mm256_bswap_64( *(buf + 5) ); \
M6 = mm256_bswap_64( *(buf + 6) ); \
M7 = mm256_bswap_64( *(buf + 7) ); \
M8 = mm256_bswap_64( *(buf + 8) ); \
M9 = mm256_bswap_64( *(buf + 9) ); \
MA = mm256_bswap_64( *(buf + 10) ); \
MB = mm256_bswap_64( *(buf + 11) ); \
MC = mm256_bswap_64( *(buf + 12) ); \
MD = mm256_bswap_64( *(buf + 13) ); \
ME = mm256_bswap_64( *(buf + 14) ); \
MF = mm256_bswap_64( *(buf + 15) ); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
@@ -1064,8 +1064,8 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
_mm_set1_epi32( 0x01000000UL ) );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
*(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
}
else
@@ -1077,13 +1077,13 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
memset_zero_128( u.buf, 56>>2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
*(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, u.buf, 64 );
}
out = (__m128i*)dst;
for ( k = 0; k < out_size_w32; k++ )
out[k] = mm_byteswap_32( sc->H[k] );
out[k] = mm_bswap_32( sc->H[k] );
}
#if defined (__AVX2__)
@@ -1187,8 +1187,8 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
if (out_size_w32 == 8)
u.buf[52>>2] = _mm256_or_si256( u.buf[52>>2],
_mm256_set1_epi32( 0x01000000UL ) );
*(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) );
*(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr );
}
else
@@ -1200,13 +1200,13 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
memset_zero_256( u.buf, 56>>2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
*(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) );
*(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, u.buf, 64 );
}
out = (__m256i*)dst;
for ( k = 0; k < out_size_w32; k++ )
out[k] = mm256_byteswap_32( sc->H[k] );
out[k] = mm256_bswap_32( sc->H[k] );
}
// Blake-512 4 way
@@ -1311,9 +1311,9 @@ blake64_4way_close( blake_4way_big_context *sc,
if ( out_size_w64 == 8 )
u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
*(u.buf+(112>>3)) = mm256_byteswap_64(
*(u.buf+(112>>3)) = mm256_bswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_64(
*(u.buf+(120>>3)) = mm256_bswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
@@ -1328,16 +1328,16 @@ blake64_4way_close( blake_4way_big_context *sc,
memset_zero_256( u.buf, 112>>3 );
if ( out_size_w64 == 8 )
u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
*(u.buf+(112>>3)) = mm256_byteswap_64(
*(u.buf+(112>>3)) = mm256_bswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_64(
*(u.buf+(120>>3)) = mm256_bswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf, 128 );
}
out = (__m256i*)dst;
for ( k = 0; k < out_size_w64; k++ )
out[k] = mm256_byteswap_64( sc->H[k] );
out[k] = mm256_bswap_64( sc->H[k] );
}
#endif

View File

@@ -984,7 +984,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
}
memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
buf[ (buf_size - 4) >> 2 ] = mm_zero;
buf[ (buf_size - 4) >> 2 ] = m128_zero;
compress_small( buf, h, h2 );
for ( u = 0; u < 16; u ++ )

View File

@@ -129,7 +129,7 @@ static void transform( cubehashParam *sp )
#endif
} // transform
// Ccubehash context initializing is very expensive.
// Cubehash context initializing is very expensive.
// Cache the intial value for faster reinitializing.
cubehashParam cube_ctx_cache __attribute__ ((aligned (64)));

File diff suppressed because it is too large Load Diff

View File

@@ -48,20 +48,20 @@ extern "C"{
#define SPH_SIZE_hamsi512 512
// Partial is only scalar but needs pointer ref for hamsi-helper
// deprecate partial_len
typedef struct {
__m128i h[16];
__m128i partial[2];
__m256i h[8];
__m256i buf[1];
size_t partial_len;
sph_u32 count_high, count_low;
} hamsi_4way_big_context;
typedef hamsi_4way_big_context hamsi512_4way_context;
void hamsi512_4way_init(void *cc);
void hamsi512_4way(void *cc, const void *data, size_t len);
void hamsi512_4way_close(void *cc, void *dst);
void hamsi512_4way_init( hamsi512_4way_context *sc );
void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
#ifdef __cplusplus
}

View File

@@ -1,482 +0,0 @@
/* $Id: hamsi_helper.c 202 2010-05-31 15:46:48Z tp $ */
/*
* Helper code for Hamsi (input block expansion). This code is
* automatically generated and includes precomputed tables for
* expansion code which handles 2 to 8 bits at a time.
*
* This file is included from hamsi.c, and is not meant to be compiled
* independently.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifdef __cplusplus
extern "C"{
#endif
/* Note: this table lists bits within each byte from least
siginificant to most significant. */
static const sph_u32 T512[64][16] = {
{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
SPH_C32(0x9e69af68) },
{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
SPH_C32(0x0c26f262) },
{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
SPH_C32(0xdc24e61f) },
{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
SPH_C32(0x3daac2da) },
{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
SPH_C32(0x78cace29) },
{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
SPH_C32(0x2dd1f9ab) },
{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
SPH_C32(0xbf2c0be2) },
{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
SPH_C32(0x32219526) },
{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
SPH_C32(0xac8e6c88) },
{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
SPH_C32(0x7b1bd6b9) },
{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
SPH_C32(0xf746c320) },
{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
SPH_C32(0x69505b3a) },
{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
SPH_C32(0x8a341574) },
{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
SPH_C32(0x450360bf) },
{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
SPH_C32(0xf3d45758) },
{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
SPH_C32(0x925c44e9) },
{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
SPH_C32(0xa123ff9f) },
{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
SPH_C32(0x1568ff0f) },
{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
SPH_C32(0xc5c1eb3e) },
{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
SPH_C32(0x1af21fe1) },
{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
SPH_C32(0x857f3c2b) },
{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
SPH_C32(0x2ba05a55) },
{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
SPH_C32(0xfeabf254) },
{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
SPH_C32(0xfe1cdc7f) },
{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
SPH_C32(0xb0a51834) },
{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
SPH_C32(0xa6b8c28d) },
{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
SPH_C32(0x3a4e99d7) },
{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
SPH_C32(0xe1844257) },
{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
SPH_C32(0x2c3b504e) },
{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
SPH_C32(0x524a0d59) },
{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
SPH_C32(0x378dd173) },
{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
SPH_C32(0x8b6c72bd) },
{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
SPH_C32(0x8e67b7fa) },
{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
SPH_C32(0x443d3004) },
{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
SPH_C32(0xf4f6ea7b) },
{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
SPH_C32(0x979961d0) },
{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
SPH_C32(0x98aa496e) },
{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
SPH_C32(0x094e3198) },
{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
SPH_C32(0xe86cba2e) },
{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
SPH_C32(0x4b7eec55) },
{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
SPH_C32(0x1e7536a6) },
{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
SPH_C32(0x24314f17) },
{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
SPH_C32(0x9075b1ce) },
{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
SPH_C32(0x9b6ef888) },
{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
SPH_C32(0xd8b61463) },
{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
SPH_C32(0x3ea660f7) },
{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
SPH_C32(0x7f975691) },
{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
SPH_C32(0x2c94459e) },
{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
SPH_C32(0x56a7b19f) },
{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
SPH_C32(0x81fdf908) },
{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
SPH_C32(0x5bd61539) },
{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
SPH_C32(0x15b961e7) },
{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
SPH_C32(0x2a2c18f0) },
{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
SPH_C32(0x551e3d6e) },
{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
SPH_C32(0x33c5244f) },
{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
SPH_C32(0x8a58e6a4) },
{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
SPH_C32(0xda878000) },
{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
SPH_C32(0x3c5dfffe) },
{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
SPH_C32(0x7b1675d7) },
{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
SPH_C32(0x2879ebac) },
{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
SPH_C32(0xbe0a679e) },
{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
SPH_C32(0x30aebcf7) },
{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
SPH_C32(0xc7ff60f0) },
{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
SPH_C32(0xe7e00a94) }
};
#define U_BIG( n ) \
do { \
__m128i db = buf[n]; \
for ( int u = 0; u < 32; u++ ) \
{ \
__m128i dm = mm_negate_32( _mm_and_si128( db, mm_one_32 ) ); \
m0 = _mm_xor_si128( m0, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m1 = _mm_xor_si128( m1, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m2 = _mm_xor_si128( m2, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m3 = _mm_xor_si128( m3, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m4 = _mm_xor_si128( m4, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m5 = _mm_xor_si128( m5, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m6 = _mm_xor_si128( m6, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m7 = _mm_xor_si128( m7, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m8 = _mm_xor_si128( m8, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
m9 = _mm_xor_si128( m9, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mA = _mm_xor_si128( mA, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mB = _mm_xor_si128( mB, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mC = _mm_xor_si128( mC, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mD = _mm_xor_si128( mD, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mE = _mm_xor_si128( mE, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
mF = _mm_xor_si128( mF, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
db = _mm_srli_epi32( db, 1 ); \
} \
} while (0);
#define INPUT_BIG \
do { \
const sph_u32 *tp = &T512[0][0]; \
m0 = mm_zero; \
m1 = mm_zero; \
m2 = mm_zero; \
m3 = mm_zero; \
m4 = mm_zero; \
m5 = mm_zero; \
m6 = mm_zero; \
m7 = mm_zero; \
m8 = mm_zero; \
m9 = mm_zero; \
mA = mm_zero; \
mB = mm_zero; \
mC = mm_zero; \
mD = mm_zero; \
mE = mm_zero; \
mF = mm_zero; \
U_BIG( 0 ); \
U_BIG( 1 ); \
} while (0)
#ifdef __cplusplus
}
#endif

940
algo/hamsi/sph_hamsi.c.test Normal file
View File

@@ -0,0 +1,940 @@
/* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */
/*
* Hamsi implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include <stddef.h>
#include <string.h>
#include "sph_hamsi.h"
#ifdef __cplusplus
extern "C"{
#endif
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAMSI
#define SPH_SMALL_FOOTPRINT_HAMSI 1
#endif
/*
* The SPH_HAMSI_EXPAND_* define how many input bits we handle in one
* table lookup during message expansion (1 to 8, inclusive). If we note
* w the number of bits per message word (w=32 for Hamsi-224/256, w=64
* for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for
* Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level,
* then we will get t tables (where t=ceil(w/n)) of individual size
* 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and
* n=5, there are 7 tables, but the last one uses only two bits on
* input, not five).
*
* Also, we read t rows of r words from RAM. Words in a given row are
* concatenated in RAM in that order, so most of the cost is about
* reading the first row word; comparatively, cache misses are thus
* less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8).
*
* When n=1, tables are "special" in that we omit the first entry of
* each table (which always contains 0), so that total table size is
* halved.
*
* We thus have the following (size1 is the cumulative table size of
* Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2
* are for Hamsi-224/256 and Hamsi-384/512, respectively).
*
* n size1 size2 t1 t2
* ---------------------------------------
* 1 1024 4096 32 64
* 2 2048 8192 16 32
* 3 2688 10880 11 22
* 4 4096 16384 8 16
* 5 6272 25600 7 13
* 6 10368 41984 6 11
* 7 16896 73856 5 10
* 8 32768 131072 4 8
*
* So there is a trade-off: a lower n makes the tables fit better in
* L1 cache, but increases the number of memory accesses. The optimal
* value depends on the amount of available L1 cache and the relative
* impact of a cache miss.
*
* Experimentally, in ideal benchmark conditions (which are not necessarily
* realistic with regards to L1 cache contention), it seems that n=8 is
* the best value on "big" architectures (those with 32 kB or more of L1
* cache), while n=4 is better on "small" architectures. This was tested
* on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3
* (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302
* (8 kB L1 cache).
*
* Note: with n=1, the 32 tables (actually implemented as one big table)
* are read entirely and sequentially, regardless of the input data,
* thus avoiding any data-dependent table access pattern.
*/
#if !defined SPH_HAMSI_EXPAND_SMALL
#if SPH_SMALL_FOOTPRINT_HAMSI
#define SPH_HAMSI_EXPAND_SMALL 4
#else
#define SPH_HAMSI_EXPAND_SMALL 8
#endif
#endif
#if !defined SPH_HAMSI_EXPAND_BIG
#define SPH_HAMSI_EXPAND_BIG 8
#endif
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
#include "sph_hamsi_helper.c"
static const sph_u32 IV224[] = {
SPH_C32(0xc3967a67), SPH_C32(0xc3bc6c20), SPH_C32(0x4bc3bcc3),
SPH_C32(0xa7c3bc6b), SPH_C32(0x2c204b61), SPH_C32(0x74686f6c),
SPH_C32(0x69656b65), SPH_C32(0x20556e69)
};
/*
* This version is the one used in the Hamsi submission package for
* round 2 of the SHA-3 competition; the UTF-8 encoding is wrong and
* shall soon be corrected in the official Hamsi specification.
*
static const sph_u32 IV224[] = {
SPH_C32(0x3c967a67), SPH_C32(0x3cbc6c20), SPH_C32(0xb4c343c3),
SPH_C32(0xa73cbc6b), SPH_C32(0x2c204b61), SPH_C32(0x74686f6c),
SPH_C32(0x69656b65), SPH_C32(0x20556e69)
};
*/
static const sph_u32 IV256[] = {
SPH_C32(0x76657273), SPH_C32(0x69746569), SPH_C32(0x74204c65),
SPH_C32(0x7576656e), SPH_C32(0x2c204465), SPH_C32(0x70617274),
SPH_C32(0x656d656e), SPH_C32(0x7420456c)
};
static const sph_u32 IV384[] = {
SPH_C32(0x656b7472), SPH_C32(0x6f746563), SPH_C32(0x686e6965),
SPH_C32(0x6b2c2043), SPH_C32(0x6f6d7075), SPH_C32(0x74657220),
SPH_C32(0x53656375), SPH_C32(0x72697479), SPH_C32(0x20616e64),
SPH_C32(0x20496e64), SPH_C32(0x75737472), SPH_C32(0x69616c20),
SPH_C32(0x43727970), SPH_C32(0x746f6772), SPH_C32(0x61706879),
SPH_C32(0x2c204b61)
};
static const sph_u32 IV512[] = {
SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33),
SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48),
SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
SPH_C32(0x6769756d)
};
static const sph_u32 alpha_n[] = {
SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
};
static const sph_u32 alpha_f[] = {
SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
};
#define DECL_STATE_SMALL \
sph_u32 c0, c1, c2, c3, c4, c5, c6, c7;
#define READ_STATE_SMALL(sc) do { \
c0 = sc->h[0x0]; \
c1 = sc->h[0x1]; \
c2 = sc->h[0x2]; \
c3 = sc->h[0x3]; \
c4 = sc->h[0x4]; \
c5 = sc->h[0x5]; \
c6 = sc->h[0x6]; \
c7 = sc->h[0x7]; \
} while (0)
#define WRITE_STATE_SMALL(sc) do { \
sc->h[0x0] = c0; \
sc->h[0x1] = c1; \
sc->h[0x2] = c2; \
sc->h[0x3] = c3; \
sc->h[0x4] = c4; \
sc->h[0x5] = c5; \
sc->h[0x6] = c6; \
sc->h[0x7] = c7; \
} while (0)
#define s0 m0
#define s1 m1
#define s2 c0
#define s3 c1
#define s4 c2
#define s5 c3
#define s6 m2
#define s7 m3
#define s8 m4
#define s9 m5
#define sA c4
#define sB c5
#define sC c6
#define sD c7
#define sE m6
#define sF m7
#define SBOX(a, b, c, d) do { \
sph_u32 t; \
t = (a); \
(a) &= (c); \
(a) ^= (d); \
(c) ^= (b); \
(c) ^= (a); \
(d) |= t; \
(d) ^= (b); \
t ^= (c); \
(b) = (d); \
(d) |= t; \
(d) ^= (a); \
(a) &= (b); \
t ^= (a); \
(b) ^= (d); \
(b) ^= t; \
(a) = (c); \
(c) = (b); \
(b) = (d); \
(d) = SPH_T32(~t); \
} while (0)
#define L(a, b, c, d) do { \
(a) = SPH_ROTL32(a, 13); \
(c) = SPH_ROTL32(c, 3); \
(b) ^= (a) ^ (c); \
(d) ^= (c) ^ SPH_T32((a) << 3); \
(b) = SPH_ROTL32(b, 1); \
(d) = SPH_ROTL32(d, 7); \
(a) ^= (b) ^ (d); \
(c) ^= (d) ^ SPH_T32((b) << 7); \
(a) = SPH_ROTL32(a, 5); \
(c) = SPH_ROTL32(c, 22); \
} while (0)
#define ROUND_SMALL(rc, alpha) do { \
s0 ^= alpha[0x00]; \
s1 ^= alpha[0x01] ^ (sph_u32)(rc); \
s2 ^= alpha[0x02]; \
s3 ^= alpha[0x03]; \
s4 ^= alpha[0x08]; \
s5 ^= alpha[0x09]; \
s6 ^= alpha[0x0A]; \
s7 ^= alpha[0x0B]; \
s8 ^= alpha[0x10]; \
s9 ^= alpha[0x11]; \
sA ^= alpha[0x12]; \
sB ^= alpha[0x13]; \
sC ^= alpha[0x18]; \
sD ^= alpha[0x19]; \
sE ^= alpha[0x1A]; \
sF ^= alpha[0x1B]; \
SBOX(s0, s4, s8, sC); \
SBOX(s1, s5, s9, sD); \
SBOX(s2, s6, sA, sE); \
SBOX(s3, s7, sB, sF); \
L(s0, s5, sA, sF); \
L(s1, s6, sB, sC); \
L(s2, s7, s8, sD); \
L(s3, s4, s9, sE); \
} while (0)
#define P_SMALL do { \
ROUND_SMALL(0, alpha_n); \
ROUND_SMALL(1, alpha_n); \
ROUND_SMALL(2, alpha_n); \
} while (0)
#define PF_SMALL do { \
ROUND_SMALL(0, alpha_f); \
ROUND_SMALL(1, alpha_f); \
ROUND_SMALL(2, alpha_f); \
ROUND_SMALL(3, alpha_f); \
ROUND_SMALL(4, alpha_f); \
ROUND_SMALL(5, alpha_f); \
} while (0)
#define T_SMALL do { \
/* order is important */ \
c7 = (sc->h[7] ^= sB); \
c6 = (sc->h[6] ^= sA); \
c5 = (sc->h[5] ^= s9); \
c4 = (sc->h[4] ^= s8); \
c3 = (sc->h[3] ^= s3); \
c2 = (sc->h[2] ^= s2); \
c1 = (sc->h[1] ^= s1); \
c0 = (sc->h[0] ^= s0); \
} while (0)
static void
hamsi_small(sph_hamsi_small_context *sc, const unsigned char *buf, size_t num)
{
DECL_STATE_SMALL
#if !SPH_64
sph_u32 tmp;
#endif
#if SPH_64
sc->count += (sph_u64)num << 5;
#else
tmp = SPH_T32((sph_u32)num << 5);
sc->count_low = SPH_T32(sc->count_low + tmp);
sc->count_high += (sph_u32)((num >> 13) >> 14);
if (sc->count_low < tmp)
sc->count_high ++;
#endif
READ_STATE_SMALL(sc);
while (num -- > 0) {
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
INPUT_SMALL;
P_SMALL;
T_SMALL;
buf += 4;
}
WRITE_STATE_SMALL(sc);
}
static void
hamsi_small_final(sph_hamsi_small_context *sc, const unsigned char *buf)
{
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
DECL_STATE_SMALL
READ_STATE_SMALL(sc);
INPUT_SMALL;
PF_SMALL;
T_SMALL;
WRITE_STATE_SMALL(sc);
}
static void
hamsi_small_init(sph_hamsi_small_context *sc, const sph_u32 *iv)
{
sc->partial_len = 0;
memcpy(sc->h, iv, sizeof sc->h);
#if SPH_64
sc->count = 0;
#else
sc->count_high = sc->count_low = 0;
#endif
}
static void
hamsi_small_core(sph_hamsi_small_context *sc, const void *data, size_t len)
{
if (sc->partial_len != 0) {
size_t mlen;
mlen = 4 - sc->partial_len;
if (len < mlen) {
memcpy(sc->partial + sc->partial_len, data, len);
sc->partial_len += len;
return;
} else {
memcpy(sc->partial + sc->partial_len, data, mlen);
len -= mlen;
data = (const unsigned char *)data + mlen;
hamsi_small(sc, sc->partial, 1);
sc->partial_len = 0;
}
}
hamsi_small(sc, data, (len >> 2));
data = (const unsigned char *)data + (len & ~(size_t)3);
len &= (size_t)3;
memcpy(sc->partial, data, len);
sc->partial_len = len;
}
static void
hamsi_small_close(sph_hamsi_small_context *sc,
unsigned ub, unsigned n, void *dst, size_t out_size_w32)
{
unsigned char pad[12];
size_t ptr, u;
unsigned z;
unsigned char *out;
ptr = sc->partial_len;
memcpy(pad, sc->partial, ptr);
#if SPH_64
sph_enc64be(pad + 4, sc->count + (ptr << 3) + n);
#else
sph_enc32be(pad + 4, sc->count_high);
sph_enc32be(pad + 8, sc->count_low + (ptr << 3) + n);
#endif
z = 0x80 >> n;
pad[ptr ++] = ((ub & -z) | z) & 0xFF;
while (ptr < 4)
pad[ptr ++] = 0;
hamsi_small(sc, pad, 2);
hamsi_small_final(sc, pad + 8);
out = dst;
for (u = 0; u < out_size_w32; u ++)
sph_enc32be(out + (u << 2), sc->h[u]);
}
#define DECL_STATE_BIG \
sph_u32 c0, c1, c2, c3, c4, c5, c6, c7; \
sph_u32 c8, c9, cA, cB, cC, cD, cE, cF;
#define READ_STATE_BIG(sc) do { \
c0 = sc->h[0x0]; \
c1 = sc->h[0x1]; \
c2 = sc->h[0x2]; \
c3 = sc->h[0x3]; \
c4 = sc->h[0x4]; \
c5 = sc->h[0x5]; \
c6 = sc->h[0x6]; \
c7 = sc->h[0x7]; \
c8 = sc->h[0x8]; \
c9 = sc->h[0x9]; \
cA = sc->h[0xA]; \
cB = sc->h[0xB]; \
cC = sc->h[0xC]; \
cD = sc->h[0xD]; \
cE = sc->h[0xE]; \
cF = sc->h[0xF]; \
} while (0)
#define WRITE_STATE_BIG(sc) do { \
sc->h[0x0] = c0; \
sc->h[0x1] = c1; \
sc->h[0x2] = c2; \
sc->h[0x3] = c3; \
sc->h[0x4] = c4; \
sc->h[0x5] = c5; \
sc->h[0x6] = c6; \
sc->h[0x7] = c7; \
sc->h[0x8] = c8; \
sc->h[0x9] = c9; \
sc->h[0xA] = cA; \
sc->h[0xB] = cB; \
sc->h[0xC] = cC; \
sc->h[0xD] = cD; \
sc->h[0xE] = cE; \
sc->h[0xF] = cF; \
} while (0)
#define s00 m0
#define s01 m1
#define s02 c0
#define s03 c1
#define s04 m2
#define s05 m3
#define s06 c2
#define s07 c3
#define s08 c4
#define s09 c5
#define s0A m4
#define s0B m5
#define s0C c6
#define s0D c7
#define s0E m6
#define s0F m7
#define s10 m8
#define s11 m9
#define s12 c8
#define s13 c9
#define s14 mA
#define s15 mB
#define s16 cA
#define s17 cB
#define s18 cC
#define s19 cD
#define s1A mC
#define s1B mD
#define s1C cE
#define s1D cF
#define s1E mE
#define s1F mF
#define ROUND_BIG(rc, alpha) do { \
s00 ^= alpha[0x00]; \
s01 ^= alpha[0x01] ^ (sph_u32)(rc); \
s02 ^= alpha[0x02]; \
s03 ^= alpha[0x03]; \
s04 ^= alpha[0x04]; \
s05 ^= alpha[0x05]; \
s06 ^= alpha[0x06]; \
s07 ^= alpha[0x07]; \
s08 ^= alpha[0x08]; \
s09 ^= alpha[0x09]; \
s0A ^= alpha[0x0A]; \
s0B ^= alpha[0x0B]; \
s0C ^= alpha[0x0C]; \
s0D ^= alpha[0x0D]; \
s0E ^= alpha[0x0E]; \
s0F ^= alpha[0x0F]; \
s10 ^= alpha[0x10]; \
s11 ^= alpha[0x11]; \
s12 ^= alpha[0x12]; \
s13 ^= alpha[0x13]; \
s14 ^= alpha[0x14]; \
s15 ^= alpha[0x15]; \
s16 ^= alpha[0x16]; \
s17 ^= alpha[0x17]; \
s18 ^= alpha[0x18]; \
s19 ^= alpha[0x19]; \
s1A ^= alpha[0x1A]; \
s1B ^= alpha[0x1B]; \
s1C ^= alpha[0x1C]; \
s1D ^= alpha[0x1D]; \
s1E ^= alpha[0x1E]; \
s1F ^= alpha[0x1F]; \
SBOX(s00, s08, s10, s18); \
SBOX(s01, s09, s11, s19); \
SBOX(s02, s0A, s12, s1A); \
SBOX(s03, s0B, s13, s1B); \
SBOX(s04, s0C, s14, s1C); \
SBOX(s05, s0D, s15, s1D); \
SBOX(s06, s0E, s16, s1E); \
SBOX(s07, s0F, s17, s1F); \
L(s00, s09, s12, s1B); \
L(s01, s0A, s13, s1C); \
L(s02, s0B, s14, s1D); \
L(s03, s0C, s15, s1E); \
L(s04, s0D, s16, s1F); \
L(s05, s0E, s17, s18); \
L(s06, s0F, s10, s19); \
L(s07, s08, s11, s1A); \
/*if (rc == 0 ) { \
printf("S L5 post s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
}*/ \
L(s00, s02, s05, s07); \
L(s10, s13, s15, s16); \
/*if (rc == 0 ) { \
printf("S L5 post s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
}*/ \
L(s09, s0B, s0C, s0E); \
L(s19, s1A, s1C, s1F); \
} while (0)
#if SPH_SMALL_FOOTPRINT_HAMSI
#define P_BIG do { \
unsigned r; \
for (r = 0; r < 6; r ++) \
ROUND_BIG(r, alpha_n); \
} while (0)
#define PF_BIG do { \
unsigned r; \
for (r = 0; r < 12; r ++) \
ROUND_BIG(r, alpha_f); \
} while (0)
#else
#define P_BIG do { \
ROUND_BIG(0, alpha_n); \
/*printf("S R0 s00 %08lx s01 %08lx s02 %08lx s03 %08lx\n",s00,s01,s02,s03); \
printf("S R0 s04 %08lx s05 %08lx s06 %08lx s07 %08lx\n",s04,s05,s06,s07); \
printf("S R0 s08 %08lx s09 %08lx s0A %08lx s0B %08lx\n",s08,s09,s0A,s0B); \
printf("S R0 s0C %08lx s0D %08lx s0E %08lx s0F %08lx\n",s0C,s0D,s0E,s0F); \
printf("S R0 s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
printf("S R0 s14 %08lx s15 %08lx s16 %08lx s17 %08lx\n",s14,s15,s16,s17); \
printf("S R0 s18 %08lx s19 %08lx s1A %08lx s1B %08lx\n",s18,s19,s1A,s1B); \
printf("S R0 s1C %08lx s1D %08lx s1E %08lx s1F %08lx\n",s1C,s1D,s1E,s1F); \
*/\
ROUND_BIG(1, alpha_n); \
ROUND_BIG(2, alpha_n); \
ROUND_BIG(3, alpha_n); \
ROUND_BIG(4, alpha_n); \
ROUND_BIG(5, alpha_n); \
} while (0)
#define PF_BIG do { \
ROUND_BIG(0, alpha_f); \
ROUND_BIG(1, alpha_f); \
ROUND_BIG(2, alpha_f); \
ROUND_BIG(3, alpha_f); \
ROUND_BIG(4, alpha_f); \
ROUND_BIG(5, alpha_f); \
ROUND_BIG(6, alpha_f); \
ROUND_BIG(7, alpha_f); \
ROUND_BIG(8, alpha_f); \
ROUND_BIG(9, alpha_f); \
ROUND_BIG(10, alpha_f); \
ROUND_BIG(11, alpha_f); \
} while (0)
#endif
#define T_BIG do { \
/* order is important */ \
cF = (sc->h[0xF] ^= s17); \
cE = (sc->h[0xE] ^= s16); \
cD = (sc->h[0xD] ^= s15); \
cC = (sc->h[0xC] ^= s14); \
cB = (sc->h[0xB] ^= s13); \
cA = (sc->h[0xA] ^= s12); \
c9 = (sc->h[0x9] ^= s11); \
c8 = (sc->h[0x8] ^= s10); \
c7 = (sc->h[0x7] ^= s07); \
c6 = (sc->h[0x6] ^= s06); \
c5 = (sc->h[0x5] ^= s05); \
c4 = (sc->h[0x4] ^= s04); \
c3 = (sc->h[0x3] ^= s03); \
c2 = (sc->h[0x2] ^= s02); \
c1 = (sc->h[0x1] ^= s01); \
c0 = (sc->h[0x0] ^= s00); \
} while (0)
static void
hamsi_big(sph_hamsi_big_context *sc, const unsigned char *buf, size_t num)
{
DECL_STATE_BIG
#if !SPH_64
sph_u32 tmp;
#endif
#if SPH_64
sc->count += (sph_u64)num << 6;
#else
tmp = SPH_T32((sph_u32)num << 6);
sc->count_low = SPH_T32(sc->count_low + tmp);
sc->count_high += (sph_u32)((num >> 13) >> 13);
if (sc->count_low < tmp)
sc->count_high ++;
#endif
READ_STATE_BIG(sc);
/*
uint32_t* b = (uint32_t*)buf;
//printf("S s64: %016llx\n",*ss);
//printf("S buf: %08lx %08lx\n",b[0], b[1]);
int n1 = 1;
int n2 = 1;
*/
while (num -- > 0) {
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
INPUT_BIG;
/*if ( n1 )
{
n1 = 0;
printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m0,m1,m2,m3 );
printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m4,m5,m6,m7);
printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m8,m9,mA,mB );
printf("S INPUT m: %08lx %08lx %08lx %08lx\n",mC,mD,mE,mF);
}
*/
P_BIG;
/*if ( n2 )
{
n2 = 0;
printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s00,s01,s02,s03 );
printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s04,s05,s07,s07);
printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s08,s09,s0A,s0B );
printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s0C,s0D,s0E,s0F);
}
*/
T_BIG;
buf += 8;
}
WRITE_STATE_BIG(sc);
}
static void
hamsi_big_final(sph_hamsi_big_context *sc, const unsigned char *buf)
{
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
DECL_STATE_BIG
READ_STATE_BIG(sc);
INPUT_BIG;
PF_BIG;
T_BIG;
WRITE_STATE_BIG(sc);
}
static void
hamsi_big_init(sph_hamsi_big_context *sc, const sph_u32 *iv)
{
sc->partial_len = 0;
memcpy(sc->h, iv, sizeof sc->h);
#if SPH_64
sc->count = 0;
#else
sc->count_high = sc->count_low = 0;
#endif
}
static void
hamsi_big_core(sph_hamsi_big_context *sc, const void *data, size_t len)
{
uint64_t* d = (uint64_t*)data;
uint64_t* h = (uint64_t*)sc->h;
/*
printf("S core1 len = %d\n",len);
printf("S data: %016llx %016llx %016llx %016llx\n",d[0],d[1],d[2],d[3]);
printf("S data: %016llx %016llx %016llx %016llx\n",d[4],d[5],d[6],d[7]);
printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
*/
if (sc->partial_len != 0) {
//printf("WARNING partial_len != 0\n");
size_t mlen;
mlen = 8 - sc->partial_len;
if (len < mlen) {
memcpy(sc->partial + sc->partial_len, data, len);
sc->partial_len += len;
return;
} else {
memcpy(sc->partial + sc->partial_len, data, mlen);
len -= mlen;
data = (const unsigned char *)data + mlen;
hamsi_big(sc, sc->partial, 1);
sc->partial_len = 0;
}
}
hamsi_big(sc, data, (len >> 3));
/*
printf("S core2\n");
printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
*/
data = (const unsigned char *)data + (len & ~(size_t)7);
len &= (size_t)7;
memcpy(sc->partial, data, len);
sc->partial_len = len;
}
static void
hamsi_big_close(sph_hamsi_big_context *sc,
unsigned ub, unsigned n, void *dst, size_t out_size_w32)
{
unsigned char pad[8];
size_t ptr, u;
unsigned z;
unsigned char *out;
//uint64_t* h = (uint64_t*)sc->h;
ptr = sc->partial_len;
#if SPH_64
sph_enc64be(pad, sc->count + (ptr << 3) + n);
#else
sph_enc32be(pad, sc->count_high);
sph_enc32be(pad + 4, sc->count_low + (ptr << 3) + n);
#endif
z = 0x80 >> n;
sc->partial[ptr ++] = ((ub & -z) | z) & 0xFF;
while (ptr < 8)
sc->partial[ptr ++] = 0;
//printf("S close1\n");
//printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
hamsi_big(sc, sc->partial, 1);
//printf("S close2\n");
//printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
hamsi_big_final(sc, pad);
//printf("S close3\n");
//printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
out = dst;
if (out_size_w32 == 12) {
sph_enc32be(out + 0, sc->h[ 0]);
sph_enc32be(out + 4, sc->h[ 1]);
sph_enc32be(out + 8, sc->h[ 3]);
sph_enc32be(out + 12, sc->h[ 4]);
sph_enc32be(out + 16, sc->h[ 5]);
sph_enc32be(out + 20, sc->h[ 6]);
sph_enc32be(out + 24, sc->h[ 8]);
sph_enc32be(out + 28, sc->h[ 9]);
sph_enc32be(out + 32, sc->h[10]);
sph_enc32be(out + 36, sc->h[12]);
sph_enc32be(out + 40, sc->h[13]);
sph_enc32be(out + 44, sc->h[15]);
} else {
for (u = 0; u < 16; u ++)
sph_enc32be(out + (u << 2), sc->h[u]);
}
}
/* see sph_hamsi.h */
void
sph_hamsi224_init(void *cc)
{
hamsi_small_init(cc, IV224);
}
/* see sph_hamsi.h */
void
sph_hamsi224(void *cc, const void *data, size_t len)
{
hamsi_small_core(cc, data, len);
}
/* see sph_hamsi.h */
void
sph_hamsi224_close(void *cc, void *dst)
{
hamsi_small_close(cc, 0, 0, dst, 7);
// hamsi_small_init(cc, IV224);
}
/* see sph_hamsi.h */
void
sph_hamsi224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
hamsi_small_close(cc, ub, n, dst, 7);
// hamsi_small_init(cc, IV224);
}
/* see sph_hamsi.h */
void
sph_hamsi256_init(void *cc)
{
hamsi_small_init(cc, IV256);
}
/* see sph_hamsi.h */
void
sph_hamsi256(void *cc, const void *data, size_t len)
{
hamsi_small_core(cc, data, len);
}
/* see sph_hamsi.h */
void
sph_hamsi256_close(void *cc, void *dst)
{
hamsi_small_close(cc, 0, 0, dst, 8);
// hamsi_small_init(cc, IV256);
}
/* see sph_hamsi.h */
void
sph_hamsi256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
hamsi_small_close(cc, ub, n, dst, 8);
// hamsi_small_init(cc, IV256);
}
/* see sph_hamsi.h */
void
sph_hamsi384_init(void *cc)
{
hamsi_big_init(cc, IV384);
}
/* see sph_hamsi.h */
void
sph_hamsi384(void *cc, const void *data, size_t len)
{
hamsi_big_core(cc, data, len);
}
/* see sph_hamsi.h */
void
sph_hamsi384_close(void *cc, void *dst)
{
hamsi_big_close(cc, 0, 0, dst, 12);
// hamsi_big_init(cc, IV384);
}
/* see sph_hamsi.h */
void
sph_hamsi384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
hamsi_big_close(cc, ub, n, dst, 12);
// hamsi_big_init(cc, IV384);
}
/* see sph_hamsi.h */
void
sph_hamsi512_init(void *cc)
{
hamsi_big_init(cc, IV512);
}
/* see sph_hamsi.h */
void
sph_hamsi512(void *cc, const void *data, size_t len)
{
hamsi_big_core(cc, data, len);
}
/* see sph_hamsi.h */
void
sph_hamsi512_close(void *cc, void *dst)
{
hamsi_big_close(cc, 0, 0, dst, 16);
// hamsi_big_init(cc, IV512);
}
/* see sph_hamsi.h */
void
sph_hamsi512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
hamsi_big_close(cc, ub, n, dst, 16);
// hamsi_big_init(cc, IV512);
}
#ifdef __cplusplus
}
#endif

View File

@@ -83,7 +83,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
current = (unsigned)sc->count_low & 127UL;
sc->buf[ current>>2 ] = mm_one_32;
sc->buf[ current>>2 ] = m128_one_32;
current += 4;
RSTATE;
if ( current > 116UL )

View File

@@ -15,7 +15,7 @@
#include "algo/shabal/sph_shabal.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/skein/sse2/skein.c"
#ifndef NO_AES_NI

View File

@@ -99,6 +99,7 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
pthread_barrier_wait( &hodl_barrier );
return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
#endif
return false;
}
bool register_hodl_algo( algo_gate_t* gate )

View File

@@ -44,7 +44,7 @@ void jha_hash_4way( void *out, const void *input )
for ( int round = 0; round < 3; round++ )
{
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
vh[0], _mm256_set1_epi64x( 1 ) ), mm256_zero );
vh[0], _mm256_set1_epi64x( 1 ) ), m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx_groestl, 64 );

View File

@@ -59,7 +59,7 @@ static const sph_u64 RC[] = {
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b))
#define AND64(d, a, b) (d = _mm256_and_si256(a,b))
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
#define NOT64(d, s) (d = _mm256_xor_si256(s,mm256_neg1))
#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
#define ROL64(d, v, n) (d = mm256_rotl_64(v, n))
#define XOR64_IOTA XOR64
@@ -375,12 +375,12 @@ static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
kc->w[i] = _mm256_setzero_si256();
// Initialization for the "lane complement".
kc->w[ 1] = mm256_neg1;
kc->w[ 2] = mm256_neg1;
kc->w[ 8] = mm256_neg1;
kc->w[12] = mm256_neg1;
kc->w[17] = mm256_neg1;
kc->w[20] = mm256_neg1;
kc->w[ 1] = m256_neg1;
kc->w[ 2] = m256_neg1;
kc->w[ 8] = m256_neg1;
kc->w[12] = m256_neg1;
kc->w[17] = m256_neg1;
kc->w[20] = m256_neg1;
kc->ptr = 0;
kc->lim = 200 - (out_size >> 2);
}

View File

@@ -0,0 +1,568 @@
/*
* luffa_for_sse2.c
* Version 2.0 (Sep 15th 2009)
*
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
*
* Hitachi, Ltd. is the owner of this software and hereby grant
* the U.S. Government and any interested party the right to use
* this software for the purposes of the SHA-3 evaluation process,
* notwithstanding that this software is copyrighted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <string.h>
#include <immintrin.h>
#include "luffa-hash-2way.h"
#if defined(__AVX2__)
#include "avxdefs.h"
#define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
0UL, 0UL, 0UL, 0xffffffffUL )
#define ADD_CONSTANT(a,b,c0,c1)\
a = _mm256_xor_si256(a,c0);\
b = _mm256_xor_si256(b,c1);\
#define MULT2(a0,a1) \
do { \
__m256i b = _mm256_xor_si256( a0, \
_mm256_shuffle_epi32( _mm256_and_si256(a1,MASK), 16 ) ); \
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
} while(0)
// confirm pointer arithmetic
// ok but use array indexes
#define STEP_PART(x,c,t)\
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
MIXWORD(*x,*(x+4),*t,*(t+1));\
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
ADD_CONSTANT(*x, *(x+4), *c, *(c+1));
#define SUBCRUMB(a0,a1,a2,a3,t)\
t = _mm256_load_si256(&a0);\
a0 = _mm256_or_si256(a0,a1);\
a2 = _mm256_xor_si256(a2,a3);\
a1 = _mm256_andnot_si256(a1, m256_neg1 );\
a0 = _mm256_xor_si256(a0,a3);\
a3 = _mm256_and_si256(a3,t);\
a1 = _mm256_xor_si256(a1,a3);\
a3 = _mm256_xor_si256(a3,a2);\
a2 = _mm256_and_si256(a2,a0);\
a0 = _mm256_andnot_si256(a0, m256_neg1 );\
a2 = _mm256_xor_si256(a2,a1);\
a1 = _mm256_or_si256(a1,a3);\
t = _mm256_xor_si256(t,a1);\
a3 = _mm256_xor_si256(a3,a2);\
a2 = _mm256_and_si256(a2,a1);\
a1 = _mm256_xor_si256(a1,a0);\
a0 = _mm256_load_si256(&t);\
#define MIXWORD(a,b,t1,t2)\
b = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(a,2);\
t2 = _mm256_srli_epi32(a,30);\
a = _mm256_or_si256(t1,t2);\
a = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(b,14);\
t2 = _mm256_srli_epi32(b,18);\
b = _mm256_or_si256(t1,t2);\
b = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(a,10);\
t2 = _mm256_srli_epi32(a,22);\
a = _mm256_or_si256(t1,t2);\
a = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(b,1);\
t2 = _mm256_srli_epi32(b,31);\
b = _mm256_or_si256(t1,t2);
#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
a1 = _mm256_shuffle_epi32(a1,147);\
t0 = _mm256_load_si256(&a1);\
a1 = _mm256_unpacklo_epi32(a1,a0);\
t0 = _mm256_unpackhi_epi32(t0,a0);\
t1 = _mm256_shuffle_epi32(t0,78);\
a0 = _mm256_shuffle_epi32(a1,78);\
SUBCRUMB(t1,t0,a0,a1,tmp0);\
t0 = _mm256_unpacklo_epi32(t0,t1);\
a1 = _mm256_unpacklo_epi32(a1,a0);\
a0 = _mm256_load_si256(&a1);\
a0 = _mm256_unpackhi_epi64(a0,t0);\
a1 = _mm256_unpacklo_epi64(a1,t0);\
a1 = _mm256_shuffle_epi32(a1,57);\
MIXWORD(a0,a1,tmp0,tmp1);\
ADD_CONSTANT(a0,a1,c0,c1);
#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
s2 = _mm256_load_si256(&r1);\
q2 = _mm256_load_si256(&p1);\
r2 = _mm256_shuffle_epi32(r2,216);\
p2 = _mm256_shuffle_epi32(p2,216);\
r1 = _mm256_unpacklo_epi32(r1,r0);\
p1 = _mm256_unpacklo_epi32(p1,p0);\
s2 = _mm256_unpackhi_epi32(s2,r0);\
q2 = _mm256_unpackhi_epi32(q2,p0);\
s0 = _mm256_load_si256(&r2);\
q0 = _mm256_load_si256(&p2);\
r2 = _mm256_unpacklo_epi64(r2,r1);\
p2 = _mm256_unpacklo_epi64(p2,p1);\
s1 = _mm256_load_si256(&s0);\
q1 = _mm256_load_si256(&q0);\
s0 = _mm256_unpackhi_epi64(s0,r1);\
q0 = _mm256_unpackhi_epi64(q0,p1);\
r2 = _mm256_shuffle_epi32(r2,225);\
p2 = _mm256_shuffle_epi32(p2,225);\
r0 = _mm256_load_si256(&s1);\
p0 = _mm256_load_si256(&q1);\
s0 = _mm256_shuffle_epi32(s0,225);\
q0 = _mm256_shuffle_epi32(q0,225);\
s1 = _mm256_unpacklo_epi64(s1,s2);\
q1 = _mm256_unpacklo_epi64(q1,q2);\
r0 = _mm256_unpackhi_epi64(r0,s2);\
p0 = _mm256_unpackhi_epi64(p0,q2);\
s2 = _mm256_load_si256(&r0);\
q2 = _mm256_load_si256(&p0);\
s3 = _mm256_load_si256(&r2);\
q3 = _mm256_load_si256(&p2);\
#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
s0 = _mm256_load_si256(&r0);\
q0 = _mm256_load_si256(&p0);\
s1 = _mm256_load_si256(&r2);\
q1 = _mm256_load_si256(&p2);\
r0 = _mm256_unpackhi_epi32(r0,r1);\
p0 = _mm256_unpackhi_epi32(p0,p1);\
r2 = _mm256_unpackhi_epi32(r2,r3);\
p2 = _mm256_unpackhi_epi32(p2,p3);\
s0 = _mm256_unpacklo_epi32(s0,r1);\
q0 = _mm256_unpacklo_epi32(q0,p1);\
s1 = _mm256_unpacklo_epi32(s1,r3);\
q1 = _mm256_unpacklo_epi32(q1,p3);\
r1 = _mm256_load_si256(&r0);\
p1 = _mm256_load_si256(&p0);\
r0 = _mm256_unpackhi_epi64(r0,r2);\
p0 = _mm256_unpackhi_epi64(p0,p2);\
s0 = _mm256_unpackhi_epi64(s0,s1);\
q0 = _mm256_unpackhi_epi64(q0,q1);\
r1 = _mm256_unpacklo_epi64(r1,r2);\
p1 = _mm256_unpacklo_epi64(p1,p2);\
s2 = _mm256_load_si256(&r0);\
q2 = _mm256_load_si256(&p0);\
s1 = _mm256_load_si256(&r1);\
q1 = _mm256_load_si256(&p1);\
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
s1 = _mm256_load_si256(&r3);\
q1 = _mm256_load_si256(&p3);\
s3 = _mm256_load_si256(&r3);\
q3 = _mm256_load_si256(&p3);\
s1 = _mm256_unpackhi_epi32(s1,r2);\
q1 = _mm256_unpackhi_epi32(q1,p2);\
s3 = _mm256_unpacklo_epi32(s3,r2);\
q3 = _mm256_unpacklo_epi32(q3,p2);\
s0 = _mm256_load_si256(&s1);\
q0 = _mm256_load_si256(&q1);\
s2 = _mm256_load_si256(&s3);\
q2 = _mm256_load_si256(&q3);\
r3 = _mm256_load_si256(&r1);\
p3 = _mm256_load_si256(&p1);\
r1 = _mm256_unpacklo_epi32(r1,r0);\
p1 = _mm256_unpacklo_epi32(p1,p0);\
r3 = _mm256_unpackhi_epi32(r3,r0);\
p3 = _mm256_unpackhi_epi32(p3,p0);\
s0 = _mm256_unpackhi_epi64(s0,r3);\
q0 = _mm256_unpackhi_epi64(q0,p3);\
s1 = _mm256_unpacklo_epi64(s1,r3);\
q1 = _mm256_unpacklo_epi64(q1,p3);\
s2 = _mm256_unpackhi_epi64(s2,r1);\
q2 = _mm256_unpackhi_epi64(q2,p1);\
s3 = _mm256_unpacklo_epi64(s3,r1);\
q3 = _mm256_unpacklo_epi64(q3,p1);
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
/* initial values of chaining variables */
static const uint32 IV[40] __attribute((aligned(32))) = {
0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
0xdef610bb,0xee058139,0x90152df4,0x6e292011,
0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
};
/* Round Constants */
static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
0x00000000,0x00000000,0x00000000,0x5090d577,
0x00000000,0x00000000,0x00000000,0xac11d7fa,
0x00000000,0x00000000,0x00000000,0x2d1925ab,
0x00000000,0x00000000,0x00000000,0x1bcb66f2,
0x00000000,0x00000000,0x00000000,0xb46496ac,
0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
0x00000000,0x00000000,0x00000000,0xd1925ab0,
0x00000000,0x00000000,0x00000000,0x78602649,
0x00000000,0x00000000,0x00000000,0x29131ab6,
0x00000000,0x00000000,0x00000000,0x8edae952,
0x00000000,0x00000000,0x00000000,0x0fc053c3,
0x00000000,0x00000000,0x00000000,0x3b6ba548,
0x00000000,0x00000000,0x00000000,0x3f014f0c,
0x00000000,0x00000000,0x00000000,0xedae9520,
0x00000000,0x00000000,0x00000000,0xfc053c31
};
__m256i CNS[32];
/***************************************************/
/* Round function */
/* state: hash context */
static void rnd512_2way( luffa_2way_context *state, __m256i msg1, __m256i msg0 )
{
__m256i t[2];
__m256i *chainv = state->chainv;
__m256i tmp[2];
__m256i x[8];
t[0] = chainv[0];
t[1] = chainv[1];
t[0] = _mm256_xor_si256( t[0], chainv[2] );
t[1] = _mm256_xor_si256( t[1], chainv[3] );
t[0] = _mm256_xor_si256( t[0], chainv[4] );
t[1] = _mm256_xor_si256( t[1], chainv[5] );
t[0] = _mm256_xor_si256( t[0], chainv[6] );
t[1] = _mm256_xor_si256( t[1], chainv[7] );
t[0] = _mm256_xor_si256( t[0], chainv[8] );
t[1] = _mm256_xor_si256( t[1], chainv[9] );
MULT2( t[0], t[1] );
msg0 = _mm256_shuffle_epi32( msg0, 27 );
msg1 = _mm256_shuffle_epi32( msg1, 27 );
chainv[0] = _mm256_xor_si256( chainv[0], t[0] );
chainv[1] = _mm256_xor_si256( chainv[1], t[1] );
chainv[2] = _mm256_xor_si256( chainv[2], t[0] );
chainv[3] = _mm256_xor_si256( chainv[3], t[1] );
chainv[4] = _mm256_xor_si256( chainv[4], t[0] );
chainv[5] = _mm256_xor_si256( chainv[5], t[1] );
chainv[6] = _mm256_xor_si256( chainv[6], t[0] );
chainv[7] = _mm256_xor_si256( chainv[7], t[1] );
chainv[8] = _mm256_xor_si256( chainv[8], t[0] );
chainv[9] = _mm256_xor_si256( chainv[9], t[1] );
t[0] = chainv[0];
t[1] = chainv[1];
MULT2( chainv[0], chainv[1]);
chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
MULT2( chainv[2], chainv[3]);
chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
MULT2( chainv[4], chainv[5]);
chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
MULT2( chainv[6], chainv[7]);
chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
MULT2( chainv[8], chainv[9]);
chainv[8] = _mm256_xor_si256( chainv[8], t[0] );
chainv[9] = _mm256_xor_si256( chainv[9], t[1] );
t[0] = chainv[8];
t[1] = chainv[9];
MULT2( chainv[8], chainv[9]);
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
MULT2( chainv[6], chainv[7]);
chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
MULT2( chainv[4], chainv[5]);
chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
MULT2( chainv[2], chainv[3] );
chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
MULT2( chainv[0], chainv[1] );
chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t[0] ), msg0 );
chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t[1] ), msg1 );
MULT2( msg0, msg1);
chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
MULT2( msg0, msg1);
chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
MULT2( msg0, msg1);
chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
MULT2( msg0, msg1);
chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
MULT2( msg0, msg1);
chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3], 1 ),
_mm256_srli_epi32( chainv[3], 31 ) );
chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5], 2 ),
_mm256_srli_epi32( chainv[5], 30 ) );
chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7], 3 ),
_mm256_srli_epi32( chainv[7], 29 ) );
chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9], 4 ),
_mm256_srli_epi32( chainv[9], 28 ) );
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
x[0], x[1], x[2], x[3],
chainv[1],chainv[3],chainv[5],chainv[7],
x[4], x[5], x[6], x[7] );
STEP_PART( &x[0], &CNS[ 0], &tmp[0] );
STEP_PART( &x[0], &CNS[ 2], &tmp[0] );
STEP_PART( &x[0], &CNS[ 4], &tmp[0] );
STEP_PART( &x[0], &CNS[ 6], &tmp[0] );
STEP_PART( &x[0], &CNS[ 8], &tmp[0] );
STEP_PART( &x[0], &CNS[10], &tmp[0] );
STEP_PART( &x[0], &CNS[12], &tmp[0] );
STEP_PART( &x[0], &CNS[14], &tmp[0] );
MIXTON1024( x[0], x[1], x[2], x[3],
chainv[0], chainv[2], chainv[4],chainv[6],
x[4], x[5], x[6], x[7],
chainv[1],chainv[3],chainv[5],chainv[7]);
/* Process last 256-bit block */
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[16], CNS[17],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[18], CNS[19],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[20], CNS[21],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[22], CNS[23],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[24], CNS[25],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[26], CNS[27],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[28], CNS[29],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[30], CNS[31],
tmp[0], tmp[1] );
}
/***************************************************/
/* Finalization function */
/* state: hash context */
/* b[8]: hash values */
static void finalization512_2way( luffa_2way_context *state, uint32 *b )
{
uint32 hash[8] __attribute((aligned(64)));
__m256i* chainv = state->chainv;
__m256i t[2];
/*---- blank round with m=0 ----*/
rnd512_2way( state, m256_zero, m256_zero );
t[0] = chainv[0];
t[1] = chainv[1];
t[0] = _mm256_xor_si256( t[0], chainv[2] );
t[1] = _mm256_xor_si256( t[1], chainv[3] );
t[0] = _mm256_xor_si256( t[0], chainv[4] );
t[1] = _mm256_xor_si256( t[1], chainv[5] );
t[0] = _mm256_xor_si256( t[0], chainv[6] );
t[1] = _mm256_xor_si256( t[1], chainv[7] );
t[0] = _mm256_xor_si256( t[0], chainv[8] );
t[1] = _mm256_xor_si256( t[1], chainv[9] );
t[0] = _mm256_shuffle_epi32( t[0], 27 );
t[1] = _mm256_shuffle_epi32( t[1], 27 );
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
rnd512_2way( state, m256_zero, m256_zero );
t[0] = chainv[0];
t[1] = chainv[1];
t[0] = _mm256_xor_si256( t[0], chainv[2] );
t[1] = _mm256_xor_si256( t[1], chainv[3] );
t[0] = _mm256_xor_si256( t[0], chainv[4] );
t[1] = _mm256_xor_si256( t[1], chainv[5] );
t[0] = _mm256_xor_si256( t[0], chainv[6] );
t[1] = _mm256_xor_si256( t[1], chainv[7] );
t[0] = _mm256_xor_si256( t[0], chainv[8] );
t[1] = _mm256_xor_si256( t[1], chainv[9] );
t[0] = _mm256_shuffle_epi32( t[0], 27 );
t[1] = _mm256_shuffle_epi32( t[1], 27 );
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
}
int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
{
int i;
state->hashbitlen = hashbitlen;
for ( i=0; i<32; i++ ) CNS[i] =
_mm256_set_epi32( CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ],
CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ] );
for ( i=0; i<10; i++ ) state->chainv[i] =
_mm256_set_epi32( IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
IV[ (i<<2) +1 ], IV[ (i<<2) ],
IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
IV[ (i<<2) +1 ], IV[ (i<<2) ] );
((__m256i*)state->buffer)[0] = m256_zero;
((__m256i*)state->buffer)[1] = m256_zero;
return 0;
}
// Do not call luffa_update_close after having called luffa_update.
// Once luffa_update has been called only call luffa_update or luffa_close.
int luffa_2way_update( luffa_2way_context *state, const void *data,
size_t len )
{
__m256i *vdata = (__m256i*)data;
__m256i *buffer = (__m256i*)state->buffer;
int i;
int blocks = (int)len / 32;
state-> rembytes = (int)len % 32;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
rnd512_2way( state, mm256_bswap_32( vdata[1] ) ,
mm256_bswap_32( vdata[0] ) );
}
// 16 byte partial block exists for 80 byte len
// store in buffer for transform in final for midstate to work
if ( state->rembytes )
{
// remaining data bytes
buffer[0] = mm256_bswap_32( vdata[0] );
buffer[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
}
return 0;
}
int luffa_2way_close( luffa_2way_context *state, void *hashval )
{
__m256i *buffer = (__m256i*)state->buffer;
// transform pad block
if ( state->rembytes )
// not empty, data is in buffer
rnd512_2way( state, buffer[1], buffer[0] );
else
// empty pad block, constant data
rnd512_2way( state, m256_zero,
_mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
finalization512_2way( state, (uint32*)hashval );
if ( state->hashbitlen > 512 )
finalization512_2way( state, (uint32*)( hashval+128 ) );
return 0;
}
int luffa_2way_update_close( luffa_2way_context *state,
void *output, const void *data, size_t inlen )
{
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
__m256i *vdata = (__m256i*)data;
int i;
int blocks = (int)( inlen / 32 );
state->rembytes = inlen % 32;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
rnd512_2way( state, mm256_bswap_32( vdata[1] ),
mm256_bswap_32( vdata[0] ) );
// 16 byte partial block exists for 80 byte len
if ( state->rembytes )
// padding of partial block
rnd512_2way( state,
_mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
mm256_bswap_32( vdata[0] ) );
else
// empty pad block
rnd512_2way( state, m256_zero,
_mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
finalization512_2way( state, (uint32*)output );
if ( state->hashbitlen > 512 )
finalization512_2way( state, (uint32*)( output+128 ) );
return 0;
}
#endif

View File

@@ -0,0 +1,69 @@
#if !defined(LUFFA_HASH_2WAY_H__)
#define LUFFA_HASH_2WAY_H__ 1
/*
* luffa_for_sse2.h
* Version 2.0 (Sep 15th 2009)
*
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
*
* Hitachi, Ltd. is the owner of this software and hereby grant
* the U.S. Government and any interested party the right to use
* this software for the purposes of the SHA-3 evaluation process,
* notwithstanding that this software is copyrighted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#if defined(__AVX2__)
#include <immintrin.h>
#include "algo/sha/sha3-defs.h"
#include "avxdefs.h"
/* The length of digests*/
#define DIGEST_BIT_LEN_224 224
#define DIGEST_BIT_LEN_256 256
#define DIGEST_BIT_LEN_384 384
#define DIGEST_BIT_LEN_512 512
/*********************************/
/* The parameters of Luffa */
#define MSG_BLOCK_BIT_LEN 256 /*The bit length of a message block*/
#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length
* of a message block*/
/* The number of blocks in Luffa */
#define WIDTH_224 3
#define WIDTH_256 3
#define WIDTH_384 4
#define WIDTH_512 5
/* The limit of the length of message */
#define LIMIT_224 64
#define LIMIT_256 64
#define LIMIT_384 128
#define LIMIT_512 128
/*********************************/
typedef struct {
uint32 buffer[8*2] __attribute((aligned(64)));
__m256i chainv[10] __attribute((aligned(32))); /* Chaining values */
int hashbitlen;
int rembytes;
} luffa_2way_context;
int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
int luffa_2way_update( luffa_2way_context *state, const void *data,
size_t len );
int luffa_2way_close( luffa_2way_context *state, void *hashval );
int luffa_2way_update_close( luffa_2way_context *state, void *output,
const void *data, size_t inlen );
#endif
#endif

View File

@@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
// full blocks
for ( i = 0; i < blocks; i++ )
{
rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
mm_byteswap_32( casti_m128i( data, 0 ) ) );
rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
mm_bswap_32( casti_m128i( data, 0 ) ) );
data += MSG_BLOCK_BYTE_LEN;
}
@@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
if ( state->rembytes )
{
// remaining data bytes
casti_m128i( state->buffer, 0 ) = mm_byteswap_32( cast_m128i( data ) );
casti_m128i( state->buffer, 0 ) = mm_bswap_32( cast_m128i( data ) );
// padding of partial block
casti_m128i( state->buffer, 1 ) =
_mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
@@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
// full blocks
for ( i = 0; i < blocks; i++ )
{
rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
mm_byteswap_32( casti_m128i( data, 0 ) ) );
rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
mm_bswap_32( casti_m128i( data, 0 ) ) );
data += MSG_BLOCK_BYTE_LEN;
}
@@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
{
// padding of partial block
rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
mm_byteswap_32( cast_m128i( data ) ) );
mm_bswap_32( cast_m128i( data ) ) );
}
else
{
@@ -542,7 +542,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 0 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
rnd512( state, zero, zero );
@@ -555,7 +555,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 1 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
}
#else
@@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm_store_si128((__m128i*)&hash[0], t[0]);
_mm_store_si128((__m128i*)&hash[4], t[1]);
casti_m128i( b, 0 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 1 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
casti_m128i( b, 0 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 1 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
rnd512( state, zero, zero );
@@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm_store_si128((__m128i*)&hash[0], t[0]);
_mm_store_si128((__m128i*)&hash[4], t[1]);
casti_m128i( b, 2 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 3 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
casti_m128i( b, 2 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 3 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
}
#endif

View File

@@ -60,7 +60,7 @@ void anime_4way_hash( void *state, const void *input )
blake512_4way_close( &ctx.blake, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
@@ -97,7 +97,7 @@ void anime_4way_hash( void *state, const void *input )
jh512_4way_close( &ctx.jh, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
m256_zero );
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
@@ -118,7 +118,7 @@ void anime_4way_hash( void *state, const void *input )
skein512_4way_close( &ctx.skein, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
m256_zero );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );

View File

@@ -60,7 +60,7 @@ void quark_4way_hash( void *state, const void *input )
bmw512_4way_close( &ctx.bmw, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
@@ -97,7 +97,7 @@ void quark_4way_hash( void *state, const void *input )
jh512_4way_close( &ctx.jh, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
m256_zero );
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
@@ -118,7 +118,7 @@ void quark_4way_hash( void *state, const void *input )
skein512_4way_close( &ctx.skein, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
m256_zero );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );

130
algo/qubit/deep-2way.c Normal file
View File

@@ -0,0 +1,130 @@
#include "deep-gate.h"
#if defined(DEEP_2WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct
{
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_echo echo;
} deep_2way_ctx_holder;
deep_2way_ctx_holder deep_2way_ctx;
void init_deep_2way_ctx()
{
luffa_2way_init( &deep_2way_ctx.luffa, 512 );
cubehashInit(&deep_2way_ctx.cube,512,16,32);
sph_shavite512_init(&deep_2way_ctx.shavite);
init_echo(&deep_2way_ctx.echo, 512);
};
void deep_2way_hash( void *output, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*2] __attribute__ ((aligned (64)));
deep_2way_ctx_holder ctx;
memcpy( &ctx, &deep_2way_ctx, sizeof(deep_2way_ctx) );
luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
luffa_2way_close( &ctx.luffa, vhash );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*) hash0, 64 );
memcpy( &ctx.cube, &deep_2way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &deep_2way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &deep_2way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
}
int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 32+3; // 4*8 + 3
uint32_t *noncep1 = vdata + 32+7;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
luffa_2way_init( &deep_2way_ctx.luffa, 512 );
luffa_2way_update( &deep_2way_ctx.luffa, vdata, 64 );
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
deep_2way_hash( hash, vdata );
pdata[19] = n;
if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+64 );
}
n += 2;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

17
algo/qubit/deep-gate.c Normal file
View File

@@ -0,0 +1,17 @@
#include "deep-gate.h"
bool register_deep_algo( algo_gate_t* gate )
{
#if defined (DEEP_2WAY)
init_deep_2way_ctx();
gate->scanhash = (void*)&scanhash_deep_2way;
gate->hash = (void*)&deep_2way_hash;
#else
init_deep_ctx();
gate->scanhash = (void*)&scanhash_deep;
gate->hash = (void*)&deep_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
return true;
};

32
algo/qubit/deep-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef DEEP_GATE_H__
#define DEEP_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define DEEP_2WAY
#endif
bool register_deep_algo( algo_gate_t* gate );
#if defined(DEEP_2WAY)
void deep_2way_hash( void *state, const void *input );
int scanhash_deep_2way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_deep_2way_ctx();
#endif
void deep_hash( void *state, const void *input );
int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_deep_ctx();
#endif

View File

@@ -1,9 +1,9 @@
#include "algo-gate-api.h"
#include "deep-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#ifndef NO_AES_NI
#include "algo/echo/aes_ni/hash_api.h"
@@ -139,12 +139,3 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
bool register_deep_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_deep_ctx();
gate->scanhash = (void*)&scanhash_deep;
gate->hash = (void*)&deep_hash;
return true;
};

138
algo/qubit/qubit-2way.c Normal file
View File

@@ -0,0 +1,138 @@
#include "qubit-gate.h"
#if defined(QUBIT_2WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct
{
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd_2way_context simd;
hashState_echo echo;
} qubit_2way_ctx_holder;
qubit_2way_ctx_holder qubit_2way_ctx;
void init_qubit_2way_ctx()
{
luffa_2way_init( &qubit_2way_ctx.luffa, 512 );
cubehashInit(&qubit_2way_ctx.cube,512,16,32);
sph_shavite512_init(&qubit_2way_ctx.shavite);
simd_2way_init( &qubit_2way_ctx.simd, 512 );
init_echo(&qubit_2way_ctx.echo, 512);
};
void qubit_2way_hash( void *output, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*2] __attribute__ ((aligned (64)));
qubit_2way_ctx_holder ctx;
memcpy( &ctx, &qubit_2way_ctx, sizeof(qubit_2way_ctx) );
luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
luffa_2way_close( &ctx.luffa, vhash );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*) hash0, 64 );
memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
}
int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 32+3; // 4*8 + 3
uint32_t *noncep1 = vdata + 32+7;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
luffa_2way_init( &qubit_2way_ctx.luffa, 512 );
luffa_2way_update( &qubit_2way_ctx.luffa, vdata, 64 );
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
qubit_2way_hash( hash, vdata );
pdata[19] = n;
if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
n += 2;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

17
algo/qubit/qubit-gate.c Normal file
View File

@@ -0,0 +1,17 @@
#include "qubit-gate.h"
bool register_qubit_algo( algo_gate_t* gate )
{
#if defined (QUBIT_2WAY)
init_qubit_2way_ctx();
gate->scanhash = (void*)&scanhash_qubit_2way;
gate->hash = (void*)&qubit_2way_hash;
#else
init_qubit_ctx();
gate->scanhash = (void*)&scanhash_qubit;
gate->hash = (void*)&qubit_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
return true;
};

32
algo/qubit/qubit-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef QUBIT_GATE_H__
#define QUBIT_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define QUBIT_2WAY
#endif
bool register_qubit_algo( algo_gate_t* gate );
#if defined(QUBIT_2WAY)
void qubit_2way_hash( void *state, const void *input );
int scanhash_qubit_2way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_qubit_2way_ctx();
#endif
void qubit_hash( void *state, const void *input );
int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_qubit_ctx();
#endif

View File

@@ -1,11 +1,11 @@
#include "algo-gate-api.h"
#include "qubit-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#include "algo/shavite/sph_shavite.h"
#ifndef NO_AES_NI
#include "algo/echo/aes_ni/hash_api.h"
@@ -48,7 +48,7 @@ void qubit_luffa_midstate( const void* input )
update_luffa( &qubit_luffa_mid, input, 64 );
}
void qubithash(void *output, const void *input)
void qubit_hash(void *output, const void *input)
{
unsigned char hash[128] __attribute((aligned(64)));
#define hashB hash+64
@@ -115,7 +115,7 @@ int scanhash_qubit(int thr_id, struct work *work,
{
pdata[19] = ++n;
be32enc(&endiandata[19], n);
qubithash(hash64, endiandata);
qubit_hash(hash64, endiandata);
#ifndef DEBUG_ALGO
if (!(hash64[7] & mask))
{
@@ -151,12 +151,3 @@ int scanhash_qubit(int thr_id, struct work *work,
return 0;
}
bool register_qubit_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_qubit_ctx();
gate->scanhash = (void*)&scanhash_qubit;
gate->hash = (void*)&qubithash;
return true;
};

View File

@@ -778,6 +778,7 @@ bool scrypt_miner_thread_init( int thr_id )
bool register_scrypt_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
gate->scanhash = (void*)&scanhash_scrypt;
// gate->hash = (void*)&scrypt_1024_1_1_256_24way;

View File

@@ -215,18 +215,18 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
#if defined BE64
#if defined PLW1
sc->buf[ SPH_MAXPAD>>3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#elif defined PLW4
memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#else
sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#endif // PLW
#else // LE64
#if defined PLW1
@@ -255,7 +255,7 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
for ( u = 0; u < rnum; u ++ )
{
#if defined BE64
((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
#else // LE64
((__m256i*)dst)[u] = sc->val[u];
#endif

View File

@@ -129,7 +129,7 @@ sha512_4way_round( __m256i *in, __m256i r[8] )
__m256i W[80];
for ( i = 0; i < 16; i++ )
W[i] = mm256_byteswap_64( in[i] );
W[i] = mm256_bswap_64( in[i] );
for ( i = 16; i < 80; i++ )
W[i] = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64(
SSG5_1( W[ i-2 ] ), W[ i-7 ] ), SSG5_0( W[ i-15 ] ) ), W[ i-16 ] );
@@ -224,13 +224,13 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ ( pad+8 ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
sha512_4way_round( sc->buf, sc->val );
for ( u = 0; u < 8; u ++ )
((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
}
#endif

View File

@@ -74,6 +74,18 @@ static const sph_u32 IV512[] = {
C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
};
// Return hi 128 bits with elements shifted one lane with vacated lane filled
// with data rotated from lo.
// Partially rotate elements in two 128 bit vectors as one 256 bit vector
// and return the rotated high 128 bits.
// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not
// completed. It's faster than a full rotation.
static inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo, int n )
{ return _mm_or_si128( _mm_srli_si128( hi, n<<2 ),
_mm_slli_si128( lo, 16 - (n<<2) ) );
}
#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \
sph_u32 t0 = (x0); \
sph_u32 t1 = (x1); \
@@ -284,42 +296,42 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round
k00 = m[0];
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k01 = m[1];
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k02 = m[2];
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k03 = m[3];
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p0 = _mm_xor_si128( p0, x );
k10 = m[4];
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k11 = m[5];
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k12 = m[6];
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k13 = m[7];
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p2 = _mm_xor_si128( p2, x );
for ( r = 0; r < 3; r ++ )
{
// round 1, 5, 9
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 );
if ( r == 0 )
@@ -327,8 +339,8 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count3, sc->count2, sc->count1, sc->count0 ) );
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 );
if ( r == 1 )
@@ -336,34 +348,34 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count0, sc->count1, sc->count2, sc->count3 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 );
if ( r == 2 )
@@ -371,89 +383,89 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count1, sc->count0, sc->count3, sc->count2 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p1 = _mm_xor_si128( p1, x );
// round 2, 6, 10
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
x = _mm_xor_si128( p3, k00 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p2 = _mm_xor_si128( p2, x );
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
x = _mm_xor_si128( p1, k10 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p0 = _mm_xor_si128( p0, x );
// round 3, 7, 11
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p2, k00 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p1 = _mm_xor_si128( p1, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p0, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p3 = _mm_xor_si128( p3, x );
// round 4, 8, 12
@@ -461,83 +473,83 @@ c512( sph_shavite_big_context *sc, const void *msg )
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p0 = _mm_xor_si128( p0, x );
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p2 = _mm_xor_si128( p2, x );
}
// round 13
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
x = _mm_aesenc_si128( x, m128_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
x = _mm_aesenc_si128( x, m128_zero );
p1 = _mm_xor_si128( p1, x );
h[0] = _mm_xor_si128( h[0], p2 );

853
algo/simd/simd-hash-2way.c Normal file
View File

@@ -0,0 +1,853 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "simd-hash-2way.h"
#if defined (__AVX2__)
// imported from simd_iv.h
uint32_t SIMD_IV_512[] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc,
0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f,
0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8,
0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4,
0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 };
/* Twiddle tables */
static const m256_v16 FFT64_Twiddle[] =
{
{{ 1, 2, 4, 8, 16, 32, 64, 128,
1, 2, 4, 8, 16, 32, 64, 128 }},
{{ 1, 60, 2, 120, 4, -17, 8, -34,
1, 60, 2, 120, 4, -17, 8, -34 }},
{{ 1, 120, 8, -68, 64, -30, -2, 17,
1, 120, 8, -68, 64, -30, -2, 17 }},
{{ 1, 46, 60, -67, 2, 92, 120, 123,
1, 46, 60, -67, 2, 92, 120, 123 }},
{{ 1, 92, -17, -22, 32, 117, -30, 67,
1, 92, -17, -22, 32, 117, -30, 67 }},
{{ 1, -67, 120, -73, 8, -22, -68, -70,
1, -67, 120, -73, 8, -22, -68, -70 }},
{{ 1, 123, -34, -70, 128, 67, 17, 35,
1, 123, -34, -70, 128, 67, 17, 35 }},
};
static const m256_v16 FFT128_Twiddle[] =
{
{{ 1, -118, 46, -31, 60, 116, -67, -61,
1, -118, 46, -31, 60, 116, -67, -61 }},
{{ 2, 21, 92, -62, 120, -25, 123, -122,
2, 21, 92, -62, 120, -25, 123, -122 }},
{{ 4, 42, -73, -124, -17, -50, -11, 13,
4, 42, -73, -124, -17, -50, -11, 13 }},
{{ 8, 84, 111, 9, -34, -100, -22, 26,
8, 84, 111, 9, -34, -100, -22, 26 }},
{{ 16, -89, -35, 18, -68, 57, -44, 52,
16, -89, -35, 18, -68, 57, -44, 52 }},
{{ 32, 79, -70, 36, 121, 114, -88, 104,
32, 79, -70, 36, 121, 114, -88, 104 }},
{{ 64, -99, 117, 72, -15, -29, 81, -49,
64, -99, 117, 72, -15, -29, 81, -49 }},
{{ 128, 59, -23, -113, -30, -58, -95, -98,
128, 59, -23, -113, -30, -58, -95, -98 }},
};
static const m256_v16 FFT256_Twiddle[] =
{
{{ 1, 41, -118, 45, 46, 87, -31, 14,
1, 41, -118, 45, 46, 87, -31, 14 }},
{{ 60, -110, 116, -127, -67, 80, -61, 69,
60, -110, 116, -127, -67, 80, -61, 69 }},
{{ 2, 82, 21, 90, 92, -83, -62, 28,
2, 82, 21, 90, 92, -83, -62, 28 }},
{{ 120, 37, -25, 3, 123, -97, -122, -119,
120, 37, -25, 3, 123, -97, -122, -119 }},
{{ 4, -93, 42, -77, -73, 91, -124, 56,
4, -93, 42, -77, -73, 91, -124, 56 }},
{{ -17, 74, -50, 6, -11, 63, 13, 19,
-17, 74, -50, 6, -11, 63, 13, 19 }},
{{ 8, 71, 84, 103, 111, -75, 9, 112,
8, 71, 84, 103, 111, -75, 9, 112 }},
{{ -34, -109, -100, 12, -22, 126, 26, 38,
-34, -109, -100, 12, -22, 126, 26, 38 }},
{{ 16, -115, -89, -51, -35, 107, 18, -33,
16, -115, -89, -51, -35, 107, 18, -33 }},
{{ -68, 39, 57, 24, -44, -5, 52, 76,
-68, 39, 57, 24, -44, -5, 52, 76 }},
{{ 32, 27, 79, -102, -70, -43, 36, -66,
32, 27, 79, -102, -70, -43, 36, -66 }},
{{ 121, 78, 114, 48, -88, -10, 104, -105,
121, 78, 114, 48, -88, -10, 104, -105 }},
{{ 64, 54, -99, 53, 117, -86, 72, 125,
64, 54, -99, 53, 117, -86, 72, 125 }},
{{ -15, -101, -29, 96, 81, -20, -49, 47,
-15, -101, -29, 96, 81, -20, -49, 47 }},
{{ 128, 108, 59, 106, -23, 85, -113, -7,
128, 108, 59, 106, -23, 85, -113, -7 }},
{{ -30, 55, -58, -65, -95, -40, -98, 94,
-30, 55, -58, -65, -95, -40, -98, 94 }}
};
#define SHUFXOR_1 0xb1 /* 0b10110001 */
#define SHUFXOR_2 0x4e /* 0b01001110 */
#define SHUFXOR_3 0x1b /* 0b00011011 */
#define CAT(x, y) x##y
#define XCAT(x,y) CAT(x,y)
#define shufxor(x,s) _mm256_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
// imported from vector.c
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi16( 255 ) ), \
_mm256_srai_epi16( x, 8 ) )
#define EXTRA_REDUCE_S(x)\
_mm256_sub_epi16( x, \
_mm256_and_si256( _mm256_set1_epi16( 257 ), \
_mm256_cmpgt_epi16( x, _mm256_set1_epi16( 128 ) ) ) )
#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) )
#define DO_REDUCE( i ) X(i) = REDUCE( X(i) )
#define DO_REDUCE_FULL_S(i) \
do { \
X(i) = REDUCE( X(i) ); \
X(i) = EXTRA_REDUCE_S( X(i) ); \
} while(0)
void fft64_2way( void *a )
{
__m256i* const A = a;
register __m256i X0, X1, X2, X3, X4, X5, X6, X7;
#define X(i) X##i
X0 = A[0];
X1 = A[1];
X2 = A[2];
X3 = A[3];
X4 = A[4];
X5 = A[5];
X6 = A[6];
X7 = A[7];
#define DO_REDUCE(i) X(i) = REDUCE( X(i) )
// Begin with 8 parallels DIF FFT_8
//
// FFT_8 using w=4 as 8th root of unity
// Unrolled decimation in frequency (DIF) radix-2 NTT.
// Output data is in revbin_permuted order.
static const int w[] = {0, 2, 4, 6};
// __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
#define BUTTERFLY_0( i,j ) \
do { \
__m256i v = X(j); \
X(j) = _mm256_add_epi16( X(i), X(j) ); \
X(i) = _mm256_sub_epi16( X(i), v ); \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
do { \
__m256i v = X(j); \
X(j) = _mm256_add_epi16( X(i), X(j) ); \
X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
} while(0)
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
DO_REDUCE( 2 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
DO_REDUCE( 1 );
BUTTERFLY_0( 0, 1 );
BUTTERFLY_0( 2, 3 );
BUTTERFLY_0( 4, 5 );
BUTTERFLY_0( 6, 7 );
/* We don't need to reduce X(7) */
DO_REDUCE_FULL_S( 0 );
DO_REDUCE_FULL_S( 1 );
DO_REDUCE_FULL_S( 2 );
DO_REDUCE_FULL_S( 3 );
DO_REDUCE_FULL_S( 4 );
DO_REDUCE_FULL_S( 5 );
DO_REDUCE_FULL_S( 6 );
#undef BUTTERFLY_0
#undef BUTTERFLY_N
// Multiply by twiddle factors
X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i );
X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i );
X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i );
X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i );
X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i );
X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i );
X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i );
// Transpose the FFT state with a revbin order permutation
// on the rows and the column.
// This will make the full FFT_64 in order.
#define INTERLEAVE(i,j) \
do { \
__m256i t1= X(i); \
__m256i t2= X(j); \
X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
} while(0)
INTERLEAVE( 1, 0 );
INTERLEAVE( 3, 2 );
INTERLEAVE( 5, 4 );
INTERLEAVE( 7, 6 );
INTERLEAVE( 2, 0 );
INTERLEAVE( 3, 1 );
INTERLEAVE( 6, 4 );
INTERLEAVE( 7, 5 );
INTERLEAVE( 4, 0 );
INTERLEAVE( 5, 1 );
INTERLEAVE( 6, 2 );
INTERLEAVE( 7, 3 );
#undef INTERLEAVE
//Finish with 8 parallels DIT FFT_8
//FFT_8 using w=4 as 8th root of unity
// Unrolled decimation in time (DIT) radix-2 NTT.
// Input data is in revbin_permuted order.
#define BUTTERFLY_0( i,j ) \
do { \
__m256i u = X(j); \
X(j) = _mm256_sub_epi16( X(j), X(i) ); \
X(i) = _mm256_add_epi16( u, X(i) ); \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
do { \
__m256i u = X(j); \
X(i) = _mm256_slli_epi16( X(i), w[n] ); \
X(j) = _mm256_sub_epi16( X(j), X(i) ); \
X(i) = _mm256_add_epi16( u, X(i) ); \
} while(0)
DO_REDUCE( 0 );
DO_REDUCE( 1 );
DO_REDUCE( 2 );
DO_REDUCE( 3 );
DO_REDUCE( 4 );
DO_REDUCE( 5 );
DO_REDUCE( 6 );
DO_REDUCE( 7 );
BUTTERFLY_0( 0, 1 );
BUTTERFLY_0( 2, 3 );
BUTTERFLY_0( 4, 5 );
BUTTERFLY_0( 6, 7 );
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
DO_REDUCE_FULL_S( 0 );
DO_REDUCE_FULL_S( 1 );
DO_REDUCE_FULL_S( 2 );
DO_REDUCE_FULL_S( 3 );
DO_REDUCE_FULL_S( 4 );
DO_REDUCE_FULL_S( 5 );
DO_REDUCE_FULL_S( 6 );
DO_REDUCE_FULL_S( 7 );
#undef BUTTERFLY
A[0] = X0;
A[1] = X1;
A[2] = X2;
A[3] = X3;
A[4] = X4;
A[5] = X5;
A[6] = X6;
A[7] = X7;
#undef X
}
void fft128_2way( void *a )
{
int i;
// Temp space to help for interleaving in the end
__m256i B[8];
__m256i *A = (__m256i*) a;
// __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
/* Size-2 butterflies */
for ( i = 0; i<8; i++ )
{
B[ i ] = _mm256_add_epi16( A[ i ], A[ i+8 ] );
B[ i ] = REDUCE_FULL_S( B[ i ] );
A[ i+8 ] = _mm256_sub_epi16( A[ i ], A[ i+8 ] );
A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
A[ i+8 ] = _mm256_mullo_epi16( A[ i+8 ], FFT128_Twiddle[i].m256i );
A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
}
fft64_2way( B );
fft64_2way( A+8 );
/* Transpose (i.e. interleave) */
for ( i = 0; i < 8; i++ )
{
A[ 2*i ] = _mm256_unpacklo_epi16( B[ i ], A[ i+8 ] );
A[ 2*i+1 ] = _mm256_unpackhi_epi16( B[ i ], A[ i+8 ] );
}
}
void fft128_2way_msg( uint16_t *a, const uint8_t *x, int final )
{
static const m256_v16 Tweak = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
__m256i *X = (__m256i*)x;
__m256i *A = (__m256i*)a;
// __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
#define UNPACK( i ) \
do { \
__m256i t = X[i]; \
A[2*i] = _mm256_unpacklo_epi8( t, m256_zero ); \
A[2*i+8] = _mm256_mullo_epi16( A[2*i], FFT128_Twiddle[2*i].m256i ); \
A[2*i+8] = REDUCE(A[2*i+8]); \
A[2*i+1] = _mm256_unpackhi_epi8( t, m256_zero ); \
A[2*i+9] = _mm256_mullo_epi16(A[2*i+1], FFT128_Twiddle[2*i+1].m256i ); \
A[2*i+9] = REDUCE(A[2*i+9]); \
} while(0)
// This allows to tweak the last butterflies to introduce X^127
#define UNPACK_TWEAK( i,tw ) \
do { \
__m256i t = X[i]; \
__m256i tmp; \
A[2*i] = _mm256_unpacklo_epi8( t, m256_zero ); \
A[2*i+8] = _mm256_mullo_epi16( A[ 2*i ], FFT128_Twiddle[ 2*i ].m256i ); \
A[2*i+8] = REDUCE( A[ 2*i+8 ] ); \
tmp = _mm256_unpackhi_epi8( t, m256_zero ); \
A[2*i+1] = _mm256_add_epi16( tmp, tw ); \
A[2*i+9] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
FFT128_Twiddle[ 2*i+1 ].m256i );\
A[2*i+9] = REDUCE( A[ 2*i+9 ] ); \
} while(0)
UNPACK( 0 );
UNPACK( 1 );
UNPACK( 2 );
if ( final )
UNPACK_TWEAK( 3, FinalTweak.m256i );
else
UNPACK_TWEAK( 3, Tweak.m256i );
#undef UNPACK
#undef UNPACK_TWEAK
fft64_2way( a );
fft64_2way( a+128 );
}
void fft256_2way_msg( uint16_t *a, const uint8_t *x, int final )
{
static const m256_v16 Tweak = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
__m256i *X = (__m256i*)x;
__m256i *A = (__m256i*)a;
// __m256i *Twiddle = (__m256i*)FFT256_Twiddle;
#define UNPACK( i ) \
do { \
__m256i t = X[i]; \
A[ 2*i ] = _mm256_unpacklo_epi8( t, m256_zero ); \
A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
FFT256_Twiddle[ 2*i ].m256i ); \
A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
A[ 2*i + 1 ] = _mm256_unpackhi_epi8( t, m256_zero ); \
A[ 2*i + 17 ] = _mm256_mullo_epi16( A[ 2*i + 1 ], \
FFT256_Twiddle[ 2*i + 1 ].m256i ); \
A[ 2*i + 17 ] = REDUCE( A[ 2*i + 17 ] ); \
} while(0)
// This allows to tweak the last butterflies to introduce X^127
#define UNPACK_TWEAK( i,tw ) \
do { \
__m256i t = X[i]; \
__m256i tmp; \
A[ 2*i ] = _mm256_unpacklo_epi8( t, m256_zero ); \
A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
FFT256_Twiddle[ 2*i ].m256i ); \
A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
tmp = _mm256_unpackhi_epi8( t, m256_zero ); \
A[ 2*i + 1 ] = _mm256_add_epi16( tmp, tw ); \
A[ 2*i + 17 ] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
FFT256_Twiddle[ 2*i + 1 ].m256i ); \
} while(0)
UNPACK( 0 );
UNPACK( 1 );
UNPACK( 2 );
UNPACK( 3 );
UNPACK( 4 );
UNPACK( 5 );
UNPACK( 6 );
if ( final )
UNPACK_TWEAK( 7, FinalTweak.m256i );
else
UNPACK_TWEAK( 7, Tweak.m256i );
#undef UNPACK
#undef UNPACK_TWEAK
fft128_2way( a );
fft128_2way( a+256 );
}
void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
{
register __m256i S0l, S1l, S2l, S3l;
register __m256i S0h, S1h, S2h, S3h;
__m256i *S = (__m256i*) state;
__m256i *M = (__m256i*) msg;
__m256i *W = (__m256i*) fft;
static const m256_v16 code[] = { mm256_setc1_16(185), mm256_setc1_16(233) };
S0l = _mm256_xor_si256( S[0], M[0] );
S0h = _mm256_xor_si256( S[1], M[1] );
S1l = _mm256_xor_si256( S[2], M[2] );
S1h = _mm256_xor_si256( S[3], M[3] );
S2l = _mm256_xor_si256( S[4], M[4] );
S2h = _mm256_xor_si256( S[5], M[5] );
S3l = _mm256_xor_si256( S[6], M[6] );
S3h = _mm256_xor_si256( S[7], M[7] );
#define S(i) S##i
#define F_0(B, C, D) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( C,D ), B ), D )
#define F_1(B, C, D) \
_mm256_or_si256( _mm256_and_si256( D, C ),\
_mm256_and_si256( _mm256_or_si256( D,C ), B ) )
#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
// We split the round function in two halfes
// so as to insert some independent computations in between
#define SUM7_00 0
#define SUM7_01 1
#define SUM7_02 2
#define SUM7_03 3
#define SUM7_04 4
#define SUM7_05 5
#define SUM7_06 6
#define SUM7_10 1
#define SUM7_11 2
#define SUM7_12 3
#define SUM7_13 4
#define SUM7_14 5
#define SUM7_15 6
#define SUM7_16 0
#define SUM7_20 2
#define SUM7_21 3
#define SUM7_22 4
#define SUM7_23 5
#define SUM7_24 6
#define SUM7_25 0
#define SUM7_26 1
#define SUM7_30 3
#define SUM7_31 4
#define SUM7_32 5
#define SUM7_33 6
#define SUM7_34 0
#define SUM7_35 1
#define SUM7_36 2
#define SUM7_40 4
#define SUM7_41 5
#define SUM7_42 6
#define SUM7_43 0
#define SUM7_44 1
#define SUM7_45 2
#define SUM7_46 3
#define SUM7_50 5
#define SUM7_51 6
#define SUM7_52 0
#define SUM7_53 1
#define SUM7_54 2
#define SUM7_55 3
#define SUM7_56 4
#define SUM7_60 6
#define SUM7_61 0
#define SUM7_62 1
#define SUM7_63 2
#define SUM7_64 3
#define SUM7_65 4
#define SUM7_66 5
#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
#define PERM_0(d,a) /* XOR 1 */ \
do { \
d##l = shufxor( a##l, 1 ); \
d##h = shufxor( a##h, 1 ); \
} while(0)
#define PERM_1(d,a) /* XOR 6 */ \
do { \
d##l = shufxor( a##h, 2 ); \
d##h = shufxor( a##l, 2 ); \
} while(0)
#define PERM_2(d,a) /* XOR 2 */ \
do { \
d##l = shufxor( a##l, 2 ); \
d##h = shufxor( a##h, 2 ); \
} while(0)
#define PERM_3(d,a) /* XOR 3 */ \
do { \
d##l = shufxor( a##l, 3 ); \
d##h = shufxor( a##h, 3 ); \
} while(0)
#define PERM_4(d,a) /* XOR 5 */ \
do { \
d##l = shufxor( a##h, 1 ); \
d##h = shufxor( a##l, 1 ); \
} while(0)
#define PERM_5(d,a) /* XOR 7 */ \
do { \
d##l = shufxor( a##h, 3 ); \
d##h = shufxor( a##l, 3 ); \
} while(0)
#define PERM_6(d,a) /* XOR 4 */ \
do { \
d##l = a##h; \
d##h = a##l; \
} while(0)
#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
do { \
TTl = Fl( a,b,c,fun ); \
TTh = Fh( a,b,c,fun ); \
a##l = mm256_rotl_32( a##l, r ); \
a##h = mm256_rotl_32( a##h, r ); \
w##l = _mm256_add_epi32( w##l, d##l ); \
w##h = _mm256_add_epi32( w##h, d##h ); \
TTl = _mm256_add_epi32( TTl, w##l ); \
TTh = _mm256_add_epi32( TTh, w##h ); \
TTl = mm256_rotl_32( TTl, s ); \
TTh = mm256_rotl_32( TTh, s ); \
PERM( z,d,a ); \
} while(0)
#define STEP_1( a,b,c,d,w,fun,r,s,z ) STEP_1_( a,b,c,d,w,fun,r,s,z )
#define STEP_2_( a,b,c,d,w,fun,r,s ) \
do { \
d##l = _mm256_add_epi32( d##l, TTl ); \
d##h = _mm256_add_epi32( d##h, TTh ); \
} while(0)
#define STEP_2( a,b,c,d,w,fun,r,s ) STEP_2_( a,b,c,d,w,fun,r,s )
#define STEP( a,b,c,d,w1,w2,fun,r,s,z ) \
do { \
register __m256i TTl, TTh, Wl=w1, Wh=w2; \
STEP_1( a,b,c,d,W,fun,r,s,z ); \
STEP_2( a,b,c,d,W,fun,r,s ); \
} while(0);
#define MSG_l(x) (2*(x))
#define MSG_h(x) (2*(x)+1)
#define MSG( w,hh,ll,u,z ) \
do { \
int a = MSG_##u(hh); \
int b = MSG_##u(ll); \
w##l = _mm256_unpacklo_epi16( W[a], W[b] ); \
w##l = _mm256_mullo_epi16( w##l, code[z].m256i ); \
w##h = _mm256_unpackhi_epi16( W[a], W[b]) ; \
w##h = _mm256_mullo_epi16( w##h, code[z].m256i ); \
} while(0)
#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \
do { \
register __m256i W0l, W1l, W2l, W3l, TTl; \
register __m256i W0h, W1h, W2h, W3h, TTh; \
MSG( W0, h0, l0, u0, z ); \
STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0 ); \
MSG( W1, h1, l1, u1, z ); \
STEP_2( S(0), S(1), S(2), S(3), W0, fun, r, s ); \
STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1 ); \
MSG( W2,h2,l2,u2,z ); \
STEP_2( S(3), S(0), S(1), S(2), W1, fun, s, t ); \
STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2 ); \
MSG( W3,h3,l3,u3,z ); \
STEP_2( S(2), S(3), S(0), S(1), W2, fun, t, u ); \
STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3 ); \
STEP_2( S(1), S(2), S(3), S(0), W3, fun, u, r ); \
} while(0)
// 4 rounds with code 185
#define PERM_START 0
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 4
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 1
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0);
#undef PERM_START
#define PERM_START 5
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0);
#undef PERM_START
// 4 rounds with code 233
#define PERM_START 2
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 6
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 3
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1);
#undef PERM_START
#define PERM_START 0
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1);
#undef PERM_START
// 1 round as feed-forward
#define PERM_START 4
STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0 );
STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1 );
STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2 );
STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3 );
S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h;
S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h;
#undef PERM_START
#undef STEP_1
#undef STEP_2
#undef STEP
#undef ROUND
}
void SIMD_2way_Compress( simd_2way_context *state, const void *m, int final )
{
m256_v16 Y[32];
uint16_t *y = (uint16_t*) Y[0].u16;
fft256_2way_msg( y, m, final );
rounds512_2way( state->A, m, y );
}
// imported from nist.c
int simd_2way_init( simd_2way_context *state, int hashbitlen )
{
__m256i *A = (__m256i*)state->A;
int n = 8;
state->hashbitlen = hashbitlen;
state->n_feistels = n;
state->blocksize = 128*8;
state->count = 0;
for ( int i = 0; i < 8; i++ )
A[i] = _mm256_set_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0],
SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] );
return 0;
}
int simd_2way_update( simd_2way_context *state, const void *data,
int databitlen )
{
int bs = state->blocksize;
int current = state->count & (bs - 1);
while ( databitlen > 0 )
{
if ( current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_2way_Compress( state, data, 0 );
databitlen -= bs;
data += 2*(bs/8);
state->count += bs;
}
else
{
// Copy a chunk of data to the buffer
int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer + 2*(current/8), data, 2*((databitlen+7)/8) );
state->count += databitlen;
return 0;
}
else
{
memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
state->count += len;
databitlen -= len;
data += 2*(len/8);
current = 0;
SIMD_2way_Compress( state, state->buffer, 0 );
}
}
}
return 0;
}
int simd_2way_close( simd_2way_context *state, void *hashval )
{
uint64_t l;
int current = state->count & (state->blocksize - 1);
int i;
int isshort = 1;
// If there is still some data in the buffer, hash it
if ( current )
{
current = ( current+7 ) / 8;
memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current ) );
SIMD_2way_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, 2*(state->blocksize / 8) );
l = state->count;
for ( i = 0; i < 8; i++ )
{
state->buffer[ i ] = l & 0xff;
state->buffer[ i+16 ] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
SIMD_2way_Compress( state, state->buffer, isshort );
memcpy( hashval, state->A, 2*(state->hashbitlen / 8) );
return 0;
}
int simd_2way_update_close( simd_2way_context *state, void *hashval,
const void *data, int databitlen )
{
int current, i;
int bs = state->blocksize; // bits in one lane
int isshort = 1;
uint64_t l;
current = state->count & (bs - 1);
while ( databitlen > 0 )
{
if ( current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_2way_Compress( state, data, 0 );
databitlen -= bs;
data += 2*( bs/8 );
state->count += bs;
}
else
{
// Copy a chunk of data to the buffer
int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) );
state->count += databitlen;
break;
}
else
{
memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
state->count += len;
databitlen -= len;
data += 2*( len/8 );
current = 0;
SIMD_2way_Compress( state, state->buffer, 0 );
}
}
}
current = state->count & (state->blocksize - 1);
// If there is still some data in the buffer, hash it
if ( current )
{
current = ( current+7 ) / 8;
memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current) );
SIMD_2way_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, 2*( state->blocksize/8 ) );
l = state->count;
for ( i = 0; i < 8; i++ )
{
state->buffer[ i ] = l & 0xff;
state->buffer[ i+16 ] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
SIMD_2way_Compress( state, state->buffer, isshort );
memcpy( hashval, state->A, 2*( state->hashbitlen / 8 ) );
return 0;
}
#endif

View File

@@ -0,0 +1,27 @@
#ifndef SIMD_HASH_2WAY_H__
#define SIMD_HASH_2WAY_H__ 1
#include "simd-compat.h"
#if defined(__AVX2__)
#include "avxdefs.h"
typedef struct {
uint32_t A[ 32*2 ] __attribute__((aligned(64)));
uint8_t buffer[ 128*2 ] __attribute__((aligned(64)));
uint64_t count;
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
} simd_2way_context;
int simd_2way_init( simd_2way_context *state, int hashbitlen );
int simd_2way_update( simd_2way_context *state, const void *data,
int databitlen );
int simd_2way_close( simd_2way_context *state, void *hashval );
int simd_2way_update_close( simd_2way_context *state, void *hashval,
const void *data, int databitlen );
#endif
#endif

View File

@@ -1,3 +1,6 @@
#if !defined(SIMD_IV_H__)
#define SIMD_IV_H__
u32 IV_224[] = {
0x33586e9f, 0x12fff033, 0xb2d9f64d, 0x6f8fea53,
0xde943106, 0x2742e439, 0x4fbab5ac, 0x62b9ff96,
@@ -25,3 +28,5 @@ u32 IV_512[] = {
0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
};
#endif

View File

@@ -1,23 +0,0 @@
#ifndef DEFS_X5_H__
#define DEFS_X5_H__
#include <emmintrin.h>
typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn;
typedef unsigned char uint8;
typedef unsigned int uint32;
typedef unsigned long long uint64;
typedef struct {
uint32 buffer[8]; /* Buffer to be hashed */
__m128i chainv[10]; /* Chaining values */
uint64 bitlen[2]; /* Message length in bits */
uint32 rembitlen; /* Length of buffer data to be hashed */
int hashbitlen;
} hashState_luffa;
typedef unsigned char byte;
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -63,13 +63,13 @@ MAYBE_INLINE void fft64(void *a) {
v16* const A = a;
register v16 X0, X1, X2, X3, X4, X5, X6, X7;
/*
#if V16_SIZE == 8
#define X(i) A[i]
#elif V16_SIZE == 4
#define X(i) A[2*i]
#endif
*/
#define X(i) X##i
X0 = A[0];
@@ -623,6 +623,11 @@ void rounds(u32* state, const unsigned char* msg, short* fft) {
STEP(S(1), S(2), S(3), S(0), S[3], 0, 25, 4, 20);
S[0] = S(0); S[1] = S(1); S[2] = S(2); S[3] = S(3);
#undef ROUND
#undef STEP
#undef STEP_1
#undef STEP_2
}
@@ -849,24 +854,32 @@ void rounds512(u32* state, const unsigned char* msg, short* fft) {
*/
#define PERM_START 0
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 4
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 1
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0);
#undef PERM_START
#define PERM_START 5
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0);
#undef PERM_START
/*
* 4 rounds with code 233
*/
#define PERM_START 2
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 6
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 3
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1);
#undef PERM_START
#define PERM_START 0
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1);
#undef PERM_START
/*
@@ -877,9 +890,15 @@ void rounds512(u32* state, const unsigned char* msg, short* fft) {
STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1);
STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2);
STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3);
#undef PERM_START
S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h;
S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h;
#undef ROUND
#undef STEP
#undef STEP_1
#undef STEP_2
}
void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) {

View File

@@ -125,14 +125,14 @@ void sm3_4way_close( void *cc, void *dst )
memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
}
count[0] = mm_byteswap_32(
count[0] = mm_bswap_32(
_mm_set1_epi32( ctx->nblocks >> 23 ) );
count[1] = mm_byteswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
count[1] = mm_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
( ctx->num << 3 ) ) );
sm3_4way_compress( ctx->digest, block );
for ( i = 0; i < 8 ; i++ )
hash[i] = mm_byteswap_32( ctx->digest[i] );
hash[i] = mm_bswap_32( ctx->digest[i] );
}
#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 9 ), \
@@ -165,7 +165,7 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
int j;
for ( j = 0; j < 16; j++ )
W[j] = mm_byteswap_32( block[j] );
W[j] = mm_bswap_32( block[j] );
for ( j = 16; j < 68; j++ )
W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],

View File

@@ -229,18 +229,18 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
#if defined BE64
#if defined PLW1
sc->buf[ SPH_MAXPAD>>3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#elif defined PLW4
memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#else
sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#endif // PLW
#else // LE64
#if defined PLW1
@@ -276,7 +276,7 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
for ( u = 0; u < rnum; u ++ )
{
#if defined BE64
((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
#else // LE64
((__m256i*)dst)[u] = sc->val[u];
#endif

View File

@@ -12,10 +12,10 @@
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
@@ -25,10 +25,10 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
hashState_echo echo;
} c11_4way_ctx_holder;
@@ -42,10 +42,10 @@ void init_c11_4way_ctx()
skein512_4way_init( &c11_4way_ctx.skein );
jh512_4way_init( &c11_4way_ctx.jh );
keccak512_4way_init( &c11_4way_ctx.keccak );
init_luffa( &c11_4way_ctx.luffa, 512 );
luffa_2way_init( &c11_4way_ctx.luffa, 512 );
cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &c11_4way_ctx.shavite );
init_sd( &c11_4way_ctx.simd, 512 );
simd_2way_init( &c11_4way_ctx.simd, 512 );
init_echo( &c11_4way_ctx.echo, 512 );
}
@@ -56,6 +56,7 @@ void c11_4way_hash( void *state, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*2] __attribute__ ((aligned (64)));
c11_4way_ctx_holder ctx;
memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
@@ -98,17 +99,13 @@ void c11_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -136,17 +133,13 @@ void c11_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,

View File

@@ -22,9 +22,9 @@
#include "algo/echo/aes_ni/hash_api.h"
#endif
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c"
#include "algo/keccak/sse2/keccak.c"
#include "algo/bmw/sse2/bmw.c"

View File

@@ -12,7 +12,7 @@
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
static __thread uint32_t s_ntime = UINT32_MAX;
@@ -25,7 +25,7 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
} tt8_4way_ctx_holder;
@@ -39,7 +39,7 @@ void init_tt8_4way_ctx()
skein512_4way_init( &tt8_4way_ctx.skein );
jh512_4way_init( &tt8_4way_ctx.jh );
keccak512_4way_init( &tt8_4way_ctx.keccak );
init_luffa( &tt8_4way_ctx.luffa, 512 );
luffa_2way_init( &tt8_4way_ctx.luffa, 512 );
cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 );
};
@@ -139,17 +139,13 @@ void timetravel_4way_hash(void *output, const void *input)
case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence *)hash0, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
if ( i != 7 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );

View File

@@ -9,7 +9,7 @@
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#ifdef NO_AES_NI
#include "algo/groestl/sph_groestl.h"

View File

@@ -12,10 +12,10 @@
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
@@ -27,10 +27,10 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
} tt10_4way_ctx_holder;
tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64)));
@@ -43,10 +43,10 @@ void init_tt10_4way_ctx()
skein512_4way_init( &tt10_4way_ctx.skein );
jh512_4way_init( &tt10_4way_ctx.jh );
keccak512_4way_init( &tt10_4way_ctx.keccak );
init_luffa( &tt10_4way_ctx.luffa, 512 );
luffa_2way_init( &tt10_4way_ctx.luffa, 512 );
cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &tt10_4way_ctx.shavite );
init_sd( &tt10_4way_ctx.simd, 512 );
simd_2way_init( &tt10_4way_ctx.simd, 512 );
};
void timetravel10_4way_hash(void *output, const void *input)
@@ -145,17 +145,13 @@ void timetravel10_4way_hash(void *output, const void *input)
case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence *)hash0, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
@@ -199,17 +195,13 @@ void timetravel10_4way_hash(void *output, const void *input)
case 9:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, dataLen<<3 );
mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );

View File

@@ -8,10 +8,10 @@
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#ifdef NO_AES_NI
#include "algo/groestl/sph_groestl.h"

View File

@@ -5,17 +5,16 @@
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
@@ -25,10 +24,10 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
hashState_echo echo;
} x11_4way_ctx_holder;
@@ -42,10 +41,10 @@ void init_x11_4way_ctx()
skein512_4way_init( &x11_4way_ctx.skein );
jh512_4way_init( &x11_4way_ctx.jh );
keccak512_4way_init( &x11_4way_ctx.keccak );
init_luffa( &x11_4way_ctx.luffa, 512 );
luffa_2way_init( &x11_4way_ctx.luffa, 512 );
cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11_4way_ctx.shavite );
init_sd( &x11_4way_ctx.simd, 512 );
simd_2way_init( &x11_4way_ctx.simd, 512 );
init_echo( &x11_4way_ctx.echo, 512 );
}
@@ -56,6 +55,8 @@ void x11_4way_hash( void *state, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*2] __attribute__ ((aligned (64)));
x11_4way_ctx_holder ctx;
memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
@@ -94,21 +95,16 @@ void x11_4way_hash( void *state, const void *input )
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 7 Luffa parallel 2 way 128 bit
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -136,17 +132,13 @@ void x11_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,

View File

@@ -10,10 +10,8 @@
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sph_simd.h"
#include "algo/echo/sph_echo.h"
#ifndef NO_AES_NI
@@ -21,9 +19,9 @@
#include "algo/echo/aes_ni/hash_api.h"
#endif
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c"
#include "algo/keccak/sse2/keccak.c"
#include "algo/bmw/sse2/bmw.c"

View File

@@ -11,15 +11,12 @@
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sph_simd.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
typedef struct {
blake512_4way_context blake;
@@ -28,10 +25,10 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
hashState_echo echo;
} x11evo_4way_ctx_holder;
@@ -45,10 +42,11 @@ void init_x11evo_4way_ctx()
skein512_4way_init( &x11evo_4way_ctx.skein );
jh512_4way_init( &x11evo_4way_ctx.jh );
keccak512_4way_init( &x11evo_4way_ctx.keccak );
luffa_2way_init( &x11evo_4way_ctx.luffa, 512 );
init_luffa( &x11evo_4way_ctx.luffa, 512 );
cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11evo_4way_ctx.shavite );
init_sd( &x11evo_4way_ctx.simd, 512 );
simd_2way_init( &x11evo_4way_ctx.simd, 512 );
init_echo( &x11evo_4way_ctx.echo, 512 );
}
@@ -142,20 +140,13 @@ void x11evo_4way_hash( void *state, const void *input )
case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
mm256_interleave_2x128( vhash, hash0, hash1, 64<<3 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 64<<3 );
mm256_interleave_2x128( vhash, hash2, hash3, 64<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 64<<3 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
@@ -202,17 +193,13 @@ void x11evo_4way_hash( void *state, const void *input )
case 9:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
mm256_interleave_2x128( vhash, hash0, hash1, 64<<3 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 64<<3 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 64<<3 );
mm256_interleave_2x128( vhash, hash2, hash3, 64<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 64<<3 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 64<<3 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );

View File

@@ -22,9 +22,9 @@
#include "algo/echo/aes_ni/hash_api.h"
#endif
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
typedef struct {
#ifdef NO_AES_NI

View File

@@ -13,10 +13,10 @@
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/gost/sph_gost.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
@@ -27,10 +27,10 @@ typedef struct {
jh512_4way_context jh;
keccak512_4way_context keccak;
sph_gost512_context gost;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
hashState_echo echo;
} x11gost_4way_ctx_holder;
@@ -45,10 +45,10 @@ void init_x11gost_4way_ctx()
jh512_4way_init( &x11gost_4way_ctx.jh );
keccak512_4way_init( &x11gost_4way_ctx.keccak );
sph_gost512_init( &x11gost_4way_ctx.gost );
init_luffa( &x11gost_4way_ctx.luffa, 512 );
luffa_2way_init( &x11gost_4way_ctx.luffa, 512 );
cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11gost_4way_ctx.shavite );
init_sd( &x11gost_4way_ctx.simd, 512 );
simd_2way_init( &x11gost_4way_ctx.simd, 512 );
init_echo( &x11gost_4way_ctx.echo, 512 );
}
@@ -59,6 +59,7 @@ void x11gost_4way_hash( void *state, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x11gost_4way_ctx_holder ctx;
memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
@@ -109,17 +110,13 @@ void x11gost_4way_hash( void *state, const void *input )
sph_gost512( &ctx.gost, hash3, 64 );
sph_gost512_close( &ctx.gost, hash3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
@@ -144,17 +141,12 @@ void x11gost_4way_hash( void *state, const void *input )
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );

View File

@@ -10,9 +10,9 @@
#include "algo/shavite/sph_shavite.h"
#include "algo/echo/sph_echo.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c"
#include "algo/keccak/sse2/keccak.c"
#include "algo/bmw/sse2/bmw.c"

View File

@@ -12,10 +12,10 @@
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h"
@@ -27,10 +27,10 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
hashState_echo echo;
hamsi512_4way_context hamsi;
sph_fugue512_context fugue;
@@ -46,10 +46,10 @@ void init_x13_4way_ctx()
skein512_4way_init( &x13_4way_ctx.skein );
jh512_4way_init( &x13_4way_ctx.jh );
keccak512_4way_init( &x13_4way_ctx.keccak );
init_luffa( &x13_4way_ctx.luffa, 512 );
luffa_2way_init( &x13_4way_ctx.luffa, 512 );
cubehashInit( &x13_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x13_4way_ctx.shavite );
init_sd( &x13_4way_ctx.simd, 512 );
simd_2way_init( &x13_4way_ctx.simd, 512 );
init_echo( &x13_4way_ctx.echo, 512 );
hamsi512_4way_init( &x13_4way_ctx.hamsi );
sph_fugue512_init( &x13_4way_ctx.fugue );
@@ -104,17 +104,13 @@ void x13_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -142,17 +138,13 @@ void x13_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -168,10 +160,10 @@ void x13_4way_hash( void *state, const void *input )
(const BitSequence *) hash3, 512 );
// 12 Hamsi parallel 4way 32 bit
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 13 Fugue serial
sph_fugue512( &ctx.fugue, hash0, 64 );

View File

@@ -19,9 +19,9 @@
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c"
#include "algo/bmw/sse2/bmw.c"
#include "algo/keccak/sse2/keccak.c"

View File

@@ -12,10 +12,10 @@
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/sm3/sm3-hash-4way.h"
#include "algo/hamsi/hamsi-hash-4way.h"
@@ -28,10 +28,10 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
hashState_echo echo;
sm3_4way_ctx_t sm3;
hamsi512_4way_context hamsi;
@@ -49,10 +49,10 @@ void init_x13sm3_4way_ctx()
skein512_4way_init( &x13sm3_4way_ctx.skein );
jh512_4way_init( &x13sm3_4way_ctx.jh );
keccak512_4way_init( &x13sm3_4way_ctx.keccak );
init_luffa( &x13sm3_4way_ctx.luffa, 512 );
luffa_2way_init( &x13sm3_4way_ctx.luffa, 512 );
cubehashInit( &x13sm3_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x13sm3_4way_ctx.shavite );
init_sd( &x13sm3_4way_ctx.simd, 512 );
simd_2way_init( &x13sm3_4way_ctx.simd, 512 );
init_echo( &x13sm3_4way_ctx.echo, 512 );
sm3_4way_init( &x13sm3_4way_ctx.sm3 );
hamsi512_4way_init( &x13sm3_4way_ctx.hamsi );
@@ -111,17 +111,13 @@ void x13sm3_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
// Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -149,17 +145,13 @@ void x13sm3_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 );
// Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
// Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -190,12 +182,13 @@ void x13sm3_4way_hash( void *state, const void *input )
sm3_4way( &ctx.sm3, vhash, 64 );
sm3_4way_close( &ctx.sm3, sm3_vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
// Hamsi parallel 32 bit
hamsi512_4way( &ctx.hamsi, sm3_vhash, 64 );
// Hamsi parallel 4x32x2
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// Fugue serial
sph_fugue512( &ctx.fugue, hash0, 64 );

View File

@@ -15,9 +15,9 @@
#include "algo/fugue/sph_fugue.h"
#include "algo/sm3/sph_sm3.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#include "algo/echo/sse2/sph_echo.h"
#include "algo/blake/sse2/blake.c"
#include "algo/bmw/sse2/bmw.c"

View File

@@ -9,8 +9,7 @@
#include "algo/skein/skein-hash-4way.h"
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/fugue//sph_fugue.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
//#include "algo/shabal/sph_shabal.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/gost/sph_gost.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -18,7 +17,7 @@ typedef struct {
skein512_4way_context skein;
shabal512_4way_context shabal;
hashState_echo echo;
hashState_luffa luffa;
luffa_2way_context luffa;
sph_fugue512_context fugue;
sph_gost512_context gost;
} poly_4way_ctx_holder;
@@ -27,12 +26,12 @@ poly_4way_ctx_holder poly_4way_ctx;
void init_polytimos_4way_ctx()
{
skein512_4way_init( &poly_4way_ctx.skein );
shabal512_4way_init( &poly_4way_ctx.shabal );
init_echo( &poly_4way_ctx.echo, 512 );
init_luffa( &poly_4way_ctx.luffa, 512 );
sph_fugue512_init( &poly_4way_ctx.fugue );
sph_gost512_init( &poly_4way_ctx.gost );
skein512_4way_init( &poly_4way_ctx.skein );
shabal512_4way_init( &poly_4way_ctx.shabal );
init_echo( &poly_4way_ctx.echo, 512 );
luffa_2way_init( &poly_4way_ctx.luffa, 512 );
sph_fugue512_init( &poly_4way_ctx.fugue );
sph_gost512_init( &poly_4way_ctx.gost );
}
void polytimos_4way_hash( void *output, const void *input )
@@ -67,17 +66,13 @@ void polytimos_4way_hash( void *output, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );

View File

@@ -8,7 +8,7 @@
#include "algo/skein/sph_skein.h"
#include "algo/echo/sph_echo.h"
#include "algo/fugue//sph_fugue.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/gost/sph_gost.h"
#ifndef NO_AES_NI

View File

@@ -12,10 +12,10 @@
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/hamsi-hash-4way.h"
@@ -29,10 +29,10 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
hashState_echo echo;
hamsi512_4way_context hamsi;
sph_fugue512_context fugue;
@@ -45,15 +45,14 @@ void init_x14_4way_ctx()
{
blake512_4way_init( &x14_4way_ctx.blake );
bmw512_4way_init( &x14_4way_ctx.bmw );
sph_bmw512_init( &x14_4way_ctx.bmw );
init_groestl( &x14_4way_ctx.groestl, 64 );
skein512_4way_init( &x14_4way_ctx.skein );
jh512_4way_init( &x14_4way_ctx.jh );
keccak512_4way_init( &x14_4way_ctx.keccak );
init_luffa( &x14_4way_ctx.luffa, 512 );
luffa_2way_init( &x14_4way_ctx.luffa, 512 );
cubehashInit( &x14_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x14_4way_ctx.shavite );
init_sd( &x14_4way_ctx.simd, 512 );
simd_2way_init( &x14_4way_ctx.simd, 512 );
init_echo( &x14_4way_ctx.echo, 512 );
hamsi512_4way_init( &x14_4way_ctx.hamsi );
sph_fugue512_init( &x14_4way_ctx.fugue );
@@ -109,17 +108,13 @@ void x14_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -147,17 +142,13 @@ void x14_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -173,10 +164,10 @@ void x14_4way_hash( void *state, const void *input )
(const BitSequence *) hash3, 512 );
// 12 Hamsi parallel 4way 32 bit
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 13 Fugue serial
sph_fugue512( &ctx.fugue, hash0, 64 );

View File

@@ -20,9 +20,9 @@
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#include "algo/echo/sse2/sph_echo.h"
#include "algo/blake/sse2/blake.c"
#include "algo/bmw/sse2/bmw.c"

View File

@@ -12,14 +12,13 @@
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/hamsi-hash-4way.h"
//#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h"
@@ -31,13 +30,12 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
hashState_echo echo;
hamsi512_4way_context hamsi;
// sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
shabal512_4way_context shabal;
sph_whirlpool_context whirlpool;
@@ -53,13 +51,12 @@ void init_x15_4way_ctx()
skein512_4way_init( &x15_4way_ctx.skein );
jh512_4way_init( &x15_4way_ctx.jh );
keccak512_4way_init( &x15_4way_ctx.keccak );
init_luffa( &x15_4way_ctx.luffa, 512 );
luffa_2way_init( &x15_4way_ctx.luffa, 512 );
cubehashInit( &x15_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x15_4way_ctx.shavite );
init_sd( &x15_4way_ctx.simd, 512 );
simd_2way_init( &x15_4way_ctx.simd, 512 );
init_echo( &x15_4way_ctx.echo, 512 );
hamsi512_4way_init( &x15_4way_ctx.hamsi );
// sph_hamsi512_init( &x15_4way_ctx.hamsi );
sph_fugue512_init( &x15_4way_ctx.fugue );
shabal512_4way_init( &x15_4way_ctx.shabal );
sph_whirlpool_init( &x15_4way_ctx.whirlpool );
@@ -114,17 +111,13 @@ void x15_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -152,17 +145,13 @@ void x15_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -178,24 +167,11 @@ void x15_4way_hash( void *state, const void *input )
(const BitSequence *) hash3, 512 );
// 12 Hamsi parallel 4way 32 bit
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
/*
// 12 Hamsi
sph_hamsi512( &ctx.hamsi, hash0, 64 );
sph_hamsi512_close( &ctx.hamsi, hash0 );
memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash1, 64 );
sph_hamsi512_close( &ctx.hamsi, hash1 );
memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash2, 64 );
sph_hamsi512_close( &ctx.hamsi, hash2 );
memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash3, 64 );
sph_hamsi512_close( &ctx.hamsi, hash3 );
*/
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 13 Fugue
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );

View File

@@ -21,9 +21,9 @@
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c"
#include "algo/bmw/sse2/bmw.c"
#include "algo/keccak/sse2/keccak.c"

View File

@@ -23,9 +23,9 @@
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#endif
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#include "algo/jh/sse2/jh_sse2_opt64.h"
typedef struct {

View File

@@ -19,9 +19,9 @@
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h"
@@ -41,10 +41,10 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
hamsi512_4way_context hamsi;
sph_fugue512_context fugue;
shabal512_4way_context shabal;
@@ -68,6 +68,10 @@ void x16r_4way_hash( void* output, const void* input )
uint32_t hash2[24] __attribute__ ((aligned (64)));
uint32_t hash3[24] __attribute__ ((aligned (64)));
uint32_t vhash[24*4] __attribute__ ((aligned (64)));
// uint32_t inp0[24] __attribute__ ((aligned (64)));
// uint32_t inp1[24] __attribute__ ((aligned (64)));
// uint32_t inp2[24] __attribute__ ((aligned (64)));
// uint32_t inp3[24] __attribute__ ((aligned (64)));
x16r_4way_ctx_holder ctx;
@@ -75,7 +79,6 @@ void x16r_4way_hash( void* output, const void* input )
void *in1 = (void*) hash1;
void *in2 = (void*) hash2;
void *in3 = (void*) hash3;
int size = 80;
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, input, 640 );
@@ -111,7 +114,7 @@ void x16r_4way_hash( void* output, const void* input )
blake512_4way( &ctx.blake, vhash, size );
}
blake512_4way_close( &ctx.blake, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case BMW:
bmw512_4way_init( &ctx.bmw );
@@ -123,7 +126,7 @@ void x16r_4way_hash( void* output, const void* input )
bmw512_4way( &ctx.bmw, vhash, size );
}
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case GROESTL:
init_groestl( &ctx.groestl, 64 );
@@ -149,7 +152,7 @@ void x16r_4way_hash( void* output, const void* input )
skein512_4way( &ctx.skein, vhash, size );
}
skein512_4way_close( &ctx.skein, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case JH:
jh512_4way_init( &ctx.jh );
@@ -161,7 +164,7 @@ void x16r_4way_hash( void* output, const void* input )
jh512_4way( &ctx.jh, vhash, size );
}
jh512_4way_close( &ctx.jh, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case KECCAK:
keccak512_4way_init( &ctx.keccak );
@@ -173,21 +176,17 @@ void x16r_4way_hash( void* output, const void* input )
keccak512_4way( &ctx.keccak, vhash, size );
}
keccak512_4way_close( &ctx.keccak, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case LUFFA:
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)in0, size );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)in1, size );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)in2, size );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)in3, size );
mm256_interleave_2x128( vhash, in0, in1, size<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, in2, in3, size<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
break;
case CUBEHASH:
cubehashReinit( &ctx.cube );
@@ -218,18 +217,14 @@ void x16r_4way_hash( void* output, const void* input )
sph_shavite512_close( &ctx.shavite, hash3 );
break;
case SIMD:
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence*)in0, size<<3 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence*)in1, size<<3 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence*)in2, size<<3 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence*)in3, size<<3 );
mm256_interleave_2x128( vhash, in0, in1, size<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, in2, in3, size<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
break;
case ECHO:
init_echo( &ctx.echo, 512 );
@@ -246,11 +241,11 @@ void x16r_4way_hash( void* output, const void* input )
(const BitSequence*)in3, size<<3 );
break;
case HAMSI:
mm_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 );
mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, size );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, size<<3 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case FUGUE:
sph_fugue512_init( &ctx.fugue );
@@ -271,7 +266,7 @@ void x16r_4way_hash( void* output, const void* input )
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, size );
shabal512_4way_close( &ctx.shabal, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, size<<3 );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );
@@ -292,9 +287,13 @@ void x16r_4way_hash( void* output, const void* input )
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, size );
sha512_4way_close( &ctx.sha512, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
}
// in0 = (void*) hash0;
// in1 = (void*) hash1;
// in2 = (void*) hash2;
// in3 = (void*) hash3;
size = 64;
}
memcpy( output, hash0, 32 );
@@ -351,28 +350,28 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
x16r_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] <= Htarg ) && fulltest( hash, ptarget ) )
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] <= Htarg ) && fulltest( hash+8, ptarget ) )
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] <= Htarg ) && fulltest( hash+16, ptarget ) )
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] <= Htarg ) && fulltest( hash+24, ptarget ) )
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;

View File

@@ -16,9 +16,9 @@
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
@@ -117,7 +117,7 @@ void x16r_hash( void* output, const void* input )
case GROESTL:
#ifdef NO_AES_NI
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size<<3 );
sph_groestl512( &ctx.groestl, in, size );
sph_groestl512_close(&ctx.groestl, hash);
#else
init_groestl( &ctx.groestl, 64 );

View File

@@ -12,10 +12,10 @@
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h"
@@ -31,10 +31,10 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
hashState_echo echo;
hamsi512_4way_context hamsi;
sph_fugue512_context fugue;
@@ -54,10 +54,10 @@ void init_x17_4way_ctx()
skein512_4way_init( &x17_4way_ctx.skein );
jh512_4way_init( &x17_4way_ctx.jh );
keccak512_4way_init( &x17_4way_ctx.keccak );
init_luffa( &x17_4way_ctx.luffa, 512 );
luffa_2way_init( &x17_4way_ctx.luffa, 512 );
cubehashInit( &x17_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x17_4way_ctx.shavite );
init_sd( &x17_4way_ctx.simd, 512 );
simd_2way_init( &x17_4way_ctx.simd, 512 );
init_echo( &x17_4way_ctx.echo, 512 );
hamsi512_4way_init( &x17_4way_ctx.hamsi );
sph_fugue512_init( &x17_4way_ctx.fugue );
@@ -114,18 +114,14 @@ void x17_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa serial
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 7 Luffa
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -153,17 +149,13 @@ void x17_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -178,11 +170,11 @@ void x17_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
// 12 Hamsi parallel 4way 32 bit
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
// 12 Hamsi
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 13 Fugue
sph_fugue512( &ctx.fugue, hash0, 64 );

View File

@@ -21,9 +21,9 @@
#include "algo/sha/sph_sha2.h"
#include "algo/haval/sph-haval.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c"
#include "algo/bmw/sse2/bmw.c"
#include "algo/keccak/sse2/keccak.c"

View File

@@ -13,9 +13,9 @@
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h"
@@ -31,10 +31,10 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
simd_2way_context simd;
hashState_echo echo;
hamsi512_4way_context hamsi;
sph_fugue512_context fugue;
@@ -56,10 +56,10 @@ void init_xevan_4way_ctx()
skein512_4way_init(&xevan_4way_ctx.skein);
jh512_4way_init(&xevan_4way_ctx.jh);
keccak512_4way_init(&xevan_4way_ctx.keccak);
init_luffa( &xevan_4way_ctx.luffa, 512 );
luffa_2way_init( &xevan_4way_ctx.luffa, 512 );
cubehashInit( &xevan_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &xevan_4way_ctx.shavite );
init_sd( &xevan_4way_ctx.simd, 512 );
simd_2way_init( &xevan_4way_ctx.simd, 512 );
init_echo( &xevan_4way_ctx.echo, 512 );
hamsi512_4way_init( &xevan_4way_ctx.hamsi );
sph_fugue512_init( &xevan_4way_ctx.fugue );
@@ -127,20 +127,14 @@ void xevan_4way_hash( void *output, const void *input )
keccak512_4way( &ctx.keccak, vhash, dataLen );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, dataLen );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, dataLen );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
dataLen );
@@ -169,17 +163,13 @@ void xevan_4way_hash( void *output, const void *input )
sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, dataLen<<3 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, dataLen<<3 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, dataLen<<3 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, dataLen<<3 );
mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, dataLen<<3 );
@@ -192,12 +182,11 @@ void xevan_4way_hash( void *output, const void *input )
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, dataLen<<3 );
// Parallel 32 bit
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
// Parallel
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
hamsi512_4way( &ctx.hamsi, vhash, dataLen );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
sph_fugue512( &ctx.fugue, hash0, dataLen );
sph_fugue512_close( &ctx.fugue, hash0 );
@@ -278,18 +267,13 @@ void xevan_4way_hash( void *output, const void *input )
keccak512_4way_close( &ctx.keccak, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, dataLen );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, dataLen );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
dataLen );
@@ -318,17 +302,13 @@ void xevan_4way_hash( void *output, const void *input )
sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, dataLen<<3 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, dataLen<<3 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, dataLen<<3 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, dataLen<<3 );
mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, dataLen<<3 );
@@ -342,10 +322,10 @@ void xevan_4way_hash( void *output, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, dataLen<<3 );
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
hamsi512_4way( &ctx.hamsi, vhash, dataLen );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
sph_fugue512( &ctx.fugue, hash0, dataLen );
sph_fugue512_close( &ctx.fugue, hash0 );

View File

@@ -11,14 +11,14 @@
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "algo/haval/sph-haval.h"
#include "algo/simd/sse2/nist.h"
#include "algo/simd/nist.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include <openssl/sha.h>
#ifdef NO_AES_NI

View File

@@ -424,12 +424,17 @@ int64_t yescryptr16_get_max64()
return 0xfffLL;
}
bool register_yescrypt_algo( algo_gate_t* gate )
void yescrypt_gate_base(algo_gate_t *gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->optimizations = SSE2_OPT | AVX_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yescrypt;
gate->hash = (void*)&yescrypt_hash;
gate->set_target = (void*)&scrypt_set_target;
}
bool register_yescrypt_algo( algo_gate_t* gate )
{
yescrypt_gate_base( gate );
gate->get_max64 = (void*)&yescrypt_get_max64;
client_key_hack = true;
YESCRYPT_N = 2048;
@@ -440,10 +445,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )
bool register_yescryptr8_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yescrypt;
gate->hash = (void*)&yescrypt_hash;
gate->set_target = (void*)&scrypt_set_target;
yescrypt_gate_base( gate );
gate->get_max64 = (void*)&yescrypt_get_max64;
client_key_hack = false;
YESCRYPT_N = 2048;
@@ -454,10 +456,7 @@ bool register_yescryptr8_algo( algo_gate_t* gate )
bool register_yescryptr16_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yescrypt;
gate->hash = (void*)&yescrypt_hash;
gate->set_target = (void*)&scrypt_set_target;
yescrypt_gate_base( gate );
gate->get_max64 = (void*)&yescryptr16_get_max64;
client_key_hack = false;
YESCRYPT_N = 4096;

1313
avxdefs.h

File diff suppressed because it is too large Load Diff

View File

@@ -3,16 +3,6 @@
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-4way.exe
strip -s cpuminer
mv cpuminer cpuminer-4way
make clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
make -j 4
strip -s cpuminer.exe

View File

@@ -18,8 +18,8 @@ rm -f config.status
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
make -j 4

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.0.1.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.1.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.8.0.1'
PACKAGE_STRING='cpuminer-opt 3.8.0.1'
PACKAGE_VERSION='3.8.1'
PACKAGE_STRING='cpuminer-opt 3.8.1'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.8.0.1 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.8.1 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.8.0.1:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.8.1:";;
esac
cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.8.0.1
cpuminer-opt configure 3.8.1
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.8.0.1, which was
It was created by cpuminer-opt $as_me 3.8.1, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2981,7 +2981,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.8.0.1'
VERSION='3.8.1'
cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.8.0.1, which was
This file was extended by cpuminer-opt $as_me 3.8.1, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.8.0.1
cpuminer-opt config.status 3.8.1
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.8.0.1])
AC_INIT([cpuminer-opt], [3.8.1])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -3238,10 +3238,10 @@ int main(int argc, char *argv[])
}
}
//#ifdef HAVE_SYSLOG_H
// if (use_syslog)
// openlog("cpuminer", LOG_PID, LOG_USER);
//#endif
#ifdef HAVE_SYSLOG_H
if (use_syslog)
openlog("cpuminer", LOG_PID, LOG_USER);
#endif
work_restart = (struct work_restart*) calloc(opt_n_threads, sizeof(*work_restart));
if (!work_restart)

10
miner.h
View File

@@ -80,10 +80,10 @@ void *alloca (size_t);
# endif
//#endif
//#ifdef HAVE_SYSLOG_H
//#include <syslog.h>
//#define LOG_BLUE 0x10 /* unique value */
//#else
#ifdef HAVE_SYSLOG_H
#include <syslog.h>
#define LOG_BLUE 0x10 /* unique value */
#else
enum {
LOG_ERR,
LOG_WARNING,
@@ -93,7 +93,7 @@ enum {
/* custom notices */
LOG_BLUE = 0x10,
};
//#endif
#endif
static inline bool is_windows(void)
{